[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n\n# pdm\n#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.\n#pdm.lock\n#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it\n#   in version control.\n#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control\n.pdm.toml\n.pdm-python\n.pdm-build/\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n#.idea/\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "English | [简体中文](README_cn.md)\n\n## RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision\n\n:fire::fire:**[WACV 2025 Oral]** The official implementation of the paper \"[RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision](https://arxiv.org/pdf/2409.08475)\". \\\n[[`arXiv`](https://arxiv.org/pdf/2409.08475)] \n![image](https://github.com/user-attachments/assets/5910d729-cc44-49f4-b404-b6631576930f)\n\n\n## Model Zoo on COCO\n\n| Model | Epoch | Backbone  | Input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) |  T4 TensorRT FP16(FPS) | Weight | Config | Log\n|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---|\n| RT-DETRv3-R18 | 6x |  ResNet-18 | 640 | 48.1 | 66.2 | 20 | 60 | 217 |[baidu 网盘](https://pan.baidu.com/s/1s7lyT6_fHmczoegQZXdX-w?pwd=54jp)  [google drive](https://drive.google.com/file/d/1zIDOjn1qDccC3TBsDlGQHOjVrehd26bk/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml) | \n| RT-DETRv3-R34 | 6x |  ResNet-34 | 640 | 49.9 | 67.7 | 31 | 92 | 161 | [baidu 网盘](https://pan.baidu.com/s/1VCg6oqNVF9_ZZdmlhUBgSA?pwd=pi32) [google drive](https://drive.google.com/file/d/12-wqAF8i67eqbocaWPK33d4tFkN2wGi2/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r34vd_6x_coco.yml) | \n| RT-DETRv3-R50 | 6x |  ResNet-50 | 640 | 53.4 | 71.7 | 42 | 136 | 108 | [baidu 网盘](https://pan.baidu.com/s/1DuvrpMIqbU5okoDp16C94g?pwd=wrxy) [google drive](https://drive.google.com/file/d/1wfJE-QgdgqKE0IkiTuoD5HEbZwwZg3sQ/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r50vd_6x_coco.yml) | \n| RT-DETRv3-R101 | 6x |  ResNet-101 | 640 | 54.6 | 73.1 | 76 | 259 | 74 |  | [config](./configs/rtdetrv3/rtdetrv3_r101vd_6x_coco.yml) | \n\n\n**Notes:**\n- RT-DETRv3 uses 4 GPUs for training.\n- RT-DETRv3 was trained on COCO train2017 and evaluated on val2017.\n\n## Model Zoo on LVIS\n\n| Model | Epoch | Backbone  | Input shape | AP | $AP_{r}$ | $AP_{c}$ | $AP_{f}$ | Weight | Config | Log\n|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---|\n| RT-DETRv3-R18 | 6x |  ResNet-18 | 640 | 26.5 | 12.5 | 24.3 | 35.2 |  | [config](./configs/rtdetrv3/rtdetrv3_r18vd_6x_lvis.yml) | \n| RT-DETRv3-R50 | 6x |  ResNet-50 | 640 | 33.9 | 20.2 | 32.5 | 41.5 |  | [config](./configs/rtdetrv3/rtdetrv3_r50vd_6x_lvis.yml) |\n\n\n## Quick start\n\n<details open>\n<summary>Install requirements</summary>\n\n<!-- - PaddlePaddle == 2.4.2 -->\n```bash\npip install -r requirements.txt\n```\n\n</details>\n\n<details>\n<summary>Compile (optional)</summary>\n\n```bash\ncd ./ppdet/modeling/transformers/ext_op/\n\npython setup_ms_deformable_attn_op.py install\n```\nSee [details](./ppdet/modeling/transformers/ext_op/)\n</details>\n\n\n<details>\n<summary>Data preparation</summary>\n\n- Download and extract COCO 2017 train and val images.\n```\npath/to/coco/\n  annotations/  # annotation json files\n  train2017/    # train images\n  val2017/      # val images\n```\n- Modify config [`dataset_dir`](configs/datasets/coco_detection.yml)\n</details>\n\n\n<details>\n<summary>Training & Evaluation & Testing</summary>\n\n- Training on a Single GPU:\n\n```shell\n# training on single-GPU\nexport CUDA_VISIBLE_DEVICES=0\npython tools/train.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml --eval\n```\n\n- Training on Multiple GPUs:\n\n```shell\n# training on multi-GPU\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\npython -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml --fleet --eval\n```\n\n- Evaluation:\n\n```shell\npython tools/eval.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \\\n              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams\n```\n\n- Inference:\n\n```shell\npython tools/infer.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \\\n              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams \\\n              --infer_img=./demo/000000570688.jpg\n```\n\n</details>\n\n\n## Deploy\n\n<details open>\n<summary>1. Export model </summary>\n\n```shell\npython tools/export_model.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \\\n              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams trt=True \\\n              --output_dir=output_inference\n```\n\n</details>\n\n<details>\n<summary>2. Convert to ONNX </summary>\n\n- Install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) and ONNX\n\n```shell\npip install onnx==1.13.0\npip install paddle2onnx==1.0.5\n```\n\n- Convert:\n\n```shell\npaddle2onnx --model_dir=./output_inference/rtdetrv3_r18vd_6x_coco/ \\\n            --model_filename model.pdmodel  \\\n            --params_filename model.pdiparams \\\n            --opset_version 16 \\\n            --save_file rtdetrv3_r18vd_6x_coco.onnx\n```\n</details>\n\n<details>\n<summary>3. Convert to TensorRT </summary>\n\n- TensorRT version >= 8.5.1\n- Inference can refer to [Bennchmark](../benchmark)\n\n```shell\ntrtexec --onnx=./rtdetrv3_r18vd_6x_coco.onnx \\\n        --workspace=4096 \\\n        --shapes=image:1x3x640x640 \\\n        --saveEngine=rtdetrv3_r18vd_6x_coco.trt \\\n        --avgRuns=100 \\\n        --fp16\n```\n-\n</details>\n\n## Citation\n\nIf you find RT-DETRv3 useful in your research, please consider giving a star ⭐ and citing:\n\n```\n@article{wang2024rt,\n  title={RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision},\n  author={Wang, Shuo and Xia, Chunlong and Lv, Feng and Shi, Yifeng},\n  journal={arXiv preprint arXiv:2409.08475},\n  year={2024}\n}\n```\n"
  },
  {
    "path": "configs/datasets/coco_detection.yml",
    "content": "metric: COCO\nnum_classes: 80\n\nTrainDataset:\n  name: COCODataSet\n  image_dir: train2017\n  anno_path: annotations/instances_train2017.json\n  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO\n  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']\n\nEvalDataset:\n  name: COCODataSet\n  image_dir: val2017\n  anno_path: annotations/instances_val2017.json\n  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO\n  allow_empty: true\n\nTestDataset:\n  name: ImageFolder\n  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)\n  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'\n"
  },
  {
    "path": "configs/datasets/coco_instance.yml",
    "content": "metric: COCO\nnum_classes: 80\n\nTrainDataset:\n  name: COCODataSet\n  image_dir: train2017\n  anno_path: annotations/instances_train2017.json\n  dataset_dir: dataset/coco\n  data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_poly', 'is_crowd']\n\nEvalDataset:\n  name: COCODataSet\n  image_dir: val2017\n  anno_path: annotations/instances_val2017.json\n  dataset_dir: dataset/coco\n\nTestDataset:\n  name: ImageFolder\n  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)\n  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'\n"
  },
  {
    "path": "configs/datasets/culane.yml",
    "content": "metric: CULaneMetric\nnum_classes: 5 # 4 lanes + background\n\ncut_height: &cut_height 270\ndataset_dir: &dataset_dir dataset/culane\n\nTrainDataset:\n  name: CULaneDataSet\n  dataset_dir: *dataset_dir\n  list_path: 'list/train_gt.txt'\n  split: train\n  cut_height: *cut_height\n\n\nEvalDataset:\n  name: CULaneDataSet\n  dataset_dir: *dataset_dir\n  list_path: 'list/test.txt'\n  split: test\n  cut_height: *cut_height\n\n\nTestDataset:\n  name: CULaneDataSet\n  dataset_dir: *dataset_dir\n  list_path: 'list/test.txt'\n  split: test\n  cut_height: *cut_height\n"
  },
  {
    "path": "configs/datasets/dota.yml",
    "content": "metric: RBOX\nnum_classes: 15\n\nTrainDataset:\n  !COCODataSet\n    image_dir: trainval1024/images\n    anno_path: trainval1024/DOTA_trainval1024.json\n    dataset_dir: dataset/dota/\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']\n\nEvalDataset:\n  !COCODataSet\n    image_dir: trainval1024/images\n    anno_path: trainval1024/DOTA_trainval1024.json\n    dataset_dir: dataset/dota/\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']\n\nTestDataset:\n  !ImageFolder\n    anno_path: test1024/DOTA_test1024.json\n    dataset_dir: dataset/dota/\n"
  },
  {
    "path": "configs/datasets/dota_ms.yml",
    "content": "metric: RBOX\nnum_classes: 15\n\nTrainDataset:\n  !COCODataSet\n    image_dir: trainval1024/images\n    anno_path: trainval1024/DOTA_trainval1024.json\n    dataset_dir: dataset/dota_ms/\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']\n\nEvalDataset:\n  !COCODataSet\n    image_dir: trainval1024/images\n    anno_path: trainval1024/DOTA_trainval1024.json\n    dataset_dir: dataset/dota_ms/\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']\n\nTestDataset:\n  !ImageFolder\n    anno_path: test1024/DOTA_test1024.json\n    dataset_dir: dataset/dota_ms/\n"
  },
  {
    "path": "configs/datasets/lvis_detection.yml",
    "content": "metric: LVIS\nnum_classes: 1203\n\nTrainDataset:\n  name: LVISDataSet\n  image_dir: .\n  anno_path: annotations/lvis_v1_train.json\n  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO\n  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']\n\nEvalDataset:\n  name: LVISDataSet\n  image_dir: .\n  anno_path: annotations/lvis_v1_val.json\n  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO\n  allow_empty: true\n\nTestDataset:\n  name: ImageFolder\n  anno_path: annotations/lvis_v1_val.json # also support txt (like VOC's label_list.txt)\n  dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO # if set, anno_path will be 'dataset_dir/anno_path'"
  },
  {
    "path": "configs/datasets/mcmot.yml",
    "content": "metric: MCMOT\nnum_classes: 10\n# using VisDrone2019 MOT dataset with 10 classes as default, you can modify it for your needs.\n\n# for MCMOT training\nTrainDataset:\n  !MCMOTDataSet\n    dataset_dir: dataset/mot\n    image_lists: ['visdrone_mcmot.train']\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide']\n    label_list: label_list.txt\n\n# for MCMOT evaluation\n# If you want to change the MCMOT evaluation dataset, please modify 'data_root'\nEvalMOTDataset:\n  !MOTImageFolder\n    dataset_dir: dataset/mot\n    data_root: visdrone_mcmot/images/val\n    keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT\n\n# for MCMOT video inference\nTestMOTDataset:\n  !MOTImageFolder\n    dataset_dir: dataset/mot\n    keep_ori_im: True # set True if save visualization images or video\n"
  },
  {
    "path": "configs/datasets/mot.yml",
    "content": "metric: MOT\nnum_classes: 1\n\n# for MOT training\nTrainDataset:\n  !MOTDataSet\n    dataset_dir: dataset/mot\n    image_lists: ['mot17.train', 'caltech.all', 'cuhksysu.train', 'prw.train', 'citypersons.train', 'eth.train']\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide']\n\n# for MOT evaluation\n# If you want to change the MOT evaluation dataset, please modify 'data_root'\nEvalMOTDataset:\n  !MOTImageFolder\n    dataset_dir: dataset/mot\n    data_root: MOT16/images/train\n    keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT\n\n# for MOT video inference\nTestMOTDataset:\n  !MOTImageFolder\n    dataset_dir: dataset/mot\n    keep_ori_im: True # set True if save visualization images or video\n"
  },
  {
    "path": "configs/datasets/objects365_detection.yml",
    "content": "metric: COCO\nnum_classes: 365\n\nTrainDataset:\n  !COCODataSet\n    image_dir: train\n    anno_path: annotations/zhiyuan_objv2_train.json\n    dataset_dir: dataset/objects365\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']\n\nEvalDataset:\n  !COCODataSet\n    image_dir: val\n    anno_path: annotations/zhiyuan_objv2_val.json\n    dataset_dir: dataset/objects365\n    allow_empty: true\n\nTestDataset:\n  !ImageFolder\n    anno_path: annotations/zhiyuan_objv2_val.json\n    dataset_dir: dataset/objects365/\n"
  },
  {
    "path": "configs/datasets/roadsign_voc.yml",
    "content": "metric: VOC\nmap_type: integral\nnum_classes: 4\n\nTrainDataset:\n  name: VOCDataSet\n  dataset_dir: dataset/roadsign_voc\n  anno_path: train.txt\n  label_list: label_list.txt\n  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']\n\nEvalDataset:\n  name: VOCDataSet\n  dataset_dir: dataset/roadsign_voc\n  anno_path: valid.txt\n  label_list: label_list.txt\n  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']\n\nTestDataset:\n  name: ImageFolder\n  anno_path: dataset/roadsign_voc/label_list.txt\n"
  },
  {
    "path": "configs/datasets/sniper_coco_detection.yml",
    "content": "metric: SNIPERCOCO\nnum_classes: 80\n\nTrainDataset:\n  !SniperCOCODataSet\n    image_dir: train2017\n    anno_path: annotations/instances_train2017.json\n    dataset_dir: dataset/coco\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']\n    allow_empty: true\n    is_trainset: true\n    image_target_sizes: [2000, 1000]\n    valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]]\n    chip_target_size: 512\n    chip_target_stride: 200\n    use_neg_chip: false\n    max_neg_num_per_im: 8\n\n\nEvalDataset:\n  !SniperCOCODataSet\n    image_dir: val2017\n    anno_path: annotations/instances_val2017.json\n    dataset_dir: dataset/coco\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']\n    allow_empty: true\n    is_trainset: false\n    image_target_sizes: [2000, 1000]\n    valid_box_ratio_ranges: [[-1, 0.1], [0.08, -1]]\n    chip_target_size: 512\n    chip_target_stride: 200\n    max_per_img: -1\n    nms_thresh: 0.5\n\nTestDataset:\n  !SniperCOCODataSet\n    image_dir: val2017\n    dataset_dir: dataset/coco\n    is_trainset: false\n    image_target_sizes: [2000, 1000]\n    valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]]\n    chip_target_size: 500\n    chip_target_stride: 200\n    max_per_img: -1\n    nms_thresh: 0.5\n\n\n"
  },
  {
    "path": "configs/datasets/sniper_visdrone_detection.yml",
    "content": "metric: SNIPERCOCO\nnum_classes: 9\n\nTrainDataset:\n  !SniperCOCODataSet\n    image_dir: train\n    anno_path: annotations/train.json\n    dataset_dir: dataset/VisDrone2019_coco\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']\n    allow_empty: true\n    is_trainset: true\n    image_target_sizes: [8145, 2742]\n    valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]\n    chip_target_size: 1536\n    chip_target_stride: 1184\n    use_neg_chip: false\n    max_neg_num_per_im: 8\n\n\nEvalDataset:\n  !SniperCOCODataSet\n    image_dir: val\n    anno_path: annotations/val.json\n    dataset_dir: dataset/VisDrone2019_coco\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']\n    allow_empty: true\n    is_trainset: false\n    image_target_sizes: [8145, 2742]\n    valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]\n    chip_target_size: 1536\n    chip_target_stride: 1184\n    max_per_img: -1\n    nms_thresh: 0.5\n\nTestDataset:\n  !SniperCOCODataSet\n    image_dir: val\n    dataset_dir: dataset/VisDrone2019_coco\n    is_trainset: false\n    image_target_sizes: [8145, 2742]\n    valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]\n    chip_target_size: 1536\n    chip_target_stride: 1184\n    max_per_img: -1\n    nms_thresh: 0.5\n\n\n"
  },
  {
    "path": "configs/datasets/spine_coco.yml",
    "content": "metric: RBOX\nnum_classes: 9\n\nTrainDataset:\n  !COCODataSet\n    image_dir: images\n    anno_path: annotations/train.json\n    dataset_dir: dataset/spine_coco\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']\n\nEvalDataset:\n  !COCODataSet\n    image_dir: images\n    anno_path: annotations/valid.json\n    dataset_dir: dataset/spine_coco\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']\n\nTestDataset:\n  !ImageFolder\n    anno_path: annotations/valid.json\n    dataset_dir: dataset/spine_coco\n"
  },
  {
    "path": "configs/datasets/visdrone_detection.yml",
    "content": "metric: COCO\nnum_classes: 10\n\nTrainDataset:\n  !COCODataSet\n    image_dir: VisDrone2019-DET-train\n    anno_path: train.json\n    dataset_dir: dataset/visdrone\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']\n\nEvalDataset:\n  !COCODataSet\n    image_dir: VisDrone2019-DET-val\n    anno_path: val.json\n    # image_dir: test_dev\n    # anno_path: test_dev.json\n    dataset_dir: dataset/visdrone\n\nTestDataset:\n  !ImageFolder\n    anno_path: val.json\n    dataset_dir: dataset/visdrone\n"
  },
  {
    "path": "configs/datasets/voc.yml",
    "content": "metric: VOC\nmap_type: 11point\nnum_classes: 20\n\nTrainDataset:\n  name: VOCDataSet\n  dataset_dir: dataset/voc\n  anno_path: trainval.txt\n  label_list: label_list.txt\n  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']\n\nEvalDataset:\n  name: VOCDataSet\n  dataset_dir: dataset/voc\n  anno_path: test.txt\n  label_list: label_list.txt\n  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']\n\nTestDataset:\n  name: ImageFolder\n  anno_path: dataset/voc/label_list.txt\n"
  },
  {
    "path": "configs/datasets/wider_face.yml",
    "content": "metric: WiderFace\nnum_classes: 1\n\nTrainDataset:\n  !WIDERFaceDataSet\n    dataset_dir: dataset/wider_face\n    anno_path: wider_face_split/wider_face_train_bbx_gt.txt\n    image_dir: WIDER_train/images\n    data_fields: ['image', 'gt_bbox', 'gt_class']\n\nEvalDataset:\n  !WIDERFaceValDataset\n    dataset_dir: dataset/wider_face\n    image_dir: WIDER_val/images\n    anno_path: wider_face_split/wider_face_val_bbx_gt.txt\n    gt_mat_path: WIDER_val/ground_truth\n    data_fields: ['image', 'gt_bbox', 'gt_class', 'ori_gt_bbox']\n\nTestDataset:\n  !ImageFolder\n    use_default_label: true\n"
  },
  {
    "path": "configs/rtdetrv3/_base_/optimizer_6x.yml",
    "content": "epoch: 72\n\nLearningRate:\n  base_lr: 0.0004\n  schedulers:\n  - !PiecewiseDecay\n    gamma: 1.0\n    milestones: [100]\n    use_warmup: true\n  - !LinearWarmup\n    start_factor: 0.001\n    steps: 2000\n\nOptimizerBuilder:\n  clip_grad_by_norm: 0.1\n  regularizer: false\n  optimizer:\n    type: AdamW\n    weight_decay: 0.0001\n"
  },
  {
    "path": "configs/rtdetrv3/_base_/rtdetr_reader.yml",
    "content": "worker_num: 4\nTrainReader:\n  sample_transforms:\n    - Decode: {}\n    - RandomDistort: {prob: 0.8}\n    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}\n    - RandomCrop: {prob: 0.8}\n    - RandomFlip: {}\n  batch_transforms:\n    - BatchRandomResize: {target_size: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800], random_size: True, random_interp: True, keep_ratio: False}\n    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}\n    - NormalizeBox: {retain_origin_box: true}\n    - BboxXYXY2XYWH: {}\n    - Permute: {}\n    - PadGT: {only_origin_box: true}\n  batch_size: 16\n  shuffle: true\n  drop_last: true\n  collate_batch: false\n  use_shared_memory: true\n\n\nEvalReader:\n  sample_transforms:\n    - Decode: {}\n    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}\n    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}\n    - Permute: {}\n  batch_size: 16\n  shuffle: false\n  drop_last: false\n\n\nTestReader:\n  inputs_def:\n    image_shape: [3, 640, 640]\n  sample_transforms:\n    - Decode: {}\n    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}\n    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}\n    - Permute: {}\n  batch_size: 1\n  shuffle: false\n  drop_last: false\n"
  },
  {
    "path": "configs/rtdetrv3/_base_/rtdetrv3_r50vd.yml",
    "content": "architecture: RTDETRV3\npretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams\nnorm_type: sync_bn\nuse_ema: True\nema_decay: 0.9999\nema_decay_type: \"exponential\"\nema_filter_no_grad: True\nhidden_dim: 256\nuse_focal_loss: True\neval_size: [640, 640]\n\n\nRTDETRV3:\n  backbone: ResNet\n  neck: HybridEncoder\n  transformer: RTDETRTransformerv3\n  detr_head: DINOv3Head\n  aux_o2m_head: PPYOLOEHead\n  post_process: DETRPostProcess\n\nResNet:\n  # index 0 stands for res2\n  depth: 50\n  variant: d\n  norm_type: bn\n  freeze_at: 0\n  return_idx: [1, 2, 3]\n  lr_mult_list: [0.1, 0.1, 0.1, 0.1]\n  num_stages: 4\n  freeze_stem_only: True\n\nHybridEncoder:\n  hidden_dim: 256\n  use_encoder_idx: [2]\n  num_encoder_layers: 1\n  encoder_layer:\n    name: TransformerLayer\n    d_model: 256\n    nhead: 8\n    dim_feedforward: 1024\n    dropout: 0.\n    activation: 'gelu'\n  expansion: 1.0\n\n\nRTDETRTransformerv3:\n  num_queries: 300\n  position_embed_type: sine\n  feat_strides: [8, 16, 32]\n  num_levels: 3\n  nhead: 8\n  num_decoder_layers: 6\n  dim_feedforward: 1024\n  dropout: 0.0\n  activation: relu\n  num_denoising: 100\n  label_noise_ratio: 0.5\n  box_noise_scale: 1.0\n  learnt_init_query: False\n  num_noises: 0\n  num_noise_queries: []\n  num_noise_denoising: 100\n  learnt_init_query: False\n\n\nDINOv3Head:\n  o2m: 4\n  loss:\n    name: DINOv3Loss\n    loss_coeff: {class: 1, bbox: 5, giou: 2}\n    aux_loss: True\n    use_vfl: True\n    matcher:\n      name: HungarianMatcher\n      matcher_coeff: {class: 2, bbox: 5, giou: 2}\n\nPPYOLOEHead:\n  fpn_strides: [8, 16, 32]\n  grid_cell_scale: 5.0\n  grid_cell_offset: 0.5\n  static_assigner_epoch: 30\n  use_varifocal_loss: True\n  loss_weight: {class: 1.0, iou: 2.5, dfl: 0.5}\n  static_assigner:\n    name: ATSSAssigner\n    topk: 9\n  assigner:\n    name: TaskAlignedAssigner\n    topk: 13\n    alpha: 1.0\n    beta: 6.0\n  nms:\n    name: MultiClassNMS\n    nms_top_k: 1000\n    keep_top_k: 300\n    score_threshold: 0.01\n    nms_threshold: 0.7\n\nDETRPostProcess:\n  num_top_queries: 300\n"
  },
  {
    "path": "configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml",
    "content": "_BASE_: [\n  '../datasets/coco_detection.yml',\n  '../runtime.yml',\n  '_base_/optimizer_6x.yml',\n  '_base_/rtdetrv3_r50vd.yml',\n  '_base_/rtdetr_reader.yml',\n]\n\nweights: output/rtdetrv3_r18vd_6x_coco/model_final\nfind_unused_parameters: True\nlog_iter: 200\n\no2m_branch: True\nnum_queries_o2m: 450\n\npretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams\n\nRTDETRV3:\n  backbone: ResNet\n  neck: HybridEncoder\n  transformer: RTDETRTransformerv3\n  detr_head: DINOv3Head\n  aux_o2m_head: PPYOLOEHead\n  post_process: DETRPostProcess\n  \nResNet:\n  depth: 18\n  variant: d\n  return_idx: [1, 2, 3]\n  freeze_at: -1\n  freeze_norm: false\n  norm_decay: 0.\n\nHybridEncoder:\n  hidden_dim: 256\n  use_encoder_idx: [2]\n  num_encoder_layers: 1\n  encoder_layer:\n    name: TransformerLayer\n    d_model: 256\n    nhead: 8\n    dim_feedforward: 1024\n    dropout: 0.\n    activation: 'gelu'\n  expansion: 0.5\n  depth_mult: 1.0\n\nRTDETRTransformerv3:\n  eval_idx: -1\n  num_decoder_layers: 3\n  num_noises: 3\n  num_noise_queries: [300, 300, 300]\n  num_noise_denoising: 100\n  learnt_init_query: False\n"
  },
  {
    "path": "configs/rtdetrv3/rtdetrv3_r18vd_6x_lvis.yml",
    "content": "_BASE_: [\n  '../datasets/lvis_detection.yml',\n  '../runtime.yml',\n  '_base_/optimizer_6x.yml',\n  '_base_/rtdetrv3_r50vd.yml',\n  '_base_/rtdetr_reader.yml',\n]\n\nweights: output/rtdetrv3vd_r18_6x_lvis/model_final\nfind_unused_parameters: True\nlog_iter: 200\n\no2m_branch: True\nnum_queries_o2m: 450\n\npretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams\n\nRTDETRV3:\n  backbone: ResNet\n  neck: HybridEncoder\n  transformer: RTDETRTransformerv3\n  detr_head: DINOv3Head\n  aux_o2m_head: PPYOLOEHead\n  post_process: DETRPostProcess\n  \nResNet:\n  depth: 18\n  variant: d\n  return_idx: [1, 2, 3]\n  freeze_at: -1\n  freeze_norm: false\n  norm_decay: 0.\n\nHybridEncoder:\n  hidden_dim: 256\n  use_encoder_idx: [2]\n  num_encoder_layers: 1\n  encoder_layer:\n    name: TransformerLayer\n    d_model: 256\n    nhead: 8\n    dim_feedforward: 1024\n    dropout: 0.\n    activation: 'gelu'\n  expansion: 0.5\n  depth_mult: 1.0\n\nRTDETRTransformerv3:\n  eval_idx: -1\n  num_decoder_layers: 3\n  num_noises: 2\n  num_noise_queries: [300, 300]\n  num_noise_denoising: 100\n  learnt_init_query: False\n"
  },
  {
    "path": "configs/rtdetrv3/rtdetrv3_r34vd_6x_coco.yml",
    "content": "_BASE_: [\n  '../datasets/coco_detection.yml',\n  '../runtime.yml',\n  '_base_/optimizer_6x.yml',\n  '_base_/rtdetrv3_r50vd.yml',\n  '_base_/rtdetr_reader.yml',\n]\n\nweights: output/rtdetrv3_r34vd_6x_coco/model_final\nfind_unused_parameters: True\nlog_iter: 200\n\no2m_branch: True\nnum_queries_o2m: 450\n\npretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ResNet34_vd_pretrained.pdparams\n\nRTDETRV3:\n  backbone: ResNet\n  neck: HybridEncoder\n  transformer: RTDETRTransformerv3\n  detr_head: DINOv3Head\n  aux_o2m_head: PPYOLOEHead\n  post_process: DETRPostProcess\n  \nResNet:\n  depth: 34\n  variant: d\n  return_idx: [1, 2, 3]\n  freeze_at: -1\n  freeze_norm: false\n  norm_decay: 0.\n\nHybridEncoder:\n  hidden_dim: 256\n  use_encoder_idx: [2]\n  num_encoder_layers: 1\n  encoder_layer:\n    name: TransformerLayer\n    d_model: 256\n    nhead: 8\n    dim_feedforward: 1024\n    dropout: 0.\n    activation: 'gelu'\n  expansion: 0.5\n  depth_mult: 1.0\n\nRTDETRTransformerv3:\n  eval_idx: -1\n  num_decoder_layers: 4\n  num_noises: 3\n  num_noise_queries: [300, 300, 300]\n  num_noise_denoising: 100\n  learnt_init_query: False\n"
  },
  {
    "path": "configs/rtdetrv3/rtdetrv3_r50vd_6x_coco.yml",
    "content": "_BASE_: [\n  '../datasets/coco_detection.yml',\n  '../runtime.yml',\n  '_base_/optimizer_6x.yml',\n  '_base_/rtdetrv3_r50vd.yml',\n  '_base_/rtdetr_reader.yml',\n]\n\nweights: output/rtdetrv3_r50vd_6x_coco/model_final\nfind_unused_parameters: True\nlog_iter: 200\n\no2m_branch: True\nnum_queries_o2m: 450\n\npretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams\n\n\nRTDETRTransformerv3:\n  eval_idx: -1\n  num_decoder_layers: 6\n  num_noises: 2\n  num_noise_queries: [300, 300]\n  num_noise_denoising: 100\n  learnt_init_query: False\n"
  },
  {
    "path": "configs/rtdetrv3/rtdetrv3_r50vd_6x_lvis.yml",
    "content": "_BASE_: [\n  '../datasets/lvis_detection.yml',\n  '../runtime.yml',\n  '_base_/optimizer_6x.yml',\n  '_base_/rtdetrv3_r50vd.yml',\n  '_base_/rtdetr_reader.yml',\n]\n\nweights: output/rtdetrv3_r50vd_6x_lvis/model_final\nfind_unused_parameters: True\nlog_iter: 200\nsnapshot_epoch: 2\n\no2m_branch: True\nnum_queries_o2m: 450\n\npretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams\n\n\nRTDETRTransformerv3:\n  eval_idx: -1\n  num_decoder_layers: 6\n  num_noises: 1\n  num_noise_queries: [300]\n  num_noise_denoising: 100\n  learnt_init_query: False\n"
  },
  {
    "path": "configs/runtime.yml",
    "content": "use_gpu: true\nuse_xpu: false\nuse_mlu: false\nuse_npu: false\nlog_iter: 20\nsave_dir: output\nsnapshot_epoch: 1\nprint_flops: false\nprint_params: false\n\n# Exporting the model\nexport:\n  post_process: True  # Whether post-processing is included in the network when export model.\n  nms: True           # Whether NMS is included in the network when export model.\n  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.\n  fuse_conv_bn: False\n"
  },
  {
    "path": "dataset/coco/download_coco.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport os.path as osp\nimport logging\n# add python path of PaddleDetection to sys.path\nparent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))\nif parent_path not in sys.path:\n    sys.path.append(parent_path)\n\nfrom ppdet.utils.download import download_dataset\n\nlogging.basicConfig(level=logging.INFO)\n\ndownload_path = osp.split(osp.realpath(sys.argv[0]))[0]\ndownload_dataset(download_path, 'coco')\n"
  },
  {
    "path": "dataset/dota/.gitignore",
    "content": ""
  },
  {
    "path": "dataset/mot/gen_labels_MOT.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os.path as osp\nimport os\nimport numpy as np\n\nMOT_data = 'MOT16'\n\n# choose a data in ['MOT15', 'MOT16', 'MOT17', 'MOT20']\n# or your custom data (prepare it following the 'docs/tutorials/PrepareMOTDataSet.md')\n\n\ndef mkdirs(d):\n    if not osp.exists(d):\n        os.makedirs(d)\n\n\nseq_root = './{}/images/train'.format(MOT_data)\nlabel_root = './{}/labels_with_ids/train'.format(MOT_data)\nmkdirs(label_root)\nseqs = [s for s in os.listdir(seq_root)]\n\ntid_curr = 0\ntid_last = -1\nfor seq in seqs:\n    seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read()\n    seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find(\n        '\\nimHeight')])\n    seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find(\n        '\\nimExt')])\n\n    gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt')\n    gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',')\n\n    seq_label_root = osp.join(label_root, seq, 'img1')\n    mkdirs(seq_label_root)\n\n    for fid, tid, x, y, w, h, mark, label, _ in gt:\n        if mark == 0 or not label == 1:\n            continue\n        fid = int(fid)\n        tid = int(tid)\n        if not tid == tid_last:\n            tid_curr += 1\n            tid_last = tid\n        x += w / 2\n        y += h / 2\n        label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid))\n        label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\\n'.format(\n            tid_curr, x / seq_width, y / seq_height, w / seq_width,\n            h / seq_height)\n        with open(label_fpath, 'a') as f:\n            f.write(label_str)\n"
  },
  {
    "path": "dataset/roadsign_voc/download_roadsign_voc.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport os.path as osp\nimport logging\n# add python path of PaddleDetection to sys.path\nparent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))\nif parent_path not in sys.path:\n    sys.path.append(parent_path)\n\nfrom ppdet.utils.download import download_dataset\n\nlogging.basicConfig(level=logging.INFO)\n\ndownload_path = osp.split(osp.realpath(sys.argv[0]))[0]\ndownload_dataset(download_path, 'roadsign_voc')\n"
  },
  {
    "path": "dataset/roadsign_voc/label_list.txt",
    "content": "speedlimit\ncrosswalk\ntrafficlight\nstop"
  },
  {
    "path": "dataset/spine_coco/download_spine_coco.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport os.path as osp\nimport logging\n# add python path of PaddleDetection to sys.path\nparent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))\nif parent_path not in sys.path:\n    sys.path.append(parent_path)\n\nfrom ppdet.utils.download import download_dataset\n\nlogging.basicConfig(level=logging.INFO)\n\ndownload_path = osp.split(osp.realpath(sys.argv[0]))[0]\ndownload_dataset(download_path, 'spine_coco')\n"
  },
  {
    "path": "dataset/voc/create_list.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport os.path as osp\nimport logging\n# add python path of PaddleDetection to sys.path\nparent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))\nif parent_path not in sys.path:\n    sys.path.append(parent_path)\n\nfrom ppdet.utils.download import create_voc_list\n\nlogging.basicConfig(level=logging.INFO)\n\nvoc_path = osp.split(osp.realpath(sys.argv[0]))[0]\ncreate_voc_list(voc_path)\n"
  },
  {
    "path": "dataset/voc/download_voc.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport os.path as osp\nimport logging\n# add python path of PaddleDetection to sys.path\nparent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))\nif parent_path not in sys.path:\n    sys.path.append(parent_path)\n\nfrom ppdet.utils.download import download_dataset\n\nlogging.basicConfig(level=logging.INFO)\n\ndownload_path = osp.split(osp.realpath(sys.argv[0]))[0]\ndownload_dataset(download_path, 'voc')\n"
  },
  {
    "path": "dataset/voc/label_list.txt",
    "content": "aeroplane\nbicycle\nbird\nboat\nbottle\nbus\ncar\ncat\nchair\ncow\ndiningtable\ndog\nhorse\nmotorbike\nperson\npottedplant\nsheep\nsofa\ntrain\ntvmonitor\n"
  },
  {
    "path": "dataset/wider_face/download_wider_face.sh",
    "content": "# All rights `PaddleDetection` reserved\n# References:\n#   @inproceedings{yang2016wider,\n#   Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou},\n#   Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},\n#   Title = {WIDER FACE: A Face Detection Benchmark},\n#   Year = {2016}}\n\nDIR=\"$( cd \"$(dirname \"$0\")\" ; pwd -P )\"\ncd \"$DIR\"\n\n# Download the data.\necho \"Downloading...\"\nwget https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip\nwget https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip\nwget https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip\n# Extract the data.\necho \"Extracting...\"\nunzip -q WIDER_train.zip\nunzip -q WIDER_val.zip\nunzip -q wider_face_split.zip\n"
  },
  {
    "path": "ppdet/__init__.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import (core, data, engine, modeling, model_zoo, optimizer, metrics,\n               utils, slim)\n\n\ntry:\n    from .version import full_version as __version__\n    from .version import commit as __git_commit__\nexcept ImportError:\n    import sys\n    sys.stderr.write(\"Warning: import ppdet from source directory \" \\\n            \"without installing, run 'python setup.py install' to \" \\\n            \"install ppdet firstly\\n\")\n"
  },
  {
    "path": "ppdet/core/__init__.py",
    "content": "#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import config\n"
  },
  {
    "path": "ppdet/core/config/__init__.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppdet/core/config/schema.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport inspect\nimport importlib\nimport re\n\ntry:\n    from docstring_parser import parse as doc_parse\nexcept Exception:\n\n    def doc_parse(*args):\n        pass\n\n\ntry:\n    from typeguard import check_type\nexcept Exception:\n\n    def check_type(*args):\n        pass\n\n\n__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema']\n\n\nclass SchemaValue(object):\n    def __init__(self, name, doc='', type=None):\n        super(SchemaValue, self).__init__()\n        self.name = name\n        self.doc = doc\n        self.type = type\n\n    def set_default(self, value):\n        self.default = value\n\n    def has_default(self):\n        return hasattr(self, 'default')\n\n\nclass SchemaDict(dict):\n    def __init__(self, **kwargs):\n        super(SchemaDict, self).__init__()\n        self.schema = {}\n        self.strict = False\n        self.doc = \"\"\n        self.update(kwargs)\n\n    def __setitem__(self, key, value):\n        # XXX also update regular dict to SchemaDict??\n        if isinstance(value, dict) and key in self and isinstance(self[key],\n                                                                  SchemaDict):\n            self[key].update(value)\n        else:\n            super(SchemaDict, self).__setitem__(key, value)\n\n    def __missing__(self, key):\n        if self.has_default(key):\n            return self.schema[key].default\n        elif key in self.schema:\n            return self.schema[key]\n        else:\n            raise KeyError(key)\n\n    def copy(self):\n        newone = SchemaDict()\n        newone.__dict__.update(self.__dict__)\n        newone.update(self)\n        return newone\n\n    def set_schema(self, key, value):\n        assert isinstance(value, SchemaValue)\n        self.schema[key] = value\n\n    def set_strict(self, strict):\n        self.strict = strict\n\n    def has_default(self, key):\n        return key in self.schema and self.schema[key].has_default()\n\n    def is_default(self, key):\n        if not self.has_default(key):\n            return False\n        if hasattr(self[key], '__dict__'):\n            return True\n        else:\n            return key not in self or self[key] == self.schema[key].default\n\n    def find_default_keys(self):\n        return [\n            k for k in list(self.keys()) + list(self.schema.keys())\n            if self.is_default(k)\n        ]\n\n    def mandatory(self):\n        return any([k for k in self.schema.keys() if not self.has_default(k)])\n\n    def find_missing_keys(self):\n        missing = [\n            k for k in self.schema.keys()\n            if k not in self and not self.has_default(k)\n        ]\n        placeholders = [k for k in self if self[k] in ('<missing>', '<value>')]\n        return missing + placeholders\n\n    def find_extra_keys(self):\n        return list(set(self.keys()) - set(self.schema.keys()))\n\n    def find_mismatch_keys(self):\n        mismatch_keys = []\n        for arg in self.schema.values():\n            if arg.type is not None:\n                try:\n                    check_type(\"{}.{}\".format(self.name, arg.name),\n                               self[arg.name], arg.type)\n                except Exception:\n                    mismatch_keys.append(arg.name)\n        return mismatch_keys\n\n    def validate(self):\n        missing_keys = self.find_missing_keys()\n        if missing_keys:\n            raise ValueError(\"Missing param for class<{}>: {}\".format(\n                self.name, \", \".join(missing_keys)))\n        extra_keys = self.find_extra_keys()\n        if extra_keys and self.strict:\n            raise ValueError(\"Extraneous param for class<{}>: {}\".format(\n                self.name, \", \".join(extra_keys)))\n        mismatch_keys = self.find_mismatch_keys()\n        if mismatch_keys:\n            raise TypeError(\"Wrong param type for class<{}>: {}\".format(\n                self.name, \", \".join(mismatch_keys)))\n\n\nclass SharedConfig(object):\n    \"\"\"\n    Representation class for `__shared__` annotations, which work as follows:\n\n    - if `key` is set for the module in config file, its value will take\n      precedence\n    - if `key` is not set for the module but present in the config file, its\n      value will be used\n    - otherwise, use the provided `default_value` as fallback\n\n    Args:\n        key: config[key] will be injected\n        default_value: fallback value\n    \"\"\"\n\n    def __init__(self, key, default_value=None):\n        super(SharedConfig, self).__init__()\n        self.key = key\n        self.default_value = default_value\n\n\ndef extract_schema(cls):\n    \"\"\"\n    Extract schema from a given class\n\n    Args:\n        cls (type): Class from which to extract.\n\n    Returns:\n        schema (SchemaDict): Extracted schema.\n    \"\"\"\n    ctor = cls.__init__\n    # python 2 compatibility\n    if hasattr(inspect, 'getfullargspec'):\n        argspec = inspect.getfullargspec(ctor)\n        annotations = argspec.annotations\n        has_kwargs = argspec.varkw is not None\n    else:\n        argspec = inspect.getfullargspec(ctor)\n        # python 2 type hinting workaround, see pep-3107\n        # however, since `typeguard` does not support python 2, type checking\n        # is still python 3 only for now\n        annotations = getattr(ctor, '__annotations__', {})\n        has_kwargs = argspec.varkw is not None\n\n    names = [arg for arg in argspec.args if arg != 'self']\n    defaults = argspec.defaults\n    num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0\n    num_required = len(names) - num_defaults\n\n    docs = cls.__doc__\n    if docs is None and getattr(cls, '__category__', None) == 'op':\n        docs = cls.__call__.__doc__\n    try:\n        docstring = doc_parse(docs)\n    except Exception:\n        docstring = None\n\n    if docstring is None:\n        comments = {}\n    else:\n        comments = {}\n        for p in docstring.params:\n            match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name)\n            if match_obj is not None:\n                comments[match_obj.group(1)] = p.description\n\n    schema = SchemaDict()\n    schema.name = cls.__name__\n    schema.doc = \"\"\n    if docs is not None:\n        start_pos = docs[0] == '\\n' and 1 or 0\n        schema.doc = docs[start_pos:].split(\"\\n\")[0].strip()\n    # XXX handle paddle's weird doc convention\n    if '**' == schema.doc[:2] and '**' == schema.doc[-2:]:\n        schema.doc = schema.doc[2:-2].strip()\n    schema.category = hasattr(cls, '__category__') and getattr(\n        cls, '__category__') or 'module'\n    schema.strict = not has_kwargs\n    schema.pymodule = importlib.import_module(cls.__module__)\n    schema.inject = getattr(cls, '__inject__', [])\n    schema.shared = getattr(cls, '__shared__', [])\n    for idx, name in enumerate(names):\n        comment = name in comments and comments[name] or name\n        if name in schema.inject:\n            type_ = None\n        else:\n            type_ = name in annotations and annotations[name] or None\n        value_schema = SchemaValue(name, comment, type_)\n        if name in schema.shared:\n            assert idx >= num_required, \"shared config must have default value\"\n            default = defaults[idx - num_required]\n            value_schema.set_default(SharedConfig(name, default))\n        elif idx >= num_required:\n            default = defaults[idx - num_required]\n            value_schema.set_default(default)\n        schema.set_schema(name, value_schema)\n\n    return schema\n"
  },
  {
    "path": "ppdet/core/config/yaml_helpers.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport importlib\nimport inspect\n\nimport yaml\nfrom .schema import SharedConfig\n\n__all__ = ['serializable', 'Callable']\n\n\ndef represent_dictionary_order(self, dict_data):\n    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())\n\n\ndef setup_orderdict():\n    from collections import OrderedDict\n    yaml.add_representer(OrderedDict, represent_dictionary_order)\n\n\ndef _make_python_constructor(cls):\n    def python_constructor(loader, node):\n        if isinstance(node, yaml.SequenceNode):\n            args = loader.construct_sequence(node, deep=True)\n            return cls(*args)\n        else:\n            kwargs = loader.construct_mapping(node, deep=True)\n            try:\n                return cls(**kwargs)\n            except Exception as ex:\n                print(\"Error when construct {} instance from yaml config\".\n                      format(cls.__name__))\n                raise ex\n\n    return python_constructor\n\n\ndef _make_python_representer(cls):\n    # python 2 compatibility\n    if hasattr(inspect, 'getfullargspec'):\n        argspec = inspect.getfullargspec(cls)\n    else:\n        argspec = inspect.getfullargspec(cls.__init__)\n    argnames = [arg for arg in argspec.args if arg != 'self']\n\n    def python_representer(dumper, obj):\n        if argnames:\n            data = {name: getattr(obj, name) for name in argnames}\n        else:\n            data = obj.__dict__\n        if '_id' in data:\n            del data['_id']\n        return dumper.represent_mapping(u'!{}'.format(cls.__name__), data)\n\n    return python_representer\n\n\ndef serializable(cls):\n    \"\"\"\n    Add loader and dumper for given class, which must be\n    \"trivially serializable\"\n\n    Args:\n        cls: class to be serialized\n\n    Returns: cls\n    \"\"\"\n    yaml.add_constructor(u'!{}'.format(cls.__name__),\n                         _make_python_constructor(cls))\n    yaml.add_representer(cls, _make_python_representer(cls))\n    return cls\n\n\nyaml.add_representer(SharedConfig,\n                     lambda d, o: d.represent_data(o.default_value))\n\n\n@serializable\nclass Callable(object):\n    \"\"\"\n    Helper to be used in Yaml for creating arbitrary class objects\n\n    Args:\n        full_type (str): the full module path to target function\n    \"\"\"\n\n    def __init__(self, full_type, args=[], kwargs={}):\n        super(Callable, self).__init__()\n        self.full_type = full_type\n        self.args = args\n        self.kwargs = kwargs\n\n    def __call__(self):\n        if '.' in self.full_type:\n            idx = self.full_type.rfind('.')\n            module = importlib.import_module(self.full_type[:idx])\n            func_name = self.full_type[idx + 1:]\n        else:\n            try:\n                module = importlib.import_module('builtins')\n            except Exception:\n                module = importlib.import_module('__builtin__')\n            func_name = self.full_type\n\n        func = getattr(module, func_name)\n        return func(*self.args, **self.kwargs)\n"
  },
  {
    "path": "ppdet/core/workspace.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport importlib\nimport os\nimport sys\n\nimport yaml\nimport collections\n\ntry:\n    collectionsAbc = collections.abc\nexcept AttributeError:\n    collectionsAbc = collections\n\nfrom .config.schema import SchemaDict, SharedConfig, extract_schema\nfrom .config.yaml_helpers import serializable\n\n__all__ = [\n    'global_config',\n    'load_config',\n    'merge_config',\n    'get_registered_modules',\n    'create',\n    'register',\n    'serializable',\n    'dump_value',\n]\n\n\ndef dump_value(value):\n    # XXX this is hackish, but collections.abc is not available in python 2\n    if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)):\n        value = yaml.dump(value, default_flow_style=True)\n        value = value.replace('\\n', '')\n        value = value.replace('...', '')\n        return \"'{}'\".format(value)\n    else:\n        # primitive types\n        return str(value)\n\n\nclass AttrDict(dict):\n    \"\"\"Single level attribute dict, NOT recursive\"\"\"\n\n    def __init__(self, **kwargs):\n        super(AttrDict, self).__init__()\n        super(AttrDict, self).update(kwargs)\n\n    def __getattr__(self, key):\n        if key in self:\n            return self[key]\n        raise AttributeError(\"object has no attribute '{}'\".format(key))\n\n    def __setattr__(self, key, value):\n        self[key] = value\n\n    def copy(self):\n        new_dict = AttrDict()\n        for k, v in self.items():\n            new_dict.update({k: v})\n        return new_dict\n\n\nglobal_config = AttrDict()\n\nBASE_KEY = '_BASE_'\n\n\n# parse and load _BASE_ recursively\ndef _load_config_with_base(file_path):\n    with open(file_path) as f:\n        file_cfg = yaml.load(f, Loader=yaml.Loader)\n\n    # NOTE: cfgs outside have higher priority than cfgs in _BASE_\n    if BASE_KEY in file_cfg:\n        all_base_cfg = AttrDict()\n        base_ymls = list(file_cfg[BASE_KEY])\n        for base_yml in base_ymls:\n            if base_yml.startswith(\"~\"):\n                base_yml = os.path.expanduser(base_yml)\n            if not base_yml.startswith('/'):\n                base_yml = os.path.join(os.path.dirname(file_path), base_yml)\n\n            with open(base_yml) as f:\n                base_cfg = _load_config_with_base(base_yml)\n                all_base_cfg = merge_config(base_cfg, all_base_cfg)\n\n        del file_cfg[BASE_KEY]\n        return merge_config(file_cfg, all_base_cfg)\n\n    return file_cfg\n\n\ndef load_config(file_path):\n    \"\"\"\n    Load config from file.\n\n    Args:\n        file_path (str): Path of the config file to be loaded.\n\n    Returns: global config\n    \"\"\"\n    _, ext = os.path.splitext(file_path)\n    assert ext in ['.yml', '.yaml'], \"only support yaml files for now\"\n\n    # load config from file and merge into global config\n    cfg = _load_config_with_base(file_path)\n    cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]\n    merge_config(cfg)\n\n    return global_config\n\n\ndef dict_merge(dct, merge_dct):\n    \"\"\" Recursive dict merge. Inspired by :meth:``dict.update()``, instead of\n    updating only top-level keys, dict_merge recurses down into dicts nested\n    to an arbitrary depth, updating keys. The ``merge_dct`` is merged into\n    ``dct``.\n\n    Args:\n        dct: dict onto which the merge is executed\n        merge_dct: dct merged into dct\n\n    Returns: dct\n    \"\"\"\n    for k, v in merge_dct.items():\n        if (k in dct and isinstance(dct[k], dict) and\n                isinstance(merge_dct[k], collectionsAbc.Mapping)):\n            dict_merge(dct[k], merge_dct[k])\n        else:\n            dct[k] = merge_dct[k]\n    return dct\n\n\ndef merge_config(config, another_cfg=None):\n    \"\"\"\n    Merge config into global config or another_cfg.\n\n    Args:\n        config (dict): Config to be merged.\n\n    Returns: global config\n    \"\"\"\n    global global_config\n    dct = another_cfg or global_config\n    return dict_merge(dct, config)\n\n\ndef get_registered_modules():\n    return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)}\n\n\ndef make_partial(cls):\n    op_module = importlib.import_module(cls.__op__.__module__)\n    op = getattr(op_module, cls.__op__.__name__)\n    cls.__category__ = getattr(cls, '__category__', None) or 'op'\n\n    def partial_apply(self, *args, **kwargs):\n        kwargs_ = self.__dict__.copy()\n        kwargs_.update(kwargs)\n        return op(*args, **kwargs_)\n\n    if getattr(cls, '__append_doc__', True):  # XXX should default to True?\n        if sys.version_info[0] > 2:\n            cls.__doc__ = \"Wrapper for `{}` OP\".format(op.__name__)\n            cls.__init__.__doc__ = op.__doc__\n            cls.__call__ = partial_apply\n            cls.__call__.__doc__ = op.__doc__\n        else:\n            # XXX work around for python 2\n            partial_apply.__doc__ = op.__doc__\n            cls.__call__ = partial_apply\n    return cls\n\n\ndef register(cls):\n    \"\"\"\n    Register a given module class.\n\n    Args:\n        cls (type): Module class to be registered.\n\n    Returns: cls\n    \"\"\"\n    if cls.__name__ in global_config:\n        raise ValueError(\"Module class already registered: {}\".format(\n            cls.__name__))\n    if hasattr(cls, '__op__'):\n        cls = make_partial(cls)\n    global_config[cls.__name__] = extract_schema(cls)\n    return cls\n\n\ndef create(cls_or_name, **kwargs):\n    \"\"\"\n    Create an instance of given module class.\n\n    Args:\n        cls_or_name (type or str): Class of which to create instance.\n\n    Returns: instance of type `cls_or_name`\n    \"\"\"\n    assert type(cls_or_name) in [type, str\n                                 ], \"should be a class or name of a class\"\n    name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__\n    if name in global_config:\n        if isinstance(global_config[name], SchemaDict):\n            pass\n        elif hasattr(global_config[name], \"__dict__\"):\n            # support instance return directly\n            return global_config[name]\n        else:\n            raise ValueError(\"The module {} is not registered\".format(name))\n    else:\n        raise ValueError(\"The module {} is not registered\".format(name))\n\n    config = global_config[name]\n    cls = getattr(config.pymodule, name)\n    cls_kwargs = {}\n    cls_kwargs.update(global_config[name])\n\n    # parse `shared` annoation of registered modules\n    if getattr(config, 'shared', None):\n        for k in config.shared:\n            target_key = config[k]\n            shared_conf = config.schema[k].default\n            assert isinstance(shared_conf, SharedConfig)\n            if target_key is not None and not isinstance(target_key,\n                                                         SharedConfig):\n                continue  # value is given for the module\n            elif shared_conf.key in global_config:\n                # `key` is present in config\n                cls_kwargs[k] = global_config[shared_conf.key]\n            else:\n                cls_kwargs[k] = shared_conf.default_value\n\n    # parse `inject` annoation of registered modules\n    if getattr(cls, 'from_config', None):\n        cls_kwargs.update(cls.from_config(config, **kwargs))\n\n    if getattr(config, 'inject', None):\n        for k in config.inject:\n            target_key = config[k]\n            # optional dependency\n            if target_key is None:\n                continue\n\n            if isinstance(target_key, dict) or hasattr(target_key, '__dict__'):\n                if 'name' not in target_key.keys():\n                    continue\n                inject_name = str(target_key['name'])\n                if inject_name not in global_config:\n                    raise ValueError(\n                        \"Missing injection name {} and check it's name in cfg file\".\n                        format(k))\n                target = global_config[inject_name]\n                for i, v in target_key.items():\n                    if i == 'name':\n                        continue\n                    target[i] = v\n                if isinstance(target, SchemaDict):\n                    cls_kwargs[k] = create(inject_name)\n            elif isinstance(target_key, str):\n                if target_key not in global_config:\n                    raise ValueError(\"Missing injection config:\", target_key)\n                target = global_config[target_key]\n                if isinstance(target, SchemaDict):\n                    cls_kwargs[k] = create(target_key)\n                elif hasattr(target, '__dict__'):  # serialized object\n                    cls_kwargs[k] = target\n            else:\n                raise ValueError(\"Unsupported injection type:\", target_key)\n    # prevent modification of global config values of reference types\n    # (e.g., list, dict) from within the created module instances\n    #kwargs = copy.deepcopy(kwargs)\n    return cls(**cls_kwargs)\n"
  },
  {
    "path": "ppdet/data/__init__.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom . import source\nfrom . import transform\nfrom . import reader\n\nfrom .source import *\nfrom .transform import *\nfrom .reader import *\n"
  },
  {
    "path": "ppdet/data/crop_utils/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License."
  },
  {
    "path": "ppdet/data/crop_utils/annotation_cropper.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nimport math\nimport random\nimport numpy as np\nfrom copy import deepcopy\nfrom typing import List, Tuple\nfrom collections import defaultdict\n\nfrom .chip_box_utils import nms, transform_chip_boxes2image_boxes\nfrom .chip_box_utils import find_chips_to_cover_overlaped_boxes\nfrom .chip_box_utils import transform_chip_box\nfrom .chip_box_utils import intersection_over_box\n\n\nclass AnnoCropper(object):\n    def __init__(self,\n                 image_target_sizes: List[int],\n                 valid_box_ratio_ranges: List[List[float]],\n                 chip_target_size: int,\n                 chip_target_stride: int,\n                 use_neg_chip: bool=False,\n                 max_neg_num_per_im: int=8,\n                 max_per_img: int=-1,\n                 nms_thresh: int=0.5):\n        \"\"\"\n        Generate chips by chip_target_size and chip_target_stride.\n        These two parameters just like kernel_size and stride in cnn.\n\n        Each image has its raw size. After resizing, then get its target size.\n        The resizing scale = target_size / raw_size.\n        So are chips of the image.\n        box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size\n        The 'size' above mentioned is the size of long-side of image, box or chip.\n\n        :param image_target_sizes: [2000, 1000]\n        :param valid_box_ratio_ranges:  [[-1, 0.1],[0.08, -1]]\n        :param chip_target_size: 500\n        :param chip_target_stride: 200\n        \"\"\"\n        self.target_sizes = image_target_sizes\n        self.valid_box_ratio_ranges = valid_box_ratio_ranges\n        assert len(self.target_sizes) == len(self.valid_box_ratio_ranges)\n        self.scale_num = len(self.target_sizes)\n        self.chip_target_size = chip_target_size  # is target size\n        self.chip_target_stride = chip_target_stride  # is target stride\n        self.use_neg_chip = use_neg_chip\n        self.max_neg_num_per_im = max_neg_num_per_im\n        self.max_per_img = max_per_img\n        self.nms_thresh = nms_thresh\n\n    def crop_anno_records(self, records: List[dict]):\n        \"\"\"\n        The main logic:\n        # foreach record(image):\n        #   foreach scale:\n        #     1 generate chips by chip size and stride for each scale\n        #     2 get pos chips\n        #     - validate boxes: current scale; h,w >= 1\n        #     - find pos chips greedily by valid gt boxes in each scale\n        #     - for every valid gt box, find its corresponding pos chips in each scale\n        #     3 get neg chips\n        #     - If given proposals, find neg boxes in them which are not in pos chips\n        #     - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2.\n        # 4 sample neg chips if too much each image\n        #   transform this image-scale annotations to chips(pos chips&neg chips) annotations\n\n        :param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1\n                        model and maybe have neg boxes in them.\n        :return: new_records, list of dict like\n        {\n            'im_file': 'fake_image1.jpg',\n            'im_id': np.array([1]),  # new _global_chip_id as im_id\n            'h': h,  # chip height\n            'w': w,  # chip width\n            'is_crowd': is_crowd,  # Nx1 -> Mx1\n            'gt_class': gt_class,  # Nx1 -> Mx1\n            'gt_bbox': gt_bbox,  # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2]\n            'gt_poly': gt_poly,  # [None]xN -> [None]xM\n            'chip': [x1, y1, x2, y2]  # added\n        }\n\n        Attention:\n        ------------------------------>x\n        |\n        |    (x1,y1)------\n        |       |        |\n        |       |        |\n        |       |        |\n        |       |        |\n        |       |        |\n        |       ----------\n        |                 (x2,y2)\n        |\n        ↓\n        y\n\n        If we use [x1, y1, x2, y2] to represent boxes or chips,\n        (x1,y1) is the left-top point which is in the box,\n        but (x2,y2) is the right-bottom point which is not in the box.\n        So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h].\n        And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area.\n        \"\"\"\n\n        self.chip_records = []\n        self._global_chip_id = 1\n        for r in records:\n            self._cur_im_pos_chips = [\n            ]  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]\n            self._cur_im_neg_chips = []  # element: (chip, neg_box_num)\n            for scale_i in range(self.scale_num):\n                self._get_current_scale_parameters(scale_i, r)\n\n                # Cx4\n                chips = self._create_chips(r['h'], r['w'], self._cur_scale)\n\n                # # dict: chipid->[box_id, ...]\n                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(\n                    r['gt_bbox'], chips)\n\n                # dict: chipid->neg_box_num\n                neg_chip2box_num = self._get_neg_boxes_and_chips(\n                    chips,\n                    list(pos_chip2boxes_idx.keys()), r.get('proposals', None))\n\n                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,\n                                          neg_chip2box_num)\n\n            cur_image_records = self._trans_all_chips2annotations(r)\n            self.chip_records.extend(cur_image_records)\n        return self.chip_records\n\n    def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx, neg_chip2box_num):\n        for pos_chipid, boxes_idx in pos_chip2boxes_idx.items():\n            chip = np.array(chips[pos_chipid])  # copy chips slice\n            self._cur_im_pos_chips.append((chip, boxes_idx))\n\n        if neg_chip2box_num is None:\n            return\n\n        for neg_chipid, neg_box_num in neg_chip2box_num.items():\n            chip = np.array(chips[neg_chipid])\n            self._cur_im_neg_chips.append((chip, neg_box_num))\n\n    def _trans_all_chips2annotations(self, r):\n        gt_bbox = r['gt_bbox']\n        im_file = r['im_file']\n        is_crowd = r['is_crowd']\n        gt_class = r['gt_class']\n        # gt_poly = r['gt_poly']   # [None]xN\n        # remaining keys: im_id, h, w\n        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,\n                                                         is_crowd, gt_class)\n\n        if not self.use_neg_chip:\n            return chip_records\n\n        sampled_neg_chips = self._sample_neg_chips()\n        neg_chip_records = self._trans_neg_chips2annotations(im_file,\n                                                             sampled_neg_chips)\n        chip_records.extend(neg_chip_records)\n        return chip_records\n\n    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,\n                                     gt_class):\n        chip_records = []\n        for chip, boxes_idx in self._cur_im_pos_chips:\n            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,\n                                                            chip)\n            x1, y1, x2, y2 = chip\n            chip_h = y2 - y1\n            chip_w = x2 - x1\n            rec = {\n                'im_file': im_file,\n                'im_id': np.array([self._global_chip_id]),\n                'h': chip_h,\n                'w': chip_w,\n                'gt_bbox': chip_bbox,\n                'is_crowd': is_crowd[final_boxes_idx].copy(),\n                'gt_class': gt_class[final_boxes_idx].copy(),\n                # 'gt_poly': [None] * len(final_boxes_idx),\n                'chip': chip\n            }\n            self._global_chip_id += 1\n            chip_records.append(rec)\n        return chip_records\n\n    def _sample_neg_chips(self):\n        pos_num = len(self._cur_im_pos_chips)\n        neg_num = len(self._cur_im_neg_chips)\n        sample_num = min(pos_num + 2, self.max_neg_num_per_im)\n        assert sample_num >= 1\n        if neg_num <= sample_num:\n            return self._cur_im_neg_chips\n\n        candidate_num = int(sample_num * 1.5)\n        candidate_neg_chips = sorted(\n            self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]\n        random.shuffle(candidate_neg_chips)\n        sampled_neg_chips = candidate_neg_chips[:sample_num]\n        return sampled_neg_chips\n\n    def _trans_neg_chips2annotations(self,\n                                     im_file: str,\n                                     sampled_neg_chips: List[Tuple]):\n        chip_records = []\n        for chip, neg_box_num in sampled_neg_chips:\n            x1, y1, x2, y2 = chip\n            chip_h = y2 - y1\n            chip_w = x2 - x1\n            rec = {\n                'im_file': im_file,\n                'im_id': np.array([self._global_chip_id]),\n                'h': chip_h,\n                'w': chip_w,\n                'gt_bbox': np.zeros(\n                    (0, 4), dtype=np.float32),\n                'is_crowd': np.zeros(\n                    (0, 1), dtype=np.int32),\n                'gt_class': np.zeros(\n                    (0, 1), dtype=np.int32),\n                # 'gt_poly': [],\n                'chip': chip\n            }\n            self._global_chip_id += 1\n            chip_records.append(rec)\n        return chip_records\n\n    def _get_current_scale_parameters(self, scale_i, r):\n        im_size = max(r['h'], r['w'])\n        im_target_size = self.target_sizes[scale_i]\n        self._cur_im_size, self._cur_im_target_size = im_size, im_target_size\n        self._cur_scale = self._get_current_scale(im_target_size, im_size)\n        self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i]\n\n    def _get_current_scale(self, im_target_size, im_size):\n        return im_target_size / im_size\n\n    def _create_chips(self, h: int, w: int, scale: float):\n        \"\"\"\n        Generate chips by chip_target_size and chip_target_stride.\n        These two parameters just like kernel_size and stride in cnn.\n        :return: chips, Cx4, xy in raw size dimension\n        \"\"\"\n        chip_size = self.chip_target_size  # omit target for simplicity\n        stride = self.chip_target_stride\n        width = int(scale * w)\n        height = int(scale * h)\n        min_chip_location_diff = 20  # in target size\n\n        assert chip_size >= stride\n        chip_overlap = chip_size - stride\n        if (width - chip_overlap\n            ) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大，则保留\n            w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))\n        else:  # 不能被stride整除的部分比较小，则丢弃\n            w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))\n        if (height - chip_overlap) % stride > min_chip_location_diff:\n            h_steps = max(1, int(math.ceil((height - chip_overlap) / stride)))\n        else:\n            h_steps = max(1, int(math.floor((height - chip_overlap) / stride)))\n\n        chips = list()\n        for j in range(h_steps):\n            for i in range(w_steps):\n                x1 = i * stride\n                y1 = j * stride\n                x2 = min(x1 + chip_size, width)\n                y2 = min(y1 + chip_size, height)\n                chips.append([x1, y1, x2, y2])\n\n        # check  chip size\n        for item in chips:\n            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[\n                    1] > chip_size * 1.1:\n                raise ValueError(item)\n        chips = np.array(chips, dtype=np.float32)\n\n        raw_size_chips = chips / scale\n        return raw_size_chips\n\n    def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips):\n        valid_ratio_range = self._cur_valid_ratio_range\n        im_size = self._cur_im_size\n        scale = self._cur_scale\n        #   Nx4            N\n        valid_boxes, valid_boxes_idx = self._validate_boxes(\n            valid_ratio_range, im_size, gt_bbox, scale)\n        # dict: chipid->[box_id, ...]\n        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,\n                                                  valid_boxes_idx)\n        return pos_chip2boxes_idx\n\n    def _validate_boxes(self,\n                        valid_ratio_range: List[float],\n                        im_size: int,\n                        gt_boxes: 'np.array of Nx4',\n                        scale: float):\n        \"\"\"\n        :return: valid_boxes: Nx4, valid_boxes_idx: N\n        \"\"\"\n        ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32)\n        hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32)\n        maxs = np.maximum(ws, hs)\n        box_ratio = maxs / im_size\n        mins = np.minimum(ws, hs)\n        target_mins = mins * scale\n\n        low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0\n        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(\n            np.float32).max\n\n        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (\n            target_mins >= 2))[0]\n        valid_boxes = gt_boxes[valid_boxes_idx]\n        return valid_boxes, valid_boxes_idx\n\n    def _find_pos_chips(self,\n                        chips: 'Cx4',\n                        valid_boxes: 'Bx4',\n                        valid_boxes_idx: 'B'):\n        \"\"\"\n        :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]\n        \"\"\"\n        iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB\n\n        iob_threshold_to_find_chips = 1.\n        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(\n            iob, iob_threshold_to_find_chips)\n        pos_chip_ids = set(pos_chip_ids)\n\n        iob_threshold_to_assign_box = 0.5\n        pos_chip2boxes_idx = self._assign_boxes_to_pos_chips(\n            iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx)\n        return pos_chip2boxes_idx\n\n    def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):\n        return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)\n\n    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,\n                                   valid_boxes_idx):\n        chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)\n        pos_chip2boxes_idx = defaultdict(list)\n        for chip_id, box_id in zip(chip_ids, box_ids):\n            if chip_id not in pos_chip_ids:\n                continue\n            raw_gt_box_idx = valid_boxes_idx[box_id]\n            pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)\n        return pos_chip2boxes_idx\n\n    def _get_neg_boxes_and_chips(self,\n                                 chips: 'Cx4',\n                                 pos_chip_ids: 'D',\n                                 proposals: 'Px4'):\n        \"\"\"\n        :param chips:\n        :param pos_chip_ids:\n        :param proposals:\n        :return: neg_chip2box_num, None or dict: chipid->neg_box_num\n        \"\"\"\n        if not self.use_neg_chip:\n            return None\n\n        # train proposals maybe None\n        if proposals is None or len(proposals) < 1:\n            return None\n\n        valid_ratio_range = self._cur_valid_ratio_range\n        im_size = self._cur_im_size\n        scale = self._cur_scale\n\n        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,\n                                              proposals, scale)\n        neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)\n        neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)\n        return neg_chip2box_num\n\n    def _find_neg_boxes(self,\n                        chips: 'Cx4',\n                        pos_chip_ids: 'D',\n                        valid_props: 'Px4'):\n        \"\"\"\n        :return: neg_boxes: Nx4\n        \"\"\"\n        if len(pos_chip_ids) == 0:\n            return valid_props\n\n        pos_chips = chips[pos_chip_ids]\n        iob = intersection_over_box(pos_chips, valid_props)\n        overlap_per_prop = np.max(iob, axis=0)\n        non_overlap_props_idx = overlap_per_prop < 0.5\n        neg_boxes = valid_props[non_overlap_props_idx]\n        return neg_boxes\n\n    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D',\n                        neg_boxes: 'Nx4'):\n        \"\"\"\n        :return: neg_chip2box_num, dict: chipid->neg_box_num\n        \"\"\"\n        neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids)\n        neg_chips = chips[neg_chip_ids]\n\n        iob = intersection_over_box(neg_chips, neg_boxes)\n        iob_threshold_to_find_chips = 0.7\n        chosen_neg_chip_ids, chip_id2overlap_box_num = \\\n            self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)\n\n        neg_chipid2box_num = {}\n        for cid in chosen_neg_chip_ids:\n            box_num = chip_id2overlap_box_num[cid]\n            raw_chip_id = neg_chip_ids[cid]\n            neg_chipid2box_num[raw_chip_id] = box_num\n        return neg_chipid2box_num\n\n    def crop_infer_anno_records(self, records: List[dict]):\n        \"\"\"\n        transform image record to chips record\n        :param records:\n        :return: new_records, list of dict like\n        {\n            'im_file': 'fake_image1.jpg',\n            'im_id': np.array([1]),  # new _global_chip_id as im_id\n            'h': h,  # chip height\n            'w': w,  # chip width\n            'chip': [x1, y1, x2, y2]  # added\n            'ori_im_h': ori_im_h  # added, origin image height\n            'ori_im_w': ori_im_w  # added, origin image width\n            'scale_i': 0  # added,\n        }\n        \"\"\"\n        self.chip_records = []\n        self._global_chip_id = 1  # im_id start from 1\n        self._global_chip_id2img_id = {}\n\n        for r in records:\n            for scale_i in range(self.scale_num):\n                self._get_current_scale_parameters(scale_i, r)\n                # Cx4\n                chips = self._create_chips(r['h'], r['w'], self._cur_scale)\n                cur_img_chip_record = self._get_chips_records(r, chips, scale_i)\n                self.chip_records.extend(cur_img_chip_record)\n\n        return self.chip_records\n\n    def _get_chips_records(self, rec, chips, scale_i):\n        cur_img_chip_records = []\n        ori_im_h = rec[\"h\"]\n        ori_im_w = rec[\"w\"]\n        im_file = rec[\"im_file\"]\n        ori_im_id = rec[\"im_id\"]\n        for id, chip in enumerate(chips):\n            chip_rec = {}\n            x1, y1, x2, y2 = chip\n            chip_h = y2 - y1\n            chip_w = x2 - x1\n            chip_rec[\"im_file\"] = im_file\n            chip_rec[\"im_id\"] = self._global_chip_id\n            chip_rec[\"h\"] = chip_h\n            chip_rec[\"w\"] = chip_w\n            chip_rec[\"chip\"] = chip\n            chip_rec[\"ori_im_h\"] = ori_im_h\n            chip_rec[\"ori_im_w\"] = ori_im_w\n            chip_rec[\"scale_i\"] = scale_i\n\n            self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id)\n            self._global_chip_id += 1\n            cur_img_chip_records.append(chip_rec)\n\n        return cur_img_chip_records\n\n    def aggregate_chips_detections(self, results, records=None):\n        \"\"\"\n        # 1. transform chip dets to image dets\n        # 2. nms boxes per image;\n        # 3. format output results\n        :param results:\n        :param roidb:\n        :return:\n        \"\"\"\n        results = deepcopy(results)\n        records = records if records else self.chip_records\n        img_id2bbox = self._transform_chip2image_bboxes(results, records)\n        nms_img_id2bbox = self._nms_dets(img_id2bbox)\n        aggregate_results = self._reformat_results(nms_img_id2bbox)\n        return aggregate_results\n\n    def _transform_chip2image_bboxes(self, results, records):\n        # 1. Transform chip dets to image dets;\n        # 2. Filter valid range;\n        # 3. Reformat and Aggregate chip dets to Get scale_cls_dets\n        img_id2bbox = defaultdict(list)\n        for result in results:\n            bbox_locs = result['bbox']\n            bbox_nums = result['bbox_num']\n            if len(bbox_locs) == 1 and bbox_locs[0][\n                    0] == -1:  # current batch has no detections\n                # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]\n                # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.\n                continue\n            im_ids = result['im_id']  # replace with range(len(bbox_nums))\n\n            last_bbox_num = 0\n            for idx, im_id in enumerate(im_ids):\n\n                cur_bbox_len = bbox_nums[idx]\n                bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]\n                last_bbox_num += cur_bbox_len\n                # box: [num_id, score, xmin, ymin, xmax, ymax]\n                if len(bboxes) == 0:  # current image has no detections\n                    continue\n\n                chip_rec = records[int(im_id) -\n                                   1]  # im_id starts from 1, type is np.int64\n                image_size = max(chip_rec[\"ori_im_h\"], chip_rec[\"ori_im_w\"])\n\n                bboxes = transform_chip_boxes2image_boxes(\n                    bboxes, chip_rec[\"chip\"], chip_rec[\"ori_im_h\"],\n                    chip_rec[\"ori_im_w\"])\n\n                scale_i = chip_rec[\"scale_i\"]\n                cur_scale = self._get_current_scale(self.target_sizes[scale_i],\n                                                    image_size)\n                _, valid_boxes_idx = self._validate_boxes(\n                    self.valid_box_ratio_ranges[scale_i], image_size,\n                    bboxes[:, 2:], cur_scale)\n                ori_img_id = self._global_chip_id2img_id[int(im_id)]\n\n                img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])\n\n        return img_id2bbox\n\n    def _nms_dets(self, img_id2bbox):\n        # 1. NMS on each image-class\n        # 2. Limit number of detections to MAX_PER_IMAGE if requested\n        max_per_img = self.max_per_img\n        nms_thresh = self.nms_thresh\n\n        for img_id in img_id2bbox:\n            box = img_id2bbox[\n                img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]\n            box = np.concatenate(box, axis=0)\n            nms_dets = nms(box, nms_thresh)\n            if max_per_img > 0:\n                if len(nms_dets) > max_per_img:\n                    keep = np.argsort(-nms_dets[:, 1])[:max_per_img]\n                    nms_dets = nms_dets[keep]\n\n            img_id2bbox[img_id] = nms_dets\n\n        return img_id2bbox\n\n    def _reformat_results(self, img_id2bbox):\n        \"\"\"reformat results\"\"\"\n        im_ids = img_id2bbox.keys()\n        results = []\n        for img_id in im_ids:  # output by original im_id order\n            if len(img_id2bbox[img_id]) == 0:\n                bbox = np.array(\n                    [[-1., 0., 0., 0., 0., 0.]])  # edge case: no detections\n                bbox_num = np.array([0])\n            else:\n                # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]\n                bbox = img_id2bbox[img_id]\n                bbox_num = np.array([len(bbox)])\n            res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)\n            results.append(res)\n        return results\n"
  },
  {
    "path": "ppdet/data/crop_utils/chip_box_utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\n\n\ndef bbox_area(boxes):\n    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])\n\n\ndef intersection_over_box(chips, boxes):\n    \"\"\"\n    intersection area over box area\n    :param chips:  C\n    :param boxes:  B\n    :return: iob, CxB\n    \"\"\"\n    M = chips.shape[0]\n    N = boxes.shape[0]\n    if M * N == 0:\n        return np.zeros([M, N], dtype='float32')\n\n    box_area = bbox_area(boxes)  # B\n\n    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:],\n                            boxes[:, 2:])  # CxBX2\n    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2],\n                            boxes[:, :2])  # CxBx2\n    inter_wh = inter_x2y2 - inter_x1y1\n    inter_wh = np.clip(inter_wh, a_min=0, a_max=None)\n    inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # CxB\n\n    iob = inter_area / np.expand_dims(box_area, 0)\n    return iob\n\n\ndef clip_boxes(boxes, im_shape):\n    \"\"\"\n    Clip boxes to image boundaries.\n    :param boxes: [N, 4]\n    :param im_shape: tuple of 2, [h, w]\n    :return: [N, 4]\n    \"\"\"\n    # x1 >= 0\n    boxes[:, 0] = np.clip(boxes[:, 0], 0, im_shape[1] - 1)\n    # y1 >= 0\n    boxes[:, 1] = np.clip(boxes[:, 1], 0, im_shape[0] - 1)\n    # x2 < im_shape[1]\n    boxes[:, 2] = np.clip(boxes[:, 2], 1, im_shape[1])\n    # y2 < im_shape[0]\n    boxes[:, 3] = np.clip(boxes[:, 3], 1, im_shape[0])\n    return boxes\n\n\ndef transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'):\n    boxes_idx = np.array(boxes_idx)\n    cur_gt_bbox = gt_bbox[boxes_idx].copy()  # Bx4\n    x1, y1, x2, y2 = chip\n    cur_gt_bbox[:, 0] -= x1\n    cur_gt_bbox[:, 1] -= y1\n    cur_gt_bbox[:, 2] -= x1\n    cur_gt_bbox[:, 3] -= y1\n    h = y2 - y1\n    w = x2 - x1\n    cur_gt_bbox = clip_boxes(cur_gt_bbox, (h, w))\n    ws = (cur_gt_bbox[:, 2] - cur_gt_bbox[:, 0]).astype(np.int32)\n    hs = (cur_gt_bbox[:, 3] - cur_gt_bbox[:, 1]).astype(np.int32)\n    valid_idx = (ws >= 2) & (hs >= 2)\n    return cur_gt_bbox[valid_idx], boxes_idx[valid_idx]\n\n\ndef find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):\n    chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)\n    chip_id2overlap_box_num = np.bincount(chip_ids)  # 1d array\n    chip_id2overlap_box_num = np.pad(\n        chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),\n        constant_values=0)\n\n    chosen_chip_ids = []\n    while len(box_ids) > 0:\n        value_counts = np.bincount(chip_ids)  # 1d array\n        max_count_chip_id = np.argmax(value_counts)\n        assert max_count_chip_id not in chosen_chip_ids\n        chosen_chip_ids.append(max_count_chip_id)\n\n        box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]\n        ids_not_in_cur_boxes_mask = np.logical_not(\n            np.isin(box_ids, box_ids_in_cur_chip))\n        chip_ids = chip_ids[ids_not_in_cur_boxes_mask]\n        box_ids = box_ids[ids_not_in_cur_boxes_mask]\n    return chosen_chip_ids, chip_id2overlap_box_num\n\n\ndef transform_chip_boxes2image_boxes(chip_boxes, chip, img_h, img_w):\n    chip_boxes = np.array(sorted(chip_boxes, key=lambda item: -item[1]))\n    xmin, ymin, _, _ = chip\n    # Transform to origin image loc\n    chip_boxes[:, 2] += xmin\n    chip_boxes[:, 4] += xmin\n    chip_boxes[:, 3] += ymin\n    chip_boxes[:, 5] += ymin\n    chip_boxes = clip_boxes(chip_boxes, (img_h, img_w))\n    return chip_boxes\n\n\ndef nms(dets, thresh):\n    \"\"\"Apply classic DPM-style greedy NMS.\"\"\"\n    if dets.shape[0] == 0:\n        return dets[[], :]\n    scores = dets[:, 1]\n    x1 = dets[:, 2]\n    y1 = dets[:, 3]\n    x2 = dets[:, 4]\n    y2 = dets[:, 5]\n\n    areas = (x2 - x1 + 1) * (y2 - y1 + 1)\n    order = scores.argsort()[::-1]\n\n    ndets = dets.shape[0]\n    suppressed = np.zeros((ndets), dtype=np.int32)\n\n    # nominal indices\n    # _i, _j\n    # sorted indices\n    # i, j\n    # temp variables for box i's (the box currently under consideration)\n    # ix1, iy1, ix2, iy2, iarea\n\n    # variables for computing overlap with box j (lower scoring box)\n    # xx1, yy1, xx2, yy2\n    # w, h\n    # inter, ovr\n\n    for _i in range(ndets):\n        i = order[_i]\n        if suppressed[i] == 1:\n            continue\n        ix1 = x1[i]\n        iy1 = y1[i]\n        ix2 = x2[i]\n        iy2 = y2[i]\n        iarea = areas[i]\n        for _j in range(_i + 1, ndets):\n            j = order[_j]\n            if suppressed[j] == 1:\n                continue\n            xx1 = max(ix1, x1[j])\n            yy1 = max(iy1, y1[j])\n            xx2 = min(ix2, x2[j])\n            yy2 = min(iy2, y2[j])\n            w = max(0.0, xx2 - xx1 + 1)\n            h = max(0.0, yy2 - yy1 + 1)\n            inter = w * h\n            ovr = inter / (iarea + areas[j] - inter)\n            if ovr >= thresh:\n                suppressed[j] = 1\n    keep = np.where(suppressed == 0)[0]\n    dets = dets[keep, :]\n    return dets\n"
  },
  {
    "path": "ppdet/data/culane_utils.py",
    "content": "import math\nimport numpy as np\nfrom imgaug.augmentables.lines import LineString\nfrom scipy.interpolate import InterpolatedUnivariateSpline\n\n\ndef lane_to_linestrings(lanes):\n    lines = []\n    for lane in lanes:\n        lines.append(LineString(lane))\n\n    return lines\n\n\ndef linestrings_to_lanes(lines):\n    lanes = []\n    for line in lines:\n        lanes.append(line.coords)\n\n    return lanes\n\n\ndef sample_lane(points, sample_ys, img_w):\n    # this function expects the points to be sorted\n    points = np.array(points)\n    if not np.all(points[1:, 1] < points[:-1, 1]):\n        raise Exception('Annotaion points have to be sorted')\n    x, y = points[:, 0], points[:, 1]\n\n    # interpolate points inside domain\n    assert len(points) > 1\n    interp = InterpolatedUnivariateSpline(\n        y[::-1], x[::-1], k=min(3, len(points) - 1))\n    domain_min_y = y.min()\n    domain_max_y = y.max()\n    sample_ys_inside_domain = sample_ys[(sample_ys >= domain_min_y) & (\n        sample_ys <= domain_max_y)]\n    assert len(sample_ys_inside_domain) > 0\n    interp_xs = interp(sample_ys_inside_domain)\n\n    # extrapolate lane to the bottom of the image with a straight line using the 2 points closest to the bottom\n    two_closest_points = points[:2]\n    extrap = np.polyfit(\n        two_closest_points[:, 1], two_closest_points[:, 0], deg=1)\n    extrap_ys = sample_ys[sample_ys > domain_max_y]\n    extrap_xs = np.polyval(extrap, extrap_ys)\n    all_xs = np.hstack((extrap_xs, interp_xs))\n\n    # separate between inside and outside points\n    inside_mask = (all_xs >= 0) & (all_xs < img_w)\n    xs_inside_image = all_xs[inside_mask]\n    xs_outside_image = all_xs[~inside_mask]\n\n    return xs_outside_image, xs_inside_image\n\n\ndef filter_lane(lane):\n    assert lane[-1][1] <= lane[0][1]\n    filtered_lane = []\n    used = set()\n    for p in lane:\n        if p[1] not in used:\n            filtered_lane.append(p)\n            used.add(p[1])\n\n    return filtered_lane\n\n\ndef transform_annotation(img_w, img_h, max_lanes, n_offsets, offsets_ys,\n                         n_strips, strip_size, anno):\n    old_lanes = anno['lanes']\n\n    # removing lanes with less than 2 points\n    old_lanes = filter(lambda x: len(x) > 1, old_lanes)\n    # sort lane points by Y (bottom to top of the image)\n    old_lanes = [sorted(lane, key=lambda x: -x[1]) for lane in old_lanes]\n    # remove points with same Y (keep first occurrence)\n    old_lanes = [filter_lane(lane) for lane in old_lanes]\n    # normalize the annotation coordinates\n    old_lanes = [[[x * img_w / float(img_w), y * img_h / float(img_h)]\n                  for x, y in lane] for lane in old_lanes]\n    # create tranformed annotations\n    lanes = np.ones(\n        (max_lanes, 2 + 1 + 1 + 2 + n_offsets), dtype=np.float32\n    ) * -1e5  # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, S+1 coordinates\n    lanes_endpoints = np.ones((max_lanes, 2))\n    # lanes are invalid by default\n    lanes[:, 0] = 1\n    lanes[:, 1] = 0\n    for lane_idx, lane in enumerate(old_lanes):\n        if lane_idx >= max_lanes:\n            break\n\n        try:\n            xs_outside_image, xs_inside_image = sample_lane(lane, offsets_ys,\n                                                            img_w)\n        except AssertionError:\n            continue\n        if len(xs_inside_image) <= 1:\n            continue\n        all_xs = np.hstack((xs_outside_image, xs_inside_image))\n        lanes[lane_idx, 0] = 0\n        lanes[lane_idx, 1] = 1\n        lanes[lane_idx, 2] = len(xs_outside_image) / n_strips\n        lanes[lane_idx, 3] = xs_inside_image[0]\n\n        thetas = []\n        for i in range(1, len(xs_inside_image)):\n            theta = math.atan(\n                i * strip_size /\n                (xs_inside_image[i] - xs_inside_image[0] + 1e-5)) / math.pi\n            theta = theta if theta > 0 else 1 - abs(theta)\n            thetas.append(theta)\n\n        theta_far = sum(thetas) / len(thetas)\n\n        # lanes[lane_idx,\n        #       4] = (theta_closest + theta_far) / 2  # averaged angle\n        lanes[lane_idx, 4] = theta_far\n        lanes[lane_idx, 5] = len(xs_inside_image)\n        lanes[lane_idx, 6:6 + len(all_xs)] = all_xs\n        lanes_endpoints[lane_idx, 0] = (len(all_xs) - 1) / n_strips\n        lanes_endpoints[lane_idx, 1] = xs_inside_image[-1]\n\n    new_anno = {\n        'label': lanes,\n        'old_anno': anno,\n        'lane_endpoints': lanes_endpoints\n    }\n    return new_anno\n"
  },
  {
    "path": "ppdet/data/reader.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nimport os\nimport traceback\nimport six\nimport sys\nif sys.version_info >= (3, 0):\n    pass\nelse:\n    pass\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\n\nfrom copy import deepcopy\n\nfrom paddle.io import DataLoader, DistributedBatchSampler\nfrom .utils import default_collate_fn\n\nfrom ppdet.core.workspace import register\nfrom . import transform\nfrom .shm_utils import _get_shared_memory_size_in_M\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('reader')\n\nMAIN_PID = os.getpid()\n\n\nclass Compose(object):\n    def __init__(self, transforms, num_classes=80):\n        self.transforms = transforms\n        self.transforms_cls = []\n        for t in self.transforms:\n            for k, v in t.items():\n                op_cls = getattr(transform, k)\n                f = op_cls(**v)\n                if hasattr(f, 'num_classes'):\n                    f.num_classes = num_classes\n\n                self.transforms_cls.append(f)\n\n    def _update_transforms_cls(self, data):\n        if 'transform_schedulers' in data:\n            def is_valid(op):\n                op_name = op.__class__.__name__\n                for t in data['transform_schedulers']:\n                    for k, v in t.items():\n                        if op_name == k:\n                            # [start_epoch, stop_epoch)\n                            start_epoch = v.get('start_epoch', 0)\n                            if start_epoch > data['curr_epoch']:\n                                return False\n                            stop_epoch = v.get('stop_epoch', float('inf'))\n                            if stop_epoch <= data['curr_epoch']:\n                                return False\n                return True\n\n            return filter(is_valid, self.transforms_cls)\n        else:\n            return self.transforms_cls\n\n    def __call__(self, data):\n        transforms_cls = self._update_transforms_cls(data)\n        for f in transforms_cls:\n            try:\n                data = f(data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger.warning(\"fail to map sample transform [{}] \"\n                               \"with error: {} and stack:\\n{}\".format(\n                                   f, e, str(stack_info)))\n                raise e\n\n        return data\n\n\nclass BatchCompose(Compose):\n    def __init__(self, transforms, num_classes=80, collate_batch=True):\n        super(BatchCompose, self).__init__(transforms, num_classes)\n        self.collate_batch = collate_batch\n\n    def __call__(self, data):\n        transforms_cls = self._update_transforms_cls(data[0])\n        for f in transforms_cls:\n            try:\n                data = f(data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger.warning(\"fail to map batch transform [{}] \"\n                               \"with error: {} and stack:\\n{}\".format(\n                                   f, e, str(stack_info)))\n                raise e\n\n        # remove keys which is not needed by model\n        extra_key = ['h', 'w', 'flipped', 'transform_schedulers']\n        for k in extra_key:\n            for sample in data:\n                if k in sample:\n                    sample.pop(k)\n\n        # batch data, if user-define batch function needed\n        # use user-defined here\n        if self.collate_batch:\n            batch_data = default_collate_fn(data)\n        else:\n            batch_data = {}\n            for k in data[0].keys():\n                tmp_data = []\n                for i in range(len(data)):\n                    tmp_data.append(data[i][k])\n                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:\n                    tmp_data = np.stack(tmp_data, axis=0)\n                if 'origin_' in k:\n                    tmp_data = np.stack(tmp_data, axis=0)\n                batch_data[k] = tmp_data\n        return batch_data\n\n\nclass BaseDataLoader(object):\n    \"\"\"\n    Base DataLoader implementation for detection models\n\n    Args:\n        sample_transforms (list): a list of transforms to perform\n                                  on each sample\n        batch_transforms (list): a list of transforms to perform\n                                 on batch\n        batch_size (int): batch size for batch collating, default 1.\n        shuffle (bool): whether to shuffle samples\n        drop_last (bool): whether to drop the last incomplete,\n                          default False\n        num_classes (int): class number of dataset, default 80\n        collate_batch (bool): whether to collate batch in dataloader.\n            If set to True, the samples will collate into batch according\n            to the batch size. Otherwise, the ground-truth will not collate,\n            which is used when the number of ground-truch is different in\n            samples.\n        use_shared_memory (bool): whether to use shared memory to\n                accelerate data loading, enable this only if you\n                are sure that the shared memory size of your OS\n                is larger than memory cost of input datas of model.\n                Note that shared memory will be automatically\n                disabled if the shared memory of OS is less than\n                1G, which is not enough for detection models.\n                Default False.\n    \"\"\"\n\n    def __init__(self,\n                 sample_transforms=[],\n                 batch_transforms=[],\n                 batch_size=1,\n                 shuffle=False,\n                 drop_last=False,\n                 num_classes=80,\n                 collate_batch=True,\n                 use_shared_memory=False,\n                 **kwargs):\n        # sample transform\n        self._sample_transforms = Compose(\n            sample_transforms, num_classes=num_classes)\n\n        # batch transfrom\n        self._batch_transforms = BatchCompose(batch_transforms, num_classes,\n                                              collate_batch)\n        self.batch_size = batch_size\n        self.shuffle = shuffle\n        self.drop_last = drop_last\n        self.use_shared_memory = use_shared_memory\n        self.kwargs = kwargs\n\n    def __call__(self,\n                 dataset,\n                 worker_num,\n                 batch_sampler=None,\n                 return_list=False):\n        self.dataset = dataset\n        self.dataset.check_or_download_dataset()\n        self.dataset.parse_dataset()\n        # get data\n        self.dataset.set_transform(self._sample_transforms)\n        # set kwargs\n        self.dataset.set_kwargs(**self.kwargs)\n        # batch sampler\n        if batch_sampler is None:\n            self._batch_sampler = DistributedBatchSampler(\n                self.dataset,\n                batch_size=self.batch_size,\n                shuffle=self.shuffle,\n                drop_last=self.drop_last)\n        else:\n            self._batch_sampler = batch_sampler\n\n        # DataLoader do not start sub-process in Windows and Mac\n        # system, do not need to use shared memory\n        use_shared_memory = self.use_shared_memory and \\\n                            sys.platform not in ['win32', 'darwin']\n        # check whether shared memory size is bigger than 1G(1024M)\n        if use_shared_memory:\n            shm_size = _get_shared_memory_size_in_M()\n            if shm_size is not None and shm_size < 1024.:\n                logger.warning(\"Shared memory size is less than 1G, \"\n                               \"disable shared_memory in DataLoader\")\n                use_shared_memory = False\n\n        self.dataloader = DataLoader(\n            dataset=self.dataset,\n            batch_sampler=self._batch_sampler,\n            collate_fn=self._batch_transforms,\n            num_workers=worker_num,\n            return_list=return_list,\n            use_shared_memory=use_shared_memory)\n        self.loader = iter(self.dataloader)\n\n        return self\n\n    def __len__(self):\n        return len(self._batch_sampler)\n\n    def __iter__(self):\n        return self\n\n    def __next__(self):\n        try:\n            return next(self.loader)\n        except StopIteration:\n            self.loader = iter(self.dataloader)\n            six.reraise(*sys.exc_info())\n\n    def next(self):\n        # python2 compatibility\n        return self.__next__()\n\n\n@register\nclass TrainReader(BaseDataLoader):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 sample_transforms=[],\n                 batch_transforms=[],\n                 batch_size=1,\n                 shuffle=True,\n                 drop_last=True,\n                 num_classes=80,\n                 collate_batch=True,\n                 **kwargs):\n        super(TrainReader, self).__init__(sample_transforms, batch_transforms,\n                                          batch_size, shuffle, drop_last,\n                                          num_classes, collate_batch, **kwargs)\n\n\n@register\nclass EvalReader(BaseDataLoader):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 sample_transforms=[],\n                 batch_transforms=[],\n                 batch_size=1,\n                 shuffle=False,\n                 drop_last=False,\n                 num_classes=80,\n                 **kwargs):\n        super(EvalReader, self).__init__(sample_transforms, batch_transforms,\n                                         batch_size, shuffle, drop_last,\n                                         num_classes, **kwargs)\n\n\n@register\nclass TestReader(BaseDataLoader):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 sample_transforms=[],\n                 batch_transforms=[],\n                 batch_size=1,\n                 shuffle=False,\n                 drop_last=False,\n                 num_classes=80,\n                 **kwargs):\n        super(TestReader, self).__init__(sample_transforms, batch_transforms,\n                                         batch_size, shuffle, drop_last,\n                                         num_classes, **kwargs)\n\n\n@register\nclass EvalMOTReader(BaseDataLoader):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 sample_transforms=[],\n                 batch_transforms=[],\n                 batch_size=1,\n                 shuffle=False,\n                 drop_last=False,\n                 num_classes=1,\n                 **kwargs):\n        super(EvalMOTReader, self).__init__(sample_transforms, batch_transforms,\n                                            batch_size, shuffle, drop_last,\n                                            num_classes, **kwargs)\n\n\n@register\nclass TestMOTReader(BaseDataLoader):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 sample_transforms=[],\n                 batch_transforms=[],\n                 batch_size=1,\n                 shuffle=False,\n                 drop_last=False,\n                 num_classes=1,\n                 **kwargs):\n        super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,\n                                            batch_size, shuffle, drop_last,\n                                            num_classes, **kwargs)\n\n\n# For Semi-Supervised Object Detection (SSOD)\nclass Compose_SSOD(object):\n    def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80):\n        self.base_transforms = base_transforms\n        self.base_transforms_cls = []\n        for t in self.base_transforms:\n            for k, v in t.items():\n                op_cls = getattr(transform, k)\n                f = op_cls(**v)\n                if hasattr(f, 'num_classes'):\n                    f.num_classes = num_classes\n                self.base_transforms_cls.append(f)\n\n        self.weak_augs = weak_aug\n        self.weak_augs_cls = []\n        for t in self.weak_augs:\n            for k, v in t.items():\n                op_cls = getattr(transform, k)\n                f = op_cls(**v)\n                if hasattr(f, 'num_classes'):\n                    f.num_classes = num_classes\n                self.weak_augs_cls.append(f)\n\n        self.strong_augs = strong_aug\n        self.strong_augs_cls = []\n        for t in self.strong_augs:\n            for k, v in t.items():\n                op_cls = getattr(transform, k)\n                f = op_cls(**v)\n                if hasattr(f, 'num_classes'):\n                    f.num_classes = num_classes\n                self.strong_augs_cls.append(f)\n\n    def __call__(self, data):\n        for f in self.base_transforms_cls:\n            try:\n                data = f(data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger.warning(\"fail to map sample transform [{}] \"\n                               \"with error: {} and stack:\\n{}\".format(\n                                   f, e, str(stack_info)))\n                raise e\n\n        weak_data = deepcopy(data)\n        strong_data = deepcopy(data)\n        for f in self.weak_augs_cls:\n            try:\n                weak_data = f(weak_data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger.warning(\"fail to map weak aug [{}] \"\n                               \"with error: {} and stack:\\n{}\".format(\n                                   f, e, str(stack_info)))\n                raise e\n\n        for f in self.strong_augs_cls:\n            try:\n                strong_data = f(strong_data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger.warning(\"fail to map strong aug [{}] \"\n                               \"with error: {} and stack:\\n{}\".format(\n                                   f, e, str(stack_info)))\n                raise e\n\n        weak_data['strong_aug'] = strong_data\n        return weak_data\n\n\nclass BatchCompose_SSOD(Compose):\n    def __init__(self, transforms, num_classes=80, collate_batch=True):\n        super(BatchCompose_SSOD, self).__init__(transforms, num_classes)\n        self.collate_batch = collate_batch\n\n    def __call__(self, data):\n        # split strong_data from data(weak_data)\n        strong_data = []\n        for sample in data:\n            strong_data.append(sample['strong_aug'])\n            sample.pop('strong_aug')\n\n        for f in self.transforms_cls:\n            try:\n                data = f(data)\n                if 'BatchRandomResizeForSSOD' in f._id:\n                    strong_data = f(strong_data, data[1])[0]\n                    data = data[0]\n                else:\n                    strong_data = f(strong_data)\n            except Exception as e:\n                stack_info = traceback.format_exc()\n                logger.warning(\"fail to map batch transform [{}] \"\n                               \"with error: {} and stack:\\n{}\".format(\n                                   f, e, str(stack_info)))\n                raise e\n\n        # remove keys which is not needed by model\n        extra_key = ['h', 'w', 'flipped']\n        for k in extra_key:\n            for sample in data:\n                if k in sample:\n                    sample.pop(k)\n            for sample in strong_data:\n                if k in sample:\n                    sample.pop(k)\n\n        # batch data, if user-define batch function needed\n        # use user-defined here\n        if self.collate_batch:\n            batch_data = default_collate_fn(data)\n            strong_batch_data = default_collate_fn(strong_data)\n            return batch_data, strong_batch_data\n        else:\n            batch_data = {}\n            for k in data[0].keys():\n                tmp_data = []\n                for i in range(len(data)):\n                    tmp_data.append(data[i][k])\n                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:\n                    tmp_data = np.stack(tmp_data, axis=0)\n                batch_data[k] = tmp_data\n\n            strong_batch_data = {}\n            for k in strong_data[0].keys():\n                tmp_data = []\n                for i in range(len(strong_data)):\n                    tmp_data.append(strong_data[i][k])\n                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:\n                    tmp_data = np.stack(tmp_data, axis=0)\n                strong_batch_data[k] = tmp_data\n\n        return batch_data, strong_batch_data\n\n\nclass CombineSSODLoader(object):\n    def __init__(self, label_loader, unlabel_loader):\n        self.label_loader = label_loader\n        self.unlabel_loader = unlabel_loader\n\n    def __iter__(self):\n        while True:\n            try:\n                label_samples = next(self.label_loader_iter)\n            except:\n                self.label_loader_iter = iter(self.label_loader)\n                label_samples = next(self.label_loader_iter)\n\n            try:\n                unlabel_samples = next(self.unlabel_loader_iter)\n            except:\n                self.unlabel_loader_iter = iter(self.unlabel_loader)\n                unlabel_samples = next(self.unlabel_loader_iter)\n\n            yield (\n                label_samples[0],  # sup weak\n                label_samples[1],  # sup strong\n                unlabel_samples[0],  # unsup weak\n                unlabel_samples[1]  # unsup strong\n            )\n\n    def __call__(self):\n        return self.__iter__()\n\n\nclass BaseSemiDataLoader(object):\n    def __init__(self,\n                 sample_transforms=[],\n                 weak_aug=[],\n                 strong_aug=[],\n                 sup_batch_transforms=[],\n                 unsup_batch_transforms=[],\n                 sup_batch_size=1,\n                 unsup_batch_size=1,\n                 shuffle=True,\n                 drop_last=True,\n                 num_classes=80,\n                 collate_batch=True,\n                 use_shared_memory=False,\n                 **kwargs):\n        # sup transforms\n        self._sample_transforms_label = Compose_SSOD(\n            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)\n        self._batch_transforms_label = BatchCompose_SSOD(\n            sup_batch_transforms, num_classes, collate_batch)\n        self.batch_size_label = sup_batch_size\n\n        # unsup transforms\n        self._sample_transforms_unlabel = Compose_SSOD(\n            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)\n        self._batch_transforms_unlabel = BatchCompose_SSOD(\n            unsup_batch_transforms, num_classes, collate_batch)\n        self.batch_size_unlabel = unsup_batch_size\n\n        # common\n        self.shuffle = shuffle\n        self.drop_last = drop_last\n        self.use_shared_memory = use_shared_memory\n        self.kwargs = kwargs\n\n    def __call__(self,\n                 dataset_label,\n                 dataset_unlabel,\n                 worker_num,\n                 batch_sampler_label=None,\n                 batch_sampler_unlabel=None,\n                 return_list=False):\n        # sup dataset\n        self.dataset_label = dataset_label\n        self.dataset_label.check_or_download_dataset()\n        self.dataset_label.parse_dataset()\n        self.dataset_label.set_transform(self._sample_transforms_label)\n        self.dataset_label.set_kwargs(**self.kwargs)\n        if batch_sampler_label is None:\n            self._batch_sampler_label = DistributedBatchSampler(\n                self.dataset_label,\n                batch_size=self.batch_size_label,\n                shuffle=self.shuffle,\n                drop_last=self.drop_last)\n        else:\n            self._batch_sampler_label = batch_sampler_label\n\n        # unsup dataset\n        self.dataset_unlabel = dataset_unlabel\n        self.dataset_unlabel.length = self.dataset_label.__len__()\n        self.dataset_unlabel.check_or_download_dataset()\n        self.dataset_unlabel.parse_dataset()\n        self.dataset_unlabel.set_transform(self._sample_transforms_unlabel)\n        self.dataset_unlabel.set_kwargs(**self.kwargs)\n        if batch_sampler_unlabel is None:\n            self._batch_sampler_unlabel = DistributedBatchSampler(\n                self.dataset_unlabel,\n                batch_size=self.batch_size_unlabel,\n                shuffle=self.shuffle,\n                drop_last=self.drop_last)\n        else:\n            self._batch_sampler_unlabel = batch_sampler_unlabel\n\n        # DataLoader do not start sub-process in Windows and Mac\n        # system, do not need to use shared memory\n        use_shared_memory = self.use_shared_memory and \\\n                            sys.platform not in ['win32', 'darwin']\n        # check whether shared memory size is bigger than 1G(1024M)\n        if use_shared_memory:\n            shm_size = _get_shared_memory_size_in_M()\n            if shm_size is not None and shm_size < 1024.:\n                logger.warning(\"Shared memory size is less than 1G, \"\n                               \"disable shared_memory in DataLoader\")\n                use_shared_memory = False\n\n        self.dataloader_label = DataLoader(\n            dataset=self.dataset_label,\n            batch_sampler=self._batch_sampler_label,\n            collate_fn=self._batch_transforms_label,\n            num_workers=worker_num,\n            return_list=return_list,\n            use_shared_memory=use_shared_memory)\n\n        self.dataloader_unlabel = DataLoader(\n            dataset=self.dataset_unlabel,\n            batch_sampler=self._batch_sampler_unlabel,\n            collate_fn=self._batch_transforms_unlabel,\n            num_workers=worker_num,\n            return_list=return_list,\n            use_shared_memory=use_shared_memory)\n\n        self.dataloader = CombineSSODLoader(self.dataloader_label,\n                                            self.dataloader_unlabel)\n        self.loader = iter(self.dataloader)\n        return self\n\n    def __len__(self):\n        return len(self._batch_sampler_label)\n\n    def __iter__(self):\n        return self\n\n    def __next__(self):\n        return next(self.loader)\n\n    def next(self):\n        # python2 compatibility\n        return self.__next__()\n\n\n@register\nclass SemiTrainReader(BaseSemiDataLoader):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 sample_transforms=[],\n                 weak_aug=[],\n                 strong_aug=[],\n                 sup_batch_transforms=[],\n                 unsup_batch_transforms=[],\n                 sup_batch_size=1,\n                 unsup_batch_size=1,\n                 shuffle=True,\n                 drop_last=True,\n                 num_classes=80,\n                 collate_batch=True,\n                 **kwargs):\n        super(SemiTrainReader, self).__init__(\n            sample_transforms, weak_aug, strong_aug, sup_batch_transforms,\n            unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle,\n            drop_last, num_classes, collate_batch, **kwargs)\n"
  },
  {
    "path": "ppdet/data/shm_utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\n\nSIZE_UNIT = ['K', 'M', 'G', 'T']\nSHM_QUERY_CMD = 'df -h'\nSHM_KEY = 'shm'\nSHM_DEFAULT_MOUNT = '/dev/shm'\n\n# [ shared memory size check ]\n# In detection models, image/target data occupies a lot of memory, and\n# will occupy lots of shared memory in multi-process DataLoader, we use\n# following code to get shared memory size and perform a size check to\n# disable shared memory use if shared memory size is not enough.\n# Shared memory getting process as follows:\n# 1. use `df -h` get all mount info\n# 2. pick up spaces whose mount info contains 'shm'\n# 3. if 'shm' space number is only 1, return its size\n# 4. if there are multiple 'shm' space, try to find the default mount\n#    directory '/dev/shm' is Linux-like system, otherwise return the\n#    biggest space size.\n\n\ndef _parse_size_in_M(size_str):\n    if size_str[-1] == 'B':\n        num, unit = size_str[:-2], size_str[-2]\n    else:\n        num, unit = size_str[:-1], size_str[-1]\n    assert unit in SIZE_UNIT, \\\n            \"unknown shm size unit {}\".format(unit)\n    return float(num) * \\\n            (1024 ** (SIZE_UNIT.index(unit) - 1))\n\n\ndef _get_shared_memory_size_in_M():\n    try:\n        df_infos = os.popen(SHM_QUERY_CMD).readlines()\n    except:\n        return None\n    else:\n        shm_infos = []\n        for df_info in df_infos:\n            info = df_info.strip()\n            if info.find(SHM_KEY) >= 0:\n                shm_infos.append(info.split())\n\n        if len(shm_infos) == 0:\n            return None\n        elif len(shm_infos) == 1:\n            return _parse_size_in_M(shm_infos[0][3])\n        else:\n            default_mount_infos = [\n                si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT\n            ]\n            if default_mount_infos:\n                return _parse_size_in_M(default_mount_infos[0][3])\n            else:\n                return max([_parse_size_in_M(si[3]) for si in shm_infos])\n"
  },
  {
    "path": "ppdet/data/source/__init__.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import coco\nfrom . import voc\nfrom . import widerface\nfrom . import category\nfrom . import keypoint_coco\nfrom . import mot\nfrom . import sniper_coco\nfrom . import culane\nfrom . import lvis\n\nfrom .coco import *\nfrom .voc import *\nfrom .widerface import *\nfrom .category import *\nfrom .keypoint_coco import *\nfrom .mot import *\nfrom .sniper_coco import SniperCOCODataSet\nfrom .dataset import ImageFolder\nfrom .pose3d_cmb import *\nfrom .culane import *\nfrom .lvis import *"
  },
  {
    "path": "ppdet/data/source/category.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\n\nfrom ppdet.data.source.voc import pascalvoc_label\nfrom ppdet.data.source.widerface import widerface_label\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = ['get_categories']\n\n\ndef get_categories(metric_type, anno_file=None, arch=None):\n    \"\"\"\n    Get class id to category id map and category id\n    to category name map from annotation file.\n\n    Args:\n        metric_type (str): metric type, currently support 'coco', 'voc', 'oid'\n            and 'widerface'.\n        anno_file (str): annotation file path\n    \"\"\"\n    if arch == 'keypoint_arch':\n        return (None, {'id': 'keypoint'})\n\n    if anno_file == None or (not os.path.isfile(anno_file)):\n        logger.warning(\n            \"anno_file '{}' is None or not set or not exist, \"\n            \"please recheck TrainDataset/EvalDataset/TestDataset.anno_path, \"\n            \"otherwise the default categories will be used by metric_type.\".\n            format(anno_file))\n\n    if metric_type.lower() == 'coco' or metric_type.lower(\n    ) == 'rbox' or metric_type.lower() == 'snipercoco':\n        if anno_file and os.path.isfile(anno_file):\n            if anno_file.endswith('json'):\n                # lazy import pycocotools here\n                from pycocotools.coco import COCO\n                coco = COCO(anno_file)\n                cats = coco.loadCats(coco.getCatIds())\n\n                clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}\n                catid2name = {cat['id']: cat['name'] for cat in cats}\n\n            elif anno_file.endswith('txt'):\n                cats = []\n                with open(anno_file) as f:\n                    for line in f.readlines():\n                        cats.append(line.strip())\n                if cats[0] == 'background': cats = cats[1:]\n\n                clsid2catid = {i: i for i in range(len(cats))}\n                catid2name = {i: name for i, name in enumerate(cats)}\n\n            else:\n                raise ValueError(\"anno_file {} should be json or txt.\".format(\n                    anno_file))\n            return clsid2catid, catid2name\n\n        # anno file not exist, load default categories of COCO17\n        else:\n            if metric_type.lower() == 'rbox':\n                logger.warning(\n                    \"metric_type: {}, load default categories of DOTA.\".format(\n                        metric_type))\n                return _dota_category()\n            logger.warning(\"metric_type: {}, load default categories of COCO.\".\n                           format(metric_type))\n            return _coco17_category()\n\n    elif metric_type.lower() == 'voc':\n        if anno_file and os.path.isfile(anno_file):\n            cats = []\n            with open(anno_file) as f:\n                for line in f.readlines():\n                    cats.append(line.strip())\n\n            if cats[0] == 'background':\n                cats = cats[1:]\n\n            clsid2catid = {i: i for i in range(len(cats))}\n            catid2name = {i: name for i, name in enumerate(cats)}\n\n            return clsid2catid, catid2name\n\n        # anno file not exist, load default categories of\n        # VOC all 20 categories\n        else:\n            logger.warning(\"metric_type: {}, load default categories of VOC.\".\n                           format(metric_type))\n            return _vocall_category()\n\n    elif metric_type.lower() == 'oid':\n        if anno_file and os.path.isfile(anno_file):\n            logger.warning(\"only default categories support for OID19\")\n        return _oid19_category()\n\n    elif metric_type.lower() == 'widerface':\n        return _widerface_category()\n\n    elif metric_type.lower() in [\n            'keypointtopdowncocoeval', 'keypointtopdownmpiieval',\n            'keypointtopdowncocowholebadyhandeval'\n    ]:\n        return (None, {'id': 'keypoint'})\n\n    elif metric_type.lower() == 'pose3deval':\n        return (None, {'id': 'pose3d'})\n\n    elif metric_type.lower() in ['mot', 'motdet', 'reid']:\n        if anno_file and os.path.isfile(anno_file):\n            cats = []\n            with open(anno_file) as f:\n                for line in f.readlines():\n                    cats.append(line.strip())\n            if cats[0] == 'background':\n                cats = cats[1:]\n            clsid2catid = {i: i for i in range(len(cats))}\n            catid2name = {i: name for i, name in enumerate(cats)}\n            return clsid2catid, catid2name\n        # anno file not exist, load default category 'pedestrian'.\n        else:\n            logger.warning(\n                \"metric_type: {}, load default categories of pedestrian MOT.\".\n                format(metric_type))\n            return _mot_category(category='pedestrian')\n\n    elif metric_type.lower() in ['kitti', 'bdd100kmot']:\n        return _mot_category(category='vehicle')\n\n    elif metric_type.lower() in ['mcmot']:\n        if anno_file and os.path.isfile(anno_file):\n            cats = []\n            with open(anno_file) as f:\n                for line in f.readlines():\n                    cats.append(line.strip())\n            if cats[0] == 'background':\n                cats = cats[1:]\n            clsid2catid = {i: i for i in range(len(cats))}\n            catid2name = {i: name for i, name in enumerate(cats)}\n            return clsid2catid, catid2name\n        # anno file not exist, load default categories of visdrone all 10 categories\n        else:\n            logger.warning(\n                \"metric_type: {}, load default categories of VisDrone.\".format(\n                    metric_type))\n            return _visdrone_category()\n\n    else:\n        raise ValueError(\"unknown metric type {}\".format(metric_type))\n\n\ndef _mot_category(category='pedestrian'):\n    \"\"\"\n    Get class id to category id map and category id\n    to category name map of mot dataset\n    \"\"\"\n    label_map = {category: 0}\n    label_map = sorted(label_map.items(), key=lambda x: x[1])\n    cats = [l[0] for l in label_map]\n\n    clsid2catid = {i: i for i in range(len(cats))}\n    catid2name = {i: name for i, name in enumerate(cats)}\n\n    return clsid2catid, catid2name\n\n\ndef _coco17_category():\n    \"\"\"\n    Get class id to category id map and category id\n    to category name map of COCO2017 dataset\n\n    \"\"\"\n    clsid2catid = {\n        1: 1,\n        2: 2,\n        3: 3,\n        4: 4,\n        5: 5,\n        6: 6,\n        7: 7,\n        8: 8,\n        9: 9,\n        10: 10,\n        11: 11,\n        12: 13,\n        13: 14,\n        14: 15,\n        15: 16,\n        16: 17,\n        17: 18,\n        18: 19,\n        19: 20,\n        20: 21,\n        21: 22,\n        22: 23,\n        23: 24,\n        24: 25,\n        25: 27,\n        26: 28,\n        27: 31,\n        28: 32,\n        29: 33,\n        30: 34,\n        31: 35,\n        32: 36,\n        33: 37,\n        34: 38,\n        35: 39,\n        36: 40,\n        37: 41,\n        38: 42,\n        39: 43,\n        40: 44,\n        41: 46,\n        42: 47,\n        43: 48,\n        44: 49,\n        45: 50,\n        46: 51,\n        47: 52,\n        48: 53,\n        49: 54,\n        50: 55,\n        51: 56,\n        52: 57,\n        53: 58,\n        54: 59,\n        55: 60,\n        56: 61,\n        57: 62,\n        58: 63,\n        59: 64,\n        60: 65,\n        61: 67,\n        62: 70,\n        63: 72,\n        64: 73,\n        65: 74,\n        66: 75,\n        67: 76,\n        68: 77,\n        69: 78,\n        70: 79,\n        71: 80,\n        72: 81,\n        73: 82,\n        74: 84,\n        75: 85,\n        76: 86,\n        77: 87,\n        78: 88,\n        79: 89,\n        80: 90\n    }\n\n    catid2name = {\n        0: 'background',\n        1: 'person',\n        2: 'bicycle',\n        3: 'car',\n        4: 'motorcycle',\n        5: 'airplane',\n        6: 'bus',\n        7: 'train',\n        8: 'truck',\n        9: 'boat',\n        10: 'traffic light',\n        11: 'fire hydrant',\n        13: 'stop sign',\n        14: 'parking meter',\n        15: 'bench',\n        16: 'bird',\n        17: 'cat',\n        18: 'dog',\n        19: 'horse',\n        20: 'sheep',\n        21: 'cow',\n        22: 'elephant',\n        23: 'bear',\n        24: 'zebra',\n        25: 'giraffe',\n        27: 'backpack',\n        28: 'umbrella',\n        31: 'handbag',\n        32: 'tie',\n        33: 'suitcase',\n        34: 'frisbee',\n        35: 'skis',\n        36: 'snowboard',\n        37: 'sports ball',\n        38: 'kite',\n        39: 'baseball bat',\n        40: 'baseball glove',\n        41: 'skateboard',\n        42: 'surfboard',\n        43: 'tennis racket',\n        44: 'bottle',\n        46: 'wine glass',\n        47: 'cup',\n        48: 'fork',\n        49: 'knife',\n        50: 'spoon',\n        51: 'bowl',\n        52: 'banana',\n        53: 'apple',\n        54: 'sandwich',\n        55: 'orange',\n        56: 'broccoli',\n        57: 'carrot',\n        58: 'hot dog',\n        59: 'pizza',\n        60: 'donut',\n        61: 'cake',\n        62: 'chair',\n        63: 'couch',\n        64: 'potted plant',\n        65: 'bed',\n        67: 'dining table',\n        70: 'toilet',\n        72: 'tv',\n        73: 'laptop',\n        74: 'mouse',\n        75: 'remote',\n        76: 'keyboard',\n        77: 'cell phone',\n        78: 'microwave',\n        79: 'oven',\n        80: 'toaster',\n        81: 'sink',\n        82: 'refrigerator',\n        84: 'book',\n        85: 'clock',\n        86: 'vase',\n        87: 'scissors',\n        88: 'teddy bear',\n        89: 'hair drier',\n        90: 'toothbrush'\n    }\n\n    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}\n    catid2name.pop(0)\n\n    return clsid2catid, catid2name\n\n\ndef _dota_category():\n    \"\"\"\n    Get class id to category id map and category id\n    to category name map of dota dataset\n    \"\"\"\n    catid2name = {\n        0: 'background',\n        1: 'plane',\n        2: 'baseball-diamond',\n        3: 'bridge',\n        4: 'ground-track-field',\n        5: 'small-vehicle',\n        6: 'large-vehicle',\n        7: 'ship',\n        8: 'tennis-court',\n        9: 'basketball-court',\n        10: 'storage-tank',\n        11: 'soccer-ball-field',\n        12: 'roundabout',\n        13: 'harbor',\n        14: 'swimming-pool',\n        15: 'helicopter'\n    }\n    catid2name.pop(0)\n    clsid2catid = {i: i + 1 for i in range(len(catid2name))}\n    return clsid2catid, catid2name\n\n\ndef _vocall_category():\n    \"\"\"\n    Get class id to category id map and category id\n    to category name map of mixup voc dataset\n\n    \"\"\"\n    label_map = pascalvoc_label()\n    label_map = sorted(label_map.items(), key=lambda x: x[1])\n    cats = [l[0] for l in label_map]\n\n    clsid2catid = {i: i for i in range(len(cats))}\n    catid2name = {i: name for i, name in enumerate(cats)}\n\n    return clsid2catid, catid2name\n\n\ndef _widerface_category():\n    label_map = widerface_label()\n    label_map = sorted(label_map.items(), key=lambda x: x[1])\n    cats = [l[0] for l in label_map]\n    clsid2catid = {i: i for i in range(len(cats))}\n    catid2name = {i: name for i, name in enumerate(cats)}\n\n    return clsid2catid, catid2name\n\n\ndef _oid19_category():\n    clsid2catid = {k: k + 1 for k in range(500)}\n\n    catid2name = {\n        0: \"background\",\n        1: \"Infant bed\",\n        2: \"Rose\",\n        3: \"Flag\",\n        4: \"Flashlight\",\n        5: \"Sea turtle\",\n        6: \"Camera\",\n        7: \"Animal\",\n        8: \"Glove\",\n        9: \"Crocodile\",\n        10: \"Cattle\",\n        11: \"House\",\n        12: \"Guacamole\",\n        13: \"Penguin\",\n        14: \"Vehicle registration plate\",\n        15: \"Bench\",\n        16: \"Ladybug\",\n        17: \"Human nose\",\n        18: \"Watermelon\",\n        19: \"Flute\",\n        20: \"Butterfly\",\n        21: \"Washing machine\",\n        22: \"Raccoon\",\n        23: \"Segway\",\n        24: \"Taco\",\n        25: \"Jellyfish\",\n        26: \"Cake\",\n        27: \"Pen\",\n        28: \"Cannon\",\n        29: \"Bread\",\n        30: \"Tree\",\n        31: \"Shellfish\",\n        32: \"Bed\",\n        33: \"Hamster\",\n        34: \"Hat\",\n        35: \"Toaster\",\n        36: \"Sombrero\",\n        37: \"Tiara\",\n        38: \"Bowl\",\n        39: \"Dragonfly\",\n        40: \"Moths and butterflies\",\n        41: \"Antelope\",\n        42: \"Vegetable\",\n        43: \"Torch\",\n        44: \"Building\",\n        45: \"Power plugs and sockets\",\n        46: \"Blender\",\n        47: \"Billiard table\",\n        48: \"Cutting board\",\n        49: \"Bronze sculpture\",\n        50: \"Turtle\",\n        51: \"Broccoli\",\n        52: \"Tiger\",\n        53: \"Mirror\",\n        54: \"Bear\",\n        55: \"Zucchini\",\n        56: \"Dress\",\n        57: \"Volleyball\",\n        58: \"Guitar\",\n        59: \"Reptile\",\n        60: \"Golf cart\",\n        61: \"Tart\",\n        62: \"Fedora\",\n        63: \"Carnivore\",\n        64: \"Car\",\n        65: \"Lighthouse\",\n        66: \"Coffeemaker\",\n        67: \"Food processor\",\n        68: \"Truck\",\n        69: \"Bookcase\",\n        70: \"Surfboard\",\n        71: \"Footwear\",\n        72: \"Bench\",\n        73: \"Necklace\",\n        74: \"Flower\",\n        75: \"Radish\",\n        76: \"Marine mammal\",\n        77: \"Frying pan\",\n        78: \"Tap\",\n        79: \"Peach\",\n        80: \"Knife\",\n        81: \"Handbag\",\n        82: \"Laptop\",\n        83: \"Tent\",\n        84: \"Ambulance\",\n        85: \"Christmas tree\",\n        86: \"Eagle\",\n        87: \"Limousine\",\n        88: \"Kitchen & dining room table\",\n        89: \"Polar bear\",\n        90: \"Tower\",\n        91: \"Football\",\n        92: \"Willow\",\n        93: \"Human head\",\n        94: \"Stop sign\",\n        95: \"Banana\",\n        96: \"Mixer\",\n        97: \"Binoculars\",\n        98: \"Dessert\",\n        99: \"Bee\",\n        100: \"Chair\",\n        101: \"Wood-burning stove\",\n        102: \"Flowerpot\",\n        103: \"Beaker\",\n        104: \"Oyster\",\n        105: \"Woodpecker\",\n        106: \"Harp\",\n        107: \"Bathtub\",\n        108: \"Wall clock\",\n        109: \"Sports uniform\",\n        110: \"Rhinoceros\",\n        111: \"Beehive\",\n        112: \"Cupboard\",\n        113: \"Chicken\",\n        114: \"Man\",\n        115: \"Blue jay\",\n        116: \"Cucumber\",\n        117: \"Balloon\",\n        118: \"Kite\",\n        119: \"Fireplace\",\n        120: \"Lantern\",\n        121: \"Missile\",\n        122: \"Book\",\n        123: \"Spoon\",\n        124: \"Grapefruit\",\n        125: \"Squirrel\",\n        126: \"Orange\",\n        127: \"Coat\",\n        128: \"Punching bag\",\n        129: \"Zebra\",\n        130: \"Billboard\",\n        131: \"Bicycle\",\n        132: \"Door handle\",\n        133: \"Mechanical fan\",\n        134: \"Ring binder\",\n        135: \"Table\",\n        136: \"Parrot\",\n        137: \"Sock\",\n        138: \"Vase\",\n        139: \"Weapon\",\n        140: \"Shotgun\",\n        141: \"Glasses\",\n        142: \"Seahorse\",\n        143: \"Belt\",\n        144: \"Watercraft\",\n        145: \"Window\",\n        146: \"Giraffe\",\n        147: \"Lion\",\n        148: \"Tire\",\n        149: \"Vehicle\",\n        150: \"Canoe\",\n        151: \"Tie\",\n        152: \"Shelf\",\n        153: \"Picture frame\",\n        154: \"Printer\",\n        155: \"Human leg\",\n        156: \"Boat\",\n        157: \"Slow cooker\",\n        158: \"Croissant\",\n        159: \"Candle\",\n        160: \"Pancake\",\n        161: \"Pillow\",\n        162: \"Coin\",\n        163: \"Stretcher\",\n        164: \"Sandal\",\n        165: \"Woman\",\n        166: \"Stairs\",\n        167: \"Harpsichord\",\n        168: \"Stool\",\n        169: \"Bus\",\n        170: \"Suitcase\",\n        171: \"Human mouth\",\n        172: \"Juice\",\n        173: \"Skull\",\n        174: \"Door\",\n        175: \"Violin\",\n        176: \"Chopsticks\",\n        177: \"Digital clock\",\n        178: \"Sunflower\",\n        179: \"Leopard\",\n        180: \"Bell pepper\",\n        181: \"Harbor seal\",\n        182: \"Snake\",\n        183: \"Sewing machine\",\n        184: \"Goose\",\n        185: \"Helicopter\",\n        186: \"Seat belt\",\n        187: \"Coffee cup\",\n        188: \"Microwave oven\",\n        189: \"Hot dog\",\n        190: \"Countertop\",\n        191: \"Serving tray\",\n        192: \"Dog bed\",\n        193: \"Beer\",\n        194: \"Sunglasses\",\n        195: \"Golf ball\",\n        196: \"Waffle\",\n        197: \"Palm tree\",\n        198: \"Trumpet\",\n        199: \"Ruler\",\n        200: \"Helmet\",\n        201: \"Ladder\",\n        202: \"Office building\",\n        203: \"Tablet computer\",\n        204: \"Toilet paper\",\n        205: \"Pomegranate\",\n        206: \"Skirt\",\n        207: \"Gas stove\",\n        208: \"Cookie\",\n        209: \"Cart\",\n        210: \"Raven\",\n        211: \"Egg\",\n        212: \"Burrito\",\n        213: \"Goat\",\n        214: \"Kitchen knife\",\n        215: \"Skateboard\",\n        216: \"Salt and pepper shakers\",\n        217: \"Lynx\",\n        218: \"Boot\",\n        219: \"Platter\",\n        220: \"Ski\",\n        221: \"Swimwear\",\n        222: \"Swimming pool\",\n        223: \"Drinking straw\",\n        224: \"Wrench\",\n        225: \"Drum\",\n        226: \"Ant\",\n        227: \"Human ear\",\n        228: \"Headphones\",\n        229: \"Fountain\",\n        230: \"Bird\",\n        231: \"Jeans\",\n        232: \"Television\",\n        233: \"Crab\",\n        234: \"Microphone\",\n        235: \"Home appliance\",\n        236: \"Snowplow\",\n        237: \"Beetle\",\n        238: \"Artichoke\",\n        239: \"Jet ski\",\n        240: \"Stationary bicycle\",\n        241: \"Human hair\",\n        242: \"Brown bear\",\n        243: \"Starfish\",\n        244: \"Fork\",\n        245: \"Lobster\",\n        246: \"Corded phone\",\n        247: \"Drink\",\n        248: \"Saucer\",\n        249: \"Carrot\",\n        250: \"Insect\",\n        251: \"Clock\",\n        252: \"Castle\",\n        253: \"Tennis racket\",\n        254: \"Ceiling fan\",\n        255: \"Asparagus\",\n        256: \"Jaguar\",\n        257: \"Musical instrument\",\n        258: \"Train\",\n        259: \"Cat\",\n        260: \"Rifle\",\n        261: \"Dumbbell\",\n        262: \"Mobile phone\",\n        263: \"Taxi\",\n        264: \"Shower\",\n        265: \"Pitcher\",\n        266: \"Lemon\",\n        267: \"Invertebrate\",\n        268: \"Turkey\",\n        269: \"High heels\",\n        270: \"Bust\",\n        271: \"Elephant\",\n        272: \"Scarf\",\n        273: \"Barrel\",\n        274: \"Trombone\",\n        275: \"Pumpkin\",\n        276: \"Box\",\n        277: \"Tomato\",\n        278: \"Frog\",\n        279: \"Bidet\",\n        280: \"Human face\",\n        281: \"Houseplant\",\n        282: \"Van\",\n        283: \"Shark\",\n        284: \"Ice cream\",\n        285: \"Swim cap\",\n        286: \"Falcon\",\n        287: \"Ostrich\",\n        288: \"Handgun\",\n        289: \"Whiteboard\",\n        290: \"Lizard\",\n        291: \"Pasta\",\n        292: \"Snowmobile\",\n        293: \"Light bulb\",\n        294: \"Window blind\",\n        295: \"Muffin\",\n        296: \"Pretzel\",\n        297: \"Computer monitor\",\n        298: \"Horn\",\n        299: \"Furniture\",\n        300: \"Sandwich\",\n        301: \"Fox\",\n        302: \"Convenience store\",\n        303: \"Fish\",\n        304: \"Fruit\",\n        305: \"Earrings\",\n        306: \"Curtain\",\n        307: \"Grape\",\n        308: \"Sofa bed\",\n        309: \"Horse\",\n        310: \"Luggage and bags\",\n        311: \"Desk\",\n        312: \"Crutch\",\n        313: \"Bicycle helmet\",\n        314: \"Tick\",\n        315: \"Airplane\",\n        316: \"Canary\",\n        317: \"Spatula\",\n        318: \"Watch\",\n        319: \"Lily\",\n        320: \"Kitchen appliance\",\n        321: \"Filing cabinet\",\n        322: \"Aircraft\",\n        323: \"Cake stand\",\n        324: \"Candy\",\n        325: \"Sink\",\n        326: \"Mouse\",\n        327: \"Wine\",\n        328: \"Wheelchair\",\n        329: \"Goldfish\",\n        330: \"Refrigerator\",\n        331: \"French fries\",\n        332: \"Drawer\",\n        333: \"Treadmill\",\n        334: \"Picnic basket\",\n        335: \"Dice\",\n        336: \"Cabbage\",\n        337: \"Football helmet\",\n        338: \"Pig\",\n        339: \"Person\",\n        340: \"Shorts\",\n        341: \"Gondola\",\n        342: \"Honeycomb\",\n        343: \"Doughnut\",\n        344: \"Chest of drawers\",\n        345: \"Land vehicle\",\n        346: \"Bat\",\n        347: \"Monkey\",\n        348: \"Dagger\",\n        349: \"Tableware\",\n        350: \"Human foot\",\n        351: \"Mug\",\n        352: \"Alarm clock\",\n        353: \"Pressure cooker\",\n        354: \"Human hand\",\n        355: \"Tortoise\",\n        356: \"Baseball glove\",\n        357: \"Sword\",\n        358: \"Pear\",\n        359: \"Miniskirt\",\n        360: \"Traffic sign\",\n        361: \"Girl\",\n        362: \"Roller skates\",\n        363: \"Dinosaur\",\n        364: \"Porch\",\n        365: \"Human beard\",\n        366: \"Submarine sandwich\",\n        367: \"Screwdriver\",\n        368: \"Strawberry\",\n        369: \"Wine glass\",\n        370: \"Seafood\",\n        371: \"Racket\",\n        372: \"Wheel\",\n        373: \"Sea lion\",\n        374: \"Toy\",\n        375: \"Tea\",\n        376: \"Tennis ball\",\n        377: \"Waste container\",\n        378: \"Mule\",\n        379: \"Cricket ball\",\n        380: \"Pineapple\",\n        381: \"Coconut\",\n        382: \"Doll\",\n        383: \"Coffee table\",\n        384: \"Snowman\",\n        385: \"Lavender\",\n        386: \"Shrimp\",\n        387: \"Maple\",\n        388: \"Cowboy hat\",\n        389: \"Goggles\",\n        390: \"Rugby ball\",\n        391: \"Caterpillar\",\n        392: \"Poster\",\n        393: \"Rocket\",\n        394: \"Organ\",\n        395: \"Saxophone\",\n        396: \"Traffic light\",\n        397: \"Cocktail\",\n        398: \"Plastic bag\",\n        399: \"Squash\",\n        400: \"Mushroom\",\n        401: \"Hamburger\",\n        402: \"Light switch\",\n        403: \"Parachute\",\n        404: \"Teddy bear\",\n        405: \"Winter melon\",\n        406: \"Deer\",\n        407: \"Musical keyboard\",\n        408: \"Plumbing fixture\",\n        409: \"Scoreboard\",\n        410: \"Baseball bat\",\n        411: \"Envelope\",\n        412: \"Adhesive tape\",\n        413: \"Briefcase\",\n        414: \"Paddle\",\n        415: \"Bow and arrow\",\n        416: \"Telephone\",\n        417: \"Sheep\",\n        418: \"Jacket\",\n        419: \"Boy\",\n        420: \"Pizza\",\n        421: \"Otter\",\n        422: \"Office supplies\",\n        423: \"Couch\",\n        424: \"Cello\",\n        425: \"Bull\",\n        426: \"Camel\",\n        427: \"Ball\",\n        428: \"Duck\",\n        429: \"Whale\",\n        430: \"Shirt\",\n        431: \"Tank\",\n        432: \"Motorcycle\",\n        433: \"Accordion\",\n        434: \"Owl\",\n        435: \"Porcupine\",\n        436: \"Sun hat\",\n        437: \"Nail\",\n        438: \"Scissors\",\n        439: \"Swan\",\n        440: \"Lamp\",\n        441: \"Crown\",\n        442: \"Piano\",\n        443: \"Sculpture\",\n        444: \"Cheetah\",\n        445: \"Oboe\",\n        446: \"Tin can\",\n        447: \"Mango\",\n        448: \"Tripod\",\n        449: \"Oven\",\n        450: \"Mouse\",\n        451: \"Barge\",\n        452: \"Coffee\",\n        453: \"Snowboard\",\n        454: \"Common fig\",\n        455: \"Salad\",\n        456: \"Marine invertebrates\",\n        457: \"Umbrella\",\n        458: \"Kangaroo\",\n        459: \"Human arm\",\n        460: \"Measuring cup\",\n        461: \"Snail\",\n        462: \"Loveseat\",\n        463: \"Suit\",\n        464: \"Teapot\",\n        465: \"Bottle\",\n        466: \"Alpaca\",\n        467: \"Kettle\",\n        468: \"Trousers\",\n        469: \"Popcorn\",\n        470: \"Centipede\",\n        471: \"Spider\",\n        472: \"Sparrow\",\n        473: \"Plate\",\n        474: \"Bagel\",\n        475: \"Personal care\",\n        476: \"Apple\",\n        477: \"Brassiere\",\n        478: \"Bathroom cabinet\",\n        479: \"studio couch\",\n        480: \"Computer keyboard\",\n        481: \"Table tennis racket\",\n        482: \"Sushi\",\n        483: \"Cabinetry\",\n        484: \"Street light\",\n        485: \"Towel\",\n        486: \"Nightstand\",\n        487: \"Rabbit\",\n        488: \"Dolphin\",\n        489: \"Dog\",\n        490: \"Jug\",\n        491: \"Wok\",\n        492: \"Fire hydrant\",\n        493: \"Human eye\",\n        494: \"Skyscraper\",\n        495: \"Backpack\",\n        496: \"Potato\",\n        497: \"Paper towel\",\n        498: \"Lifejacket\",\n        499: \"Bicycle wheel\",\n        500: \"Toilet\",\n    }\n\n    return clsid2catid, catid2name\n\n\ndef _visdrone_category():\n    clsid2catid = {i: i for i in range(10)}\n\n    catid2name = {\n        0: 'pedestrian',\n        1: 'people',\n        2: 'bicycle',\n        3: 'car',\n        4: 'van',\n        5: 'truck',\n        6: 'tricycle',\n        7: 'awning-tricycle',\n        8: 'bus',\n        9: 'motor'\n    }\n    return clsid2catid, catid2name\n"
  },
  {
    "path": "ppdet/data/source/coco.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport os\nimport copy\n\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\nimport numpy as np\nfrom ppdet.core.workspace import register, serializable\nfrom .dataset import DetDataset\n\nfrom ppdet.utils.logger import setup_logger\n\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset', 'COCOInstSegDataset'\n]\n\n\n@register\n@serializable\nclass COCODataSet(DetDataset):\n    \"\"\"\n    Load dataset with COCO format.\n\n    Args:\n        dataset_dir (str): root directory for dataset.\n        image_dir (str): directory for images.\n        anno_path (str): coco annotation file path.\n        data_fields (list): key name of data dictionary, at least have 'image'.\n        sample_num (int): number of samples to load, -1 means all.\n        load_crowd (bool): whether to load crowded ground-truth. \n            False as default\n        allow_empty (bool): whether to load empty entry. False as default\n        empty_ratio (float): the ratio of empty record number to total \n            record's, if empty_ratio is out of [0. ,1.), do not sample the \n            records and use all the empty entries. 1. as default\n        repeat (int): repeat times for dataset, use in benchmark.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 data_fields=['image'],\n                 sample_num=-1,\n                 load_crowd=False,\n                 allow_empty=False,\n                 empty_ratio=1.,\n                 repeat=1):\n        super(COCODataSet, self).__init__(\n            dataset_dir,\n            image_dir,\n            anno_path,\n            data_fields,\n            sample_num,\n            repeat=repeat)\n        self.load_image_only = False\n        self.load_semantic = False\n        self.load_crowd = load_crowd\n        self.allow_empty = allow_empty\n        self.empty_ratio = empty_ratio\n\n    def _sample_empty(self, records, num):\n        # if empty_ratio is out of [0. ,1.), do not sample the records\n        if self.empty_ratio < 0. or self.empty_ratio >= 1.:\n            return records\n        import random\n        sample_num = min(\n            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))\n        records = random.sample(records, sample_num)\n        return records\n\n    def parse_dataset(self):\n        anno_path = os.path.join(self.dataset_dir, self.anno_path)\n        image_dir = os.path.join(self.dataset_dir, self.image_dir)\n\n        assert anno_path.endswith('.json'), \\\n            'invalid coco annotation file: ' + anno_path\n        from pycocotools.coco import COCO\n        coco = COCO(anno_path)\n        img_ids = coco.getImgIds()\n        img_ids.sort()\n        cat_ids = coco.getCatIds()\n        records = []\n        empty_records = []\n        ct = 0\n\n        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})\n        self.cname2cid = dict({\n            coco.loadCats(catid)[0]['name']: clsid\n            for catid, clsid in self.catid2clsid.items()\n        })\n\n        if 'annotations' not in coco.dataset:\n            self.load_image_only = True\n            logger.warning('Annotation file: {} does not contains ground truth '\n                           'and load image information only.'.format(anno_path))\n\n        for img_id in img_ids:\n            img_anno = coco.loadImgs([img_id])[0]\n            im_fname = img_anno['file_name']\n            im_w = float(img_anno['width'])\n            im_h = float(img_anno['height'])\n\n            im_path = os.path.join(image_dir,\n                                   im_fname) if image_dir else im_fname\n            is_empty = False\n            if not os.path.exists(im_path):\n                logger.warning('Illegal image file: {}, and it will be '\n                               'ignored'.format(im_path))\n                continue\n\n            if im_w < 0 or im_h < 0:\n                logger.warning('Illegal width: {} or height: {} in annotation, '\n                               'and im_id: {} will be ignored'.format(\n                    im_w, im_h, img_id))\n                continue\n\n            coco_rec = {\n                'im_file': im_path,\n                'im_id': np.array([img_id]),\n                'h': im_h,\n                'w': im_w,\n            } if 'image' in self.data_fields else {}\n\n            if not self.load_image_only:\n                ins_anno_ids = coco.getAnnIds(\n                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)\n                instances = coco.loadAnns(ins_anno_ids)\n\n                bboxes = []\n                is_rbox_anno = False\n                for inst in instances:\n                    # check gt bbox\n                    if inst.get('ignore', False):\n                        continue\n                    if 'bbox' not in inst.keys():\n                        continue\n                    else:\n                        if not any(np.array(inst['bbox'])):\n                            continue\n\n                    x1, y1, box_w, box_h = inst['bbox']\n                    x2 = x1 + box_w\n                    y2 = y1 + box_h\n                    eps = 1e-5\n                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:\n                        inst['clean_bbox'] = [\n                            round(float(x), 3) for x in [x1, y1, x2, y2]\n                        ]\n                        bboxes.append(inst)\n                    else:\n                        logger.warning(\n                            'Found an invalid bbox in annotations: im_id: {}, '\n                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(\n                                img_id, float(inst['area']), x1, y1, x2, y2))\n\n                num_bbox = len(bboxes)\n                if num_bbox <= 0 and not self.allow_empty:\n                    continue\n                elif num_bbox <= 0:\n                    is_empty = True\n\n                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)\n                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)\n                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)\n                gt_poly = [None] * num_bbox\n                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)\n\n                has_segmentation = False\n                has_track_id = False\n                for i, box in enumerate(bboxes):\n                    catid = box['category_id']\n                    gt_class[i][0] = self.catid2clsid[catid]\n                    gt_bbox[i, :] = box['clean_bbox']\n                    is_crowd[i][0] = box['iscrowd']\n                    # check RLE format \n                    if 'segmentation' in box and box['iscrowd'] == 1:\n                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]\n                    elif 'segmentation' in box and box['segmentation']:\n                        if not np.array(\n                                box['segmentation'],\n                                dtype=object).size > 0 and not self.allow_empty:\n                            bboxes.pop(i)\n                            gt_poly.pop(i)\n                            np.delete(is_crowd, i)\n                            np.delete(gt_class, i)\n                            np.delete(gt_bbox, i)\n                        else:\n                            gt_poly[i] = box['segmentation']\n                        has_segmentation = True\n\n                    if 'track_id' in box:\n                        gt_track_id[i][0] = box['track_id']\n                        has_track_id = True\n\n                if has_segmentation and not any(\n                        gt_poly) and not self.allow_empty:\n                    continue\n\n                gt_rec = {\n                    'is_crowd': is_crowd,\n                    'gt_class': gt_class,\n                    'gt_bbox': gt_bbox,\n                    'gt_poly': gt_poly,\n                }\n                if has_track_id:\n                    gt_rec.update({'gt_track_id': gt_track_id})\n\n                for k, v in gt_rec.items():\n                    if k in self.data_fields:\n                        coco_rec[k] = v\n\n                # TODO: remove load_semantic\n                if self.load_semantic and 'semantic' in self.data_fields:\n                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',\n                                            'train2017', im_fname[:-3] + 'png')\n                    coco_rec.update({'semantic': seg_path})\n\n            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(\n                im_path, img_id, im_h, im_w))\n            if is_empty:\n                empty_records.append(coco_rec)\n            else:\n                records.append(coco_rec)\n            ct += 1\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n        assert ct > 0, 'not found any coco record in %s' % (anno_path)\n        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.\n                    format(ct, len(img_ids) - ct, anno_path))\n        if self.allow_empty and len(empty_records) > 0:\n            empty_records = self._sample_empty(empty_records, len(records))\n            records += empty_records\n        self.roidbs = records\n\n\n@register\n@serializable\nclass SlicedCOCODataSet(COCODataSet):\n    \"\"\"Sliced COCODataSet\"\"\"\n\n    def __init__(\n            self,\n            dataset_dir=None,\n            image_dir=None,\n            anno_path=None,\n            data_fields=['image'],\n            sample_num=-1,\n            load_crowd=False,\n            allow_empty=False,\n            empty_ratio=1.,\n            repeat=1,\n            sliced_size=[640, 640],\n            overlap_ratio=[0.25, 0.25], ):\n        super(SlicedCOCODataSet, self).__init__(\n            dataset_dir=dataset_dir,\n            image_dir=image_dir,\n            anno_path=anno_path,\n            data_fields=data_fields,\n            sample_num=sample_num,\n            load_crowd=load_crowd,\n            allow_empty=allow_empty,\n            empty_ratio=empty_ratio,\n            repeat=repeat, )\n        self.sliced_size = sliced_size\n        self.overlap_ratio = overlap_ratio\n\n    def parse_dataset(self):\n        anno_path = os.path.join(self.dataset_dir, self.anno_path)\n        image_dir = os.path.join(self.dataset_dir, self.image_dir)\n\n        assert anno_path.endswith('.json'), \\\n            'invalid coco annotation file: ' + anno_path\n        from pycocotools.coco import COCO\n        coco = COCO(anno_path)\n        img_ids = coco.getImgIds()\n        img_ids.sort()\n        cat_ids = coco.getCatIds()\n        records = []\n        empty_records = []\n        ct = 0\n        ct_sub = 0\n\n        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})\n        self.cname2cid = dict({\n            coco.loadCats(catid)[0]['name']: clsid\n            for catid, clsid in self.catid2clsid.items()\n        })\n\n        if 'annotations' not in coco.dataset:\n            self.load_image_only = True\n            logger.warning('Annotation file: {} does not contains ground truth '\n                           'and load image information only.'.format(anno_path))\n        try:\n            import sahi\n            from sahi.slicing import slice_image\n        except Exception as e:\n            logger.error(\n                'sahi not found, plaese install sahi. '\n                'for example: `pip install sahi`, see https://github.com/obss/sahi.'\n            )\n            raise e\n\n        sub_img_ids = 0\n        for img_id in img_ids:\n            img_anno = coco.loadImgs([img_id])[0]\n            im_fname = img_anno['file_name']\n            im_w = float(img_anno['width'])\n            im_h = float(img_anno['height'])\n\n            im_path = os.path.join(image_dir,\n                                   im_fname) if image_dir else im_fname\n            is_empty = False\n            if not os.path.exists(im_path):\n                logger.warning('Illegal image file: {}, and it will be '\n                               'ignored'.format(im_path))\n                continue\n\n            if im_w < 0 or im_h < 0:\n                logger.warning('Illegal width: {} or height: {} in annotation, '\n                               'and im_id: {} will be ignored'.format(\n                    im_w, im_h, img_id))\n                continue\n\n            slice_image_result = sahi.slicing.slice_image(\n                image=im_path,\n                slice_height=self.sliced_size[0],\n                slice_width=self.sliced_size[1],\n                overlap_height_ratio=self.overlap_ratio[0],\n                overlap_width_ratio=self.overlap_ratio[1])\n\n            sub_img_num = len(slice_image_result)\n            for _ind in range(sub_img_num):\n                im = slice_image_result.images[_ind]\n                coco_rec = {\n                    'image': im,\n                    'im_id': np.array([sub_img_ids + _ind]),\n                    'h': im.shape[0],\n                    'w': im.shape[1],\n                    'ori_im_id': np.array([img_id]),\n                    'st_pix': np.array(\n                        slice_image_result.starting_pixels[_ind],\n                        dtype=np.float32),\n                    'is_last': 1 if _ind == sub_img_num - 1 else 0,\n                } if 'image' in self.data_fields else {}\n                records.append(coco_rec)\n            ct_sub += sub_img_num\n            ct += 1\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n        assert ct > 0, 'not found any coco record in %s' % (anno_path)\n        logger.info('{} samples and slice to {} sub_samples in file {}'.format(\n            ct, ct_sub, anno_path))\n        if self.allow_empty and len(empty_records) > 0:\n            empty_records = self._sample_empty(empty_records, len(records))\n            records += empty_records\n        self.roidbs = records\n\n\n@register\n@serializable\nclass SemiCOCODataSet(COCODataSet):\n    \"\"\"Semi-COCODataSet used for supervised and unsupervised dataSet\"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 data_fields=['image'],\n                 sample_num=-1,\n                 load_crowd=False,\n                 allow_empty=False,\n                 empty_ratio=1.,\n                 repeat=1,\n                 supervised=True):\n        super(SemiCOCODataSet, self).__init__(\n            dataset_dir, image_dir, anno_path, data_fields, sample_num,\n            load_crowd, allow_empty, empty_ratio, repeat)\n        self.supervised = supervised\n        self.length = -1  # defalut -1 means all\n\n    def parse_dataset(self):\n        anno_path = os.path.join(self.dataset_dir, self.anno_path)\n        image_dir = os.path.join(self.dataset_dir, self.image_dir)\n\n        assert anno_path.endswith('.json'), \\\n            'invalid coco annotation file: ' + anno_path\n        from pycocotools.coco import COCO\n        coco = COCO(anno_path)\n        img_ids = coco.getImgIds()\n        img_ids.sort()\n        cat_ids = coco.getCatIds()\n        records = []\n        empty_records = []\n        ct = 0\n\n        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})\n        self.cname2cid = dict({\n            coco.loadCats(catid)[0]['name']: clsid\n            for catid, clsid in self.catid2clsid.items()\n        })\n\n        if 'annotations' not in coco.dataset or self.supervised == False:\n            self.load_image_only = True\n            logger.warning('Annotation file: {} does not contains ground truth '\n                           'and load image information only.'.format(anno_path))\n\n        for img_id in img_ids:\n            img_anno = coco.loadImgs([img_id])[0]\n            im_fname = img_anno['file_name']\n            im_w = float(img_anno['width'])\n            im_h = float(img_anno['height'])\n\n            im_path = os.path.join(image_dir,\n                                   im_fname) if image_dir else im_fname\n            is_empty = False\n            if not os.path.exists(im_path):\n                logger.warning('Illegal image file: {}, and it will be '\n                               'ignored'.format(im_path))\n                continue\n\n            if im_w < 0 or im_h < 0:\n                logger.warning('Illegal width: {} or height: {} in annotation, '\n                               'and im_id: {} will be ignored'.format(\n                    im_w, im_h, img_id))\n                continue\n\n            coco_rec = {\n                'im_file': im_path,\n                'im_id': np.array([img_id]),\n                'h': im_h,\n                'w': im_w,\n            } if 'image' in self.data_fields else {}\n\n            if not self.load_image_only:\n                ins_anno_ids = coco.getAnnIds(\n                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)\n                instances = coco.loadAnns(ins_anno_ids)\n\n                bboxes = []\n                is_rbox_anno = False\n                for inst in instances:\n                    # check gt bbox\n                    if inst.get('ignore', False):\n                        continue\n                    if 'bbox' not in inst.keys():\n                        continue\n                    else:\n                        if not any(np.array(inst['bbox'])):\n                            continue\n\n                    x1, y1, box_w, box_h = inst['bbox']\n                    x2 = x1 + box_w\n                    y2 = y1 + box_h\n                    eps = 1e-5\n                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:\n                        inst['clean_bbox'] = [\n                            round(float(x), 3) for x in [x1, y1, x2, y2]\n                        ]\n                        bboxes.append(inst)\n                    else:\n                        logger.warning(\n                            'Found an invalid bbox in annotations: im_id: {}, '\n                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(\n                                img_id, float(inst['area']), x1, y1, x2, y2))\n\n                num_bbox = len(bboxes)\n                if num_bbox <= 0 and not self.allow_empty:\n                    continue\n                elif num_bbox <= 0:\n                    is_empty = True\n\n                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)\n                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)\n                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)\n                gt_poly = [None] * num_bbox\n\n                has_segmentation = False\n                for i, box in enumerate(bboxes):\n                    catid = box['category_id']\n                    gt_class[i][0] = self.catid2clsid[catid]\n                    gt_bbox[i, :] = box['clean_bbox']\n                    is_crowd[i][0] = box['iscrowd']\n                    # check RLE format \n                    if 'segmentation' in box and box['iscrowd'] == 1:\n                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]\n                    elif 'segmentation' in box and box['segmentation']:\n                        if not np.array(box['segmentation']\n                                        ).size > 0 and not self.allow_empty:\n                            bboxes.pop(i)\n                            gt_poly.pop(i)\n                            np.delete(is_crowd, i)\n                            np.delete(gt_class, i)\n                            np.delete(gt_bbox, i)\n                        else:\n                            gt_poly[i] = box['segmentation']\n                        has_segmentation = True\n\n                if has_segmentation and not any(\n                        gt_poly) and not self.allow_empty:\n                    continue\n\n                gt_rec = {\n                    'is_crowd': is_crowd,\n                    'gt_class': gt_class,\n                    'gt_bbox': gt_bbox,\n                    'gt_poly': gt_poly,\n                }\n\n                for k, v in gt_rec.items():\n                    if k in self.data_fields:\n                        coco_rec[k] = v\n\n                # TODO: remove load_semantic\n                if self.load_semantic and 'semantic' in self.data_fields:\n                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',\n                                            'train2017', im_fname[:-3] + 'png')\n                    coco_rec.update({'semantic': seg_path})\n\n            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(\n                im_path, img_id, im_h, im_w))\n            if is_empty:\n                empty_records.append(coco_rec)\n            else:\n                records.append(coco_rec)\n            ct += 1\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n        assert ct > 0, 'not found any coco record in %s' % (anno_path)\n        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.\n                    format(ct, len(img_ids) - ct, anno_path))\n        if self.allow_empty and len(empty_records) > 0:\n            empty_records = self._sample_empty(empty_records, len(records))\n            records += empty_records\n        self.roidbs = records\n\n        if self.supervised:\n            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')\n        else:\n            if self.length > 0:  # unsup length will be decide by sup length\n                all_roidbs = self.roidbs.copy()\n                selected_idxs = [\n                    np.random.choice(len(all_roidbs))\n                    for _ in range(self.length)\n                ]\n                self.roidbs = [all_roidbs[i] for i in selected_idxs]\n            logger.info(\n                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')\n\n    def __getitem__(self, idx):\n        n = len(self.roidbs)\n        if self.repeat > 1:\n            idx %= n\n        # data batch\n        roidb = copy.deepcopy(self.roidbs[idx])\n        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:\n            idx = np.random.randint(n)\n            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]\n        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:\n            idx = np.random.randint(n)\n            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]\n        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:\n            roidb = [roidb, ] + [\n                copy.deepcopy(self.roidbs[np.random.randint(n)])\n                for _ in range(4)\n            ]\n        if isinstance(roidb, Sequence):\n            for r in roidb:\n                r['curr_iter'] = self._curr_iter\n        else:\n            roidb['curr_iter'] = self._curr_iter\n        self._curr_iter += 1\n\n        return self.transform(roidb)\n\n\n# for PaddleX\n@register\n@serializable\nclass COCODetDataset(COCODataSet):\n    pass\n\n\n# for PaddleX\n@register\n@serializable\nclass COCOInstSegDataset(COCODataSet):\n    pass\n"
  },
  {
    "path": "ppdet/data/source/culane.py",
    "content": "from ppdet.core.workspace import register, serializable\nimport cv2\nimport os\nimport tarfile\nimport numpy as np\nimport os.path as osp\nfrom ppdet.data.source.dataset import DetDataset\nfrom imgaug.augmentables.lines import LineStringsOnImage\nfrom imgaug.augmentables.segmaps import SegmentationMapsOnImage\nfrom ppdet.data.culane_utils import lane_to_linestrings\nimport pickle as pkl\nfrom ppdet.utils.logger import setup_logger\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\nfrom .dataset import DetDataset, _make_dataset, _is_valid_file\nfrom ppdet.utils.download import download_dataset\n\nlogger = setup_logger(__name__)\n\n\n@register\n@serializable\nclass CULaneDataSet(DetDataset):\n    def __init__(\n            self,\n            dataset_dir,\n            cut_height,\n            list_path,\n            split='train',\n            data_fields=['image'],\n            video_file=None,\n            frame_rate=-1, ):\n        super(CULaneDataSet, self).__init__(\n            dataset_dir=dataset_dir,\n            cut_height=cut_height,\n            split=split,\n            data_fields=data_fields)\n        self.dataset_dir = dataset_dir\n        self.list_path = osp.join(dataset_dir, list_path)\n        self.cut_height = cut_height\n        self.data_fields = data_fields\n        self.split = split\n        self.training = 'train' in split\n        self.data_infos = []\n        self.video_file = video_file\n        self.frame_rate = frame_rate\n        self._imid2path = {}\n        self.predict_dir = None\n\n    def __len__(self):\n        return len(self.data_infos)\n\n    def check_or_download_dataset(self):\n        if not osp.exists(self.dataset_dir):\n            download_dataset(\"dataset\", dataset=\"culane\")\n            # extract .tar files in self.dataset_dir\n            for fname in os.listdir(self.dataset_dir):\n                logger.info(\"Decompressing {}...\".format(fname))\n                # ignore .* files\n                if fname.startswith('.'):\n                    continue\n                if fname.find('.tar.gz') >= 0:\n                    with tarfile.open(osp.join(self.dataset_dir, fname)) as tf:\n                        tf.extractall(path=self.dataset_dir)\n        logger.info(\"Dataset files are ready.\")\n\n    def parse_dataset(self):\n        logger.info('Loading CULane annotations...')\n        if self.predict_dir is not None:\n            logger.info('switch to predict mode')\n            return\n        # Waiting for the dataset to load is tedious, let's cache it\n        os.makedirs('cache', exist_ok=True)\n        cache_path = 'cache/culane_paddle_{}.pkl'.format(self.split)\n        if os.path.exists(cache_path):\n            with open(cache_path, 'rb') as cache_file:\n                self.data_infos = pkl.load(cache_file)\n                self.max_lanes = max(\n                    len(anno['lanes']) for anno in self.data_infos)\n                return\n\n        with open(self.list_path) as list_file:\n            for line in list_file:\n                infos = self.load_annotation(line.split())\n                self.data_infos.append(infos)\n\n        # cache data infos to file\n        with open(cache_path, 'wb') as cache_file:\n            pkl.dump(self.data_infos, cache_file)\n\n    def load_annotation(self, line):\n        infos = {}\n        img_line = line[0]\n        img_line = img_line[1 if img_line[0] == '/' else 0::]\n        img_path = os.path.join(self.dataset_dir, img_line)\n        infos['img_name'] = img_line\n        infos['img_path'] = img_path\n        if len(line) > 1:\n            mask_line = line[1]\n            mask_line = mask_line[1 if mask_line[0] == '/' else 0::]\n            mask_path = os.path.join(self.dataset_dir, mask_line)\n            infos['mask_path'] = mask_path\n\n        if len(line) > 2:\n            exist_list = [int(l) for l in line[2:]]\n            infos['lane_exist'] = np.array(exist_list)\n\n        anno_path = img_path[:\n                             -3] + 'lines.txt'  # remove sufix jpg and add lines.txt\n        with open(anno_path, 'r') as anno_file:\n            data = [\n                list(map(float, line.split())) for line in anno_file.readlines()\n            ]\n        lanes = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)\n                  if lane[i] >= 0 and lane[i + 1] >= 0] for lane in data]\n        lanes = [list(set(lane)) for lane in lanes]  # remove duplicated points\n        lanes = [lane for lane in lanes\n                 if len(lane) > 2]  # remove lanes with less than 2 points\n\n        lanes = [sorted(\n            lane, key=lambda x: x[1]) for lane in lanes]  # sort by y\n        infos['lanes'] = lanes\n\n        return infos\n\n    def set_images(self, images):\n        self.predict_dir = images\n        self.data_infos = self._load_images()\n\n    def _find_images(self):\n        predict_dir = self.predict_dir\n        if not isinstance(predict_dir, Sequence):\n            predict_dir = [predict_dir]\n        images = []\n        for im_dir in predict_dir:\n            if os.path.isdir(im_dir):\n                im_dir = os.path.join(self.predict_dir, im_dir)\n                images.extend(_make_dataset(im_dir))\n            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):\n                images.append(im_dir)\n        return images\n\n    def _load_images(self):\n        images = self._find_images()\n        ct = 0\n        records = []\n        for image in images:\n            assert image != '' and os.path.isfile(image), \\\n                    \"Image {} not found\".format(image)\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n            rec = {\n                'im_id': np.array([ct]),\n                \"img_path\": os.path.abspath(image),\n                \"img_name\": os.path.basename(image),\n                \"lanes\": []\n            }\n            self._imid2path[ct] = image\n            ct += 1\n            records.append(rec)\n        assert len(records) > 0, \"No image file found\"\n        return records\n\n    def get_imid2path(self):\n        return self._imid2path\n\n    def __getitem__(self, idx):\n        data_info = self.data_infos[idx]\n        img = cv2.imread(data_info['img_path'])\n        img = img[self.cut_height:, :, :]\n        sample = data_info.copy()\n        sample.update({'image': img})\n        img_org = sample['image']\n\n        if self.training:\n            label = cv2.imread(sample['mask_path'], cv2.IMREAD_UNCHANGED)\n            if len(label.shape) > 2:\n                label = label[:, :, 0]\n            label = label.squeeze()\n            label = label[self.cut_height:, :]\n            sample.update({'mask': label})\n            if self.cut_height != 0:\n                new_lanes = []\n                for i in sample['lanes']:\n                    lanes = []\n                    for p in i:\n                        lanes.append((p[0], p[1] - self.cut_height))\n                    new_lanes.append(lanes)\n                sample.update({'lanes': new_lanes})\n\n            sample['mask'] = SegmentationMapsOnImage(\n                sample['mask'], shape=img_org.shape)\n\n        sample['full_img_path'] = data_info['img_path']\n        sample['img_name'] = data_info['img_name']\n        sample['im_id'] = np.array([idx])\n\n        sample['image'] = sample['image'].copy().astype(np.uint8)\n        sample['lanes'] = lane_to_linestrings(sample['lanes'])\n        sample['lanes'] = LineStringsOnImage(\n            sample['lanes'], shape=img_org.shape)\n        sample['seg'] = np.zeros(img_org.shape)\n\n        return sample\n"
  },
  {
    "path": "ppdet/data/source/dataset.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport os\nimport copy\nimport numpy as np\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\nfrom pycocotools.coco import COCO\nfrom paddle.io import Dataset\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.utils.download import get_dataset_path\nfrom ppdet.data import source\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\n@serializable\nclass DetDataset(Dataset):\n    \"\"\"\n    Load detection dataset.\n\n    Args:\n        dataset_dir (str): root directory for dataset.\n        image_dir (str): directory for images.\n        anno_path (str): annotation file path.\n        data_fields (list): key name of data dictionary, at least have 'image'.\n        sample_num (int): number of samples to load, -1 means all.\n        use_default_label (bool): whether to load default label list.\n        repeat (int): repeat times for dataset, use in benchmark.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 data_fields=['image'],\n                 sample_num=-1,\n                 use_default_label=None,\n                 repeat=1,\n                 **kwargs):\n        super(DetDataset, self).__init__()\n        self.dataset_dir = dataset_dir if dataset_dir is not None else ''\n        self.anno_path = anno_path\n        self.image_dir = image_dir if image_dir is not None else ''\n        self.data_fields = data_fields\n        self.sample_num = sample_num\n        self.use_default_label = use_default_label\n        self.repeat = repeat\n        self._epoch = 0\n        self._curr_iter = 0\n\n    def __len__(self, ):\n        return len(self.roidbs) * self.repeat\n\n    def __call__(self, *args, **kwargs):\n        return self\n\n    def __getitem__(self, idx):\n        n = len(self.roidbs)\n        if self.repeat > 1:\n            idx %= n\n        # data batch\n        roidb = copy.deepcopy(self.roidbs[idx])\n        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:\n            idx = np.random.randint(n)\n            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]\n        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:\n            idx = np.random.randint(n)\n            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]\n        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:\n            roidb = [roidb, ] + [\n                copy.deepcopy(self.roidbs[np.random.randint(n)])\n                for _ in range(4)\n            ]\n        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:\n            # Add previous image as input, only used in CenterTrack\n            idx_pre_img = idx - 1\n            if idx_pre_img < 0:\n                idx_pre_img = idx + 1\n            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]\n        if isinstance(roidb, Sequence):\n            for r in roidb:\n                r['curr_iter'] = self._curr_iter\n                r['curr_epoch'] = self._epoch\n        else:\n            roidb['curr_iter'] = self._curr_iter\n            roidb['curr_epoch'] = self._epoch\n        self._curr_iter += 1\n        \n        if self.transform_schedulers:\n            assert isinstance(self.transform_schedulers, list)\n            if isinstance(roidb, Sequence):\n                for r in roidb:\n                    r['transform_schedulers'] = self.transform_schedulers\n            else:\n                roidb['transform_schedulers'] = self.transform_schedulers\n        \n        return self.transform(roidb)\n\n    def check_or_download_dataset(self):\n        self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,\n                                            self.image_dir)\n\n    def set_kwargs(self, **kwargs):\n        self.mixup_epoch = kwargs.get('mixup_epoch', -1)\n        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)\n        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)\n        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)\n        self.transform_schedulers = kwargs.get('transform_schedulers', None)\n\n    def set_transform(self, transform):\n        self.transform = transform\n\n    def set_epoch(self, epoch_id):\n        self._epoch = epoch_id\n\n    def parse_dataset(self, ):\n        raise NotImplementedError(\n            \"Need to implement parse_dataset method of Dataset\")\n\n    def get_anno(self):\n        if self.anno_path is None:\n            return\n        return os.path.join(self.dataset_dir, self.anno_path)\n\n\ndef _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):\n    return f.lower().endswith(extensions)\n\n\ndef _make_dataset(dir):\n    dir = os.path.expanduser(dir)\n    if not os.path.isdir(dir):\n        raise ('{} should be a dir'.format(dir))\n    images = []\n    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):\n        for fname in sorted(fnames):\n            path = os.path.join(root, fname)\n            if _is_valid_file(path):\n                images.append(path)\n    return images\n\n\n@register\n@serializable\nclass ImageFolder(DetDataset):\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 sample_num=-1,\n                 use_default_label=None,\n                 **kwargs):\n        super(ImageFolder, self).__init__(\n            dataset_dir,\n            image_dir,\n            anno_path,\n            sample_num=sample_num,\n            use_default_label=use_default_label)\n        self._imid2path = {}\n        self.roidbs = None\n        self.sample_num = sample_num\n\n    def check_or_download_dataset(self):\n        return\n\n    def get_anno(self):\n        if self.anno_path is None:\n            return\n        if self.dataset_dir:\n            return os.path.join(self.dataset_dir, self.anno_path)\n        else:\n            return self.anno_path\n\n    def parse_dataset(self, ):\n        if not self.roidbs:\n            self.roidbs = self._load_images()\n\n    def _parse(self):\n        image_dir = self.image_dir\n        if not isinstance(image_dir, Sequence):\n            image_dir = [image_dir]\n        images = []\n        for im_dir in image_dir:\n            if os.path.isdir(im_dir):\n                im_dir = os.path.join(self.dataset_dir, im_dir)\n                images.extend(_make_dataset(im_dir))\n            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):\n                images.append(im_dir)\n        return images\n    \n    def get_images(self):\n        images_path = []\n        coco = COCO(os.path.join(self.dataset_dir, self.anno_path))\n        imgIds = coco.getImgIds(catIds=[])\n        for imgId in imgIds:\n            filename = coco.loadImgs(imgId)[0][\"file_name\"]\n            images_path.append(os.path.join(self.dataset_dir, self.image_dir, filename))\n        return images_path\n\n    def _load_images(self, do_eval=False):\n        images = self._parse()\n        ct = 0\n        records = []\n        anno_file = self.get_anno()\n        coco = COCO(anno_file)\n        for image in images:\n            assert image != '' and os.path.isfile(image), \\\n                    \"Image {} not found\".format(image)\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n            if do_eval:\n                image_id = self.get_image_id(image, coco)\n                ct = image_id\n            rec = {'im_id': np.array([ct]), 'im_file': image}\n            self._imid2path[ct] = image\n            ct += 1\n            records.append(rec)\n        assert len(records) > 0, \"No image file found\"\n        return records\n    \n    def get_image_id(self, image, coco):\n        image_ids = coco.getImgIds()\n        for image_id in image_ids:\n            img_info = coco.loadImgs(image_id)[0]\n            if img_info['file_name'] in image:\n                return image_id\n            else:\n                continue\n\n    def get_imid2path(self):\n        return self._imid2path\n\n    def set_images(self, images, do_eval=False):\n        self.image_dir = images\n        self.roidbs = self._load_images(do_eval=do_eval)\n\n    def set_slice_images(self,\n                         images,\n                         slice_size=[640, 640],\n                         overlap_ratio=[0.25, 0.25]):\n        self.image_dir = images\n        ori_records = self._load_images()\n        try:\n            import sahi\n            from sahi.slicing import slice_image\n        except Exception as e:\n            logger.error(\n                'sahi not found, plaese install sahi. '\n                'for example: `pip install sahi`, see https://github.com/obss/sahi.'\n            )\n            raise e\n\n        sub_img_ids = 0\n        ct = 0\n        ct_sub = 0\n        records = []\n        for i, ori_rec in enumerate(ori_records):\n            im_path = ori_rec['im_file']\n            slice_image_result = sahi.slicing.slice_image(\n                image=im_path,\n                slice_height=slice_size[0],\n                slice_width=slice_size[1],\n                overlap_height_ratio=overlap_ratio[0],\n                overlap_width_ratio=overlap_ratio[1])\n\n            sub_img_num = len(slice_image_result)\n            for _ind in range(sub_img_num):\n                im = slice_image_result.images[_ind]\n                rec = {\n                    'image': im,\n                    'im_id': np.array([sub_img_ids + _ind]),\n                    'h': im.shape[0],\n                    'w': im.shape[1],\n                    'ori_im_id': np.array([ori_rec['im_id'][0]]),\n                    'st_pix': np.array(\n                        slice_image_result.starting_pixels[_ind],\n                        dtype=np.float32),\n                    'is_last': 1 if _ind == sub_img_num - 1 else 0,\n                } if 'image' in self.data_fields else {}\n                records.append(rec)\n            ct_sub += sub_img_num\n            ct += 1\n        logger.info('{} samples and slice to {} sub_samples.'.format(ct,\n                                                                     ct_sub))\n        self.roidbs = records\n\n    def get_label_list(self):\n        # Only VOC dataset needs label list in ImageFold \n        return self.anno_path\n\n\n@register\nclass CommonDataset(object):\n    def __init__(self, **dataset_args):\n        super(CommonDataset, self).__init__()\n        dataset_args = copy.deepcopy(dataset_args)\n        type = dataset_args.pop(\"name\")\n        self.dataset = getattr(source, type)(**dataset_args)\n\n    def __call__(self):\n        return self.dataset\n\n\n@register\nclass TrainDataset(CommonDataset):\n    pass\n\n\n@register\nclass EvalMOTDataset(CommonDataset):\n    pass\n\n\n@register\nclass TestMOTDataset(CommonDataset):\n    pass\n\n\n@register\nclass EvalDataset(CommonDataset):\n    pass\n\n\n@register\nclass TestDataset(CommonDataset):\n    pass\n"
  },
  {
    "path": "ppdet/data/source/keypoint_coco.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\"\"\"\nthis code is base on https://github.com/open-mmlab/mmpose\n\"\"\"\nimport os\nimport cv2\nimport numpy as np\nimport json\nimport copy\nimport pycocotools\nfrom pycocotools.coco import COCO\nfrom .dataset import DetDataset\nfrom ppdet.core.workspace import register, serializable\n\n\n@serializable\nclass KeypointBottomUpBaseDataset(DetDataset):\n    \"\"\"Base class for bottom-up datasets. \n\n    All datasets should subclass it.\n    All subclasses should overwrite:\n        Methods:`_get_imganno`\n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        anno_path (str): Relative path to the annotation file.\n        image_dir (str): Path to a directory where images are held.\n            Default: None.\n        num_joints (int): keypoint numbers\n        transform (composed(operators)): A sequence of data transforms.\n        shard (list): [rank, worldsize], the distributed env params\n        test_mode (bool): Store True when building test or\n            validation dataset. Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir,\n                 image_dir,\n                 anno_path,\n                 num_joints,\n                 transform=[],\n                 shard=[0, 1],\n                 test_mode=False):\n        super().__init__(dataset_dir, image_dir, anno_path)\n        self.image_info = {}\n        self.ann_info = {}\n\n        self.img_prefix = os.path.join(dataset_dir, image_dir)\n        self.transform = transform\n        self.test_mode = test_mode\n\n        self.ann_info['num_joints'] = num_joints\n        self.img_ids = []\n\n    def parse_dataset(self):\n        pass\n\n    def __len__(self):\n        \"\"\"Get dataset length.\"\"\"\n        return len(self.img_ids)\n\n    def _get_imganno(self, idx):\n        \"\"\"Get anno for a single image.\"\"\"\n        raise NotImplementedError\n\n    def __getitem__(self, idx):\n        \"\"\"Prepare image for training given the index.\"\"\"\n        records = copy.deepcopy(self._get_imganno(idx))\n        records['image'] = cv2.imread(records['image_file'])\n        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)\n        if 'mask' in records:\n            records['mask'] = (records['mask'] + 0).astype('uint8')\n        records = self.transform(records)\n        return records\n\n    def parse_dataset(self):\n        return\n\n\n@register\n@serializable\nclass KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):\n    \"\"\"COCO dataset for bottom-up pose estimation. \n\n    The dataset loads raw features and apply specified transforms\n    to return a dict containing the image tensors and other information.\n\n    COCO keypoint indexes::\n\n        0: 'nose',\n        1: 'left_eye',\n        2: 'right_eye',\n        3: 'left_ear',\n        4: 'right_ear',\n        5: 'left_shoulder',\n        6: 'right_shoulder',\n        7: 'left_elbow',\n        8: 'right_elbow',\n        9: 'left_wrist',\n        10: 'right_wrist',\n        11: 'left_hip',\n        12: 'right_hip',\n        13: 'left_knee',\n        14: 'right_knee',\n        15: 'left_ankle',\n        16: 'right_ankle'\n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        anno_path (str): Relative path to the annotation file.\n        image_dir (str): Path to a directory where images are held.\n            Default: None.\n        num_joints (int): keypoint numbers\n        transform (composed(operators)): A sequence of data transforms.\n        shard (list): [rank, worldsize], the distributed env params\n        test_mode (bool): Store True when building test or\n            validation dataset. Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir,\n                 image_dir,\n                 anno_path,\n                 num_joints,\n                 transform=[],\n                 shard=[0, 1],\n                 test_mode=False,\n                 return_mask=True,\n                 return_bbox=True,\n                 return_area=True,\n                 return_class=True):\n        super().__init__(dataset_dir, image_dir, anno_path, num_joints,\n                         transform, shard, test_mode)\n\n        self.ann_file = os.path.join(dataset_dir, anno_path)\n        self.shard = shard\n        self.test_mode = test_mode\n        self.return_mask = return_mask\n        self.return_bbox = return_bbox\n        self.return_area = return_area\n        self.return_class = return_class\n\n    def parse_dataset(self):\n        self.coco = COCO(self.ann_file)\n\n        self.img_ids = self.coco.getImgIds()\n        if not self.test_mode:\n            self.img_ids_tmp = []\n            for img_id in self.img_ids:\n                ann_ids = self.coco.getAnnIds(imgIds=img_id)\n                anno = self.coco.loadAnns(ann_ids)\n                anno = [obj for obj in anno if obj['iscrowd'] == 0]\n                if len(anno) == 0:\n                    continue\n                self.img_ids_tmp.append(img_id)\n            self.img_ids = self.img_ids_tmp\n\n        blocknum = int(len(self.img_ids) / self.shard[1])\n        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (\n            self.shard[0] + 1))]\n        self.num_images = len(self.img_ids)\n        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)\n        self.dataset_name = 'coco'\n\n        cat_ids = self.coco.getCatIds()\n        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})\n        print('=> num_images: {}'.format(self.num_images))\n\n    @staticmethod\n    def _get_mapping_id_name(imgs):\n        \"\"\"\n        Args:\n            imgs (dict): dict of image info.\n\n        Returns:\n            tuple: Image name & id mapping dicts.\n\n            - id2name (dict): Mapping image id to name.\n            - name2id (dict): Mapping image name to id.\n        \"\"\"\n        id2name = {}\n        name2id = {}\n        for image_id, image in imgs.items():\n            file_name = image['file_name']\n            id2name[image_id] = file_name\n            name2id[file_name] = image_id\n\n        return id2name, name2id\n\n    def _get_imganno(self, idx):\n        \"\"\"Get anno for a single image.\n\n        Args:\n            idx (int): image idx\n\n        Returns:\n            dict: info for model training\n        \"\"\"\n        coco = self.coco\n        img_id = self.img_ids[idx]\n        ann_ids = coco.getAnnIds(imgIds=img_id)\n        anno = coco.loadAnns(ann_ids)\n\n        anno = [\n            obj for obj in anno\n            if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0\n        ]\n\n        db_rec = {}\n        joints, orgsize = self._get_joints(anno, idx)\n        db_rec['gt_joints'] = joints\n        db_rec['im_shape'] = orgsize\n\n        if self.return_bbox:\n            db_rec['gt_bbox'] = self._get_bboxs(anno, idx)\n\n        if self.return_class:\n            db_rec['gt_class'] = self._get_labels(anno, idx)\n\n        if self.return_area:\n            db_rec['gt_areas'] = self._get_areas(anno, idx)\n\n        if self.return_mask:\n            db_rec['mask'] = self._get_mask(anno, idx)\n\n        db_rec['im_id'] = img_id\n        db_rec['image_file'] = os.path.join(self.img_prefix,\n                                            self.id2name[img_id])\n\n        return db_rec\n\n    def _get_joints(self, anno, idx):\n        \"\"\"Get joints for all people in an image.\"\"\"\n        num_people = len(anno)\n\n        joints = np.zeros(\n            (num_people, self.ann_info['num_joints'], 3), dtype=np.float32)\n\n        for i, obj in enumerate(anno):\n            joints[i, :self.ann_info['num_joints'], :3] = \\\n                np.array(obj['keypoints']).reshape([-1, 3])\n\n        img_info = self.coco.loadImgs(self.img_ids[idx])[0]\n        orgsize = np.array([img_info['height'], img_info['width'], 1])\n\n        return joints, orgsize\n\n    def _get_bboxs(self, anno, idx):\n        num_people = len(anno)\n        gt_bboxes = np.zeros((num_people, 4), dtype=np.float32)\n\n        for idx, obj in enumerate(anno):\n            if 'bbox' in obj:\n                gt_bboxes[idx, :] = obj['bbox']\n\n        gt_bboxes[:, 2] += gt_bboxes[:, 0]\n        gt_bboxes[:, 3] += gt_bboxes[:, 1]\n        return gt_bboxes\n\n    def _get_labels(self, anno, idx):\n        num_people = len(anno)\n        gt_labels = np.zeros((num_people, 1), dtype=np.float32)\n\n        for idx, obj in enumerate(anno):\n            if 'category_id' in obj:\n                catid = obj['category_id']\n                gt_labels[idx, 0] = self.catid2clsid[catid]\n        return gt_labels\n\n    def _get_areas(self, anno, idx):\n        num_people = len(anno)\n        gt_areas = np.zeros((num_people, ), dtype=np.float32)\n\n        for idx, obj in enumerate(anno):\n            if 'area' in obj:\n                gt_areas[idx, ] = obj['area']\n        return gt_areas\n\n    def _get_mask(self, anno, idx):\n        \"\"\"Get ignore masks to mask out losses.\"\"\"\n        coco = self.coco\n        img_info = coco.loadImgs(self.img_ids[idx])[0]\n\n        m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32)\n\n        for obj in anno:\n            if 'segmentation' in obj:\n                if obj['iscrowd']:\n                    rle = pycocotools.mask.frPyObjects(obj['segmentation'],\n                                                       img_info['height'],\n                                                       img_info['width'])\n                    m += pycocotools.mask.decode(rle)\n                elif obj['num_keypoints'] == 0:\n                    rles = pycocotools.mask.frPyObjects(obj['segmentation'],\n                                                        img_info['height'],\n                                                        img_info['width'])\n                    for rle in rles:\n                        m += pycocotools.mask.decode(rle)\n\n        return m < 0.5\n\n\n@register\n@serializable\nclass KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset):\n    \"\"\"CrowdPose dataset for bottom-up pose estimation. \n\n    The dataset loads raw features and apply specified transforms\n    to return a dict containing the image tensors and other information.\n\n    CrowdPose keypoint indexes::\n\n        0: 'left_shoulder',\n        1: 'right_shoulder',\n        2: 'left_elbow',\n        3: 'right_elbow',\n        4: 'left_wrist',\n        5: 'right_wrist',\n        6: 'left_hip',\n        7: 'right_hip',\n        8: 'left_knee',\n        9: 'right_knee',\n        10: 'left_ankle',\n        11: 'right_ankle',\n        12: 'top_head',\n        13: 'neck'\n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        anno_path (str): Relative path to the annotation file.\n        image_dir (str): Path to a directory where images are held.\n            Default: None.\n        num_joints (int): keypoint numbers\n        transform (composed(operators)): A sequence of data transforms.\n        shard (list): [rank, worldsize], the distributed env params\n        test_mode (bool): Store True when building test or\n            validation dataset. Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir,\n                 image_dir,\n                 anno_path,\n                 num_joints,\n                 transform=[],\n                 shard=[0, 1],\n                 test_mode=False):\n        super().__init__(dataset_dir, image_dir, anno_path, num_joints,\n                         transform, shard, test_mode)\n\n        self.ann_file = os.path.join(dataset_dir, anno_path)\n        self.shard = shard\n        self.test_mode = test_mode\n\n    def parse_dataset(self):\n        self.coco = COCO(self.ann_file)\n\n        self.img_ids = self.coco.getImgIds()\n        if not self.test_mode:\n            self.img_ids = [\n                img_id for img_id in self.img_ids\n                if len(self.coco.getAnnIds(\n                    imgIds=img_id, iscrowd=None)) > 0\n            ]\n        blocknum = int(len(self.img_ids) / self.shard[1])\n        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (\n            self.shard[0] + 1))]\n        self.num_images = len(self.img_ids)\n        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)\n\n        self.dataset_name = 'crowdpose'\n        print('=> num_images: {}'.format(self.num_images))\n\n\n@serializable\nclass KeypointTopDownBaseDataset(DetDataset):\n    \"\"\"Base class for top_down datasets.\n\n    All datasets should subclass it.\n    All subclasses should overwrite:\n        Methods:`_get_db`\n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        image_dir (str): Path to a directory where images are held.\n        anno_path (str): Relative path to the annotation file.\n        num_joints (int): keypoint numbers\n        transform (composed(operators)): A sequence of data transforms.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir,\n                 image_dir,\n                 anno_path,\n                 num_joints,\n                 transform=[]):\n        super().__init__(dataset_dir, image_dir, anno_path)\n        self.image_info = {}\n        self.ann_info = {}\n\n        self.img_prefix = os.path.join(dataset_dir, image_dir)\n        self.transform = transform\n\n        self.ann_info['num_joints'] = num_joints\n        self.db = []\n\n    def __len__(self):\n        \"\"\"Get dataset length.\"\"\"\n        return len(self.db)\n\n    def _get_db(self):\n        \"\"\"Get a sample\"\"\"\n        raise NotImplementedError\n\n    def __getitem__(self, idx):\n        \"\"\"Prepare sample for training given the index.\"\"\"\n        records = copy.deepcopy(self.db[idx])\n        records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR |\n                                      cv2.IMREAD_IGNORE_ORIENTATION)\n        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)\n        records['score'] = records['score'] if 'score' in records else 1\n        records = self.transform(records)\n        # print('records', records)\n        return records\n\n\n@register\n@serializable\nclass KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):\n    \"\"\"COCO dataset for top-down pose estimation. \n\n    The dataset loads raw features and apply specified transforms\n    to return a dict containing the image tensors and other information.\n\n    COCO keypoint indexes:\n\n        0: 'nose',\n        1: 'left_eye',\n        2: 'right_eye',\n        3: 'left_ear',\n        4: 'right_ear',\n        5: 'left_shoulder',\n        6: 'right_shoulder',\n        7: 'left_elbow',\n        8: 'right_elbow',\n        9: 'left_wrist',\n        10: 'right_wrist',\n        11: 'left_hip',\n        12: 'right_hip',\n        13: 'left_knee',\n        14: 'right_knee',\n        15: 'left_ankle',\n        16: 'right_ankle'\n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        image_dir (str): Path to a directory where images are held.\n        anno_path (str): Relative path to the annotation file.\n        num_joints (int): Keypoint numbers\n        trainsize (list):[w, h] Image target size\n        transform (composed(operators)): A sequence of data transforms.\n        bbox_file (str): Path to a detection bbox file\n            Default: None.\n        use_gt_bbox (bool): Whether to use ground truth bbox\n            Default: True.\n        pixel_std (int): The pixel std of the scale\n            Default: 200.\n        image_thre (float): The threshold to filter the detection box\n            Default: 0.0.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir,\n                 image_dir,\n                 anno_path,\n                 num_joints,\n                 trainsize,\n                 transform=[],\n                 bbox_file=None,\n                 use_gt_bbox=True,\n                 pixel_std=200,\n                 image_thre=0.0,\n                 center_scale=None):\n        super().__init__(dataset_dir, image_dir, anno_path, num_joints,\n                         transform)\n\n        self.bbox_file = bbox_file\n        self.use_gt_bbox = use_gt_bbox\n        self.trainsize = trainsize\n        self.pixel_std = pixel_std\n        self.image_thre = image_thre\n        self.center_scale = center_scale\n        self.dataset_name = 'coco'\n\n    def parse_dataset(self):\n        if self.use_gt_bbox:\n            self.db = self._load_coco_keypoint_annotations()\n        else:\n            self.db = self._load_coco_person_detection_results()\n\n    def _load_coco_keypoint_annotations(self):\n        coco = COCO(self.get_anno())\n        img_ids = coco.getImgIds()\n        gt_db = []\n        for index in img_ids:\n            im_ann = coco.loadImgs(index)[0]\n            width = im_ann['width']\n            height = im_ann['height']\n            file_name = im_ann['file_name']\n            im_id = int(im_ann[\"id\"])\n\n            annIds = coco.getAnnIds(imgIds=index, iscrowd=False)\n            objs = coco.loadAnns(annIds)\n\n            valid_objs = []\n            for obj in objs:\n                x, y, w, h = obj['bbox']\n                x1 = np.max((0, x))\n                y1 = np.max((0, y))\n                x2 = np.min((width - 1, x1 + np.max((0, w - 1))))\n                y2 = np.min((height - 1, y1 + np.max((0, h - 1))))\n                if obj['area'] > 0 and x2 >= x1 and y2 >= y1:\n                    obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]\n                    valid_objs.append(obj)\n            objs = valid_objs\n\n            rec = []\n            for obj in objs:\n                if max(obj['keypoints']) == 0:\n                    continue\n\n                joints = np.zeros(\n                    (self.ann_info['num_joints'], 3), dtype=np.float32)\n                joints_vis = np.zeros(\n                    (self.ann_info['num_joints'], 3), dtype=np.float32)\n                for ipt in range(self.ann_info['num_joints']):\n                    joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]\n                    joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]\n                    joints[ipt, 2] = 0\n                    t_vis = obj['keypoints'][ipt * 3 + 2]\n                    if t_vis > 1:\n                        t_vis = 1\n                    joints_vis[ipt, 0] = t_vis\n                    joints_vis[ipt, 1] = t_vis\n                    joints_vis[ipt, 2] = 0\n\n                center, scale = self._box2cs(obj['clean_bbox'][:4])\n                rec.append({\n                    'image_file': os.path.join(self.img_prefix, file_name),\n                    'center': center,\n                    'scale': scale,\n                    'gt_joints': joints,\n                    'joints_vis': joints_vis,\n                    'im_id': im_id,\n                })\n            gt_db.extend(rec)\n\n        return gt_db\n\n    def _box2cs(self, box):\n        x, y, w, h = box[:4]\n        center = np.zeros((2), dtype=np.float32)\n        center[0] = x + w * 0.5\n        center[1] = y + h * 0.5\n        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]\n\n        if self.center_scale is not None and np.random.rand() < 0.3:\n            center += self.center_scale * (np.random.rand(2) - 0.5) * [w, h]\n\n        if w > aspect_ratio * h:\n            h = w * 1.0 / aspect_ratio\n        elif w < aspect_ratio * h:\n            w = h * aspect_ratio\n        scale = np.array(\n            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],\n            dtype=np.float32)\n        if center[0] != -1:\n            scale = scale * 1.25\n\n        return center, scale\n\n    def _load_coco_person_detection_results(self):\n        all_boxes = None\n        bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file)\n        with open(bbox_file_path, 'r') as f:\n            all_boxes = json.load(f)\n\n        if not all_boxes:\n            print('=> Load %s fail!' % bbox_file_path)\n            return None\n\n        kpt_db = []\n        for n_img in range(0, len(all_boxes)):\n            det_res = all_boxes[n_img]\n            if det_res['category_id'] != 1:\n                continue\n            file_name = det_res[\n                'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[\n                    'image_id']\n            img_name = os.path.join(self.img_prefix, file_name)\n            box = det_res['bbox']\n            score = det_res['score']\n            im_id = int(det_res['image_id'])\n\n            if score < self.image_thre:\n                continue\n\n            center, scale = self._box2cs(box)\n            joints = np.zeros(\n                (self.ann_info['num_joints'], 3), dtype=np.float32)\n            joints_vis = np.ones(\n                (self.ann_info['num_joints'], 3), dtype=np.float32)\n            kpt_db.append({\n                'image_file': img_name,\n                'im_id': im_id,\n                'center': center,\n                'scale': scale,\n                'score': score,\n                'gt_joints': joints,\n                'joints_vis': joints_vis,\n            })\n\n        return kpt_db\n\n\n@register\n@serializable\nclass KeypointTopDownCocoWholeBodyHandDataset(KeypointTopDownBaseDataset):\n    \"\"\"CocoWholeBody dataset for top-down hand pose estimation. \n\n    The dataset loads raw features and apply specified transforms\n    to return a dict containing the image tensors and other information.\n\n    COCO-WholeBody Hand keypoint indexes:\n\n        0: 'wrist',\n        1: 'thumb1',\n        2: 'thumb2',\n        3: 'thumb3',\n        4: 'thumb4',\n        5: 'forefinger1',\n        6: 'forefinger2',\n        7: 'forefinger3',\n        8: 'forefinger4',\n        9: 'middle_finger1',\n        10: 'middle_finger2',\n        11: 'middle_finger3',\n        12: 'middle_finger4',\n        13: 'ring_finger1',\n        14: 'ring_finger2',\n        15: 'ring_finger3',\n        16: 'ring_finger4',\n        17: 'pinky_finger1',\n        18: 'pinky_finger2',\n        19: 'pinky_finger3',\n        20: 'pinky_finger4'\n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        image_dir (str): Path to a directory where images are held.\n        anno_path (str): Relative path to the annotation file.\n        num_joints (int): Keypoint numbers\n        trainsize (list):[w, h] Image target size\n        transform (composed(operators)): A sequence of data transforms.\n        pixel_std (int): The pixel std of the scale\n            Default: 200.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir,\n                 image_dir,\n                 anno_path,\n                 num_joints,\n                 trainsize,\n                 transform=[],\n                 pixel_std=200):\n        super().__init__(dataset_dir, image_dir, anno_path, num_joints,\n                         transform)\n\n        self.trainsize = trainsize\n        self.pixel_std = pixel_std\n        self.dataset_name = 'coco_wholebady_hand'\n\n    def _box2cs(self, box):\n        x, y, w, h = box[:4]\n        center = np.zeros((2), dtype=np.float32)\n        center[0] = x + w * 0.5\n        center[1] = y + h * 0.5\n        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]\n\n        if w > aspect_ratio * h:\n            h = w * 1.0 / aspect_ratio\n        elif w < aspect_ratio * h:\n            w = h * aspect_ratio\n        scale = np.array(\n            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],\n            dtype=np.float32)\n        if center[0] != -1:\n            scale = scale * 1.25\n\n        return center, scale\n\n    def parse_dataset(self):\n        gt_db = []\n        num_joints = self.ann_info['num_joints']\n        coco = COCO(self.get_anno())\n        img_ids = list(coco.imgs.keys())\n        for img_id in img_ids:\n            im_ann = coco.loadImgs(img_id)[0]\n            image_file = os.path.join(self.img_prefix, im_ann['file_name'])\n            im_id = int(im_ann[\"id\"])\n\n            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)\n            objs = coco.loadAnns(ann_ids)\n\n            for obj in objs:\n                for type in ['left', 'right']:\n                    if (obj[f'{type}hand_valid'] and\n                            max(obj[f'{type}hand_kpts']) > 0):\n\n                        joints = np.zeros((num_joints, 3), dtype=np.float32)\n                        joints_vis = np.zeros((num_joints, 3), dtype=np.float32)\n\n                        keypoints = np.array(obj[f'{type}hand_kpts'])\n                        keypoints = keypoints.reshape(-1, 3)\n                        joints[:, :2] = keypoints[:, :2]\n                        joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])\n\n                        center, scale = self._box2cs(obj[f'{type}hand_box'][:4])\n                        gt_db.append({\n                            'image_file': image_file,\n                            'center': center,\n                            'scale': scale,\n                            'gt_joints': joints,\n                            'joints_vis': joints_vis,\n                            'im_id': im_id,\n                        })\n\n        self.db = gt_db\n\n\n@register\n@serializable\nclass KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):\n    \"\"\"MPII dataset for topdown pose estimation.\n\n    The dataset loads raw features and apply specified transforms\n    to return a dict containing the image tensors and other information.\n\n    MPII keypoint indexes::\n\n        0: 'right_ankle',\n        1: 'right_knee',\n        2: 'right_hip',\n        3: 'left_hip',\n        4: 'left_knee',\n        5: 'left_ankle',\n        6: 'pelvis',\n        7: 'thorax',\n        8: 'upper_neck',\n        9: 'head_top',\n        10: 'right_wrist',\n        11: 'right_elbow',\n        12: 'right_shoulder',\n        13: 'left_shoulder',\n        14: 'left_elbow',\n        15: 'left_wrist',\n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        image_dir (str): Path to a directory where images are held.\n        anno_path (str): Relative path to the annotation file.\n        num_joints (int): Keypoint numbers\n        trainsize (list):[w, h] Image target size\n        transform (composed(operators)): A sequence of data transforms.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir,\n                 image_dir,\n                 anno_path,\n                 num_joints,\n                 transform=[]):\n        super().__init__(dataset_dir, image_dir, anno_path, num_joints,\n                         transform)\n\n        self.dataset_name = 'mpii'\n\n    def parse_dataset(self):\n        with open(self.get_anno()) as anno_file:\n            anno = json.load(anno_file)\n\n        gt_db = []\n        for a in anno:\n            image_name = a['image']\n            im_id = a['image_id'] if 'image_id' in a else int(\n                os.path.splitext(image_name)[0])\n\n            c = np.array(a['center'], dtype=np.float32)\n            s = np.array([a['scale'], a['scale']], dtype=np.float32)\n\n            # Adjust center/scale slightly to avoid cropping limbs\n            if c[0] != -1:\n                c[1] = c[1] + 15 * s[1]\n                s = s * 1.25\n            c = c - 1\n\n            joints = np.zeros(\n                (self.ann_info['num_joints'], 3), dtype=np.float32)\n            joints_vis = np.zeros(\n                (self.ann_info['num_joints'], 3), dtype=np.float32)\n            if 'gt_joints' in a:\n                joints_ = np.array(a['gt_joints'])\n                joints_[:, 0:2] = joints_[:, 0:2] - 1\n                joints_vis_ = np.array(a['joints_vis'])\n                assert len(joints_) == self.ann_info[\n                    'num_joints'], 'joint num diff: {} vs {}'.format(\n                        len(joints_), self.ann_info['num_joints'])\n\n                joints[:, 0:2] = joints_[:, 0:2]\n                joints_vis[:, 0] = joints_vis_[:]\n                joints_vis[:, 1] = joints_vis_[:]\n\n            gt_db.append({\n                'image_file': os.path.join(self.img_prefix, image_name),\n                'im_id': im_id,\n                'center': c,\n                'scale': s,\n                'gt_joints': joints,\n                'joints_vis': joints_vis\n            })\n        print(\"number length: {}\".format(len(gt_db)))\n        self.db = gt_db\n"
  },
  {
    "path": "ppdet/data/source/lvis.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport os\nimport cv2\nimport copy\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\nimport numpy as np\nfrom ppdet.core.workspace import register, serializable\nfrom .dataset import DetDataset\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'LVISDataSet',\n]\n\n\n@register\n@serializable\nclass LVISDataSet(DetDataset):\n    \"\"\"\n    Load dataset with LVISDataSet format.\n\n    Args:\n        dataset_dir (str): root directory for dataset.\n        image_dir (str): directory for images.\n        anno_path (str): coco annotation file path.\n        data_fields (list): key name of data dictionary, at least have 'image'.\n        sample_num (int): number of samples to load, -1 means all.\n        load_crowd (bool): whether to load crowded ground-truth. \n            False as default\n        allow_empty (bool): whether to load empty entry. False as default\n        empty_ratio (float): the ratio of empty record number to total \n            record's, if empty_ratio is out of [0. ,1.), do not sample the \n            records and use all the empty entries. 1. as default\n        repeat (int): repeat times for dataset, use in benchmark.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 data_fields=['image'],\n                 sample_num=-1,\n                 load_crowd=False,\n                 allow_empty=False,\n                 empty_ratio=1.,\n                 repeat=1):\n        super(LVISDataSet, self).__init__(\n            dataset_dir,\n            image_dir,\n            anno_path,\n            data_fields,\n            sample_num,\n            repeat=repeat)\n        self.load_image_only = False\n        self.load_semantic = False\n        self.load_crowd = load_crowd\n        self.allow_empty = allow_empty\n        self.empty_ratio = empty_ratio\n\n    def _sample_empty(self, records, num):\n        # if empty_ratio is out of [0. ,1.), do not sample the records\n        if self.empty_ratio < 0. or self.empty_ratio >= 1.:\n            return records\n        import random\n        sample_num = min(\n            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))\n        records = random.sample(records, sample_num)\n        return records\n\n    def parse_dataset(self):\n        anno_path = os.path.join(self.dataset_dir, self.anno_path)\n        image_dir = os.path.join(self.dataset_dir, self.image_dir)\n\n        assert anno_path.endswith('.json'), \\\n            'invalid coco annotation file: ' + anno_path\n        from lvis import LVIS\n        lvis_ = LVIS(anno_path)\n        img_ids = lvis_.get_img_ids()\n        img_ids.sort()\n        cat_ids = lvis_.get_cat_ids()\n        records = []\n        empty_records = []\n        ct = 0\n        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})\n        self.cname2cid = dict({\n            lvis_.load_cats([catid])[0]['name']: clsid\n            for catid, clsid in self.catid2clsid.items()\n        })\n\n        if 'annotations' not in lvis_.dataset:\n            self.load_image_only = True\n            logger.warning('Annotation file: {} does not contains ground truth '\n                           'and load image information only.'.format(anno_path))\n\n        for img_id in img_ids:\n            img_anno = lvis_.load_imgs([img_id])[0]\n            im_fname = img_anno['coco_url'].replace('http://images.cocodataset.org/', '')\n            im_w = float(img_anno['width'])\n            im_h = float(img_anno['height'])\n\n            im_path = os.path.join(image_dir,\n                                   im_fname) if image_dir else im_fname\n            is_empty = False\n            if not os.path.exists(im_path):\n                logger.warning('Illegal image file: {}, and it will be '\n                               'ignored'.format(im_path))\n                continue\n\n            if im_w < 0 or im_h < 0:\n                logger.warning('Illegal width: {} or height: {} in annotation, '\n                               'and im_id: {} will be ignored'.format(\n                                   im_w, im_h, img_id))\n                continue\n\n            coco_rec = {\n                'im_file': im_path,\n                'im_id': np.array([img_id]),\n                'h': im_h,\n                'w': im_w,\n            } if 'image' in self.data_fields else {}\n\n            if not self.load_image_only:\n                ins_anno_ids = lvis_.get_ann_ids(img_ids=[img_id])\n                instances = lvis_.load_anns(ins_anno_ids)\n\n                bboxes = []\n                is_rbox_anno = False\n                for inst in instances:\n                    # check gt bbox\n                    if inst.get('ignore', False):\n                        continue\n                    if 'bbox' not in inst.keys():\n                        continue\n                    else:\n                        if not any(np.array(inst['bbox'])):\n                            continue\n\n                    x1, y1, box_w, box_h = inst['bbox']\n                    x2 = x1 + box_w\n                    y2 = y1 + box_h\n                    eps = 1e-5\n                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:\n                        inst['clean_bbox'] = [\n                            round(float(x), 3) for x in [x1, y1, x2, y2]\n                        ]\n                        bboxes.append(inst)\n                    else:\n                        logger.warning(\n                            'Found an invalid bbox in annotations: im_id: {}, '\n                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(\n                                img_id, float(inst['area']), x1, y1, x2, y2))\n\n                num_bbox = len(bboxes)\n                if num_bbox <= 0 and not self.allow_empty:\n                    continue\n                elif num_bbox <= 0:\n                    is_empty = True\n\n                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)\n                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)\n                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)\n                gt_poly = [None] * num_bbox\n                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)\n\n                has_segmentation = False\n                has_track_id = False\n                for i, box in enumerate(bboxes):\n                    catid = box['category_id']\n                    gt_class[i][0] = self.catid2clsid[catid]\n                    gt_bbox[i, :] = box['clean_bbox']\n                    \n                    # is_crowd[i][0] = box['iscrowd']\n                    # check RLE format \n                    # if 'segmentation' in box and box['iscrowd'] == 1:\n                    #     gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]\n                    # elif 'segmentation' in box and box['segmentation']:\n                    #     if not np.array(\n                    #             box['segmentation'],\n                    #             dtype=object).size > 0 and not self.allow_empty:\n                    #         bboxes.pop(i)\n                    #         gt_poly.pop(i)\n                    #         np.delete(is_crowd, i)\n                    #         np.delete(gt_class, i)\n                    #         np.delete(gt_bbox, i)\n                    #     else:\n                    #         gt_poly[i] = box['segmentation']\n                    #     has_segmentation = True\n\n                    if 'track_id' in box:\n                        gt_track_id[i][0] = box['track_id']\n                        has_track_id = True\n                if has_segmentation and not any(\n                        gt_poly) and not self.allow_empty:\n                    continue\n\n                gt_rec = {\n                    'is_crowd': is_crowd,\n                    'gt_class': gt_class,\n                    'gt_bbox': gt_bbox,\n                    'gt_poly': gt_poly,\n                }\n                if has_track_id:\n                    gt_rec.update({'gt_track_id': gt_track_id})\n\n                for k, v in gt_rec.items():\n                    if k in self.data_fields:\n                        coco_rec[k] = v\n\n                # TODO: remove load_semantic\n                if self.load_semantic and 'semantic' in self.data_fields:\n                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',\n                                            'train2017', im_fname[:-3] + 'png')\n                    coco_rec.update({'semantic': seg_path})\n\n            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(\n                im_path, img_id, im_h, im_w))\n            if is_empty:\n                empty_records.append(coco_rec)\n            else:\n                records.append(coco_rec)\n            ct += 1\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n        assert ct > 0, 'not found any coco record in %s' % (anno_path)\n        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.\n                    format(ct, len(img_ids) - ct, anno_path))\n        if self.allow_empty and len(empty_records) > 0:\n            empty_records = self._sample_empty(empty_records, len(records))\n            records += empty_records\n        self.roidbs = records"
  },
  {
    "path": "ppdet/data/source/mot.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport cv2\nimport glob\nimport numpy as np\nfrom collections import OrderedDict, defaultdict\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\nfrom .dataset import DetDataset, _make_dataset, _is_valid_file\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\n@register\n@serializable\nclass MOTDataSet(DetDataset):\n    \"\"\"\n    Load dataset with MOT format, only support single class MOT.\n\n    Args:\n        dataset_dir (str): root directory for dataset.\n        image_lists (str|list): mot data image lists, muiti-source mot dataset.\n        data_fields (list): key name of data dictionary, at least have 'image'.\n        sample_num (int): number of samples to load, -1 means all.\n        repeat (int): repeat times for dataset, use in benchmark.\n\n    Notes:\n        MOT datasets root directory following this:\n            dataset/mot\n            |——————image_lists\n            |        |——————caltech.train  \n            |        |——————caltech.val   \n            |        |——————mot16.train  \n            |        |——————mot17.train  \n            |        ......\n            |——————Caltech\n            |——————MOT17\n            |——————......\n\n        All the MOT datasets have the following structure:\n            Caltech\n            |——————images\n            |        └——————00001.jpg\n            |        |—————— ...\n            |        └——————0000N.jpg\n            └——————labels_with_ids\n                        └——————00001.txt\n                        |—————— ...\n                        └——————0000N.txt\n            or\n\n            MOT17\n            |——————images\n            |        └——————train\n            |        └——————test\n            └——————labels_with_ids\n                        └——————train\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_lists=[],\n                 data_fields=['image'],\n                 sample_num=-1,\n                 repeat=1):\n        super(MOTDataSet, self).__init__(\n            dataset_dir=dataset_dir,\n            data_fields=data_fields,\n            sample_num=sample_num,\n            repeat=repeat)\n        self.dataset_dir = dataset_dir\n        self.image_lists = image_lists\n        if isinstance(self.image_lists, str):\n            self.image_lists = [self.image_lists]\n        self.roidbs = None\n        self.cname2cid = None\n\n    def get_anno(self):\n        if self.image_lists == []:\n            return\n        # only used to get categories and metric\n        # only check first data, but the label_list of all data should be same.\n        first_mot_data = self.image_lists[0].split('.')[0]\n        anno_file = os.path.join(self.dataset_dir, first_mot_data,\n                                 'label_list.txt')\n        return anno_file\n\n    def parse_dataset(self):\n        self.img_files = OrderedDict()\n        self.img_start_index = OrderedDict()\n        self.label_files = OrderedDict()\n        self.tid_num = OrderedDict()\n        self.tid_start_index = OrderedDict()\n\n        img_index = 0\n        for data_name in self.image_lists:\n            # check every data image list\n            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')\n            assert os.path.isdir(image_lists_dir), \\\n                \"The {} is not a directory.\".format(image_lists_dir)\n\n            list_path = os.path.join(image_lists_dir, data_name)\n            assert os.path.exists(list_path), \\\n                \"The list path {} does not exist.\".format(list_path)\n\n            # record img_files, filter out empty ones\n            with open(list_path, 'r') as file:\n                self.img_files[data_name] = file.readlines()\n                self.img_files[data_name] = [\n                    os.path.join(self.dataset_dir, x.strip())\n                    for x in self.img_files[data_name]\n                ]\n                self.img_files[data_name] = list(\n                    filter(lambda x: len(x) > 0, self.img_files[data_name]))\n\n                self.img_start_index[data_name] = img_index\n                img_index += len(self.img_files[data_name])\n\n            # record label_files\n            self.label_files[data_name] = [\n                x.replace('images', 'labels_with_ids').replace(\n                    '.png', '.txt').replace('.jpg', '.txt')\n                for x in self.img_files[data_name]\n            ]\n\n        for data_name, label_paths in self.label_files.items():\n            max_index = -1\n            for lp in label_paths:\n                lb = np.loadtxt(lp)\n                if len(lb) < 1:\n                    continue\n                if len(lb.shape) < 2:\n                    img_max = lb[1]\n                else:\n                    img_max = np.max(lb[:, 1])\n                if img_max > max_index:\n                    max_index = img_max\n            self.tid_num[data_name] = int(max_index + 1)\n\n        last_index = 0\n        for i, (k, v) in enumerate(self.tid_num.items()):\n            self.tid_start_index[k] = last_index\n            last_index += v\n\n        self.num_identities_dict = defaultdict(int)\n        self.num_identities_dict[0] = int(last_index + 1)  # single class\n        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]\n        self.total_imgs = sum(self.num_imgs_each_data)\n\n        logger.info('MOT dataset summary: ')\n        logger.info(self.tid_num)\n        logger.info('Total images: {}'.format(self.total_imgs))\n        logger.info('Image start index: {}'.format(self.img_start_index))\n        logger.info('Total identities: {}'.format(self.num_identities_dict[0]))\n        logger.info('Identity start index: {}'.format(self.tid_start_index))\n\n        records = []\n        cname2cid = mot_label()\n\n        for img_index in range(self.total_imgs):\n            for i, (k, v) in enumerate(self.img_start_index.items()):\n                if img_index >= v:\n                    data_name = list(self.label_files.keys())[i]\n                    start_index = v\n            img_file = self.img_files[data_name][img_index - start_index]\n            lbl_file = self.label_files[data_name][img_index - start_index]\n\n            if not os.path.exists(img_file):\n                logger.warning('Illegal image file: {}, and it will be ignored'.\n                               format(img_file))\n                continue\n            if not os.path.isfile(lbl_file):\n                logger.warning('Illegal label file: {}, and it will be ignored'.\n                               format(lbl_file))\n                continue\n\n            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)\n            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]\n\n            cx, cy = labels[:, 2], labels[:, 3]\n            w, h = labels[:, 4], labels[:, 5]\n            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')\n            gt_class = labels[:, 0:1].astype('int32')\n            gt_score = np.ones((len(labels), 1)).astype('float32')\n            gt_ide = labels[:, 1:2].astype('int32')\n            for i, _ in enumerate(gt_ide):\n                if gt_ide[i] > -1:\n                    gt_ide[i] += self.tid_start_index[data_name]\n\n            mot_rec = {\n                'im_file': img_file,\n                'im_id': img_index,\n            } if 'image' in self.data_fields else {}\n\n            gt_rec = {\n                'gt_class': gt_class,\n                'gt_score': gt_score,\n                'gt_bbox': gt_bbox,\n                'gt_ide': gt_ide,\n            }\n\n            for k, v in gt_rec.items():\n                if k in self.data_fields:\n                    mot_rec[k] = v\n\n            records.append(mot_rec)\n            if self.sample_num > 0 and img_index >= self.sample_num:\n                break\n        assert len(records) > 0, 'not found any mot record in %s' % (\n            self.image_lists)\n        self.roidbs, self.cname2cid = records, cname2cid\n\n\n@register\n@serializable\nclass MCMOTDataSet(DetDataset):\n    \"\"\"\n    Load dataset with MOT format, support multi-class MOT.\n\n    Args:\n        dataset_dir (str): root directory for dataset.\n        image_lists (list(str)): mcmot data image lists, muiti-source mcmot dataset.\n        data_fields (list): key name of data dictionary, at least have 'image'.\n        label_list (str): if use_default_label is False, will load\n            mapping between category and class index.\n        sample_num (int): number of samples to load, -1 means all.\n\n    Notes:\n        MCMOT datasets root directory following this:\n            dataset/mot\n            |——————image_lists\n            |        |——————visdrone_mcmot.train  \n            |        |——————visdrone_mcmot.val   \n            visdrone_mcmot\n            |——————images\n            |        └——————train\n            |        └——————val\n            └——————labels_with_ids\n                        └——————train\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_lists=[],\n                 data_fields=['image'],\n                 label_list=None,\n                 sample_num=-1):\n        super(MCMOTDataSet, self).__init__(\n            dataset_dir=dataset_dir,\n            data_fields=data_fields,\n            sample_num=sample_num)\n        self.dataset_dir = dataset_dir\n        self.image_lists = image_lists\n        if isinstance(self.image_lists, str):\n            self.image_lists = [self.image_lists]\n        self.label_list = label_list\n        self.roidbs = None\n        self.cname2cid = None\n\n    def get_anno(self):\n        if self.image_lists == []:\n            return\n        # only used to get categories and metric\n        # only check first data, but the label_list of all data should be same.\n        first_mot_data = self.image_lists[0].split('.')[0]\n        anno_file = os.path.join(self.dataset_dir, first_mot_data,\n                                 'label_list.txt')\n        return anno_file\n\n    def parse_dataset(self):\n        self.img_files = OrderedDict()\n        self.img_start_index = OrderedDict()\n        self.label_files = OrderedDict()\n        self.tid_num = OrderedDict()\n        self.tid_start_idx_of_cls_ids = defaultdict(dict)  # for MCMOT\n\n        img_index = 0\n        for data_name in self.image_lists:\n            # check every data image list\n            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')\n            assert os.path.isdir(image_lists_dir), \\\n                \"The {} is not a directory.\".format(image_lists_dir)\n\n            list_path = os.path.join(image_lists_dir, data_name)\n            assert os.path.exists(list_path), \\\n                \"The list path {} does not exist.\".format(list_path)\n\n            # record img_files, filter out empty ones\n            with open(list_path, 'r') as file:\n                self.img_files[data_name] = file.readlines()\n                self.img_files[data_name] = [\n                    os.path.join(self.dataset_dir, x.strip())\n                    for x in self.img_files[data_name]\n                ]\n                self.img_files[data_name] = list(\n                    filter(lambda x: len(x) > 0, self.img_files[data_name]))\n\n                self.img_start_index[data_name] = img_index\n                img_index += len(self.img_files[data_name])\n\n            # record label_files\n            self.label_files[data_name] = [\n                x.replace('images', 'labels_with_ids').replace(\n                    '.png', '.txt').replace('.jpg', '.txt')\n                for x in self.img_files[data_name]\n            ]\n\n        for data_name, label_paths in self.label_files.items():\n            # using max_ids_dict rather than max_index\n            max_ids_dict = defaultdict(int)\n            for lp in label_paths:\n                lb = np.loadtxt(lp)\n                if len(lb) < 1:\n                    continue\n                lb = lb.reshape(-1, 6)\n                for item in lb:\n                    if item[1] > max_ids_dict[int(item[0])]:\n                        # item[0]: cls_id\n                        # item[1]: track id\n                        max_ids_dict[int(item[0])] = int(item[1])\n            # track id number\n            self.tid_num[data_name] = max_ids_dict\n\n        last_idx_dict = defaultdict(int)\n        for i, (k, v) in enumerate(self.tid_num.items()):  # each sub dataset\n            for cls_id, id_num in v.items():  # v is a max_ids_dict\n                self.tid_start_idx_of_cls_ids[k][cls_id] = last_idx_dict[cls_id]\n                last_idx_dict[cls_id] += id_num\n\n        self.num_identities_dict = defaultdict(int)\n        for k, v in last_idx_dict.items():\n            self.num_identities_dict[k] = int(v)  # total ids of each category\n\n        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]\n        self.total_imgs = sum(self.num_imgs_each_data)\n\n        # cname2cid and cid2cname \n        cname2cid = {}\n        if self.label_list is not None:\n            # if use label_list for multi source mix dataset, \n            # please make sure label_list in the first sub_dataset at least.\n            sub_dataset = self.image_lists[0].split('.')[0]\n            label_path = os.path.join(self.dataset_dir, sub_dataset,\n                                      self.label_list)\n            if not os.path.exists(label_path):\n                logger.info(\n                    \"Note: label_list {} does not exists, use VisDrone 10 classes labels as default.\".\n                    format(label_path))\n                cname2cid = visdrone_mcmot_label()\n            else:\n                with open(label_path, 'r') as fr:\n                    label_id = 0\n                    for line in fr.readlines():\n                        cname2cid[line.strip()] = label_id\n                        label_id += 1\n        else:\n            cname2cid = visdrone_mcmot_label()\n\n        cid2cname = dict([(v, k) for (k, v) in cname2cid.items()])\n\n        logger.info('MCMOT dataset summary: ')\n        logger.info(self.tid_num)\n        logger.info('Total images: {}'.format(self.total_imgs))\n        logger.info('Image start index: {}'.format(self.img_start_index))\n\n        logger.info('Total identities of each category: ')\n        num_identities_dict = sorted(\n            self.num_identities_dict.items(), key=lambda x: x[0])\n        total_IDs_all_cats = 0\n        for (k, v) in num_identities_dict:\n            logger.info('Category {} [{}] has {} IDs.'.format(k, cid2cname[k],\n                                                              v))\n            total_IDs_all_cats += v\n        logger.info('Total identities of all categories: {}'.format(\n            total_IDs_all_cats))\n\n        logger.info('Identity start index of each category: ')\n        for k, v in self.tid_start_idx_of_cls_ids.items():\n            sorted_v = sorted(v.items(), key=lambda x: x[0])\n            for (cls_id, start_idx) in sorted_v:\n                logger.info('Start index of dataset {} category {:d} is {:d}'\n                            .format(k, cls_id, start_idx))\n\n        records = []\n        for img_index in range(self.total_imgs):\n            for i, (k, v) in enumerate(self.img_start_index.items()):\n                if img_index >= v:\n                    data_name = list(self.label_files.keys())[i]\n                    start_index = v\n            img_file = self.img_files[data_name][img_index - start_index]\n            lbl_file = self.label_files[data_name][img_index - start_index]\n\n            if not os.path.exists(img_file):\n                logger.warning('Illegal image file: {}, and it will be ignored'.\n                               format(img_file))\n                continue\n            if not os.path.isfile(lbl_file):\n                logger.warning('Illegal label file: {}, and it will be ignored'.\n                               format(lbl_file))\n                continue\n\n            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)\n            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]\n\n            cx, cy = labels[:, 2], labels[:, 3]\n            w, h = labels[:, 4], labels[:, 5]\n            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')\n            gt_class = labels[:, 0:1].astype('int32')\n            gt_score = np.ones((len(labels), 1)).astype('float32')\n            gt_ide = labels[:, 1:2].astype('int32')\n            for i, _ in enumerate(gt_ide):\n                if gt_ide[i] > -1:\n                    cls_id = int(gt_class[i])\n                    start_idx = self.tid_start_idx_of_cls_ids[data_name][cls_id]\n                    gt_ide[i] += start_idx\n\n            mot_rec = {\n                'im_file': img_file,\n                'im_id': img_index,\n            } if 'image' in self.data_fields else {}\n\n            gt_rec = {\n                'gt_class': gt_class,\n                'gt_score': gt_score,\n                'gt_bbox': gt_bbox,\n                'gt_ide': gt_ide,\n            }\n\n            for k, v in gt_rec.items():\n                if k in self.data_fields:\n                    mot_rec[k] = v\n\n            records.append(mot_rec)\n            if self.sample_num > 0 and img_index >= self.sample_num:\n                break\n        assert len(records) > 0, 'not found any mot record in %s' % (\n            self.image_lists)\n        self.roidbs, self.cname2cid = records, cname2cid\n\n\n@register\n@serializable\nclass MOTImageFolder(DetDataset):\n    \"\"\"\n    Load MOT dataset with MOT format from image folder or video .\n    Args:\n        video_file (str): path of the video file, default ''.\n        frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set.\n        dataset_dir (str): root directory for dataset.\n        keep_ori_im (bool): whether to keep original image, default False. \n            Set True when used during MOT model inference while saving\n            images or video, or used in DeepSORT.\n    \"\"\"\n\n    def __init__(self,\n                 video_file=None,\n                 frame_rate=-1,\n                 dataset_dir=None,\n                 data_root=None,\n                 image_dir=None,\n                 sample_num=-1,\n                 keep_ori_im=False,\n                 anno_path=None,\n                 **kwargs):\n        super(MOTImageFolder, self).__init__(\n            dataset_dir, image_dir, sample_num=sample_num)\n        self.video_file = video_file\n        self.data_root = data_root\n        self.keep_ori_im = keep_ori_im\n        self._imid2path = {}\n        self.roidbs = None\n        self.frame_rate = frame_rate\n        self.anno_path = anno_path\n\n    def check_or_download_dataset(self):\n        return\n\n    def parse_dataset(self, ):\n        if not self.roidbs:\n            if self.video_file is None:\n                self.frame_rate = 30  # set as default if infer image folder\n                self.roidbs = self._load_images()\n            else:\n                self.roidbs = self._load_video_images()\n\n    def _load_video_images(self):\n        if self.frame_rate == -1:\n            # if frame_rate is not set for video, use cv2.VideoCapture\n            cap = cv2.VideoCapture(self.video_file)\n            self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS))\n\n        extension = self.video_file.split('.')[-1]\n        output_path = self.video_file.replace('.{}'.format(extension), '')\n        frames_path = video2frames(self.video_file, output_path,\n                                   self.frame_rate)\n        self.video_frames = sorted(\n            glob.glob(os.path.join(frames_path, '*.png')))\n\n        self.video_length = len(self.video_frames)\n        logger.info('Length of the video: {:d} frames.'.format(\n            self.video_length))\n        ct = 0\n        records = []\n        for image in self.video_frames:\n            assert image != '' and os.path.isfile(image), \\\n                    \"Image {} not found\".format(image)\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n            rec = {'im_id': np.array([ct]), 'im_file': image}\n            if self.keep_ori_im:\n                rec.update({'keep_ori_im': 1})\n            self._imid2path[ct] = image\n            ct += 1\n            records.append(rec)\n        assert len(records) > 0, \"No image file found\"\n        return records\n\n    def _find_images(self):\n        image_dir = self.image_dir\n        if not isinstance(image_dir, Sequence):\n            image_dir = [image_dir]\n        images = []\n        for im_dir in image_dir:\n            if os.path.isdir(im_dir):\n                im_dir = os.path.join(self.dataset_dir, im_dir)\n                images.extend(_make_dataset(im_dir))\n            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):\n                images.append(im_dir)\n        return images\n\n    def _load_images(self):\n        images = self._find_images()\n        ct = 0\n        records = []\n        for image in images:\n            assert image != '' and os.path.isfile(image), \\\n                    \"Image {} not found\".format(image)\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n            rec = {'im_id': np.array([ct]), 'im_file': image}\n            if self.keep_ori_im:\n                rec.update({'keep_ori_im': 1})\n            self._imid2path[ct] = image\n            ct += 1\n            records.append(rec)\n        assert len(records) > 0, \"No image file found\"\n        return records\n\n    def get_imid2path(self):\n        return self._imid2path\n\n    def set_images(self, images):\n        self.image_dir = images\n        self.roidbs = self._load_images()\n\n    def set_video(self, video_file, frame_rate):\n        # update video_file and frame_rate by command line of tools/infer_mot.py\n        self.video_file = video_file\n        self.frame_rate = frame_rate\n        assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \\\n                \"wrong or unsupported file format: {}\".format(self.video_file)\n        self.roidbs = self._load_video_images()\n\n    def get_anno(self):\n        return self.anno_path\n\n\ndef _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')):\n    return f.lower().endswith(extensions)\n\n\ndef video2frames(video_path, outpath, frame_rate, **kargs):\n    def _dict2str(kargs):\n        cmd_str = ''\n        for k, v in kargs.items():\n            cmd_str += (' ' + str(k) + ' ' + str(v))\n        return cmd_str\n\n    ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error ']\n    vid_name = os.path.basename(video_path).split('.')[0]\n    out_full_path = os.path.join(outpath, vid_name)\n\n    if not os.path.exists(out_full_path):\n        os.makedirs(out_full_path)\n\n    # video file name\n    outformat = os.path.join(out_full_path, '%08d.png')\n\n    cmd = ffmpeg\n    cmd = ffmpeg + [\n        ' -i ', video_path, ' -r ', str(frame_rate), ' -f image2 ', outformat\n    ]\n    cmd = ''.join(cmd) + _dict2str(kargs)\n\n    if os.system(cmd) != 0:\n        raise RuntimeError('ffmpeg process video: {} error'.format(video_path))\n        sys.exit(-1)\n\n    sys.stdout.flush()\n    return out_full_path\n\n\ndef mot_label():\n    labels_map = {'person': 0}\n    return labels_map\n\n\ndef visdrone_mcmot_label():\n    labels_map = {\n        'pedestrian': 0,\n        'people': 1,\n        'bicycle': 2,\n        'car': 3,\n        'van': 4,\n        'truck': 5,\n        'tricycle': 6,\n        'awning-tricycle': 7,\n        'bus': 8,\n        'motor': 9,\n    }\n    return labels_map\n"
  },
  {
    "path": "ppdet/data/source/pose3d_cmb.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport os\nimport cv2\nimport numpy as np\nimport json\nimport copy\nimport pycocotools\nfrom pycocotools.coco import COCO\nfrom .dataset import DetDataset\nfrom ppdet.core.workspace import register, serializable\nfrom paddle.io import Dataset\n\n\n@serializable\nclass Pose3DDataset(DetDataset):\n    \"\"\"Pose3D Dataset class. \n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        anno_list (list of str): each of the element is a relative path to the annotation file.\n        image_dirs (list of str): each of path is a relative path where images are held.\n        transform (composed(operators)): A sequence of data transforms.\n        test_mode (bool): Store True when building test or\n            validation dataset. Default: False.\n        24 joints order:\n        0-2: 'R_Ankle', 'R_Knee', 'R_Hip', \n        3-5:'L_Hip', 'L_Knee', 'L_Ankle', \n        6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder', \n        9-11:'L_Shoulder','L_Elbow','L_Wrist',\n        12-14:'Neck','Top_of_Head','Pelvis',\n        15-18:'Thorax','Spine','Jaw','Head',\n        19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear'\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir,\n                 image_dirs,\n                 anno_list,\n                 transform=[],\n                 num_joints=24,\n                 test_mode=False):\n        super().__init__(dataset_dir, image_dirs, anno_list)\n        self.image_info = {}\n        self.ann_info = {}\n        self.num_joints = num_joints\n\n        self.transform = transform\n        self.test_mode = test_mode\n\n        self.img_ids = []\n        self.dataset_dir = dataset_dir\n        self.image_dirs = image_dirs\n        self.anno_list = anno_list\n\n    def get_mask(self, mvm_percent=0.3):\n        num_joints = self.num_joints\n        mjm_mask = np.ones((num_joints, 1)).astype(np.float32)\n        if self.test_mode == False:\n            pb = np.random.random_sample()\n            masked_num = int(\n                pb * mvm_percent *\n                num_joints)  # at most x% of the joints could be masked\n            indices = np.random.choice(\n                np.arange(num_joints), replace=False, size=masked_num)\n            mjm_mask[indices, :] = 0.0\n        # return mjm_mask\n\n        num_joints = 10\n        mvm_mask = np.ones((num_joints, 1)).astype(np.float)\n        if self.test_mode == False:\n            num_vertices = num_joints\n            pb = np.random.random_sample()\n            masked_num = int(\n                pb * mvm_percent *\n                num_vertices)  # at most x% of the vertices could be masked\n            indices = np.random.choice(\n                np.arange(num_vertices), replace=False, size=masked_num)\n            mvm_mask[indices, :] = 0.0\n\n        mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0)\n        return mjm_mask\n\n    def filterjoints(self, x):\n        if self.num_joints == 24:\n            return x\n        elif self.num_joints == 14:\n            return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :]\n        elif self.num_joints == 17:\n            return x[\n                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :]\n        else:\n            raise ValueError(\n                \"unsupported joint numbers, only [24 or 17 or 14] is supported!\")\n\n    def parse_dataset(self):\n        print(\"Loading annotations..., please wait\")\n        self.annos = []\n        im_id = 0\n        self.human36m_num = 0\n        for idx, annof in enumerate(self.anno_list):\n            img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx])\n            dataf = os.path.join(self.dataset_dir, annof)\n            with open(dataf, 'r') as rf:\n                anno_data = json.load(rf)\n                annos = anno_data['data']\n                new_annos = []\n                print(\"{} has annos numbers: {}\".format(dataf, len(annos)))\n                for anno in annos:\n                    new_anno = {}\n                    new_anno['im_id'] = im_id\n                    im_id += 1\n                    imagename = anno['imageName']\n                    if imagename.startswith(\"COCO_train2014_\"):\n                        imagename = imagename[len(\"COCO_train2014_\"):]\n                    elif imagename.startswith(\"COCO_val2014_\"):\n                        imagename = imagename[len(\"COCO_val2014_\"):]\n                    imagename = os.path.join(img_prefix, imagename)\n                    if not os.path.exists(imagename):\n                        if \"train2017\" in imagename:\n                            imagename = imagename.replace(\"train2017\",\n                                                          \"val2017\")\n                            if not os.path.exists(imagename):\n                                print(\"cannot find imagepath:{}\".format(\n                                    imagename))\n                                continue\n                        else:\n                            print(\"cannot find imagepath:{}\".format(imagename))\n                            continue\n                    new_anno['imageName'] = imagename\n                    if 'human3.6m' in imagename:\n                        self.human36m_num += 1\n                    new_anno['bbox_center'] = anno['bbox_center']\n                    new_anno['bbox_scale'] = anno['bbox_scale']\n                    new_anno['joints_2d'] = np.array(anno[\n                        'gt_keypoint_2d']).astype(np.float32)\n                    if new_anno['joints_2d'].shape[0] == 49:\n                        #if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints\n                        #for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py\n                        new_anno['joints_2d'] = new_anno['joints_2d'][25:]\n                    new_anno['joints_3d'] = np.array(anno[\n                        'pose3d'])[:, :3].astype(np.float32)\n                    new_anno['mjm_mask'] = self.get_mask()\n                    if not 'has_3d_joints' in anno:\n                        new_anno['has_3d_joints'] = int(1)\n                        new_anno['has_2d_joints'] = int(1)\n                    else:\n                        new_anno['has_3d_joints'] = int(anno['has_3d_joints'])\n                        new_anno['has_2d_joints'] = int(anno['has_2d_joints'])\n                    new_anno['joints_2d'] = self.filterjoints(new_anno[\n                        'joints_2d'])\n                    self.annos.append(new_anno)\n                del annos\n\n    def get_temp_num(self):\n        \"\"\"get temporal data number, like human3.6m\"\"\"\n        return self.human36m_num\n\n    def __len__(self):\n        \"\"\"Get dataset length.\"\"\"\n        return len(self.annos)\n\n    def _get_imganno(self, idx):\n        \"\"\"Get anno for a single image.\"\"\"\n        return self.annos[idx]\n\n    def __getitem__(self, idx):\n        \"\"\"Prepare image for training given the index.\"\"\"\n        records = copy.deepcopy(self._get_imganno(idx))\n        imgpath = records['imageName']\n        assert os.path.exists(imgpath), \"cannot find image {}\".format(imgpath)\n        records['image'] = cv2.imread(imgpath)\n        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)\n        records = self.transform(records)\n        return records\n\n    def check_or_download_dataset(self):\n        alldatafind = True\n        for image_dir in self.image_dirs:\n            image_dir = os.path.join(self.dataset_dir, image_dir)\n            if not os.path.isdir(image_dir):\n                print(\"dataset [{}] is not found\".format(image_dir))\n                alldatafind = False\n        if not alldatafind:\n            raise ValueError(\n                \"Some dataset is not valid and cannot download automatically now, please prepare the dataset first\"\n            )\n\n\n@register\n@serializable\nclass Keypoint3DMultiFramesDataset(Dataset):\n    \"\"\"24 keypoints 3D dataset for pose estimation. \n\n    each item is a list of images\n\n    The dataset loads raw features and apply specified transforms\n    to return a dict containing the image tensors and other information.\n\n    Args:\n        dataset_dir (str): Root path to the dataset.\n        image_dir (str): Path to a directory where images are held.\n    \"\"\"\n\n    def __init__(\n            self,\n            dataset_dir,  # 数据集根目录\n            image_dir,  # 图像文件夹\n            p3d_dir,  # 3D关键点文件夹\n            json_path,\n            img_size,  #图像resize大小\n            num_frames,  # 帧序列长度\n            anno_path=None, ):\n\n        self.dataset_dir = dataset_dir\n        self.image_dir = image_dir\n        self.p3d_dir = p3d_dir\n        self.json_path = json_path\n        self.img_size = img_size\n        self.num_frames = num_frames\n        self.anno_path = anno_path\n\n        self.data_labels, self.mf_inds = self._generate_multi_frames_list()\n\n    def _generate_multi_frames_list(self):\n        act_list = os.listdir(self.dataset_dir)  # 动作列表\n        count = 0\n        mf_list = []\n        annos_dict = {'images': [], 'annotations': [], 'act_inds': []}\n        for act in act_list:  #对每个动作，生成帧序列\n            if '.' in act:\n                continue\n\n            json_path = os.path.join(self.dataset_dir, act, self.json_path)\n            with open(json_path, 'r') as j:\n                annos = json.load(j)\n            length = len(annos['images'])\n            for k, v in annos.items():\n                if k in annos_dict:\n                    annos_dict[k].extend(v)\n            annos_dict['act_inds'].extend([act] * length)\n\n            mf = [[i + j + count for j in range(self.num_frames)]\n                  for i in range(0, length - self.num_frames + 1)]\n            mf_list.extend(mf)\n            count += length\n\n        print(\"total data number:\", len(mf_list))\n        return annos_dict, mf_list\n\n    def __call__(self, *args, **kwargs):\n        return self\n\n    def __getitem__(self, index):  # 拿一个连续的序列\n        inds = self.mf_inds[\n            index]  # 如[568, 569, 570, 571, 572, 573]，长度为num_frames\n\n        images = self.data_labels['images']  # all images\n        annots = self.data_labels['annotations']  # all annots\n\n        act = self.data_labels['act_inds'][inds[0]]  # 动作名（文件夹名）\n\n        kps3d_list = []\n        kps3d_vis_list = []\n        names = []\n\n        h, w = 0, 0\n        for ind in inds:  # one image\n            height = float(images[ind]['height'])\n            width = float(images[ind]['width'])\n            name = images[ind]['file_name']  # 图像名称，带有后缀\n\n            kps3d_name = name.split('.')[0] + '.obj'\n            kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir,\n                                      kps3d_name)\n\n            joints, joints_vis = self.kps3d_process(kps3d_path)\n            joints_vis = np.array(joints_vis, dtype=np.float32)\n\n            kps3d_list.append(joints)\n            kps3d_vis_list.append(joints_vis)\n            names.append(name)\n\n        kps3d = np.array(kps3d_list)  # (6, 24, 3),(num_frames, joints_num, 3)\n        kps3d_vis = np.array(kps3d_vis_list)\n\n        # read image\n        imgs = []\n        for name in names:\n            img_path = os.path.join(self.dataset_dir, act, self.image_dir, name)\n\n            image = cv2.imread(img_path, cv2.IMREAD_COLOR |\n                               cv2.IMREAD_IGNORE_ORIENTATION)\n            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n\n            imgs.append(np.expand_dims(image, axis=0))\n\n        imgs = np.concatenate(imgs, axis=0)\n        imgs = imgs.astype(\n            np.float32)  # (6, 1080, 1920, 3),(num_frames, h, w, c)\n\n        # attention: 此时图像和标注是镜像的\n        records = {\n            'kps3d': kps3d,\n            'kps3d_vis': kps3d_vis,\n            \"image\": imgs,\n            'act': act,\n            'names': names,\n            'im_id': index\n        }\n\n        return self.transform(records)\n\n    def kps3d_process(self, kps3d_path):\n        count = 0\n        kps = []\n        kps_vis = []\n\n        with open(kps3d_path, 'r') as f:\n            lines = f.readlines()\n            for line in lines:\n                if line[0] == 'v':\n                    kps.append([])\n                    line = line.strip('\\n').split(' ')[1:]\n                    for kp in line:\n                        kps[-1].append(float(kp))\n                    count += 1\n\n                    kps_vis.append([1, 1, 1])\n\n        kps = np.array(kps)  # 52，3\n        kps_vis = np.array(kps_vis)\n\n        kps *= 10  # scale points\n        kps -= kps[[0], :]  # set root point to zero\n\n        kps = np.concatenate((kps[0:23], kps[[37]]), axis=0)  # 24,3\n\n        kps *= 10\n\n        kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0)  # 24,3\n\n        return kps, kps_vis\n\n    def __len__(self):\n        return len(self.mf_inds)\n\n    def get_anno(self):\n        if self.anno_path is None:\n            return\n        return os.path.join(self.dataset_dir, self.anno_path)\n\n    def check_or_download_dataset(self):\n        return\n\n    def parse_dataset(self, ):\n        return\n\n    def set_transform(self, transform):\n        self.transform = transform\n\n    def set_epoch(self, epoch_id):\n        self._epoch = epoch_id\n\n    def set_kwargs(self, **kwargs):\n        self.mixup_epoch = kwargs.get('mixup_epoch', -1)\n        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)\n        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)\n"
  },
  {
    "path": "ppdet/data/source/sniper_coco.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport cv2\nimport json\nimport copy\nimport numpy as np\n\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.data.crop_utils.annotation_cropper import AnnoCropper\nfrom .coco import COCODataSet\nfrom .dataset import _make_dataset, _is_valid_file\nfrom ppdet.utils.logger import setup_logger\n\nlogger = setup_logger('sniper_coco_dataset')\n\n\n@register\n@serializable\nclass SniperCOCODataSet(COCODataSet):\n    \"\"\"SniperCOCODataSet\"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 proposals_file=None,\n                 data_fields=['image'],\n                 sample_num=-1,\n                 load_crowd=False,\n                 allow_empty=True,\n                 empty_ratio=1.,\n                 is_trainset=True,\n                 image_target_sizes=[2000, 1000],\n                 valid_box_ratio_ranges=[[-1, 0.1],[0.08, -1]],\n                 chip_target_size=500,\n                 chip_target_stride=200,\n                 use_neg_chip=False,\n                 max_neg_num_per_im=8,\n                 max_per_img=-1,\n                 nms_thresh=0.5):\n        super(SniperCOCODataSet, self).__init__(\n            dataset_dir=dataset_dir,\n            image_dir=image_dir,\n            anno_path=anno_path,\n            data_fields=data_fields,\n            sample_num=sample_num,\n            load_crowd=load_crowd,\n            allow_empty=allow_empty,\n            empty_ratio=empty_ratio\n        )\n        self.proposals_file = proposals_file\n        self.proposals = None\n        self.anno_cropper = None\n        self.is_trainset = is_trainset\n        self.image_target_sizes = image_target_sizes\n        self.valid_box_ratio_ranges = valid_box_ratio_ranges\n        self.chip_target_size = chip_target_size\n        self.chip_target_stride = chip_target_stride\n        self.use_neg_chip = use_neg_chip\n        self.max_neg_num_per_im = max_neg_num_per_im\n        self.max_per_img = max_per_img\n        self.nms_thresh = nms_thresh\n\n\n    def parse_dataset(self):\n        if not hasattr(self, \"roidbs\"):\n            super(SniperCOCODataSet, self).parse_dataset()\n        if self.is_trainset:\n            self._parse_proposals()\n            self._merge_anno_proposals()\n        self.ori_roidbs = copy.deepcopy(self.roidbs)\n        self.init_anno_cropper()\n        self.roidbs = self.generate_chips_roidbs(self.roidbs, self.is_trainset)\n\n    def set_proposals_file(self, file_path):\n        self.proposals_file = file_path\n\n    def init_anno_cropper(self):\n        logger.info(\"Init AnnoCropper...\")\n        self.anno_cropper = AnnoCropper(\n            image_target_sizes=self.image_target_sizes,\n            valid_box_ratio_ranges=self.valid_box_ratio_ranges,\n            chip_target_size=self.chip_target_size,\n            chip_target_stride=self.chip_target_stride,\n            use_neg_chip=self.use_neg_chip,\n            max_neg_num_per_im=self.max_neg_num_per_im,\n            max_per_img=self.max_per_img,\n            nms_thresh=self.nms_thresh\n        )\n\n    def generate_chips_roidbs(self, roidbs, is_trainset):\n        if is_trainset:\n            roidbs = self.anno_cropper.crop_anno_records(roidbs)\n        else:\n            roidbs = self.anno_cropper.crop_infer_anno_records(roidbs)\n        return roidbs\n\n    def _parse_proposals(self):\n        if self.proposals_file:\n            self.proposals = {}\n            logger.info(\"Parse proposals file:{}\".format(self.proposals_file))\n            with open(self.proposals_file, 'r') as f:\n                proposals = json.load(f)\n            for prop in proposals:\n                image_id = prop[\"image_id\"]\n                if image_id not in self.proposals:\n                    self.proposals[image_id] = []\n                x, y, w, h = prop[\"bbox\"]\n                self.proposals[image_id].append([x, y, x + w, y + h])\n\n    def _merge_anno_proposals(self):\n        assert self.roidbs\n        if self.proposals and len(self.proposals.keys()) > 0:\n            logger.info(\"merge proposals to annos\")\n            for id, record in enumerate(self.roidbs):\n                image_id = int(record[\"im_id\"])\n                if image_id not in self.proposals.keys():\n                    logger.info(\"image id :{} no proposals\".format(image_id))\n                record[\"proposals\"] = np.array(self.proposals.get(image_id, []), dtype=np.float32)\n                self.roidbs[id] = record\n\n    def get_ori_roidbs(self):\n        if not hasattr(self, \"ori_roidbs\"):\n            return None\n        return self.ori_roidbs\n\n    def get_roidbs(self):\n        if not hasattr(self, \"roidbs\"):\n            self.parse_dataset()\n        return self.roidbs\n\n    def set_roidbs(self, roidbs):\n        self.roidbs = roidbs\n\n    def check_or_download_dataset(self):\n        return\n\n    def _parse(self):\n        image_dir = self.image_dir\n        if not isinstance(image_dir, Sequence):\n            image_dir = [image_dir]\n        images = []\n        for im_dir in image_dir:\n            if os.path.isdir(im_dir):\n                im_dir = os.path.join(self.dataset_dir, im_dir)\n                images.extend(_make_dataset(im_dir))\n            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):\n                images.append(im_dir)\n        return images\n\n    def _load_images(self):\n        images = self._parse()\n        ct = 0\n        records = []\n        for image in images:\n            assert image != '' and os.path.isfile(image), \\\n                \"Image {} not found\".format(image)\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n            im = cv2.imread(image)\n            h, w, c = im.shape\n            rec = {'im_id': np.array([ct]), 'im_file': image, \"h\": h, \"w\": w}\n            self._imid2path[ct] = image\n            ct += 1\n            records.append(rec)\n        assert len(records) > 0, \"No image file found\"\n        return records\n\n    def get_imid2path(self):\n        return self._imid2path\n\n    def set_images(self, images):\n        self._imid2path = {}\n        self.image_dir = images\n        self.roidbs = self._load_images()\n\n"
  },
  {
    "path": "ppdet/data/source/voc.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport numpy as np\n\nimport xml.etree.ElementTree as ET\n\nfrom ppdet.core.workspace import register, serializable\n\nfrom .dataset import DetDataset\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\n@register\n@serializable\nclass VOCDataSet(DetDataset):\n    \"\"\"\n    Load dataset with PascalVOC format.\n\n    Notes:\n    `anno_path` must contains xml file and image file path for annotations.\n\n    Args:\n        dataset_dir (str): root directory for dataset.\n        image_dir (str): directory for images.\n        anno_path (str): voc annotation file path.\n        data_fields (list): key name of data dictionary, at least have 'image'.\n        sample_num (int): number of samples to load, -1 means all.\n        label_list (str): if use_default_label is False, will load\n            mapping between category and class index.\n        allow_empty (bool): whether to load empty entry. False as default\n        empty_ratio (float): the ratio of empty record number to total \n            record's, if empty_ratio is out of [0. ,1.), do not sample the \n            records and use all the empty entries. 1. as default\n        repeat (int): repeat times for dataset, use in benchmark.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 data_fields=['image'],\n                 sample_num=-1,\n                 label_list=None,\n                 allow_empty=False,\n                 empty_ratio=1.,\n                 repeat=1):\n        super(VOCDataSet, self).__init__(\n            dataset_dir=dataset_dir,\n            image_dir=image_dir,\n            anno_path=anno_path,\n            data_fields=data_fields,\n            sample_num=sample_num,\n            repeat=repeat)\n        self.label_list = label_list\n        self.allow_empty = allow_empty\n        self.empty_ratio = empty_ratio\n\n    def _sample_empty(self, records, num):\n        # if empty_ratio is out of [0. ,1.), do not sample the records\n        if self.empty_ratio < 0. or self.empty_ratio >= 1.:\n            return records\n        import random\n        sample_num = min(\n            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))\n        records = random.sample(records, sample_num)\n        return records\n\n    def parse_dataset(self, ):\n        anno_path = os.path.join(self.dataset_dir, self.anno_path)\n        image_dir = os.path.join(self.dataset_dir, self.image_dir)\n\n        # mapping category name to class id\n        # first_class:0, second_class:1, ...\n        records = []\n        empty_records = []\n        ct = 0\n        cname2cid = {}\n        if self.label_list:\n            label_path = os.path.join(self.dataset_dir, self.label_list)\n            if not os.path.exists(label_path):\n                raise ValueError(\"label_list {} does not exists\".format(\n                    label_path))\n            with open(label_path, 'r') as fr:\n                label_id = 0\n                for line in fr.readlines():\n                    cname2cid[line.strip()] = label_id\n                    label_id += 1\n        else:\n            cname2cid = pascalvoc_label()\n\n        with open(anno_path, 'r') as fr:\n            while True:\n                line = fr.readline()\n                if not line:\n                    break\n                img_file, xml_file = [os.path.join(image_dir, x) \\\n                        for x in line.strip().split()[:2]]\n                if not os.path.exists(img_file):\n                    logger.warning(\n                        'Illegal image file: {}, and it will be ignored'.format(\n                            img_file))\n                    continue\n                if not os.path.isfile(xml_file):\n                    logger.warning(\n                        'Illegal xml file: {}, and it will be ignored'.format(\n                            xml_file))\n                    continue\n                tree = ET.parse(xml_file)\n                if tree.find('id') is None:\n                    im_id = np.array([ct])\n                else:\n                    im_id = np.array([int(tree.find('id').text)])\n\n                objs = tree.findall('object')\n                im_w = float(tree.find('size').find('width').text)\n                im_h = float(tree.find('size').find('height').text)\n                if im_w < 0 or im_h < 0:\n                    logger.warning(\n                        'Illegal width: {} or height: {} in annotation, '\n                        'and {} will be ignored'.format(im_w, im_h, xml_file))\n                    continue\n\n                num_bbox, i = len(objs), 0\n                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)\n                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)\n                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)\n                difficult = np.zeros((num_bbox, 1), dtype=np.int32)\n                for obj in objs:\n                    cname = obj.find('name').text\n\n                    # user dataset may not contain difficult field\n                    _difficult = obj.find('difficult')\n                    _difficult = int(\n                        _difficult.text) if _difficult is not None else 0\n\n                    x1 = float(obj.find('bndbox').find('xmin').text)\n                    y1 = float(obj.find('bndbox').find('ymin').text)\n                    x2 = float(obj.find('bndbox').find('xmax').text)\n                    y2 = float(obj.find('bndbox').find('ymax').text)\n                    x1 = max(0, x1)\n                    y1 = max(0, y1)\n                    x2 = min(im_w - 1, x2)\n                    y2 = min(im_h - 1, y2)\n                    if x2 > x1 and y2 > y1:\n                        gt_bbox[i, :] = [x1, y1, x2, y2]\n                        gt_class[i, 0] = cname2cid[cname]\n                        gt_score[i, 0] = 1.\n                        difficult[i, 0] = _difficult\n                        i += 1\n                    else:\n                        logger.warning(\n                            'Found an invalid bbox in annotations: xml_file: {}'\n                            ', x1: {}, y1: {}, x2: {}, y2: {}.'.format(\n                                xml_file, x1, y1, x2, y2))\n                gt_bbox = gt_bbox[:i, :]\n                gt_class = gt_class[:i, :]\n                gt_score = gt_score[:i, :]\n                difficult = difficult[:i, :]\n\n                voc_rec = {\n                    'im_file': img_file,\n                    'im_id': im_id,\n                    'h': im_h,\n                    'w': im_w\n                } if 'image' in self.data_fields else {}\n\n                gt_rec = {\n                    'gt_class': gt_class,\n                    'gt_score': gt_score,\n                    'gt_bbox': gt_bbox,\n                    'difficult': difficult\n                }\n                for k, v in gt_rec.items():\n                    if k in self.data_fields:\n                        voc_rec[k] = v\n\n                if len(objs) == 0:\n                    empty_records.append(voc_rec)\n                else:\n                    records.append(voc_rec)\n\n                ct += 1\n                if self.sample_num > 0 and ct >= self.sample_num:\n                    break\n        assert ct > 0, 'not found any voc record in %s' % (self.anno_path)\n        logger.debug('{} samples in file {}'.format(ct, anno_path))\n        if self.allow_empty and len(empty_records) > 0:\n            empty_records = self._sample_empty(empty_records, len(records))\n            records += empty_records\n        self.roidbs, self.cname2cid = records, cname2cid\n\n    def get_label_list(self):\n        return os.path.join(self.dataset_dir, self.label_list)\n\n\ndef pascalvoc_label():\n    labels_map = {\n        'aeroplane': 0,\n        'bicycle': 1,\n        'bird': 2,\n        'boat': 3,\n        'bottle': 4,\n        'bus': 5,\n        'car': 6,\n        'cat': 7,\n        'chair': 8,\n        'cow': 9,\n        'diningtable': 10,\n        'dog': 11,\n        'horse': 12,\n        'motorbike': 13,\n        'person': 14,\n        'pottedplant': 15,\n        'sheep': 16,\n        'sofa': 17,\n        'train': 18,\n        'tvmonitor': 19\n    }\n    return labels_map\n"
  },
  {
    "path": "ppdet/data/source/widerface.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import defaultdict\nimport os\nimport numpy as np\nfrom scipy.io import loadmat\n\nfrom ppdet.core.workspace import register, serializable\nfrom .dataset import DetDataset\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\n@register\n@serializable\nclass WIDERFaceDataSet(DetDataset):\n    \"\"\"\n    Load WiderFace records with 'anno_path'\n\n    Args:\n        dataset_dir (str): root directory for dataset.\n        image_dir (str): directory for images.\n        anno_path (str): WiderFace annotation data.\n        data_fields (list): key name of data dictionary, at least have 'image'.\n        sample_num (int): number of samples to load, -1 means all.\n        with_lmk (bool): whether to load face landmark keypoint labels.\n    \"\"\"\n\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 data_fields=['image'],\n                 sample_num=-1,\n                 with_lmk=False):\n        super(WIDERFaceDataSet, self).__init__(\n            dataset_dir=dataset_dir,\n            image_dir=image_dir,\n            anno_path=anno_path,\n            data_fields=data_fields,\n            sample_num=sample_num,\n            with_lmk=with_lmk)\n        self.anno_path = anno_path\n        self.sample_num = sample_num\n        self.roidbs = None\n        self.cname2cid = None\n        self.with_lmk = with_lmk\n\n    def parse_dataset(self):\n        anno_path = os.path.join(self.dataset_dir, self.anno_path)\n        image_dir = os.path.join(self.dataset_dir, self.image_dir)\n\n        txt_file = anno_path\n\n        records = []\n        ct = 0\n        file_lists = self._load_file_list(txt_file)\n        cname2cid = widerface_label()\n\n        for item in file_lists:\n            im_fname = item[0]\n            im_id = np.array([ct])\n            gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32)\n            gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32)\n            gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32)\n            lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32)\n            for index_box in range(len(item)):\n                if index_box < 1:\n                    continue\n                gt_bbox[index_box - 1] = item[index_box][0]\n                if self.with_lmk:\n                    gt_lmk_labels[index_box - 1] = item[index_box][1]\n                    lmk_ignore_flag[index_box - 1] = item[index_box][2]\n            im_fname = os.path.join(image_dir,\n                                    im_fname) if image_dir else im_fname\n            widerface_rec = {\n                'im_file': im_fname,\n                'im_id': im_id,\n            } if 'image' in self.data_fields else {}\n            gt_rec = {\n                'gt_bbox': gt_bbox,\n                'gt_class': gt_class,\n            }\n            for k, v in gt_rec.items():\n                if k in self.data_fields:\n                    widerface_rec[k] = v\n            if self.with_lmk:\n                widerface_rec['gt_keypoint'] = gt_lmk_labels\n                widerface_rec['keypoint_ignore'] = lmk_ignore_flag\n\n            if len(item) != 0:\n                records.append(widerface_rec)\n\n            ct += 1\n            if self.sample_num > 0 and ct >= self.sample_num:\n                break\n        assert len(records) > 0, 'not found any widerface in %s' % (anno_path)\n        logger.debug('{} samples in file {}'.format(ct, anno_path))\n        self.roidbs, self.cname2cid = records, cname2cid\n\n    def _load_file_list(self, input_txt):\n        with open(input_txt, 'r') as f_dir:\n            lines_input_txt = f_dir.readlines()\n\n        file_dict = {}\n        num_class = 0\n        exts = ['jpg', 'jpeg', 'png', 'bmp']\n        exts += [ext.upper() for ext in exts]\n        for i in range(len(lines_input_txt)):\n            line_txt = lines_input_txt[i].strip('\\n\\t\\r')\n            split_str = line_txt.split(' ')\n            if len(split_str) == 1:\n                img_file_name = os.path.split(split_str[0])[1]\n                split_txt = img_file_name.split('.')\n                if len(split_txt) < 2:\n                    continue\n                elif split_txt[-1] in exts:\n                    if i != 0:\n                        num_class += 1\n                    file_dict[num_class] = [line_txt]\n            else:\n                if len(line_txt) <= 6:\n                    continue\n                result_boxs = []\n                xmin = float(split_str[0])\n                ymin = float(split_str[1])\n                w = float(split_str[2])\n                h = float(split_str[3])\n                # Filter out wrong labels\n                if w < 0 or h < 0:\n                    logger.warning('Illegal box with w: {}, h: {} in '\n                                   'img: {}, and it will be ignored'.format(\n                                       w, h, file_dict[num_class][0]))\n                    continue\n                xmin = max(0, xmin)\n                ymin = max(0, ymin)\n                xmax = xmin + w\n                ymax = ymin + h\n                gt_bbox = [xmin, ymin, xmax, ymax]\n                result_boxs.append(gt_bbox)\n                if self.with_lmk:\n                    assert len(split_str) > 18, 'When `with_lmk=True`, the number' \\\n                            'of characters per line in the annotation file should' \\\n                            'exceed 18.'\n                    lmk0_x = float(split_str[5])\n                    lmk0_y = float(split_str[6])\n                    lmk1_x = float(split_str[8])\n                    lmk1_y = float(split_str[9])\n                    lmk2_x = float(split_str[11])\n                    lmk2_y = float(split_str[12])\n                    lmk3_x = float(split_str[14])\n                    lmk3_y = float(split_str[15])\n                    lmk4_x = float(split_str[17])\n                    lmk4_y = float(split_str[18])\n                    lmk_ignore_flag = 0 if lmk0_x == -1 else 1\n                    gt_lmk_label = [\n                        lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x,\n                        lmk3_y, lmk4_x, lmk4_y\n                    ]\n                    result_boxs.append(gt_lmk_label)\n                    result_boxs.append(lmk_ignore_flag)\n                file_dict[num_class].append(result_boxs)\n\n        return list(file_dict.values())\n\n\ndef widerface_label():\n    labels_map = {'face': 0}\n    return labels_map\n\n\n@register\n@serializable\nclass WIDERFaceValDataset(WIDERFaceDataSet):\n    def __init__(self,\n                 dataset_dir=None,\n                 image_dir=None,\n                 anno_path=None,\n                 gt_mat_path=None,\n                 data_fields=['image'],\n                 sample_num=-1,\n                 with_lmk=False):\n        super().__init__(\n            dataset_dir=dataset_dir,\n            image_dir=image_dir,\n            anno_path=anno_path,\n            data_fields=data_fields,\n            sample_num=sample_num,\n            with_lmk=with_lmk)\n        self.gt_mat_path = gt_mat_path\n        self.val_mat = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_face_val.mat')\n        self.hard_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_hard_val.mat')\n        self.medium_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_medium_val.mat')\n        self.easy_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_easy_val.mat')\n\n        assert os.path.exists(self.val_mat), f'{self.val_mat} not exist'\n        assert os.path.exists(self.hard_mat_path), f'{self.hard_mat_path} not exist'\n        assert os.path.exists(self.medium_mat_path), f'{self.medium_mat_path} not exist'\n        assert os.path.exists(self.easy_mat_path), f'{self.easy_mat_path} not exist'\n\n    def parse_dataset(self):\n        super().parse_dataset()\n\n        box_list, flie_list, event_list, hard_info_list, medium_info_list, \\\n            easy_info_list = self.get_gt_infos()\n        setting_infos = [easy_info_list, medium_info_list, hard_info_list]\n        settings = ['easy', 'medium', 'hard']\n        info_by_name = defaultdict(dict)\n        for setting_id in range(3):\n            info_list = setting_infos[setting_id]\n            setting = settings[setting_id]\n            for i in range(len(event_list)):\n                img_list = flie_list[i][0]\n                gt_box_list = box_list[i][0]\n                sub_info_list = info_list[i][0]\n                for j in range(len(img_list)):\n                    img_name = str(img_list[j][0][0])\n                    gt_boxes = gt_box_list[j][0].astype(np.float32)\n                    info_by_name[img_name]['gt_ori_bbox'] = gt_boxes\n\n                    keep_index = sub_info_list[j][0]\n                    ignore = np.zeros(gt_boxes.shape[0])\n                    if len(keep_index) != 0:\n                        ignore[keep_index-1] = 1\n                    info_by_name[img_name][f'gt_{setting}_ignore'] = ignore\n\n        for roidb in self.roidbs:\n            img_file = roidb['im_file'].split('/')[-1]\n            img_name = \".\".join(img_file.split(\".\")[:-1])\n            roidb.update(info_by_name[img_name])\n\n    def get_gt_infos(self):\n        \"\"\" gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)\"\"\"\n\n        val_mat = loadmat(self.val_mat)\n        hard_mat = loadmat(self.hard_mat_path)\n        medium_mat = loadmat(self.medium_mat_path)\n        easy_mat = loadmat(self.easy_mat_path)\n\n        box_list = val_mat['face_bbx_list']\n        file_list = val_mat['file_list']\n        event_list = val_mat['event_list']\n\n        hard_info_list = hard_mat['gt_list']\n        medium_info_list = medium_mat['gt_list']\n        easy_info_list = easy_mat['gt_list']\n\n        return box_list, file_list, event_list, hard_info_list, medium_info_list, easy_info_list"
  },
  {
    "path": "ppdet/data/transform/__init__.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import operators\nfrom . import batch_operators\nfrom . import keypoint_operators\nfrom . import mot_operators\nfrom . import rotated_operators\nfrom . import keypoints_3d_operators\nfrom . import culane_operators\n\nfrom .operators import *\nfrom .batch_operators import *\nfrom .keypoint_operators import *\nfrom .mot_operators import *\nfrom .rotated_operators import *\nfrom .keypoints_3d_operators import *\nfrom .culane_operators import *\n\n__all__ = []\n__all__ += registered_ops\n__all__ += keypoint_operators.__all__\n__all__ += mot_operators.__all__\n__all__ += culane_operators.__all__\n"
  },
  {
    "path": "ppdet/data/transform/atss_assigner.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# The code is based on:\n# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/atss_assigner.py\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\ndef bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):\n    \"\"\"Calculate overlap between two set of bboxes.\n    If ``is_aligned `` is ``False``, then calculate the overlaps between each\n    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned\n    pair of bboxes1 and bboxes2.\n    Args:\n        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.\n        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.\n            B indicates the batch dim, in shape (B1, B2, ..., Bn).\n            If ``is_aligned `` is ``True``, then m and n must be equal.\n        mode (str): \"iou\" (intersection over union) or \"iof\" (intersection over\n            foreground).\n        is_aligned (bool, optional): If True, then m and n must be equal.\n            Default False.\n        eps (float, optional): A value added to the denominator for numerical\n            stability. Default 1e-6.\n    Returns:\n        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)\n    \"\"\"\n    assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format(\n        mode)\n    # Either the boxes are empty or the length of boxes's last dimenstion is 4\n    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)\n    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)\n\n    # Batch dim must be the same\n    # Batch dim: (B1, B2, ... Bn)\n    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]\n    batch_shape = bboxes1.shape[:-2]\n\n    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0\n    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0\n    if is_aligned:\n        assert rows == cols\n\n    if rows * cols == 0:\n        if is_aligned:\n            return np.random.random(batch_shape + (rows, ))\n        else:\n            return np.random.random(batch_shape + (rows, cols))\n\n    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (\n        bboxes1[..., 3] - bboxes1[..., 1])\n    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (\n        bboxes2[..., 3] - bboxes2[..., 1])\n\n    if is_aligned:\n        lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]\n        rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]\n\n        wh = (rb - lt).clip(min=0)  # [B, rows, 2]\n        overlap = wh[..., 0] * wh[..., 1]\n\n        if mode in ['iou', 'giou']:\n            union = area1 + area2 - overlap\n        else:\n            union = area1\n        if mode == 'giou':\n            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])\n            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])\n        if mode == 'diou':\n            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])\n            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])\n            b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1]\n            b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3]\n            b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1]\n            b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3]\n    else:\n        lt = np.maximum(bboxes1[..., :, None, :2],\n                        bboxes2[..., None, :, :2])  # [B, rows, cols, 2]\n        rb = np.minimum(bboxes1[..., :, None, 2:],\n                        bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]\n\n        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]\n        overlap = wh[..., 0] * wh[..., 1]\n\n        if mode in ['iou', 'giou']:\n            union = area1[..., None] + area2[..., None, :] - overlap\n        else:\n            union = area1[..., None]\n        if mode == 'giou':\n            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],\n                                     bboxes2[..., None, :, :2])\n            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],\n                                     bboxes2[..., None, :, 2:])\n        if mode == 'diou':\n            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],\n                                     bboxes2[..., None, :, :2])\n            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],\n                                     bboxes2[..., None, :, 2:])\n            b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1]\n            b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3]\n            b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1]\n            b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3]\n\n    eps = np.array([eps])\n    union = np.maximum(union, eps)\n    ious = overlap / union\n    if mode in ['iou', 'iof']:\n        return ious\n    # calculate gious\n    if mode in ['giou']:\n        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)\n        enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]\n        enclose_area = np.maximum(enclose_area, eps)\n        gious = ious - (enclose_area - union) / enclose_area\n        return gious\n    if mode in ['diou']:\n        left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4\n        right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4\n        rho2 = left + right\n        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)\n        enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2\n        enclose_c = np.maximum(enclose_c, eps)\n        dious = ious - rho2 / enclose_c\n        return dious\n\n\ndef topk_(input, k, axis=1, largest=True):\n    x = -input if largest else input\n    if axis == 0:\n        row_index = np.arange(input.shape[1 - axis])\n        if k == x.shape[0]:  # argpartition requires index < len(input)\n            topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :]\n        else:\n            topk_index = np.argpartition(x, k, axis=axis)[0:k, :]\n\n        topk_data = x[topk_index, row_index]\n\n        topk_index_sort = np.argsort(topk_data, axis=axis)\n        topk_data_sort = topk_data[topk_index_sort, row_index]\n        topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index]\n    else:\n        column_index = np.arange(x.shape[1 - axis])[:, None]\n        topk_index = np.argpartition(x, k, axis=axis)[:, 0:k]\n        topk_data = x[column_index, topk_index]\n        topk_data = -topk_data if largest else topk_data\n        topk_index_sort = np.argsort(topk_data, axis=axis)\n        topk_data_sort = topk_data[column_index, topk_index_sort]\n        topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort]\n\n    return topk_data_sort, topk_index_sort\n\n\nclass ATSSAssigner(object):\n    \"\"\"Assign a corresponding gt bbox or background to each bbox.\n\n    Each proposals will be assigned with `0` or a positive integer\n    indicating the ground truth index.\n\n    - 0: negative sample, no assigned gt\n    - positive integer: positive sample, index (1-based) of assigned gt\n\n    Args:\n        topk (float): number of bbox selected in each level\n    \"\"\"\n\n    def __init__(self, topk=9):\n        self.topk = topk\n\n    def __call__(self,\n                 bboxes,\n                 num_level_bboxes,\n                 gt_bboxes,\n                 gt_bboxes_ignore=None,\n                 gt_labels=None):\n        \"\"\"Assign gt to bboxes.\n        The assignment is done in following steps\n        1. compute iou between all bbox (bbox of all pyramid levels) and gt\n        2. compute center distance between all bbox and gt\n        3. on each pyramid level, for each gt, select k bbox whose center\n           are closest to the gt center, so we total select k*l bbox as\n           candidates for each gt\n        4. get corresponding iou for the these candidates, and compute the\n           mean and std, set mean + std as the iou threshold\n        5. select these candidates whose iou are greater than or equal to\n           the threshold as postive\n        6. limit the positive sample's center in gt\n        Args:\n            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).\n            num_level_bboxes (List): num of bboxes in each level\n            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).\n            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are\n                labelled as `ignored`, e.g., crowd boxes in COCO.\n            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).\n        \"\"\"\n        bboxes = bboxes[:, :4]\n        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]\n\n        # assign 0 by default\n        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)\n\n        if num_gt == 0 or num_bboxes == 0:\n            # No ground truth or boxes, return empty assignment\n            max_overlaps = np.zeros((num_bboxes, ))\n            if num_gt == 0:\n                # No truth, assign everything to background\n                assigned_gt_inds[:] = 0\n            if not np.any(gt_labels):\n                assigned_labels = None\n            else:\n                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)\n            return assigned_gt_inds, max_overlaps\n\n        # compute iou between all bbox and gt\n        overlaps = bbox_overlaps(bboxes, gt_bboxes)\n        # compute center distance between all bbox and gt\n        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0\n        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0\n        gt_points = np.stack((gt_cx, gt_cy), axis=1)\n\n        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0\n        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0\n        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)\n\n        distances = np.sqrt(\n            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)\n            .sum(-1))\n\n        # Selecting candidates based on the center distance\n        candidate_idxs = []\n        start_idx = 0\n        for bboxes_per_level in num_level_bboxes:\n            # on each pyramid level, for each gt,\n            # select k bbox whose center are closest to the gt center\n            end_idx = start_idx + bboxes_per_level\n            distances_per_level = distances[start_idx:end_idx, :]\n            selectable_k = min(self.topk, bboxes_per_level)\n            _, topk_idxs_per_level = topk_(\n                distances_per_level, selectable_k, axis=0, largest=False)\n            candidate_idxs.append(topk_idxs_per_level + start_idx)\n            start_idx = end_idx\n        candidate_idxs = np.concatenate(candidate_idxs, axis=0)\n\n        # get corresponding iou for the these candidates, and compute the\n        # mean and std, set mean + std as the iou threshold\n        candidate_overlaps = overlaps[candidate_idxs, np.arange(num_gt)]\n        overlaps_mean_per_gt = candidate_overlaps.mean(0)\n        overlaps_std_per_gt = candidate_overlaps.std(0)\n        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt\n\n        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]\n\n        # limit the positive sample's center in gt\n        for gt_idx in range(num_gt):\n            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes\n        ep_bboxes_cx = np.broadcast_to(\n            bboxes_cx.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)\n        ep_bboxes_cy = np.broadcast_to(\n            bboxes_cy.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)\n        candidate_idxs = candidate_idxs.reshape(-1)\n\n        # calculate the left, top, right, bottom distance between positive\n        # bbox center and gt side\n        l_ = ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 0]\n        t_ = ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 1]\n        r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt)\n        b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt)\n        is_in_gts = np.stack([l_, t_, r_, b_], axis=1).min(axis=1) > 0.01\n        is_pos = is_pos & is_in_gts\n\n        # if an anchor box is assigned to multiple gts,\n        # the one with the highest IoU will be selected.\n        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)\n        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]\n        overlaps_inf[index] = overlaps.T.reshape(-1)[index]\n        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T\n\n        max_overlaps = overlaps_inf.max(axis=1)\n        argmax_overlaps = overlaps_inf.argmax(axis=1)\n        assigned_gt_inds[max_overlaps !=\n                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1\n\n        return assigned_gt_inds, max_overlaps\n\n    def get_vlr_region(self,\n                       bboxes,\n                       num_level_bboxes,\n                       gt_bboxes,\n                       gt_bboxes_ignore=None,\n                       gt_labels=None):\n        \"\"\"get vlr region for ld distillation.\n        Args:\n            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).\n            num_level_bboxes (List): num of bboxes in each level\n            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).\n            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are\n                labelled as `ignored`, e.g., crowd boxes in COCO.\n            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).\n        \"\"\"\n        bboxes = bboxes[:, :4]\n\n        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]\n\n        # compute iou between all bbox and gt\n        overlaps = bbox_overlaps(bboxes, gt_bboxes)\n\n        # compute diou between all bbox and gt\n        diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou')\n\n        # assign 0 by default\n        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)\n\n        vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32)\n\n        if num_gt == 0 or num_bboxes == 0:\n            # No ground truth or boxes, return empty assignment\n            max_overlaps = np.zeros((num_bboxes, ))\n            if num_gt == 0:\n                # No truth, assign everything to background\n                assigned_gt_inds[:] = 0\n            if not np.any(gt_labels):\n                assigned_labels = None\n            else:\n                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)\n            return assigned_gt_inds, max_overlaps\n\n        # compute center distance between all bbox and gt\n        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0\n        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0\n        gt_points = np.stack((gt_cx, gt_cy), axis=1)\n\n        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0\n        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0\n        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)\n\n        distances = np.sqrt(\n            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)\n            .sum(-1))\n\n        # Selecting candidates based on the center distance\n        candidate_idxs = []\n        candidate_idxs_t = []\n        start_idx = 0\n        for bboxes_per_level in num_level_bboxes:\n            # on each pyramid level, for each gt,\n            # select k bbox whose center are closest to the gt center\n            end_idx = start_idx + bboxes_per_level\n            distances_per_level = distances[start_idx:end_idx, :]\n            selectable_t = min(self.topk, bboxes_per_level)\n            selectable_k = bboxes_per_level  #k for all\n            _, topt_idxs_per_level = topk_(\n                distances_per_level, selectable_t, axis=0, largest=False)\n            _, topk_idxs_per_level = topk_(\n                distances_per_level, selectable_k, axis=0, largest=False)\n            candidate_idxs_t.append(topt_idxs_per_level + start_idx)\n            candidate_idxs.append(topk_idxs_per_level + start_idx)\n            start_idx = end_idx\n\n        candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0)\n        candidate_idxs = np.concatenate(candidate_idxs, axis=0)\n\n        # get corresponding iou for the these candidates, and compute the\n        # mean and std, set mean + std as the iou threshold\n        candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)]\n\n        # compute tdiou\n        t_diou = diou[candidate_idxs, np.arange(num_gt)]\n\n        overlaps_mean_per_gt = candidate_overlaps_t.mean(0)\n        overlaps_std_per_gt = candidate_overlaps_t.std(\n            0, ddof=1)  # NOTE: use Bessel correction\n        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt\n\n        # compute region        \n        is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & (\n            t_diou >= 0.25 * overlaps_thr_per_gt[None, :])\n\n        # limit the positive sample's center in gt\n        for gt_idx in range(num_gt):\n            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes\n\n        candidate_idxs = candidate_idxs.reshape(-1)\n\n        # if an anchor box is assigned to multiple gts,\n        # the one with the highest IoU will be selected.\n        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)\n        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]\n\n        overlaps_inf[index] = overlaps.T.reshape(-1)[index]\n        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T\n\n        max_overlaps = overlaps_inf.max(axis=1)\n        argmax_overlaps = overlaps_inf.argmax(axis=1)\n\n        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)\n        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T\n\n        assigned_gt_inds[max_overlaps !=\n                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1\n\n        vlr_region_iou[max_overlaps !=\n                       -np.inf] = max_overlaps[max_overlaps != -np.inf] + 0\n\n        return vlr_region_iou\n"
  },
  {
    "path": "ppdet/data/transform/autoaugment_utils.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# Reference: \n#   https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py\n\"\"\"AutoAugment util file.\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport inspect\nimport math\nfrom PIL import Image, ImageEnhance\nimport numpy as np\nimport cv2\nfrom copy import deepcopy\n\n# This signifies the max integer that the controller RNN could predict for the\n# augmentation scheme.\n_MAX_LEVEL = 10.\n\n# Represents an invalid bounding box that is used for checking for padding\n# lists of bounding box coordinates for a few augmentation operations\n_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]]\n\n\ndef policy_v0():\n    \"\"\"Autoaugment policy that was used in AutoAugment Detection Paper.\"\"\"\n    # Each tuple is an augmentation operation of the form\n    # (operation, probability, magnitude). Each element in policy is a\n    # sub-policy that will be applied sequentially on the image.\n    policy = [\n        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],\n        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],\n        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],\n        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],\n        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],\n    ]\n    return policy\n\n\ndef policy_v1():\n    \"\"\"Autoaugment policy that was used in AutoAugment Detection Paper.\"\"\"\n    # Each tuple is an augmentation operation of the form\n    # (operation, probability, magnitude). Each element in policy is a\n    # sub-policy that will be applied sequentially on the image.\n    policy = [\n        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],\n        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],\n        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],\n        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],\n        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],\n        [('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)],\n        [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)],\n        [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)],\n        [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)],\n        [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)],  # , \n        [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)],\n        [('Color', 1.0, 6), ('Equalize', 1.0, 2)],\n        [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)],\n        [('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)],\n        [('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)],\n        [('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)],\n        [('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)],\n        [('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)],\n        [('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)],\n        [('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)],\n    ]\n    return policy\n\n\ndef policy_vtest():\n    \"\"\"Autoaugment test policy for debugging.\"\"\"\n    # Each tuple is an augmentation operation of the form\n    # (operation, probability, magnitude). Each element in policy is a\n    # sub-policy that will be applied sequentially on the image.\n    policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ]\n    return policy\n\n\ndef policy_v2():\n    \"\"\"Additional policy that performs well on object detection.\"\"\"\n    # Each tuple is an augmentation operation of the form\n    # (operation, probability, magnitude). Each element in policy is a\n    # sub-policy that will be applied sequentially on the image.\n    policy = [\n        [('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)],\n        [('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2),\n         ('Rotate_BBox', 0.8, 10)],\n        [('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)],\n        [('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8),\n         ('Brightness', 0.0, 10)],\n        [('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10),\n         ('AutoContrast', 0.6, 0)],\n        [('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)],\n        [('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8),\n         ('Solarize', 0.0, 10)],\n        [('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8),\n         ('Rotate_BBox', 0.8, 8)],\n        [('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)],\n        [('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6),\n         ('Rotate_BBox', 0.6, 6)],\n        [('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4), ('Cutout', 0.2, 8)],\n        [('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6),\n         ('ShearY_BBox', 0.6, 8)],\n        [('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2),\n         ('Brightness', 0.2, 2)],\n        [('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6),\n         ('SolarizeAdd', 0.2, 10)],\n        [('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)],\n    ]\n    return policy\n\n\ndef policy_v3():\n    \"\"\"\"Additional policy that performs well on object detection.\"\"\"\n    # Each tuple is an augmentation operation of the form\n    # (operation, probability, magnitude). Each element in policy is a\n    # sub-policy that will be applied sequentially on the image.\n    policy = [\n        [('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)],\n        [('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)],\n        [('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)],\n        [('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)],\n        [('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)],\n        [('Sharpness', 0.0, 2), ('Color', 0.4, 8)],\n        [('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)],\n        [('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)],\n        [('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)],\n        [('Equalize', 0.0, 4), ('Cutout', 0.8, 10)],\n        [('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)],\n        [('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)],\n        [('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)],\n        [('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)],\n        [('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)],\n    ]\n    return policy\n\n\ndef _equal(val1, val2, eps=1e-8):\n    return abs(val1 - val2) <= eps\n\n\ndef blend(image1, image2, factor):\n    \"\"\"Blend image1 and image2 using 'factor'.\n\n    Factor can be above 0.0.    A value of 0.0 means only image1 is used.\n    A value of 1.0 means only image2 is used.    A value between 0.0 and\n    1.0 means we linearly interpolate the pixel values between the two\n    images.    A value greater than 1.0 \"extrapolates\" the difference\n    between the two pixel values, and we clip the results to values\n    between 0 and 255.\n\n    Args:\n        image1: An image Tensor of type uint8.\n        image2: An image Tensor of type uint8.\n        factor: A floating point value above 0.0.\n\n    Returns:\n        A blended image Tensor of type uint8.\n    \"\"\"\n    if factor == 0.0:\n        return image1\n    if factor == 1.0:\n        return image2\n\n    image1 = image1.astype(np.float32)\n    image2 = image2.astype(np.float32)\n\n    difference = image2 - image1\n    scaled = factor * difference\n\n    # Do addition in float.\n    temp = image1 + scaled\n\n    # Interpolate\n    if factor > 0.0 and factor < 1.0:\n        # Interpolation means we always stay within 0 and 255.\n        return temp.astype(np.uint8)\n\n    # Extrapolate:\n    #\n    # We need to clip and then cast.\n    return np.clip(temp, a_min=0, a_max=255).astype(np.uint8)\n\n\ndef cutout(image, pad_size, replace=0):\n    \"\"\"Apply cutout (https://arxiv.org/abs/1708.04552) to image.\n\n    This operation applies a (2*pad_size x 2*pad_size) mask of zeros to\n    a random location within `img`. The pixel values filled in will be of the\n    value `replace`. The located where the mask will be applied is randomly\n    chosen uniformly over the whole image.\n\n    Args:\n        image: An image Tensor of type uint8.\n        pad_size: Specifies how big the zero mask that will be generated is that\n            is applied to the image. The mask will be of size\n            (2*pad_size x 2*pad_size).\n        replace: What pixel value to fill in the image in the area that has\n            the cutout mask applied to it.\n\n    Returns:\n        An image Tensor that is of type uint8.\n    Example:\n        img = cv2.imread( \"/home/vis/gry/train/img_data/test.jpg\", cv2.COLOR_BGR2RGB )\n        new_img = cutout(img, pad_size=50, replace=0)\n    \"\"\"\n    image_height, image_width = image.shape[0], image.shape[1]\n\n    cutout_center_height = np.random.randint(low=0, high=image_height)\n    cutout_center_width = np.random.randint(low=0, high=image_width)\n\n    lower_pad = np.maximum(0, cutout_center_height - pad_size)\n    upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size)\n    left_pad = np.maximum(0, cutout_center_width - pad_size)\n    right_pad = np.maximum(0, image_width - cutout_center_width - pad_size)\n\n    cutout_shape = [\n        image_height - (lower_pad + upper_pad),\n        image_width - (left_pad + right_pad)\n    ]\n    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]\n    mask = np.pad(np.zeros(\n        cutout_shape, dtype=image.dtype),\n                  padding_dims,\n                  'constant',\n                  constant_values=1)\n    mask = np.expand_dims(mask, -1)\n    mask = np.tile(mask, [1, 1, 3])\n    image = np.where(\n        np.equal(mask, 0),\n        np.ones_like(\n            image, dtype=image.dtype) * replace,\n        image)\n    return image.astype(np.uint8)\n\n\ndef solarize(image, threshold=128):\n    # For each pixel in the image, select the pixel\n    # if the value is less than the threshold.\n    # Otherwise, subtract 255 from the pixel.\n    return np.where(image < threshold, image, 255 - image)\n\n\ndef solarize_add(image, addition=0, threshold=128):\n    # For each pixel in the image less than threshold\n    # we add 'addition' amount to it and then clip the\n    # pixel value to be between 0 and 255. The value\n    # of 'addition' is between -128 and 128.\n    added_image = image.astype(np.int64) + addition\n    added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8)\n    return np.where(image < threshold, added_image, image)\n\n\ndef color(image, factor):\n    \"\"\"use cv2 to deal\"\"\"\n    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n    degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)\n    return blend(degenerate, image, factor)\n\n\n# refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197\ndef contrast(img, factor):\n    img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor)\n    return np.array(img)\n\n\ndef brightness(image, factor):\n    \"\"\"Equivalent of PIL Brightness.\"\"\"\n    degenerate = np.zeros_like(image)\n    return blend(degenerate, image, factor)\n\n\ndef posterize(image, bits):\n    \"\"\"Equivalent of PIL Posterize.\"\"\"\n    shift = 8 - bits\n    return np.left_shift(np.right_shift(image, shift), shift)\n\n\ndef rotate(image, degrees, replace):\n    \"\"\"Rotates the image by degrees either clockwise or counterclockwise.\n\n    Args:\n        image: An image Tensor of type uint8.\n        degrees: Float, a scalar angle in degrees to rotate all images by. If\n            degrees is positive the image will be rotated clockwise otherwise it will\n            be rotated counterclockwise.\n        replace: A one or three value 1D tensor to fill empty pixels caused by\n            the rotate operation.\n\n    Returns:\n        The rotated version of image.\n    \"\"\"\n    image = wrap(image)\n    image = Image.fromarray(image)\n    image = image.rotate(degrees)\n    image = np.array(image, dtype=np.uint8)\n    return unwrap(image, replace)\n\n\ndef random_shift_bbox(image,\n                      bbox,\n                      pixel_scaling,\n                      replace,\n                      new_min_bbox_coords=None):\n    \"\"\"Move the bbox and the image content to a slightly new random location.\n\n    Args:\n        image: 3D uint8 Tensor.\n        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)\n            of type float that represents the normalized coordinates between 0 and 1.\n            The potential values for the new min corner of the bbox will be between\n            [old_min - pixel_scaling * bbox_height/2,\n             old_min - pixel_scaling * bbox_height/2].\n        pixel_scaling: A float between 0 and 1 that specifies the pixel range\n            that the new bbox location will be sampled from.\n        replace: A one or three value 1D tensor to fill empty pixels.\n        new_min_bbox_coords: If not None, then this is a tuple that specifies the\n            (min_y, min_x) coordinates of the new bbox. Normally this is randomly\n            specified, but this allows it to be manually set. The coordinates are\n            the absolute coordinates between 0 and image height/width and are int32.\n\n    Returns:\n        The new image that will have the shifted bbox location in it along with\n        the new bbox that contains the new coordinates.\n    \"\"\"\n    # Obtains image height and width and create helper clip functions.\n    image_height, image_width = image.shape[0], image.shape[1]\n    image_height = float(image_height)\n    image_width = float(image_width)\n\n    def clip_y(val):\n        return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32)\n\n    def clip_x(val):\n        return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32)\n\n    # Convert bbox to pixel coordinates.\n    min_y = int(image_height * bbox[0])\n    min_x = int(image_width * bbox[1])\n    max_y = clip_y(image_height * bbox[2])\n    max_x = clip_x(image_width * bbox[3])\n\n    bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1)\n    image_height = int(image_height)\n    image_width = int(image_width)\n\n    # Select the new min/max bbox ranges that are used for sampling the\n    # new min x/y coordinates of the shifted bbox.\n    minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) /\n                                       2.0))\n    maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) /\n                                       2.0))\n    minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) / 2.0))\n    maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) / 2.0))\n\n    # Sample and calculate the new unclipped min/max coordinates of the new bbox.\n    if new_min_bbox_coords is None:\n        unclipped_new_min_y = np.random.randint(\n            low=minval_y, high=maxval_y, dtype=np.int32)\n        unclipped_new_min_x = np.random.randint(\n            low=minval_x, high=maxval_x, dtype=np.int32)\n    else:\n        unclipped_new_min_y, unclipped_new_min_x = (\n            clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1]))\n    unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1\n    unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1\n\n    # Determine if any of the new bbox was shifted outside the current image.\n    # This is used for determining if any of the original bbox content should be\n    # discarded.\n    new_min_y, new_min_x, new_max_y, new_max_x = (\n        clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x),\n        clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x))\n    shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y\n    shifted_max_y = max_y - (unclipped_new_max_y - new_max_y)\n    shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x\n    shifted_max_x = max_x - (unclipped_new_max_x - new_max_x)\n\n    # Create the new bbox tensor by converting pixel integer values to floats.\n    new_bbox = np.stack([\n        float(new_min_y) / float(image_height), float(new_min_x) /\n        float(image_width), float(new_max_y) / float(image_height),\n        float(new_max_x) / float(image_width)\n    ])\n\n    # Copy the contents in the bbox and fill the old bbox location\n    # with gray (128).\n    bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x:\n                         shifted_max_x + 1, :]\n\n    def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask, content_tensor,\n                           image_):\n        \"\"\"Applies mask to bbox region in image then adds content_tensor to it.\"\"\"\n        mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_],\n                             [min_x_, (image_width - 1) - max_x_], [0, 0]],\n                      'constant',\n                      constant_values=1)\n\n        content_tensor = np.pad(content_tensor,\n                                [[min_y_, (image_height - 1) - max_y_],\n                                 [min_x_, (image_width - 1) - max_x_], [0, 0]],\n                                'constant',\n                                constant_values=0)\n        return image_ * mask + content_tensor\n\n    # Zero out original bbox location.\n    mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :]\n    grey_tensor = np.zeros_like(mask) + replace[0]\n    image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor,\n                               image)\n\n    # Fill in bbox content to new bbox location.\n    mask = np.zeros_like(bbox_content)\n    image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x, mask,\n                               bbox_content, image)\n\n    return image.astype(np.uint8), new_bbox\n\n\ndef _clip_bbox(min_y, min_x, max_y, max_x):\n    \"\"\"Clip bounding box coordinates between 0 and 1.\n\n    Args:\n        min_y: Normalized bbox coordinate of type float between 0 and 1.\n        min_x: Normalized bbox coordinate of type float between 0 and 1.\n        max_y: Normalized bbox coordinate of type float between 0 and 1.\n        max_x: Normalized bbox coordinate of type float between 0 and 1.\n\n    Returns:\n        Clipped coordinate values between 0 and 1.\n    \"\"\"\n    min_y = np.clip(min_y, a_min=0, a_max=1.0)\n    min_x = np.clip(min_x, a_min=0, a_max=1.0)\n    max_y = np.clip(max_y, a_min=0, a_max=1.0)\n    max_x = np.clip(max_x, a_min=0, a_max=1.0)\n    return min_y, min_x, max_y, max_x\n\n\ndef _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05):\n    \"\"\"Adjusts bbox coordinates to make sure the area is > 0.\n\n    Args:\n        min_y: Normalized bbox coordinate of type float between 0 and 1.\n        min_x: Normalized bbox coordinate of type float between 0 and 1.\n        max_y: Normalized bbox coordinate of type float between 0 and 1.\n        max_x: Normalized bbox coordinate of type float between 0 and 1.\n        delta: Float, this is used to create a gap of size 2 * delta between\n            bbox min/max coordinates that are the same on the boundary.\n            This prevents the bbox from having an area of zero.\n\n    Returns:\n        Tuple of new bbox coordinates between 0 and 1 that will now have a\n        guaranteed area > 0.\n    \"\"\"\n    height = max_y - min_y\n    width = max_x - min_x\n\n    def _adjust_bbox_boundaries(min_coord, max_coord):\n        # Make sure max is never 0 and min is never 1.\n        max_coord = np.maximum(max_coord, 0.0 + delta)\n        min_coord = np.minimum(min_coord, 1.0 - delta)\n        return min_coord, max_coord\n\n    if _equal(height, 0):\n        min_y, max_y = _adjust_bbox_boundaries(min_y, max_y)\n\n    if _equal(width, 0):\n        min_x, max_x = _adjust_bbox_boundaries(min_x, max_x)\n\n    return min_y, min_x, max_y, max_x\n\n\ndef _scale_bbox_only_op_probability(prob):\n    \"\"\"Reduce the probability of the bbox-only operation.\n\n    Probability is reduced so that we do not distort the content of too many\n    bounding boxes that are close to each other. The value of 3.0 was a chosen\n    hyper parameter when designing the autoaugment algorithm that we found\n    empirically to work well.\n\n    Args:\n        prob: Float that is the probability of applying the bbox-only operation.\n\n    Returns:\n        Reduced probability.\n    \"\"\"\n    return prob / 3.0\n\n\ndef _apply_bbox_augmentation(image, bbox, augmentation_func, *args):\n    \"\"\"Applies augmentation_func to the subsection of image indicated by bbox.\n\n    Args:\n        image: 3D uint8 Tensor.\n        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)\n            of type float that represents the normalized coordinates between 0 and 1.\n        augmentation_func: Augmentation function that will be applied to the\n            subsection of image.\n        *args: Additional parameters that will be passed into augmentation_func\n            when it is called.\n\n    Returns:\n        A modified version of image, where the bbox location in the image will\n        have `ugmentation_func applied to it.\n    \"\"\"\n    image_height = image.shape[0]\n    image_width = image.shape[1]\n\n    min_y = int(image_height * bbox[0])\n    min_x = int(image_width * bbox[1])\n    max_y = int(image_height * bbox[2])\n    max_x = int(image_width * bbox[3])\n\n    # Clip to be sure the max values do not fall out of range.\n    max_y = np.minimum(max_y, image_height - 1)\n    max_x = np.minimum(max_x, image_width - 1)\n\n    # Get the sub-tensor that is the image within the bounding box region.\n    bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :]\n\n    # Apply the augmentation function to the bbox portion of the image.\n    augmented_bbox_content = augmentation_func(bbox_content, *args)\n\n    # Pad the augmented_bbox_content and the mask to match the shape of original\n    # image.\n    augmented_bbox_content = np.pad(\n        augmented_bbox_content, [[min_y, (image_height - 1) - max_y],\n                                 [min_x, (image_width - 1) - max_x], [0, 0]],\n        'constant',\n        constant_values=1)\n\n    # Create a mask that will be used to zero out a part of the original image.\n    mask_tensor = np.zeros_like(bbox_content)\n\n    mask_tensor = np.pad(mask_tensor,\n                         [[min_y, (image_height - 1) - max_y],\n                          [min_x, (image_width - 1) - max_x], [0, 0]],\n                         'constant',\n                         constant_values=1)\n    # Replace the old bbox content with the new augmented content.\n    image = image * mask_tensor + augmented_bbox_content\n    return image.astype(np.uint8)\n\n\ndef _concat_bbox(bbox, bboxes):\n    \"\"\"Helper function that concates bbox to bboxes along the first dimension.\"\"\"\n\n    # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means\n    # we discard bboxes and start the bboxes Tensor with the current bbox.\n    bboxes_sum_check = np.sum(bboxes)\n    bbox = np.expand_dims(bbox, 0)\n    # This check will be true when it is an _INVALID_BOX\n    if _equal(bboxes_sum_check, -4):\n        bboxes = bbox\n    else:\n        bboxes = np.concatenate([bboxes, bbox], 0)\n    return bboxes\n\n\ndef _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob,\n                                     augmentation_func, func_changes_bbox,\n                                     *args):\n    \"\"\"Applies _apply_bbox_augmentation with probability prob.\n\n    Args:\n        image: 3D uint8 Tensor.\n        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)\n            of type float that represents the normalized coordinates between 0 and 1.\n        new_bboxes: 2D Tensor that is a list of the bboxes in the image after they\n            have been altered by aug_func. These will only be changed when\n            func_changes_bbox is set to true. Each bbox has 4 elements\n            (min_y, min_x, max_y, max_x) of type float that are the normalized\n            bbox coordinates between 0 and 1.\n        prob: Float that is the probability of applying _apply_bbox_augmentation.\n        augmentation_func: Augmentation function that will be applied to the\n            subsection of image.\n        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition\n            to image.\n        *args: Additional parameters that will be passed into augmentation_func\n            when it is called.\n\n    Returns:\n        A tuple. Fist element is a modified version of image, where the bbox\n        location in the image will have augmentation_func applied to it if it is\n        chosen to be called with probability `prob`. The second element is a\n        Tensor of Tensors of length 4 that will contain the altered bbox after\n        applying augmentation_func.\n    \"\"\"\n    should_apply_op = (np.random.rand() + prob >= 1)\n    if func_changes_bbox:\n        if should_apply_op:\n            augmented_image, bbox = augmentation_func(image, bbox, *args)\n        else:\n            augmented_image, bbox = (image, bbox)\n    else:\n        if should_apply_op:\n            augmented_image = _apply_bbox_augmentation(image, bbox,\n                                                       augmentation_func, *args)\n        else:\n            augmented_image = image\n    new_bboxes = _concat_bbox(bbox, new_bboxes)\n    return augmented_image.astype(np.uint8), new_bboxes\n\n\ndef _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func,\n                                   func_changes_bbox, *args):\n    \"\"\"Applies aug_func to the image for each bbox in bboxes.\n\n    Args:\n        image: 3D uint8 Tensor.\n        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox\n            has 4 elements (min_y, min_x, max_y, max_x) of type float.\n        prob: Float that is the probability of applying aug_func to a specific\n            bounding box within the image.\n        aug_func: Augmentation function that will be applied to the\n            subsections of image indicated by the bbox values in bboxes.\n        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition\n            to image.\n        *args: Additional parameters that will be passed into augmentation_func\n            when it is called.\n\n    Returns:\n        A modified version of image, where each bbox location in the image will\n        have augmentation_func applied to it if it is chosen to be called with\n        probability prob independently across all bboxes. Also the final\n        bboxes are returned that will be unchanged if func_changes_bbox is set to\n        false and if true, the new altered ones will be returned.\n    \"\"\"\n    # Will keep track of the new altered bboxes after aug_func is repeatedly\n    # applied. The -1 values are a dummy value and this first Tensor will be\n    # removed upon appending the first real bbox.\n    new_bboxes = np.array(_INVALID_BOX)\n\n    # If the bboxes are empty, then just give it _INVALID_BOX. The result\n    # will be thrown away.\n    bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes\n\n    assert bboxes.shape[1] == 4, \"bboxes.shape[1] must be 4!!!!\"\n\n    # pylint:disable=g-long-lambda\n    # pylint:disable=line-too-long\n    wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)\n    # pylint:enable=g-long-lambda\n    # pylint:enable=line-too-long\n\n    # Setup the while_loop.\n    num_bboxes = bboxes.shape[0]  # We loop until we go over all bboxes.\n    idx = 0  # Counter for the while loop.\n\n    # Conditional function when to end the loop once we go over all bboxes\n    # images_and_bboxes contain (_image, _new_bboxes)\n    def cond(_idx, _images_and_bboxes):\n        return _idx < num_bboxes\n\n    # Shuffle the bboxes so that the augmentation order is not deterministic if\n    # we are not changing the bboxes with aug_func.\n    # if not func_changes_bbox:\n    #     print(bboxes)\n    #     loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0)\n    #     print(loop_bboxes)\n    # else:\n    #     loop_bboxes = bboxes\n    # we can not shuffle the bbox because it does not contain class information here\n    loop_bboxes = deepcopy(bboxes)\n\n    # Main function of while_loop where we repeatedly apply augmentation on the\n    # bboxes in the image.\n    # pylint:disable=g-long-lambda\n    body = lambda _idx, _images_and_bboxes: [\n            _idx + 1, wrapped_aug_func(_images_and_bboxes[0],\n                                         loop_bboxes[_idx],\n                                         _images_and_bboxes[1])]\n    while (cond(idx, (image, new_bboxes))):\n        idx, (image, new_bboxes) = body(idx, (image, new_bboxes))\n\n    # Either return the altered bboxes or the original ones depending on if\n    # we altered them in anyway.\n    if func_changes_bbox:\n        final_bboxes = new_bboxes\n    else:\n        final_bboxes = bboxes\n    return image, final_bboxes\n\n\ndef _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func,\n                                           func_changes_bbox, *args):\n    \"\"\"Checks to be sure num bboxes > 0 before calling inner function.\"\"\"\n    num_bboxes = len(bboxes)\n    new_image = deepcopy(image)\n    new_bboxes = deepcopy(bboxes)\n    if num_bboxes != 0:\n        new_image, new_bboxes = _apply_multi_bbox_augmentation(\n            new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args)\n    return new_image, new_bboxes\n\n\ndef rotate_only_bboxes(image, bboxes, prob, degrees, replace):\n    \"\"\"Apply rotate to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(\n        image, bboxes, prob, rotate, func_changes_bbox, degrees, replace)\n\n\ndef shear_x_only_bboxes(image, bboxes, prob, level, replace):\n    \"\"\"Apply shear_x to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(\n        image, bboxes, prob, shear_x, func_changes_bbox, level, replace)\n\n\ndef shear_y_only_bboxes(image, bboxes, prob, level, replace):\n    \"\"\"Apply shear_y to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(\n        image, bboxes, prob, shear_y, func_changes_bbox, level, replace)\n\n\ndef translate_x_only_bboxes(image, bboxes, prob, pixels, replace):\n    \"\"\"Apply translate_x to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(\n        image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace)\n\n\ndef translate_y_only_bboxes(image, bboxes, prob, pixels, replace):\n    \"\"\"Apply translate_y to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(\n        image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace)\n\n\ndef flip_only_bboxes(image, bboxes, prob):\n    \"\"\"Apply flip_lr to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob,\n                                                  np.fliplr, func_changes_bbox)\n\n\ndef solarize_only_bboxes(image, bboxes, prob, threshold):\n    \"\"\"Apply solarize to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, solarize,\n                                                  func_changes_bbox, threshold)\n\n\ndef equalize_only_bboxes(image, bboxes, prob):\n    \"\"\"Apply equalize to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, equalize,\n                                                  func_changes_bbox)\n\n\ndef cutout_only_bboxes(image, bboxes, prob, pad_size, replace):\n    \"\"\"Apply cutout to each bbox in the image with probability prob.\"\"\"\n    func_changes_bbox = False\n    prob = _scale_bbox_only_op_probability(prob)\n    return _apply_multi_bbox_augmentation_wrapper(\n        image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace)\n\n\ndef _rotate_bbox(bbox, image_height, image_width, degrees):\n    \"\"\"Rotates the bbox coordinated by degrees.\n\n    Args:\n        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)\n            of type float that represents the normalized coordinates between 0 and 1.\n        image_height: Int, height of the image.\n        image_width: Int, height of the image.\n        degrees: Float, a scalar angle in degrees to rotate all images by. If\n            degrees is positive the image will be rotated clockwise otherwise it will\n            be rotated counterclockwise.\n\n    Returns:\n        A tensor of the same shape as bbox, but now with the rotated coordinates.\n    \"\"\"\n    image_height, image_width = (float(image_height), float(image_width))\n\n    # Convert from degrees to radians.\n    degrees_to_radians = math.pi / 180.0\n    radians = degrees * degrees_to_radians\n\n    # Translate the bbox to the center of the image and turn the normalized 0-1\n    # coordinates to absolute pixel locations.\n    # Y coordinates are made negative as the y axis of images goes down with\n    # increasing pixel values, so we negate to make sure x axis and y axis points\n    # are in the traditionally positive direction.\n    min_y = -int(image_height * (bbox[0] - 0.5))\n    min_x = int(image_width * (bbox[1] - 0.5))\n    max_y = -int(image_height * (bbox[2] - 0.5))\n    max_x = int(image_width * (bbox[3] - 0.5))\n    coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x],\n                            [max_y, max_x]]).astype(np.float32)\n    # Rotate the coordinates according to the rotation matrix clockwise if\n    # radians is positive, else negative\n    rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)],\n                                [-math.sin(radians), math.cos(radians)]])\n    new_coords = np.matmul(rotation_matrix,\n                           np.transpose(coordinates)).astype(np.int32)\n\n    # Find min/max values and convert them back to normalized 0-1 floats.\n    min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5)\n    min_x = float(np.min(new_coords[1, :])) / image_width + 0.5\n    max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5)\n    max_x = float(np.max(new_coords[1, :])) / image_width + 0.5\n\n    # Clip the bboxes to be sure the fall between [0, 1].\n    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)\n    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)\n    return np.stack([min_y, min_x, max_y, max_x])\n\n\ndef rotate_with_bboxes(image, bboxes, degrees, replace):\n    # Rotate the image.\n    image = rotate(image, degrees, replace)\n\n    # Convert bbox coordinates to pixel values.\n    image_height, image_width = image.shape[:2]\n    # pylint:disable=g-long-lambda\n    wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees)\n    # pylint:enable=g-long-lambda\n    new_bboxes = np.zeros_like(bboxes)\n    for idx in range(len(bboxes)):\n        new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx])\n    return image, new_bboxes\n\n\ndef translate_x(image, pixels, replace):\n    \"\"\"Equivalent of PIL Translate in X dimension.\"\"\"\n    image = Image.fromarray(wrap(image))\n    image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0))\n    return unwrap(np.array(image), replace)\n\n\ndef translate_y(image, pixels, replace):\n    \"\"\"Equivalent of PIL Translate in Y dimension.\"\"\"\n    image = Image.fromarray(wrap(image))\n    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels))\n    return unwrap(np.array(image), replace)\n\n\ndef _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal):\n    \"\"\"Shifts the bbox coordinates by pixels.\n\n    Args:\n        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)\n            of type float that represents the normalized coordinates between 0 and 1.\n        image_height: Int, height of the image.\n        image_width: Int, width of the image.\n        pixels: An int. How many pixels to shift the bbox.\n        shift_horizontal: Boolean. If true then shift in X dimension else shift in\n            Y dimension.\n\n    Returns:\n        A tensor of the same shape as bbox, but now with the shifted coordinates.\n    \"\"\"\n    pixels = int(pixels)\n    # Convert bbox to integer pixel locations.\n    min_y = int(float(image_height) * bbox[0])\n    min_x = int(float(image_width) * bbox[1])\n    max_y = int(float(image_height) * bbox[2])\n    max_x = int(float(image_width) * bbox[3])\n\n    if shift_horizontal:\n        min_x = np.maximum(0, min_x - pixels)\n        max_x = np.minimum(image_width, max_x - pixels)\n    else:\n        min_y = np.maximum(0, min_y - pixels)\n        max_y = np.minimum(image_height, max_y - pixels)\n\n    # Convert bbox back to floats.\n    min_y = float(min_y) / float(image_height)\n    min_x = float(min_x) / float(image_width)\n    max_y = float(max_y) / float(image_height)\n    max_x = float(max_x) / float(image_width)\n\n    # Clip the bboxes to be sure the fall between [0, 1].\n    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)\n    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)\n    return np.stack([min_y, min_x, max_y, max_x])\n\n\ndef translate_bbox(image, bboxes, pixels, replace, shift_horizontal):\n    \"\"\"Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.\n\n    Args:\n        image: 3D uint8 Tensor.\n        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox\n            has 4 elements (min_y, min_x, max_y, max_x) of type float with values\n            between [0, 1].\n        pixels: An int. How many pixels to shift the image and bboxes\n        replace: A one or three value 1D tensor to fill empty pixels.\n        shift_horizontal: Boolean. If true then shift in X dimension else shift in\n            Y dimension.\n\n    Returns:\n        A tuple containing a 3D uint8 Tensor that will be the result of translating\n        image by pixels. The second element of the tuple is bboxes, where now\n        the coordinates will be shifted to reflect the shifted image.\n    \"\"\"\n    if shift_horizontal:\n        image = translate_x(image, pixels, replace)\n    else:\n        image = translate_y(image, pixels, replace)\n\n    # Convert bbox coordinates to pixel values.\n    image_height, image_width = image.shape[0], image.shape[1]\n    # pylint:disable=g-long-lambda\n    wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal)\n    # pylint:enable=g-long-lambda\n    new_bboxes = deepcopy(bboxes)\n    num_bboxes = len(bboxes)\n    for idx in range(num_bboxes):\n        new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx])\n    return image.astype(np.uint8), new_bboxes\n\n\ndef shear_x(image, level, replace):\n    \"\"\"Equivalent of PIL Shearing in X dimension.\"\"\"\n    # Shear parallel to x axis is a projective transform\n    # with a matrix form of:\n    # [1    level\n    #    0    1].\n    image = Image.fromarray(wrap(image))\n    image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0))\n    return unwrap(np.array(image), replace)\n\n\ndef shear_y(image, level, replace):\n    \"\"\"Equivalent of PIL Shearing in Y dimension.\"\"\"\n    # Shear parallel to y axis is a projective transform\n    # with a matrix form of:\n    # [1    0\n    #    level    1].\n    image = Image.fromarray(wrap(image))\n    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0))\n    return unwrap(np.array(image), replace)\n\n\ndef _shear_bbox(bbox, image_height, image_width, level, shear_horizontal):\n    \"\"\"Shifts the bbox according to how the image was sheared.\n\n    Args:\n        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)\n            of type float that represents the normalized coordinates between 0 and 1.\n        image_height: Int, height of the image.\n        image_width: Int, height of the image.\n        level: Float. How much to shear the image.\n        shear_horizontal: If true then shear in X dimension else shear in\n            the Y dimension.\n\n    Returns:\n        A tensor of the same shape as bbox, but now with the shifted coordinates.\n    \"\"\"\n    image_height, image_width = (float(image_height), float(image_width))\n\n    # Change bbox coordinates to be pixels.\n    min_y = int(image_height * bbox[0])\n    min_x = int(image_width * bbox[1])\n    max_y = int(image_height * bbox[2])\n    max_x = int(image_width * bbox[3])\n    coordinates = np.stack(\n        [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])\n    coordinates = coordinates.astype(np.float32)\n\n    # Shear the coordinates according to the translation matrix.\n    if shear_horizontal:\n        translation_matrix = np.stack([[1, 0], [-level, 1]])\n    else:\n        translation_matrix = np.stack([[1, -level], [0, 1]])\n    translation_matrix = translation_matrix.astype(np.float32)\n    new_coords = np.matmul(translation_matrix,\n                           np.transpose(coordinates)).astype(np.int32)\n\n    # Find min/max values and convert them back to floats.\n    min_y = float(np.min(new_coords[0, :])) / image_height\n    min_x = float(np.min(new_coords[1, :])) / image_width\n    max_y = float(np.max(new_coords[0, :])) / image_height\n    max_x = float(np.max(new_coords[1, :])) / image_width\n\n    # Clip the bboxes to be sure the fall between [0, 1].\n    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)\n    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)\n    return np.stack([min_y, min_x, max_y, max_x])\n\n\ndef shear_with_bboxes(image, bboxes, level, replace, shear_horizontal):\n    \"\"\"Applies Shear Transformation to the image and shifts the bboxes.\n\n    Args:\n        image: 3D uint8 Tensor.\n        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox\n            has 4 elements (min_y, min_x, max_y, max_x) of type float with values\n            between [0, 1].\n        level: Float. How much to shear the image. This value will be between\n            -0.3 to 0.3.\n        replace: A one or three value 1D tensor to fill empty pixels.\n        shear_horizontal: Boolean. If true then shear in X dimension else shear in\n            the Y dimension.\n\n    Returns:\n        A tuple containing a 3D uint8 Tensor that will be the result of shearing\n        image by level. The second element of the tuple is bboxes, where now\n        the coordinates will be shifted to reflect the sheared image.\n    \"\"\"\n    if shear_horizontal:\n        image = shear_x(image, level, replace)\n    else:\n        image = shear_y(image, level, replace)\n\n    # Convert bbox coordinates to pixel values.\n    image_height, image_width = image.shape[:2]\n    # pylint:disable=g-long-lambda\n    wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal)\n    # pylint:enable=g-long-lambda\n    new_bboxes = deepcopy(bboxes)\n    num_bboxes = len(bboxes)\n    for idx in range(num_bboxes):\n        new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx])\n    return image.astype(np.uint8), new_bboxes\n\n\ndef autocontrast(image):\n    \"\"\"Implements Autocontrast function from PIL.\n\n    Args:\n        image: A 3D uint8 tensor.\n\n    Returns:\n        The image after it has had autocontrast applied to it and will be of type\n        uint8.\n    \"\"\"\n\n    def scale_channel(image):\n        \"\"\"Scale the 2D image using the autocontrast rule.\"\"\"\n        # A possibly cheaper version can be done using cumsum/unique_with_counts\n        # over the histogram values, rather than iterating over the entire image.\n        # to compute mins and maxes.\n        lo = float(np.min(image))\n        hi = float(np.max(image))\n\n        # Scale the image, making the lowest value 0 and the highest value 255.\n        def scale_values(im):\n            scale = 255.0 / (hi - lo)\n            offset = -lo * scale\n            im = im.astype(np.float32) * scale + offset\n            img = np.clip(im, a_min=0, a_max=255.0)\n            return im.astype(np.uint8)\n\n        result = scale_values(image) if hi > lo else image\n        return result\n\n    # Assumes RGB for now.    Scales each channel independently\n    # and then stacks the result.\n    s1 = scale_channel(image[:, :, 0])\n    s2 = scale_channel(image[:, :, 1])\n    s3 = scale_channel(image[:, :, 2])\n    image = np.stack([s1, s2, s3], 2)\n    return image\n\n\ndef sharpness(image, factor):\n    \"\"\"Implements Sharpness function from PIL.\"\"\"\n    orig_image = image\n    image = image.astype(np.float32)\n    # Make image 4D for conv operation.\n    # SMOOTH PIL Kernel.\n    kernel = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13.\n    result = cv2.filter2D(image, -1, kernel).astype(np.uint8)\n\n    # Blend the final result.\n    return blend(result, orig_image, factor)\n\n\ndef equalize(image):\n    \"\"\"Implements Equalize function from PIL using.\"\"\"\n\n    def scale_channel(im, c):\n        \"\"\"Scale the data in the channel to implement equalize.\"\"\"\n        im = im[:, :, c].astype(np.int32)\n        # Compute the histogram of the image channel.\n        histo, _ = np.histogram(im, range=[0, 255], bins=256)\n\n        # For the purposes of computing the step, filter out the nonzeros.\n        nonzero = np.where(np.not_equal(histo, 0))\n        nonzero_histo = np.reshape(np.take(histo, nonzero), [-1])\n        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255\n\n        def build_lut(histo, step):\n            # Compute the cumulative sum, shifting by step // 2\n            # and then normalization by step.\n            lut = (np.cumsum(histo) + (step // 2)) // step\n            # Shift lut, prepending with 0.\n            lut = np.concatenate([[0], lut[:-1]], 0)\n            # Clip the counts to be in range.    This is done\n            # in the C code for image.point.\n            return np.clip(lut, a_min=0, a_max=255).astype(np.uint8)\n\n        # If step is zero, return the original image.    Otherwise, build\n        # lut from the full histogram and step and then index from it.\n        if step == 0:\n            result = im\n        else:\n            result = np.take(build_lut(histo, step), im)\n\n        return result.astype(np.uint8)\n\n    # Assumes RGB for now.    Scales each channel independently\n    # and then stacks the result.\n    s1 = scale_channel(image, 0)\n    s2 = scale_channel(image, 1)\n    s3 = scale_channel(image, 2)\n    image = np.stack([s1, s2, s3], 2)\n    return image\n\n\ndef wrap(image):\n    \"\"\"Returns 'image' with an extra channel set to all 1s.\"\"\"\n    shape = image.shape\n    extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype)\n    extended = np.concatenate([image, extended_channel], 2).astype(image.dtype)\n    return extended\n\n\ndef unwrap(image, replace):\n    \"\"\"Unwraps an image produced by wrap.\n\n    Where there is a 0 in the last channel for every spatial position,\n    the rest of the three channels in that spatial dimension are grayed\n    (set to 128).    Operations like translate and shear on a wrapped\n    Tensor will leave 0s in empty locations.    Some transformations look\n    at the intensity of values to do preprocessing, and we want these\n    empty pixels to assume the 'average' value, rather than pure black.\n\n\n    Args:\n        image: A 3D Image Tensor with 4 channels.\n        replace: A one or three value 1D tensor to fill empty pixels.\n\n    Returns:\n        image: A 3D image Tensor with 3 channels.\n    \"\"\"\n    image_shape = image.shape\n    # Flatten the spatial dimensions.\n    flattened_image = np.reshape(image, [-1, image_shape[2]])\n\n    # Find all pixels where the last channel is zero.\n    alpha_channel = flattened_image[:, 3]\n\n    replace = np.concatenate([replace, np.ones([1], image.dtype)], 0)\n\n    # Where they are zero, fill them in with 'replace'.\n    alpha_channel = np.reshape(alpha_channel, (-1, 1))\n    alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1]))\n\n    flattened_image = np.where(\n        np.equal(alpha_channel, 0),\n        np.ones_like(\n            flattened_image, dtype=image.dtype) * replace,\n        flattened_image)\n\n    image = np.reshape(flattened_image, image_shape)\n    image = image[:, :, :3]\n    return image.astype(np.uint8)\n\n\ndef _cutout_inside_bbox(image, bbox, pad_fraction):\n    \"\"\"Generates cutout mask and the mean pixel value of the bbox.\n\n    First a location is randomly chosen within the image as the center where the\n    cutout mask will be applied. Note this can be towards the boundaries of the\n    image, so the full cutout mask may not be applied.\n\n    Args:\n        image: 3D uint8 Tensor.\n        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)\n            of type float that represents the normalized coordinates between 0 and 1.\n        pad_fraction: Float that specifies how large the cutout mask should be in\n            in reference to the size of the original bbox. If pad_fraction is 0.25,\n            then the cutout mask will be of shape\n            (0.25 * bbox height, 0.25 * bbox width).\n\n    Returns:\n        A tuple. Fist element is a tensor of the same shape as image where each\n        element is either a 1 or 0 that is used to determine where the image\n        will have cutout applied. The second element is the mean of the pixels\n        in the image where the bbox is located.\n        mask value: [0,1]\n    \"\"\"\n    image_height, image_width = image.shape[0], image.shape[1]\n    # Transform from shape [1, 4] to [4].\n    bbox = np.squeeze(bbox)\n\n    min_y = int(float(image_height) * bbox[0])\n    min_x = int(float(image_width) * bbox[1])\n    max_y = int(float(image_height) * bbox[2])\n    max_x = int(float(image_width) * bbox[3])\n\n    # Calculate the mean pixel values in the bounding box, which will be used\n    # to fill the cutout region.\n    mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1))\n    # Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the\n    # region lies entirely within the bbox.\n    box_height = max_y - min_y + 1\n    box_width = max_x - min_x + 1\n    pad_size_height = int(pad_fraction * (box_height / 2))\n    pad_size_width = int(pad_fraction * (box_width / 2))\n\n    # Sample the center location in the image where the zero mask will be applied.\n    cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32)\n    cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32)\n\n    lower_pad = np.maximum(0, cutout_center_height - pad_size_height)\n    upper_pad = np.maximum(\n        0, image_height - cutout_center_height - pad_size_height)\n    left_pad = np.maximum(0, cutout_center_width - pad_size_width)\n    right_pad = np.maximum(0,\n                           image_width - cutout_center_width - pad_size_width)\n\n    cutout_shape = [\n        image_height - (lower_pad + upper_pad),\n        image_width - (left_pad + right_pad)\n    ]\n    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]\n\n    mask = np.pad(np.zeros(\n        cutout_shape, dtype=image.dtype),\n                  padding_dims,\n                  'constant',\n                  constant_values=1)\n\n    mask = np.expand_dims(mask, 2)\n    mask = np.tile(mask, [1, 1, 3])\n    return mask, mean\n\n\ndef bbox_cutout(image, bboxes, pad_fraction, replace_with_mean):\n    \"\"\"Applies cutout to the image according to bbox information.\n\n    This is a cutout variant that using bbox information to make more informed\n    decisions on where to place the cutout mask.\n\n    Args:\n        image: 3D uint8 Tensor.\n        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox\n            has 4 elements (min_y, min_x, max_y, max_x) of type float with values\n            between [0, 1].\n        pad_fraction: Float that specifies how large the cutout mask should be in\n            in reference to the size of the original bbox. If pad_fraction is 0.25,\n            then the cutout mask will be of shape\n            (0.25 * bbox height, 0.25 * bbox width).\n        replace_with_mean: Boolean that specified what value should be filled in\n            where the cutout mask is applied. Since the incoming image will be of\n            uint8 and will not have had any mean normalization applied, by default\n            we set the value to be 128. If replace_with_mean is True then we find\n            the mean pixel values across the channel dimension and use those to fill\n            in where the cutout mask is applied.\n\n    Returns:\n        A tuple. First element is a tensor of the same shape as image that has\n        cutout applied to it. Second element is the bboxes that were passed in\n        that will be unchanged.\n    \"\"\"\n\n    def apply_bbox_cutout(image, bboxes, pad_fraction):\n        \"\"\"Applies cutout to a single bounding box within image.\"\"\"\n        # Choose a single bounding box to apply cutout to.\n        random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32)\n        # Select the corresponding bbox and apply cutout.\n        chosen_bbox = np.take(bboxes, random_index, axis=0)\n        mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction)\n\n        # When applying cutout we either set the pixel value to 128 or to the mean\n        # value inside the bbox.\n        replace = mean if replace_with_mean else [128] * 3\n\n        # Apply the cutout mask to the image. Where the mask is 0 we fill it with\n        # `replace`.\n        image = np.where(\n            np.equal(mask, 0),\n            np.ones_like(\n                image, dtype=image.dtype) * replace,\n            image).astype(image.dtype)\n        return image\n\n    # Check to see if there are boxes, if so then apply boxcutout.\n    if len(bboxes) != 0:\n        image = apply_bbox_cutout(image, bboxes, pad_fraction)\n\n    return image, bboxes\n\n\nNAME_TO_FUNC = {\n        'AutoContrast': autocontrast,\n        'Equalize': equalize,\n        'Posterize': posterize,\n        'Solarize': solarize,\n        'SolarizeAdd': solarize_add,\n        'Color': color,\n        'Contrast': contrast,\n        'Brightness': brightness,\n        'Sharpness': sharpness,\n        'Cutout': cutout,\n        'BBox_Cutout': bbox_cutout,\n        'Rotate_BBox': rotate_with_bboxes,\n        # pylint:disable=g-long-lambda\n        'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox(\n                image, bboxes, pixels, replace, shift_horizontal=True),\n        'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox(\n                image, bboxes, pixels, replace, shift_horizontal=False),\n        'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(\n                image, bboxes, level, replace, shear_horizontal=True),\n        'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(\n                image, bboxes, level, replace, shear_horizontal=False),\n        # pylint:enable=g-long-lambda\n        'Rotate_Only_BBoxes': rotate_only_bboxes,\n        'ShearX_Only_BBoxes': shear_x_only_bboxes,\n        'ShearY_Only_BBoxes': shear_y_only_bboxes,\n        'TranslateX_Only_BBoxes': translate_x_only_bboxes,\n        'TranslateY_Only_BBoxes': translate_y_only_bboxes,\n        'Flip_Only_BBoxes': flip_only_bboxes,\n        'Solarize_Only_BBoxes': solarize_only_bboxes,\n        'Equalize_Only_BBoxes': equalize_only_bboxes,\n        'Cutout_Only_BBoxes': cutout_only_bboxes,\n}\n\n\ndef _randomly_negate_tensor(tensor):\n    \"\"\"With 50% prob turn the tensor negative.\"\"\"\n    should_flip = np.floor(np.random.rand() + 0.5) >= 1\n    final_tensor = tensor if should_flip else -tensor\n    return final_tensor\n\n\ndef _rotate_level_to_arg(level):\n    level = (level / _MAX_LEVEL) * 30.\n    level = _randomly_negate_tensor(level)\n    return (level, )\n\n\ndef _shrink_level_to_arg(level):\n    \"\"\"Converts level to ratio by which we shrink the image content.\"\"\"\n    if level == 0:\n        return (1.0, )  # if level is zero, do not shrink the image\n    # Maximum shrinking ratio is 2.9.\n    level = 2. / (_MAX_LEVEL / level) + 0.9\n    return (level, )\n\n\ndef _enhance_level_to_arg(level):\n    return ((level / _MAX_LEVEL) * 1.8 + 0.1, )\n\n\ndef _shear_level_to_arg(level):\n    level = (level / _MAX_LEVEL) * 0.3\n    # Flip level to negative with 50% chance.\n    level = _randomly_negate_tensor(level)\n    return (level, )\n\n\ndef _translate_level_to_arg(level, translate_const):\n    level = (level / _MAX_LEVEL) * float(translate_const)\n    # Flip level to negative with 50% chance.\n    level = _randomly_negate_tensor(level)\n    return (level, )\n\n\ndef _bbox_cutout_level_to_arg(level, hparams):\n    cutout_pad_fraction = (level /\n                           _MAX_LEVEL) * 0.75  # hparams.cutout_max_pad_fraction\n    return (cutout_pad_fraction, False)  # hparams.cutout_bbox_replace_with_mean\n\n\ndef level_to_arg(hparams):\n    return {\n        'AutoContrast': lambda level: (),\n        'Equalize': lambda level: (),\n        'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ),\n        'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ),\n        'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ),\n        'Color': _enhance_level_to_arg,\n        'Contrast': _enhance_level_to_arg,\n        'Brightness': _enhance_level_to_arg,\n        'Sharpness': _enhance_level_to_arg,\n        'Cutout':\n        lambda level: (int((level / _MAX_LEVEL) * 100), ),  # hparams.cutout_const=100\n        # pylint:disable=g-long-lambda\n        'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams),\n        'TranslateX_BBox':\n        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_const=250\n        'TranslateY_BBox':\n        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_cons\n        # pylint:enable=g-long-lambda\n        'ShearX_BBox': _shear_level_to_arg,\n        'ShearY_BBox': _shear_level_to_arg,\n        'Rotate_BBox': _rotate_level_to_arg,\n        'Rotate_Only_BBoxes': _rotate_level_to_arg,\n        'ShearX_Only_BBoxes': _shear_level_to_arg,\n        'ShearY_Only_BBoxes': _shear_level_to_arg,\n        # pylint:disable=g-long-lambda\n        'TranslateX_Only_BBoxes':\n        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const\n        'TranslateY_Only_BBoxes':\n        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const\n        # pylint:enable=g-long-lambda\n        'Flip_Only_BBoxes': lambda level: (),\n        'Solarize_Only_BBoxes':\n        lambda level: (int((level / _MAX_LEVEL) * 256), ),\n        'Equalize_Only_BBoxes': lambda level: (),\n        # pylint:disable=g-long-lambda\n        'Cutout_Only_BBoxes':\n        lambda level: (int((level / _MAX_LEVEL) * 50), ),  # hparams.cutout_bbox_const\n        # pylint:enable=g-long-lambda\n    }\n\n\ndef bbox_wrapper(func):\n    \"\"\"Adds a bboxes function argument to func and returns unchanged bboxes.\"\"\"\n\n    def wrapper(images, bboxes, *args, **kwargs):\n        return (func(images, *args, **kwargs), bboxes)\n\n    return wrapper\n\n\ndef _parse_policy_info(name, prob, level, replace_value, augmentation_hparams):\n    \"\"\"Return the function that corresponds to `name` and update `level` param.\"\"\"\n    func = NAME_TO_FUNC[name]\n    args = level_to_arg(augmentation_hparams)[name](level)\n\n    # Check to see if prob is passed into function. This is used for operations\n    # where we alter bboxes independently.\n    # pytype:disable=wrong-arg-types\n    if 'prob' in inspect.getfullargspec(func)[0]:\n        args = tuple([prob] + list(args))\n    # pytype:enable=wrong-arg-types\n\n    # Add in replace arg if it is required for the function that is being called.\n    if 'replace' in inspect.getfullargspec(func)[0]:\n        # Make sure replace is the final argument\n        assert 'replace' == inspect.getfullargspec(func)[0][-1]\n        args = tuple(list(args) + [replace_value])\n\n    # Add bboxes as the second positional argument for the function if it does\n    # not already exist.\n    if 'bboxes' not in inspect.getfullargspec(func)[0]:\n        func = bbox_wrapper(func)\n    return (func, prob, args)\n\n\ndef _apply_func_with_prob(func, image, args, prob, bboxes):\n    \"\"\"Apply `func` to image w/ `args` as input with probability `prob`.\"\"\"\n    assert isinstance(args, tuple)\n    assert 'bboxes' == inspect.getfullargspec(func)[0][1]\n\n    # If prob is a function argument, then this randomness is being handled\n    # inside the function, so make sure it is always called.\n    if 'prob' in inspect.getfullargspec(func)[0]:\n        prob = 1.0\n\n    # Apply the function with probability `prob`.\n    should_apply_op = np.floor(np.random.rand() + 0.5) >= 1\n    if should_apply_op:\n        augmented_image, augmented_bboxes = func(image, bboxes, *args)\n    else:\n        augmented_image, augmented_bboxes = (image, bboxes)\n    return augmented_image, augmented_bboxes\n\n\ndef select_and_apply_random_policy(policies, image, bboxes):\n    \"\"\"Select a random policy from `policies` and apply it to `image`.\"\"\"\n    policy_to_select = np.random.randint(0, len(policies), dtype=np.int32)\n    # policy_to_select = 6 # for test\n    for (i, policy) in enumerate(policies):\n        if i == policy_to_select:\n            image, bboxes = policy(image, bboxes)\n    return (image, bboxes)\n\n\ndef build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams):\n    \"\"\"Build a policy from the given policies passed in and apply to image.\n\n    Args:\n        policies: list of lists of tuples in the form `(func, prob, level)`, `func`\n            is a string name of the augmentation function, `prob` is the probability\n            of applying the `func` operation, `level` is the input argument for\n            `func`.\n        image: numpy array that the resulting policy will be applied to.\n        bboxes:\n        augmentation_hparams: Hparams associated with the NAS learned policy.\n\n    Returns:\n        A version of image that now has data augmentation applied to it based on\n        the `policies` pass into the function. Additionally, returns bboxes if\n        a value for them is passed in that is not None\n    \"\"\"\n    replace_value = [128, 128, 128]\n\n    # func is the string name of the augmentation function, prob is the\n    # probability of applying the operation and level is the parameter associated\n\n    # tf_policies are functions that take in an image and return an augmented\n    # image.\n    tf_policies = []\n    for policy in policies:\n        tf_policy = []\n        # Link string name to the correct python function and make sure the correct\n        # argument is passed into that function.\n        for policy_info in policy:\n            policy_info = list(\n                policy_info) + [replace_value, augmentation_hparams]\n\n            tf_policy.append(_parse_policy_info(*policy_info))\n        # Now build the tf policy that will apply the augmentation procedue\n        # on image.\n        def make_final_policy(tf_policy_):\n            def final_policy(image_, bboxes_):\n                for func, prob, args in tf_policy_:\n                    image_, bboxes_ = _apply_func_with_prob(func, image_, args,\n                                                            prob, bboxes_)\n                return image_, bboxes_\n\n            return final_policy\n\n        tf_policies.append(make_final_policy(tf_policy))\n\n    augmented_images, augmented_bboxes = select_and_apply_random_policy(\n        tf_policies, image, bboxes)\n    # If no bounding boxes were specified, then just return the images.\n    return (augmented_images, augmented_bboxes)\n\n\n# TODO(barretzoph): Add in ArXiv link once paper is out.\ndef distort_image_with_autoaugment(image, bboxes, augmentation_name):\n    \"\"\"Applies the AutoAugment policy to `image` and `bboxes`.\n\n    Args:\n        image: `Tensor` of shape [height, width, 3] representing an image.\n        bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are\n            normalized between [0, 1].\n        augmentation_name: The name of the AutoAugment policy to use. The available\n            options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for\n            all of the results in the paper and was found to achieve the best results\n            on the COCO dataset. `v1`, `v2` and `v3` are additional good policies\n            found on the COCO dataset that have slight variation in what operations\n            were used during the search procedure along with how many operations are\n            applied in parallel to a single image (2 vs 3).\n\n    Returns:\n        A tuple containing the augmented versions of `image` and `bboxes`.\n    \"\"\"\n    available_policies = {\n        'v0': policy_v0,\n        'v1': policy_v1,\n        'v2': policy_v2,\n        'v3': policy_v3,\n        'test': policy_vtest\n    }\n    if augmentation_name not in available_policies:\n        raise ValueError('Invalid augmentation_name: {}'.format(\n            augmentation_name))\n\n    policy = available_policies[augmentation_name]()\n    augmentation_hparams = {}\n    return build_and_apply_nas_policy(policy, image, bboxes,\n                                      augmentation_hparams)\n"
  },
  {
    "path": "ppdet/data/transform/batch_operators.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport typing\n\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\n\nimport cv2\nimport copy\nimport math\nimport numpy as np\nfrom .operators import register_op, BaseOperator, Resize\nfrom .op_helper import jaccard_overlap, gaussian2D, gaussian_radius, draw_umich_gaussian\nfrom .atss_assigner import ATSSAssigner\nfrom scipy import ndimage\n\nfrom ppdet.modeling import bbox_utils\nfrom ppdet.utils.logger import setup_logger\nfrom ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'PadBatch', 'BatchRandomResize', 'Gt2YoloTarget', 'Gt2FCOSTarget',\n    'Gt2TTFTarget', 'Gt2Solov2Target', 'Gt2SparseTarget', 'PadMaskBatch',\n    'Gt2GFLTarget', 'Gt2CenterNetTarget', 'Gt2CenterTrackTarget', 'PadGT',\n    'PadRGT', 'BatchRandomResizeForSSOD'\n]\n\n\n@register_op\nclass PadBatch(BaseOperator):\n    \"\"\"\n    Pad a batch of samples so they can be divisible by a stride.\n    The layout of each image should be 'CHW'.\n    Args:\n        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure\n            height and width is divisible by `pad_to_stride`.\n    \"\"\"\n\n    def __init__(self, pad_to_stride=0):\n        super(PadBatch, self).__init__()\n        self.pad_to_stride = pad_to_stride\n\n    def __call__(self, samples, context=None):\n        \"\"\"\n        Args:\n            samples (list): a batch of sample, each is dict.\n        \"\"\"\n        coarsest_stride = self.pad_to_stride\n\n        # multi scale input is nested list\n        if isinstance(samples,\n                      typing.Sequence) and len(samples) > 0 and isinstance(\n                          samples[0], typing.Sequence):\n            inner_samples = samples[0]\n        else:\n            inner_samples = samples\n\n        max_shape = np.array(\n            [data['image'].shape for data in inner_samples]).max(axis=0)\n        if coarsest_stride > 0:\n            max_shape[1] = int(\n                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)\n            max_shape[2] = int(\n                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)\n\n        for data in inner_samples:\n            im = data['image']\n            im_c, im_h, im_w = im.shape[:]\n            padding_im = np.zeros(\n                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)\n            padding_im[:, :im_h, :im_w] = im\n            data['image'] = padding_im\n            if 'semantic' in data and data['semantic'] is not None:\n                semantic = data['semantic']\n                padding_sem = np.zeros(\n                    (1, max_shape[1], max_shape[2]), dtype=np.float32)\n                padding_sem[:, :im_h, :im_w] = semantic\n                data['semantic'] = padding_sem\n            if 'gt_segm' in data and data['gt_segm'] is not None:\n                gt_segm = data['gt_segm']\n                padding_segm = np.zeros(\n                    (gt_segm.shape[0], max_shape[1], max_shape[2]),\n                    dtype=np.uint8)\n                padding_segm[:, :im_h, :im_w] = gt_segm\n                data['gt_segm'] = padding_segm\n\n        return samples\n\n\n@register_op\nclass BatchRandomResize(BaseOperator):\n    \"\"\"\n    Resize image to target size randomly. random target_size and interpolation method\n    Args:\n        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple\n        keep_ratio (bool): whether keep_raio or not, default true\n        interp (int): the interpolation method\n        random_size (bool): whether random select target size of image\n        random_interp (bool): whether random select interpolation method\n    \"\"\"\n\n    def __init__(self,\n                 target_size,\n                 keep_ratio,\n                 interp=cv2.INTER_NEAREST,\n                 random_size=True,\n                 random_interp=False):\n        super(BatchRandomResize, self).__init__()\n        self.keep_ratio = keep_ratio\n        self.interps = [\n            cv2.INTER_NEAREST,\n            cv2.INTER_LINEAR,\n            cv2.INTER_AREA,\n            cv2.INTER_CUBIC,\n            cv2.INTER_LANCZOS4,\n        ]\n        self.interp = interp\n        assert isinstance(target_size, (\n            int, Sequence)), \"target_size must be int, list or tuple\"\n        if random_size and not isinstance(target_size, list):\n            raise TypeError(\n                \"Type of target_size is invalid when random_size is True. Must be List, now is {}\".\n                format(type(target_size)))\n        self.target_size = target_size\n        self.random_size = random_size\n        self.random_interp = random_interp\n\n    def __call__(self, samples, context=None):\n        if self.random_size:\n            index = np.random.choice(len(self.target_size))\n            target_size = self.target_size[index]\n        else:\n            target_size = self.target_size\n\n        if self.random_interp:\n            interp = np.random.choice(self.interps)\n        else:\n            interp = self.interp\n\n        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)\n        return resizer(samples, context=context)\n\n\n@register_op\nclass Gt2YoloTarget(BaseOperator):\n    __shared__ = ['num_classes']\n    \"\"\"\n    Generate YOLOv3 targets by groud truth data, this operator is only used in\n    fine grained YOLOv3 loss mode\n    \"\"\"\n\n    def __init__(self,\n                 anchors,\n                 anchor_masks,\n                 downsample_ratios,\n                 num_classes=80,\n                 iou_thresh=1.):\n        super(Gt2YoloTarget, self).__init__()\n        self.anchors = anchors\n        self.anchor_masks = anchor_masks\n        self.downsample_ratios = downsample_ratios\n        self.num_classes = num_classes\n        self.iou_thresh = iou_thresh\n\n    def __call__(self, samples, context=None):\n        assert len(self.anchor_masks) == len(self.downsample_ratios), \\\n            \"anchor_masks', and 'downsample_ratios' should have same length.\"\n\n        h, w = samples[0]['image'].shape[1:3]\n        an_hw = np.array(self.anchors) / np.array([[w, h]])\n        for sample in samples:\n            gt_bbox = sample['gt_bbox']\n            gt_class = sample['gt_class']\n            if 'gt_score' not in sample:\n                sample['gt_score'] = np.ones(\n                    (gt_bbox.shape[0], 1), dtype=np.float32)\n            gt_score = sample['gt_score']\n            for i, (\n                    mask, downsample_ratio\n            ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)):\n                grid_h = int(h / downsample_ratio)\n                grid_w = int(w / downsample_ratio)\n                target = np.zeros(\n                    (len(mask), 6 + self.num_classes, grid_h, grid_w),\n                    dtype=np.float32)\n                for b in range(gt_bbox.shape[0]):\n                    gx, gy, gw, gh = gt_bbox[b, :]\n                    cls = gt_class[b]\n                    score = gt_score[b]\n                    if gw <= 0. or gh <= 0. or score <= 0.:\n                        continue\n\n                    # find best match anchor index\n                    best_iou = 0.\n                    best_idx = -1\n                    for an_idx in range(an_hw.shape[0]):\n                        iou = jaccard_overlap(\n                            [0., 0., gw, gh],\n                            [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])\n                        if iou > best_iou:\n                            best_iou = iou\n                            best_idx = an_idx\n\n                    gi = int(gx * grid_w)\n                    gj = int(gy * grid_h)\n\n                    # gtbox should be regresed in this layes if best match \n                    # anchor index in anchor mask of this layer\n                    if best_idx in mask:\n                        best_n = mask.index(best_idx)\n\n                        # x, y, w, h, scale\n                        target[best_n, 0, gj, gi] = gx * grid_w - gi\n                        target[best_n, 1, gj, gi] = gy * grid_h - gj\n                        target[best_n, 2, gj, gi] = np.log(\n                            gw * w / self.anchors[best_idx][0])\n                        target[best_n, 3, gj, gi] = np.log(\n                            gh * h / self.anchors[best_idx][1])\n                        target[best_n, 4, gj, gi] = 2.0 - gw * gh\n\n                        # objectness record gt_score\n                        target[best_n, 5, gj, gi] = score\n\n                        # classification\n                        target[best_n, 6 + cls, gj, gi] = 1.\n\n                    # For non-matched anchors, calculate the target if the iou \n                    # between anchor and gt is larger than iou_thresh\n                    if self.iou_thresh < 1:\n                        for idx, mask_i in enumerate(mask):\n                            if mask_i == best_idx: continue\n                            iou = jaccard_overlap(\n                                [0., 0., gw, gh],\n                                [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]])\n                            if iou > self.iou_thresh and target[idx, 5, gj,\n                                                                gi] == 0.:\n                                # x, y, w, h, scale\n                                target[idx, 0, gj, gi] = gx * grid_w - gi\n                                target[idx, 1, gj, gi] = gy * grid_h - gj\n                                target[idx, 2, gj, gi] = np.log(\n                                    gw * w / self.anchors[mask_i][0])\n                                target[idx, 3, gj, gi] = np.log(\n                                    gh * h / self.anchors[mask_i][1])\n                                target[idx, 4, gj, gi] = 2.0 - gw * gh\n\n                                # objectness record gt_score\n                                target[idx, 5, gj, gi] = score\n\n                                # classification\n                                target[idx, 6 + cls, gj, gi] = 1.\n                sample['target{}'.format(i)] = target\n\n            # remove useless gt_class and gt_score after target calculated\n            sample.pop('gt_class')\n            sample.pop('gt_score')\n\n        return samples\n\n\n@register_op\nclass Gt2FCOSTarget(BaseOperator):\n    \"\"\"\n    Generate FCOS targets by groud truth data\n    \"\"\"\n\n    def __init__(self,\n                 object_sizes_boundary,\n                 center_sampling_radius,\n                 downsample_ratios,\n                 num_shift=0.5,\n                 multiply_strides_reg_targets=False,\n                 norm_reg_targets=True):\n        super(Gt2FCOSTarget, self).__init__()\n        self.center_sampling_radius = center_sampling_radius\n        self.downsample_ratios = downsample_ratios\n        self.INF = np.inf\n        self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF]\n        object_sizes_of_interest = []\n        for i in range(len(self.object_sizes_boundary) - 1):\n            object_sizes_of_interest.append([\n                self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1]\n            ])\n        self.object_sizes_of_interest = object_sizes_of_interest\n        self.num_shift = num_shift\n        self.multiply_strides_reg_targets = multiply_strides_reg_targets\n        self.norm_reg_targets = norm_reg_targets\n\n    def _compute_points(self, w, h):\n        \"\"\"\n        compute the corresponding points in each feature map\n        :param h: image height\n        :param w: image width\n        :return: points from all feature map\n        \"\"\"\n        locations = []\n        for stride in self.downsample_ratios:\n            shift_x = np.arange(0, w, stride).astype(np.float32)\n            shift_y = np.arange(0, h, stride).astype(np.float32)\n            shift_x, shift_y = np.meshgrid(shift_x, shift_y)\n            shift_x = shift_x.flatten()\n            shift_y = shift_y.flatten()\n            location = np.stack(\n                [shift_x, shift_y], axis=1) + stride * self.num_shift\n            locations.append(location)\n        num_points_each_level = [len(location) for location in locations]\n        locations = np.concatenate(locations, axis=0)\n        return locations, num_points_each_level\n\n    def _convert_xywh2xyxy(self, gt_bbox, w, h):\n        \"\"\"\n        convert the bounding box from style xywh to xyxy\n        :param gt_bbox: bounding boxes normalized into [0, 1]\n        :param w: image width\n        :param h: image height\n        :return: bounding boxes in xyxy style\n        \"\"\"\n        bboxes = gt_bbox.copy()\n        bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w\n        bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h\n        bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]\n        bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]\n        return bboxes\n\n    def _check_inside_boxes_limited(self, gt_bbox, xs, ys,\n                                    num_points_each_level):\n        \"\"\"\n        check if points is within the clipped boxes\n        :param gt_bbox: bounding boxes\n        :param xs: horizontal coordinate of points\n        :param ys: vertical coordinate of points\n        :return: the mask of points is within gt_box or not\n        \"\"\"\n        bboxes = np.reshape(\n            gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]])\n        bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1])\n        ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2\n        ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2\n        beg = 0\n        clipped_box = bboxes.copy()\n        for lvl, stride in enumerate(self.downsample_ratios):\n            end = beg + num_points_each_level[lvl]\n            stride_exp = self.center_sampling_radius * stride\n            clipped_box[beg:end, :, 0] = np.maximum(\n                bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp)\n            clipped_box[beg:end, :, 1] = np.maximum(\n                bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp)\n            clipped_box[beg:end, :, 2] = np.minimum(\n                bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp)\n            clipped_box[beg:end, :, 3] = np.minimum(\n                bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp)\n            beg = end\n        l_res = xs - clipped_box[:, :, 0]\n        r_res = clipped_box[:, :, 2] - xs\n        t_res = ys - clipped_box[:, :, 1]\n        b_res = clipped_box[:, :, 3] - ys\n        clipped_box_reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)\n        inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0\n        return inside_gt_box\n\n    def __call__(self, samples, context=None):\n        assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \\\n            \"object_sizes_of_interest', and 'downsample_ratios' should have same length.\"\n\n        for sample in samples:\n            im = sample['image']\n            bboxes = sample['gt_bbox']\n            gt_class = sample['gt_class']\n            # calculate the locations\n            h, w = im.shape[1:3]\n            points, num_points_each_level = self._compute_points(w, h)\n            object_scale_exp = []\n            for i, num_pts in enumerate(num_points_each_level):\n                object_scale_exp.append(\n                    np.tile(\n                        np.array([self.object_sizes_of_interest[i]]),\n                        reps=[num_pts, 1]))\n            object_scale_exp = np.concatenate(object_scale_exp, axis=0)\n\n            gt_area = (bboxes[:, 2] - bboxes[:, 0]) * (\n                bboxes[:, 3] - bboxes[:, 1])\n            xs, ys = points[:, 0], points[:, 1]\n            xs = np.reshape(xs, newshape=[xs.shape[0], 1])\n            xs = np.tile(xs, reps=[1, bboxes.shape[0]])\n            ys = np.reshape(ys, newshape=[ys.shape[0], 1])\n            ys = np.tile(ys, reps=[1, bboxes.shape[0]])\n\n            l_res = xs - bboxes[:, 0]\n            r_res = bboxes[:, 2] - xs\n            t_res = ys - bboxes[:, 1]\n            b_res = bboxes[:, 3] - ys\n            reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)\n            if self.center_sampling_radius > 0:\n                is_inside_box = self._check_inside_boxes_limited(\n                    bboxes, xs, ys, num_points_each_level)\n            else:\n                is_inside_box = np.min(reg_targets, axis=2) > 0\n            # check if the targets is inside the corresponding level\n            max_reg_targets = np.max(reg_targets, axis=2)\n            lower_bound = np.tile(\n                np.expand_dims(\n                    object_scale_exp[:, 0], axis=1),\n                reps=[1, max_reg_targets.shape[1]])\n            high_bound = np.tile(\n                np.expand_dims(\n                    object_scale_exp[:, 1], axis=1),\n                reps=[1, max_reg_targets.shape[1]])\n            is_match_current_level = \\\n                (max_reg_targets > lower_bound) & \\\n                (max_reg_targets < high_bound)\n            points2gtarea = np.tile(\n                np.expand_dims(\n                    gt_area, axis=0), reps=[xs.shape[0], 1])\n            points2gtarea[is_inside_box == 0] = self.INF\n            points2gtarea[is_match_current_level == 0] = self.INF\n            points2min_area = points2gtarea.min(axis=1)\n            points2min_area_ind = points2gtarea.argmin(axis=1)\n            labels = gt_class[points2min_area_ind] + 1\n            labels[points2min_area == self.INF] = 0\n            reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind]\n            ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \\\n                                  reg_targets[:, [0, 2]].max(axis=1)) * \\\n                                  (reg_targets[:, [1, 3]].min(axis=1) / \\\n                                   reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32)\n            ctn_targets = np.reshape(\n                ctn_targets, newshape=[ctn_targets.shape[0], 1])\n            ctn_targets[labels <= 0] = 0\n            pos_ind = np.nonzero(labels != 0)\n            reg_targets_pos = reg_targets[pos_ind[0], :]\n            split_sections = []\n            beg = 0\n            for lvl in range(len(num_points_each_level)):\n                end = beg + num_points_each_level[lvl]\n                split_sections.append(end)\n                beg = end\n            labels_by_level = np.split(labels, split_sections, axis=0)\n            reg_targets_by_level = np.split(reg_targets, split_sections, axis=0)\n            ctn_targets_by_level = np.split(ctn_targets, split_sections, axis=0)\n            for lvl in range(len(self.downsample_ratios)):\n                grid_w = int(np.ceil(w / self.downsample_ratios[lvl]))\n                grid_h = int(np.ceil(h / self.downsample_ratios[lvl]))\n                if self.norm_reg_targets:\n                    if self.multiply_strides_reg_targets:\n                        sample['reg_target{}'.format(lvl)] = np.reshape(\n                            reg_targets_by_level[lvl],\n                            newshape=[grid_h, grid_w, 4])\n                    else:\n                        sample['reg_target{}'.format(lvl)] = \\\n                            np.reshape(\n                                reg_targets_by_level[lvl] / \\\n                                self.downsample_ratios[lvl],\n                                newshape=[grid_h, grid_w, 4])\n                else:\n                    sample['reg_target{}'.format(lvl)] = np.reshape(\n                        reg_targets_by_level[lvl],\n                        newshape=[grid_h, grid_w, 4])\n                sample['labels{}'.format(lvl)] = np.reshape(\n                    labels_by_level[lvl], newshape=[grid_h, grid_w, 1])\n                sample['centerness{}'.format(lvl)] = np.reshape(\n                    ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1])\n\n            sample.pop('is_crowd', None)\n            sample.pop('difficult', None)\n            sample.pop('gt_class', None)\n            sample.pop('gt_bbox', None)\n        return samples\n\n\n@register_op\nclass Gt2GFLTarget(BaseOperator):\n    __shared__ = ['num_classes']\n    \"\"\"\n    Generate GFocal loss targets by groud truth data\n    \"\"\"\n\n    def __init__(self,\n                 num_classes=80,\n                 downsample_ratios=[8, 16, 32, 64, 128],\n                 grid_cell_scale=4,\n                 cell_offset=0,\n                 compute_vlr_region=False):\n        super(Gt2GFLTarget, self).__init__()\n        self.num_classes = num_classes\n        self.downsample_ratios = downsample_ratios\n        self.grid_cell_scale = grid_cell_scale\n        self.cell_offset = cell_offset\n        self.compute_vlr_region = compute_vlr_region\n\n        self.assigner = ATSSAssigner()\n\n    def get_grid_cells(self, featmap_size, scale, stride, offset=0):\n        \"\"\"\n        Generate grid cells of a feature map for target assignment.\n        Args:\n            featmap_size: Size of a single level feature map.\n            scale: Grid cell scale.\n            stride: Down sample stride of the feature map.\n            offset: Offset of grid cells.\n        return:\n            Grid_cells xyxy position. Size should be [feat_w * feat_h, 4]\n        \"\"\"\n        cell_size = stride * scale\n        h, w = featmap_size\n        x_range = (np.arange(w, dtype=np.float32) + offset) * stride\n        y_range = (np.arange(h, dtype=np.float32) + offset) * stride\n        x, y = np.meshgrid(x_range, y_range)\n        y = y.flatten()\n        x = x.flatten()\n        grid_cells = np.stack(\n            [\n                x - 0.5 * cell_size, y - 0.5 * cell_size, x + 0.5 * cell_size,\n                y + 0.5 * cell_size\n            ],\n            axis=-1)\n        return grid_cells\n\n    def get_sample(self, assign_gt_inds, gt_bboxes):\n        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])\n        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])\n        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1\n\n        if gt_bboxes.size == 0:\n            # hack for index error case\n            assert pos_assigned_gt_inds.size == 0\n            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)\n        else:\n            if len(gt_bboxes.shape) < 2:\n                gt_bboxes = gt_bboxes.resize(-1, 4)\n            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]\n        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds\n\n    def __call__(self, samples, context=None):\n        assert len(samples) > 0\n        batch_size = len(samples)\n        # get grid cells of image\n        h, w = samples[0]['image'].shape[1:3]\n        multi_level_grid_cells = []\n        for stride in self.downsample_ratios:\n            featmap_size = (int(math.ceil(h / stride)),\n                            int(math.ceil(w / stride)))\n            multi_level_grid_cells.append(\n                self.get_grid_cells(featmap_size, self.grid_cell_scale, stride,\n                                    self.cell_offset))\n        mlvl_grid_cells_list = [\n            multi_level_grid_cells for i in range(batch_size)\n        ]\n        # pixel cell number of multi-level feature maps\n        num_level_cells = [\n            grid_cells.shape[0] for grid_cells in mlvl_grid_cells_list[0]\n        ]\n        num_level_cells_list = [num_level_cells] * batch_size\n        # concat all level cells and to a single array\n        for i in range(batch_size):\n            mlvl_grid_cells_list[i] = np.concatenate(mlvl_grid_cells_list[i])\n        # target assign on all images\n        for sample, grid_cells, num_level_cells in zip(\n                samples, mlvl_grid_cells_list, num_level_cells_list):\n            gt_bboxes = sample['gt_bbox']\n            gt_labels = sample['gt_class'].squeeze()\n            if gt_labels.size == 1:\n                gt_labels = np.array([gt_labels]).astype(np.int32)\n            gt_bboxes_ignore = None\n            assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells,\n                                              gt_bboxes, gt_bboxes_ignore,\n                                              gt_labels)\n\n            if self.compute_vlr_region:\n                vlr_region = self.assigner.get_vlr_region(\n                    grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore,\n                    gt_labels)\n                sample['vlr_regions'] = vlr_region\n\n            pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample(\n                assign_gt_inds, gt_bboxes)\n\n            num_cells = grid_cells.shape[0]\n            bbox_targets = np.zeros_like(grid_cells)\n            bbox_weights = np.zeros_like(grid_cells)\n            labels = np.ones([num_cells], dtype=np.int64) * self.num_classes\n            label_weights = np.zeros([num_cells], dtype=np.float32)\n\n            if len(pos_inds) > 0:\n                pos_bbox_targets = pos_gt_bboxes\n                bbox_targets[pos_inds, :] = pos_bbox_targets\n                bbox_weights[pos_inds, :] = 1.0\n                if not np.any(gt_labels):\n                    labels[pos_inds] = 0\n                else:\n                    labels[pos_inds] = gt_labels[pos_assigned_gt_inds]\n\n                label_weights[pos_inds] = 1.0\n            if len(neg_inds) > 0:\n                label_weights[neg_inds] = 1.0\n            sample['grid_cells'] = grid_cells\n            sample['labels'] = labels\n            sample['label_weights'] = label_weights\n            sample['bbox_targets'] = bbox_targets\n            sample['pos_num'] = max(pos_inds.size, 1)\n            sample.pop('is_crowd', None)\n            sample.pop('difficult', None)\n            sample.pop('gt_class', None)\n            sample.pop('gt_bbox', None)\n            sample.pop('gt_score', None)\n        return samples\n\n\n@register_op\nclass Gt2TTFTarget(BaseOperator):\n    __shared__ = ['num_classes']\n    \"\"\"\n    Gt2TTFTarget\n    Generate TTFNet targets by ground truth data\n    \n    Args:\n        num_classes(int): the number of classes.\n        down_ratio(int): the down ratio from images to heatmap, 4 by default.\n        alpha(float): the alpha parameter to generate gaussian target.\n            0.54 by default.\n    \"\"\"\n\n    def __init__(self, num_classes=80, down_ratio=4, alpha=0.54):\n        super(Gt2TTFTarget, self).__init__()\n        self.down_ratio = down_ratio\n        self.num_classes = num_classes\n        self.alpha = alpha\n\n    def __call__(self, samples, context=None):\n        output_size = samples[0]['image'].shape[1]\n        feat_size = output_size // self.down_ratio\n        for sample in samples:\n            heatmap = np.zeros(\n                (self.num_classes, feat_size, feat_size), dtype='float32')\n            box_target = np.ones(\n                (4, feat_size, feat_size), dtype='float32') * -1\n            reg_weight = np.zeros((1, feat_size, feat_size), dtype='float32')\n\n            gt_bbox = sample['gt_bbox']\n            gt_class = sample['gt_class']\n\n            bbox_w = gt_bbox[:, 2] - gt_bbox[:, 0] + 1\n            bbox_h = gt_bbox[:, 3] - gt_bbox[:, 1] + 1\n            area = bbox_w * bbox_h\n            boxes_areas_log = np.log(area)\n            boxes_ind = np.argsort(boxes_areas_log, axis=0)[::-1]\n            boxes_area_topk_log = boxes_areas_log[boxes_ind]\n            gt_bbox = gt_bbox[boxes_ind]\n            gt_class = gt_class[boxes_ind]\n\n            feat_gt_bbox = gt_bbox / self.down_ratio\n            feat_gt_bbox = np.clip(feat_gt_bbox, 0, feat_size - 1)\n            feat_hs, feat_ws = (feat_gt_bbox[:, 3] - feat_gt_bbox[:, 1],\n                                feat_gt_bbox[:, 2] - feat_gt_bbox[:, 0])\n\n            ct_inds = np.stack(\n                [(gt_bbox[:, 0] + gt_bbox[:, 2]) / 2,\n                 (gt_bbox[:, 1] + gt_bbox[:, 3]) / 2],\n                axis=1) / self.down_ratio\n\n            h_radiuses_alpha = (feat_hs / 2. * self.alpha).astype('int32')\n            w_radiuses_alpha = (feat_ws / 2. * self.alpha).astype('int32')\n\n            for k in range(len(gt_bbox)):\n                cls_id = gt_class[k]\n                fake_heatmap = np.zeros((feat_size, feat_size), dtype='float32')\n                self.draw_truncate_gaussian(fake_heatmap, ct_inds[k],\n                                            h_radiuses_alpha[k],\n                                            w_radiuses_alpha[k])\n\n                heatmap[cls_id] = np.maximum(heatmap[cls_id], fake_heatmap)\n                box_target_inds = fake_heatmap > 0\n                box_target[:, box_target_inds] = gt_bbox[k][:, None]\n\n                local_heatmap = fake_heatmap[box_target_inds]\n                ct_div = np.sum(local_heatmap)\n                local_heatmap *= boxes_area_topk_log[k]\n                reg_weight[0, box_target_inds] = local_heatmap / ct_div\n            sample['ttf_heatmap'] = heatmap\n            sample['ttf_box_target'] = box_target\n            sample['ttf_reg_weight'] = reg_weight\n            sample.pop('is_crowd', None)\n            sample.pop('difficult', None)\n            sample.pop('gt_class', None)\n            sample.pop('gt_bbox', None)\n            sample.pop('gt_score', None)\n        return samples\n\n    def draw_truncate_gaussian(self, heatmap, center, h_radius, w_radius):\n        h, w = 2 * h_radius + 1, 2 * w_radius + 1\n        sigma_x = w / 6\n        sigma_y = h / 6\n        gaussian = gaussian2D((h, w), sigma_x, sigma_y)\n\n        x, y = int(center[0]), int(center[1])\n\n        height, width = heatmap.shape[0:2]\n\n        left, right = min(x, w_radius), min(width - x, w_radius + 1)\n        top, bottom = min(y, h_radius), min(height - y, h_radius + 1)\n\n        masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]\n        masked_gaussian = gaussian[h_radius - top:h_radius + bottom, w_radius -\n                                   left:w_radius + right]\n        if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:\n            heatmap[y - top:y + bottom, x - left:x + right] = np.maximum(\n                masked_heatmap, masked_gaussian)\n        return heatmap\n\n\n@register_op\nclass Gt2Solov2Target(BaseOperator):\n    \"\"\"Assign mask target and labels in SOLOv2 network.\n    The code of this function is based on:\n        https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L271\n    Args:\n        num_grids (list): The list of feature map grids size.\n        scale_ranges (list): The list of mask boundary range.\n        coord_sigma (float): The coefficient of coordinate area length.\n        sampling_ratio (float): The ratio of down sampling.\n    \"\"\"\n\n    def __init__(self,\n                 num_grids=[40, 36, 24, 16, 12],\n                 scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768],\n                               [384, 2048]],\n                 coord_sigma=0.2,\n                 sampling_ratio=4.0):\n        super(Gt2Solov2Target, self).__init__()\n        self.num_grids = num_grids\n        self.scale_ranges = scale_ranges\n        self.coord_sigma = coord_sigma\n        self.sampling_ratio = sampling_ratio\n\n    def _scale_size(self, im, scale):\n        h, w = im.shape[:2]\n        new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5))\n        resized_img = cv2.resize(\n            im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)\n        return resized_img\n\n    def __call__(self, samples, context=None):\n        sample_id = 0\n        max_ins_num = [0] * len(self.num_grids)\n        for sample in samples:\n            gt_bboxes_raw = sample['gt_bbox']\n            gt_labels_raw = sample['gt_class'] + 1\n            im_c, im_h, im_w = sample['image'].shape[:]\n            gt_masks_raw = sample['gt_segm'].astype(np.uint8)\n            mask_feat_size = [\n                int(im_h / self.sampling_ratio), int(im_w / self.sampling_ratio)\n            ]\n            gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *\n                               (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))\n            ins_ind_label_list = []\n            idx = 0\n            for (lower_bound, upper_bound), num_grid \\\n                    in zip(self.scale_ranges, self.num_grids):\n\n                hit_indices = ((gt_areas >= lower_bound) &\n                               (gt_areas <= upper_bound)).nonzero()[0]\n                num_ins = len(hit_indices)\n\n                ins_label = []\n                grid_order = []\n                cate_label = np.zeros([num_grid, num_grid], dtype=np.int64)\n                ins_ind_label = np.zeros([num_grid**2], dtype=np.bool_)\n\n                if num_ins == 0:\n                    ins_label = np.zeros(\n                        [1, mask_feat_size[0], mask_feat_size[1]],\n                        dtype=np.uint8)\n                    ins_ind_label_list.append(ins_ind_label)\n                    sample['cate_label{}'.format(idx)] = cate_label.flatten()\n                    sample['ins_label{}'.format(idx)] = ins_label\n                    sample['grid_order{}'.format(idx)] = np.asarray(\n                        [sample_id * num_grid * num_grid + 0], dtype=np.int32)\n                    idx += 1\n                    continue\n                gt_bboxes = gt_bboxes_raw[hit_indices]\n                gt_labels = gt_labels_raw[hit_indices]\n                gt_masks = gt_masks_raw[hit_indices, ...]\n\n                half_ws = 0.5 * (\n                    gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma\n                half_hs = 0.5 * (\n                    gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma\n\n                for seg_mask, gt_label, half_h, half_w in zip(\n                        gt_masks, gt_labels, half_hs, half_ws):\n                    if seg_mask.sum() == 0:\n                        continue\n                    # mass center\n                    upsampled_size = (mask_feat_size[0] * 4,\n                                      mask_feat_size[1] * 4)\n                    center_h, center_w = ndimage.measurements.center_of_mass(\n                        seg_mask)\n                    coord_w = int(\n                        (center_w / upsampled_size[1]) // (1. / num_grid))\n                    coord_h = int(\n                        (center_h / upsampled_size[0]) // (1. / num_grid))\n\n                    # left, top, right, down\n                    top_box = max(0,\n                                  int(((center_h - half_h) / upsampled_size[0])\n                                      // (1. / num_grid)))\n                    down_box = min(num_grid - 1,\n                                   int(((center_h + half_h) / upsampled_size[0])\n                                       // (1. / num_grid)))\n                    left_box = max(0,\n                                   int(((center_w - half_w) / upsampled_size[1])\n                                       // (1. / num_grid)))\n                    right_box = min(num_grid - 1,\n                                    int(((center_w + half_w) /\n                                         upsampled_size[1]) // (1. / num_grid)))\n\n                    top = max(top_box, coord_h - 1)\n                    down = min(down_box, coord_h + 1)\n                    left = max(coord_w - 1, left_box)\n                    right = min(right_box, coord_w + 1)\n\n                    cate_label[top:(down + 1), left:(right + 1)] = gt_label\n                    seg_mask = self._scale_size(\n                        seg_mask, scale=1. / self.sampling_ratio)\n                    for i in range(top, down + 1):\n                        for j in range(left, right + 1):\n                            label = int(i * num_grid + j)\n                            cur_ins_label = np.zeros(\n                                [mask_feat_size[0], mask_feat_size[1]],\n                                dtype=np.uint8)\n                            cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[\n                                1]] = seg_mask\n                            ins_label.append(cur_ins_label)\n                            ins_ind_label[label] = True\n                            grid_order.append(sample_id * num_grid * num_grid +\n                                              label)\n                if ins_label == []:\n                    ins_label = np.zeros(\n                        [1, mask_feat_size[0], mask_feat_size[1]],\n                        dtype=np.uint8)\n                    ins_ind_label_list.append(ins_ind_label)\n                    sample['cate_label{}'.format(idx)] = cate_label.flatten()\n                    sample['ins_label{}'.format(idx)] = ins_label\n                    sample['grid_order{}'.format(idx)] = np.asarray(\n                        [sample_id * num_grid * num_grid + 0], dtype=np.int32)\n                else:\n                    ins_label = np.stack(ins_label, axis=0)\n                    ins_ind_label_list.append(ins_ind_label)\n                    sample['cate_label{}'.format(idx)] = cate_label.flatten()\n                    sample['ins_label{}'.format(idx)] = ins_label\n                    sample['grid_order{}'.format(idx)] = np.asarray(\n                        grid_order, dtype=np.int32)\n                    assert len(grid_order) > 0\n                max_ins_num[idx] = max(\n                    max_ins_num[idx],\n                    sample['ins_label{}'.format(idx)].shape[0])\n                idx += 1\n            ins_ind_labels = np.concatenate([\n                ins_ind_labels_level_img\n                for ins_ind_labels_level_img in ins_ind_label_list\n            ])\n            fg_num = np.sum(ins_ind_labels)\n            sample['fg_num'] = fg_num\n            sample_id += 1\n\n            sample.pop('is_crowd')\n            sample.pop('gt_class')\n            sample.pop('gt_bbox')\n            sample.pop('gt_poly')\n            sample.pop('gt_segm')\n\n        # padding batch\n        for data in samples:\n            for idx in range(len(self.num_grids)):\n                gt_ins_data = np.zeros(\n                    [\n                        max_ins_num[idx],\n                        data['ins_label{}'.format(idx)].shape[1],\n                        data['ins_label{}'.format(idx)].shape[2]\n                    ],\n                    dtype=np.uint8)\n                gt_ins_data[0:data['ins_label{}'.format(idx)].shape[\n                    0], :, :] = data['ins_label{}'.format(idx)]\n                gt_grid_order = np.zeros([max_ins_num[idx]], dtype=np.int32)\n                gt_grid_order[0:data['grid_order{}'.format(idx)].shape[\n                    0]] = data['grid_order{}'.format(idx)]\n                data['ins_label{}'.format(idx)] = gt_ins_data\n                data['grid_order{}'.format(idx)] = gt_grid_order\n\n        return samples\n\n\n@register_op\nclass Gt2SparseTarget(BaseOperator):\n    def __init__(self, use_padding_shape=False):\n        super(Gt2SparseTarget, self).__init__()\n        self.use_padding_shape = use_padding_shape\n\n    def __call__(self, samples, context=None):\n        for sample in samples:\n            ori_h, ori_w = sample['h'], sample['w']\n            if self.use_padding_shape:\n                h, w = sample[\"image\"].shape[1:3]\n                if \"scale_factor\" in sample:\n                    sf_w, sf_h = sample[\"scale_factor\"][1], sample[\n                        \"scale_factor\"][0]\n                    sample[\"scale_factor_whwh\"] = np.array(\n                        [sf_w, sf_h, sf_w, sf_h], dtype=np.float32)\n                else:\n                    sample[\"scale_factor_whwh\"] = np.array(\n                        [1.0, 1.0, 1.0, 1.0], dtype=np.float32)\n            else:\n                h, w = round(sample['im_shape'][0]), round(sample['im_shape'][\n                    1])\n                sample[\"scale_factor_whwh\"] = np.array(\n                    [w / ori_w, h / ori_h, w / ori_w, h / ori_h],\n                    dtype=np.float32)\n\n            sample[\"img_whwh\"] = np.array([w, h, w, h], dtype=np.float32)\n            sample[\"ori_shape\"] = np.array([ori_h, ori_w], dtype=np.int32)\n\n        return samples\n\n\n@register_op\nclass PadMaskBatch(BaseOperator):\n    \"\"\"\n    Pad a batch of samples so that they can be divisible by a stride.\n    The layout of each image should be 'CHW'.\n    Args:\n        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure\n            height and width is divisible by `pad_to_stride`.\n        return_pad_mask (bool): If `return_pad_mask = True`, return\n            `pad_mask` for transformer.\n    \"\"\"\n\n    def __init__(self, pad_to_stride=0, return_pad_mask=True):\n        super(PadMaskBatch, self).__init__()\n        self.pad_to_stride = pad_to_stride\n        self.return_pad_mask = return_pad_mask\n\n    def __call__(self, samples, context=None):\n        \"\"\"\n        Args:\n            samples (list): a batch of sample, each is dict.\n        \"\"\"\n        coarsest_stride = self.pad_to_stride\n\n        max_shape = np.array([data['image'].shape for data in samples]).max(\n            axis=0)\n        if coarsest_stride > 0:\n            max_shape[1] = int(\n                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)\n            max_shape[2] = int(\n                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)\n\n        for data in samples:\n            im = data['image']\n            im_c, im_h, im_w = im.shape[:]\n            padding_im = np.zeros(\n                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)\n            padding_im[:, :im_h, :im_w] = im.astype(np.float32)\n            data['image'] = padding_im\n            if 'semantic' in data and data['semantic'] is not None:\n                semantic = data['semantic']\n                padding_sem = np.zeros(\n                    (1, max_shape[1], max_shape[2]), dtype=np.float32)\n                padding_sem[:, :im_h, :im_w] = semantic\n                data['semantic'] = padding_sem\n            if 'gt_segm' in data and data['gt_segm'] is not None:\n                gt_segm = data['gt_segm']\n                padding_segm = np.zeros(\n                    (gt_segm.shape[0], max_shape[1], max_shape[2]),\n                    dtype=np.uint8)\n                padding_segm[:, :im_h, :im_w] = gt_segm\n                data['gt_segm'] = padding_segm\n            if self.return_pad_mask:\n                padding_mask = np.zeros(\n                    (max_shape[1], max_shape[2]), dtype=np.float32)\n                padding_mask[:im_h, :im_w] = 1.\n                data['pad_mask'] = padding_mask\n\n        return samples\n\n\n@register_op\nclass Gt2CenterNetTarget(BaseOperator):\n    __shared__ = ['num_classes']\n    \"\"\"Gt2CenterNetTarget\n    Genterate CenterNet targets by ground-truth\n    Args:\n        down_ratio (int): The down sample ratio between output feature and \n                          input image.\n        num_classes (int): The number of classes, 80 by default.\n        max_objs (int): The maximum objects detected, 128 by default.\n    \"\"\"\n\n    def __init__(self, num_classes=80, down_ratio=4, max_objs=128):\n        super(Gt2CenterNetTarget, self).__init__()\n        self.nc = num_classes\n        self.down_ratio = down_ratio\n        self.max_objs = max_objs\n\n    def __call__(self, sample, context=None):\n        input_h, input_w = sample['image'].shape[1:]\n        output_h = input_h // self.down_ratio\n        output_w = input_w // self.down_ratio\n        gt_bbox = sample['gt_bbox']\n        gt_class = sample['gt_class']\n\n        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)\n        wh = np.zeros((self.max_objs, 2), dtype=np.float32)\n        reg = np.zeros((self.max_objs, 2), dtype=np.float32)\n        ind = np.zeros((self.max_objs), dtype=np.int64)\n        reg_mask = np.zeros((self.max_objs), dtype=np.int32)\n        cat_spec_wh = np.zeros((self.max_objs, self.nc * 2), dtype=np.float32)\n        cat_spec_mask = np.zeros((self.max_objs, self.nc * 2), dtype=np.int32)\n\n        trans_output = get_affine_transform(\n            center=sample['center'],\n            input_size=[sample['scale'], sample['scale']],\n            rot=0,\n            output_size=[output_w, output_h])\n\n        gt_det = []\n        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):\n            cls = int(cls)\n            bbox[:2] = affine_transform(bbox[:2], trans_output)\n            bbox[2:] = affine_transform(bbox[2:], trans_output)\n            bbox_amodal = copy.deepcopy(bbox)\n            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)\n            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)\n            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]\n            if h > 0 and w > 0:\n                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)\n                radius = max(0, int(radius))\n                ct = np.array(\n                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],\n                    dtype=np.float32)\n                ct_int = ct.astype(np.int32)\n\n                # get hm,wh,reg,ind,ind_mask\n                draw_umich_gaussian(hm[cls], ct_int, radius)\n                wh[i] = 1. * w, 1. * h\n                reg[i] = ct - ct_int\n                ind[i] = ct_int[1] * output_w + ct_int[0]\n                reg_mask[i] = 1\n                cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i]\n                cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1\n                gt_det.append([\n                    ct[0] - w / 2, ct[1] - h / 2, ct[0] + w / 2, ct[1] + h / 2,\n                    1, cls\n                ])\n\n        sample.pop('gt_bbox', None)\n        sample.pop('gt_class', None)\n        sample.pop('center', None)\n        sample.pop('scale', None)\n        sample.pop('is_crowd', None)\n        sample.pop('difficult', None)\n\n        sample['index'] = ind\n        sample['index_mask'] = reg_mask\n        sample['heatmap'] = hm\n        sample['size'] = wh\n        sample['offset'] = reg\n        return sample\n\n\n@register_op\nclass PadGT(BaseOperator):\n    \"\"\"\n    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...\n    The num_max_boxes is the largest for batch.\n    Args:\n        return_gt_mask (bool): If true, return `pad_gt_mask`,\n                                1 means bbox, 0 means no bbox.\n    \"\"\"\n\n    def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0, only_origin_box=False):\n        super(PadGT, self).__init__()\n        self.return_gt_mask = return_gt_mask\n        self.pad_img = pad_img\n        self.minimum_gtnum = minimum_gtnum\n        self.only_origin_box = only_origin_box\n\n    def _impad(self,\n               img: np.ndarray,\n               *,\n               shape=None,\n               padding=None,\n               pad_val=0,\n               padding_mode='constant') -> np.ndarray:\n        \"\"\"Pad the given image to a certain shape or pad on all sides with\n        specified padding mode and padding value.\n\n        Args:\n            img (ndarray): Image to be padded.\n            shape (tuple[int]): Expected padding shape (h, w). Default: None.\n            padding (int or tuple[int]): Padding on each border. If a single int is\n                provided this is used to pad all borders. If tuple of length 2 is\n                provided this is the padding on left/right and top/bottom\n                respectively. If a tuple of length 4 is provided this is the\n                padding for the left, top, right and bottom borders respectively.\n                Default: None. Note that `shape` and `padding` can not be both\n                set.\n            pad_val (Number | Sequence[Number]): Values to be filled in padding\n                areas when padding_mode is 'constant'. Default: 0.\n            padding_mode (str): Type of padding. Should be: constant, edge,\n                reflect or symmetric. Default: constant.\n                - constant: pads with a constant value, this value is specified\n                with pad_val.\n                - edge: pads with the last value at the edge of the image.\n                - reflect: pads with reflection of image without repeating the last\n                value on the edge. For example, padding [1, 2, 3, 4] with 2\n                elements on both sides in reflect mode will result in\n                [3, 2, 1, 2, 3, 4, 3, 2].\n                - symmetric: pads with reflection of image repeating the last value\n                on the edge. For example, padding [1, 2, 3, 4] with 2 elements on\n                both sides in symmetric mode will result in\n                [2, 1, 1, 2, 3, 4, 4, 3]\n\n        Returns:\n            ndarray: The padded image.\n        \"\"\"\n\n        assert (shape is not None) ^ (padding is not None)\n        if shape is not None:\n            width = max(shape[1] - img.shape[1], 0)\n            height = max(shape[0] - img.shape[0], 0)\n            padding = (0, 0, int(width), int(height))\n\n        # check pad_val\n        import numbers\n        if isinstance(pad_val, tuple):\n            assert len(pad_val) == img.shape[-1]\n        elif not isinstance(pad_val, numbers.Number):\n            raise TypeError('pad_val must be a int or a tuple. '\n                            f'But received {type(pad_val)}')\n\n        # check padding\n        if isinstance(padding, tuple) and len(padding) in [2, 4]:\n            if len(padding) == 2:\n                padding = (padding[0], padding[1], padding[0], padding[1])\n        elif isinstance(padding, numbers.Number):\n            padding = (padding, padding, padding, padding)\n        else:\n            raise ValueError('Padding must be a int or a 2, or 4 element tuple.'\n                             f'But received {padding}')\n\n        # check padding mode\n        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']\n\n        border_type = {\n            'constant': cv2.BORDER_CONSTANT,\n            'edge': cv2.BORDER_REPLICATE,\n            'reflect': cv2.BORDER_REFLECT_101,\n            'symmetric': cv2.BORDER_REFLECT\n        }\n        img = cv2.copyMakeBorder(\n            img,\n            padding[1],\n            padding[3],\n            padding[0],\n            padding[2],\n            border_type[padding_mode],\n            value=pad_val)\n\n        return img\n\n    def checkmaxshape(self, samples):\n        maxh, maxw = 0, 0\n        for sample in samples:\n            h, w = sample['im_shape']\n            if h > maxh:\n                maxh = h\n            if w > maxw:\n                maxw = w\n        return (maxh, maxw)\n\n    def __call__(self, samples, context=None):\n        num_max_boxes = max([len(s['gt_bbox']) for s in samples])\n        num_max_boxes = max(self.minimum_gtnum, num_max_boxes)\n        if self.pad_img:\n            maxshape = self.checkmaxshape(samples)\n        \n        if self.only_origin_box:\n            for sample in samples:\n                if self.pad_img:\n                    img = sample['image']\n                    padimg = self._impad(img, shape=maxshape)\n                    sample['image'] = padimg\n                if self.return_gt_mask:\n                    sample['pad_origin_gt_mask'] = np.zeros(\n                        (num_max_boxes, 1), dtype=np.float32)\n                if num_max_boxes == 0:\n                    continue\n                num_gt = len(sample['origin_gt_bbox'])\n                pad_origin_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)\n                pad_origin_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)\n                if num_gt > 0:\n                    pad_origin_gt_class[:num_gt] = sample['origin_gt_class']\n                    pad_origin_gt_bbox[:num_gt] = sample['origin_gt_bbox']\n                sample['origin_gt_class'] = pad_origin_gt_class\n                sample['origin_gt_bbox'] = pad_origin_gt_bbox\n                if 'pad_origin_gt_mask' in sample:\n                    sample['pad_origin_gt_mask'][:num_gt] = 1\n        else:\n            for sample in samples:\n                if self.pad_img:\n                    img = sample['image']\n                    padimg = self._impad(img, shape=maxshape)\n                    sample['image'] = padimg\n                if self.return_gt_mask:\n                    sample['pad_gt_mask'] = np.zeros(\n                        (num_max_boxes, 1), dtype=np.float32)\n                if num_max_boxes == 0:\n                    continue\n\n                num_gt = len(sample['gt_bbox'])\n                pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)\n                pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)\n                if num_gt > 0:\n                    pad_gt_class[:num_gt] = sample['gt_class']\n                    pad_gt_bbox[:num_gt] = sample['gt_bbox']\n                sample['gt_class'] = pad_gt_class\n                sample['gt_bbox'] = pad_gt_bbox\n                # pad_gt_mask\n                if 'pad_gt_mask' in sample:\n                    sample['pad_gt_mask'][:num_gt] = 1\n                # gt_score\n                if 'gt_score' in sample:\n                    pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32)\n                    if num_gt > 0:\n                        pad_gt_score[:num_gt] = sample['gt_score']\n                    sample['gt_score'] = pad_gt_score\n                if 'is_crowd' in sample:\n                    pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32)\n                    if num_gt > 0:\n                        pad_is_crowd[:num_gt] = sample['is_crowd']\n                    sample['is_crowd'] = pad_is_crowd\n                if 'difficult' in sample:\n                    pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32)\n                    if num_gt > 0:\n                        pad_diff[:num_gt] = sample['difficult']\n                    sample['difficult'] = pad_diff\n                if 'gt_joints' in sample:\n                    num_joints = sample['gt_joints'].shape[1]\n                    pad_gt_joints = np.zeros(\n                        (num_max_boxes, num_joints, 3), dtype=np.float32)\n                    if num_gt > 0:\n                        pad_gt_joints[:num_gt] = sample['gt_joints']\n                    sample['gt_joints'] = pad_gt_joints\n                if 'gt_areas' in sample:\n                    pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)\n                    if num_gt > 0:\n                        pad_gt_areas[:num_gt, 0] = sample['gt_areas']\n                    sample['gt_areas'] = pad_gt_areas\n                # gt_segm\n                if 'gt_segm' in sample:\n                    pad_gt_segm = np.zeros(\n                        (num_max_boxes, *sample['gt_segm'].shape[-2:]),\n                        dtype=np.uint8)\n                    if num_gt > 0:\n                        pad_gt_segm[:num_gt] = sample['gt_segm']\n                    sample['gt_segm'] = pad_gt_segm.astype(np.float32)\n        return samples\n\n\n@register_op\nclass PadRGT(BaseOperator):\n    \"\"\"\n    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...\n    The num_max_boxes is the largest for batch.\n    Args:\n        return_gt_mask (bool): If true, return `pad_gt_mask`,\n                                1 means bbox, 0 means no bbox.\n    \"\"\"\n\n    def __init__(self, return_gt_mask=True):\n        super(PadRGT, self).__init__()\n        self.return_gt_mask = return_gt_mask\n\n    def pad_field(self, sample, field, num_gt):\n        name, shape, dtype = field\n        if name in sample:\n            pad_v = np.zeros(shape, dtype=dtype)\n            if num_gt > 0:\n                pad_v[:num_gt] = sample[name]\n            sample[name] = pad_v\n\n    def __call__(self, samples, context=None):\n        num_max_boxes = max([len(s['gt_bbox']) for s in samples])\n        for sample in samples:\n            if self.return_gt_mask:\n                sample['pad_gt_mask'] = np.zeros(\n                    (num_max_boxes, 1), dtype=np.float32)\n            if num_max_boxes == 0:\n                continue\n\n            num_gt = len(sample['gt_bbox'])\n            pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)\n            pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)\n            if num_gt > 0:\n                pad_gt_class[:num_gt] = sample['gt_class']\n                pad_gt_bbox[:num_gt] = sample['gt_bbox']\n            sample['gt_class'] = pad_gt_class\n            sample['gt_bbox'] = pad_gt_bbox\n            # pad_gt_mask\n            if 'pad_gt_mask' in sample:\n                sample['pad_gt_mask'][:num_gt] = 1\n            # gt_score\n            names = ['gt_score', 'is_crowd', 'difficult', 'gt_poly', 'gt_rbox']\n            dims = [1, 1, 1, 8, 5]\n            dtypes = [np.float32, np.int32, np.int32, np.float32, np.float32]\n\n            for name, dim, dtype in zip(names, dims, dtypes):\n                self.pad_field(sample, [name, (num_max_boxes, dim), dtype],\n                               num_gt)\n\n        return samples\n\n\n@register_op\nclass Gt2CenterTrackTarget(BaseOperator):\n    __shared__ = ['num_classes']\n    \"\"\"Gt2CenterTrackTarget\n    Genterate CenterTrack targets by ground-truth\n    Args:\n        num_classes (int): The number of classes, 1 by default.\n        down_ratio (int): The down sample ratio between output feature and \n                          input image.\n        max_objs (int): The maximum objects detected, 256 by default.\n    \"\"\"\n\n    def __init__(self,\n                 num_classes=1,\n                 down_ratio=4,\n                 max_objs=256,\n                 hm_disturb=0.05,\n                 lost_disturb=0.4,\n                 fp_disturb=0.1,\n                 pre_hm=True,\n                 add_tracking=True,\n                 add_ltrb_amodal=True):\n        super(Gt2CenterTrackTarget, self).__init__()\n        self.nc = num_classes\n        self.down_ratio = down_ratio\n        self.max_objs = max_objs\n\n        self.hm_disturb = hm_disturb\n        self.lost_disturb = lost_disturb\n        self.fp_disturb = fp_disturb\n        self.pre_hm = pre_hm\n        self.add_tracking = add_tracking\n        self.add_ltrb_amodal = add_ltrb_amodal\n\n    def _get_pre_dets(self, input_h, input_w, trans_input_pre, gt_bbox_pre,\n                      gt_class_pre, gt_track_id_pre):\n        hm_h, hm_w = input_h, input_w\n        reutrn_hm = self.pre_hm\n        pre_hm = np.zeros(\n            (1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None\n        pre_cts, track_ids = [], []\n\n        for i, (\n                bbox, cls, track_id\n        ) in enumerate(zip(gt_bbox_pre, gt_class_pre, gt_track_id_pre)):\n            cls = int(cls)\n            bbox[:2] = affine_transform(bbox[:2], trans_input_pre)\n            bbox[2:] = affine_transform(bbox[2:], trans_input_pre)\n            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1)\n            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1)\n            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]\n            max_rad = 1\n            if (h > 0 and w > 0):\n                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)\n                radius = max(0, int(radius))\n                max_rad = max(max_rad, radius)\n                ct = np.array(\n                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],\n                    dtype=np.float32)\n                ct0 = ct.copy()\n                conf = 1\n\n                ct[0] = ct[0] + np.random.randn() * self.hm_disturb * w\n                ct[1] = ct[1] + np.random.randn() * self.hm_disturb * h\n                conf = 1 if np.random.rand() > self.lost_disturb else 0\n\n                ct_int = ct.astype(np.int32)\n                if conf == 0:\n                    pre_cts.append(ct / self.down_ratio)\n                else:\n                    pre_cts.append(ct0 / self.down_ratio)\n\n                track_ids.append(track_id)\n                if reutrn_hm:\n                    draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf)\n\n                if np.random.rand() < self.fp_disturb and reutrn_hm:\n                    ct2 = ct0.copy()\n                    # Hard code heatmap disturb ratio, haven't tried other numbers.\n                    ct2[0] = ct2[0] + np.random.randn() * 0.05 * w\n                    ct2[1] = ct2[1] + np.random.randn() * 0.05 * h\n                    ct2_int = ct2.astype(np.int32)\n                    draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf)\n        return pre_hm, pre_cts, track_ids\n\n    def __call__(self, sample, context=None):\n        input_h, input_w = sample['image'].shape[1:]\n        output_h = input_h // self.down_ratio\n        output_w = input_w // self.down_ratio\n        gt_bbox = sample['gt_bbox']\n        gt_class = sample['gt_class']\n\n        # init\n        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)\n        wh = np.zeros((self.max_objs, 2), dtype=np.float32)\n        reg = np.zeros((self.max_objs, 2), dtype=np.float32)\n        ind = np.zeros((self.max_objs), dtype=np.int64)\n        reg_mask = np.zeros((self.max_objs), dtype=np.int32)\n        if self.add_tracking:\n            tr = np.zeros((self.max_objs, 2), dtype=np.float32)\n        if self.add_ltrb_amodal:\n            ltrb_amodal = np.zeros((self.max_objs, 4), dtype=np.float32)\n\n        trans_output = get_affine_transform(\n            center=sample['center'],\n            input_size=[sample['scale'], sample['scale']],\n            rot=0,\n            output_size=[output_w, output_h])\n\n        pre_hm, pre_cts, track_ids = self._get_pre_dets(\n            input_h, input_w, sample['trans_input'], sample['pre_gt_bbox'],\n            sample['pre_gt_class'], sample['pre_gt_track_id'])\n\n        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):\n            cls = int(cls)\n            rect = np.array(\n                [[bbox[0], bbox[1]], [bbox[0], bbox[3]], [bbox[2], bbox[3]],\n                 [bbox[2], bbox[1]]],\n                dtype=np.float32)\n            for t in range(4):\n                rect[t] = affine_transform(rect[t], trans_output)\n                bbox[:2] = rect[:, 0].min(), rect[:, 1].min()\n                bbox[2:] = rect[:, 0].max(), rect[:, 1].max()\n\n            bbox_amodal = copy.deepcopy(bbox)\n            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)\n            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)\n\n            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]\n            if h > 0 and w > 0:\n                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)\n                radius = max(0, int(radius))\n                ct = np.array(\n                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],\n                    dtype=np.float32)\n                ct_int = ct.astype(np.int32)\n\n                # get hm,wh,reg,ind,ind_mask\n                draw_umich_gaussian(hm[cls], ct_int, radius)\n                wh[i] = 1. * w, 1. * h\n                reg[i] = ct - ct_int\n                ind[i] = ct_int[1] * output_w + ct_int[0]\n                reg_mask[i] = 1\n                if self.add_tracking:\n                    if sample['gt_track_id'][i] in track_ids:\n                        pre_ct = pre_cts[track_ids.index(sample['gt_track_id'][\n                            i])]\n                        tr[i] = pre_ct - ct_int\n\n                if self.add_ltrb_amodal:\n                    ltrb_amodal[i] = \\\n                        bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \\\n                        bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]\n\n        new_sample = {'image': sample['image']}\n        new_sample['index'] = ind\n        new_sample['index_mask'] = reg_mask\n        new_sample['heatmap'] = hm\n        new_sample['size'] = wh\n        new_sample['offset'] = reg\n        if self.add_tracking:\n            new_sample['tracking'] = tr\n        if self.add_ltrb_amodal:\n            new_sample['ltrb_amodal'] = ltrb_amodal\n\n        new_sample['pre_image'] = sample['pre_image']\n        new_sample['pre_hm'] = pre_hm\n\n        del sample\n        return new_sample\n\n\n@register_op\nclass BatchRandomResizeForSSOD(BaseOperator):\n    \"\"\"\n    Resize image to target size randomly. random target_size and interpolation method\n    Args:\n        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple\n        keep_ratio (bool): whether keep_raio or not, default true\n        interp (int): the interpolation method\n        random_size (bool): whether random select target size of image\n        random_interp (bool): whether random select interpolation method\n    \"\"\"\n\n    def __init__(self,\n                 target_size,\n                 keep_ratio,\n                 interp=cv2.INTER_NEAREST,\n                 random_size=True,\n                 random_interp=False):\n        super(BatchRandomResizeForSSOD, self).__init__()\n        self.keep_ratio = keep_ratio\n        self.interps = [\n            cv2.INTER_NEAREST,\n            cv2.INTER_LINEAR,\n            cv2.INTER_AREA,\n            cv2.INTER_CUBIC,\n            cv2.INTER_LANCZOS4,\n        ]\n        self.interp = interp\n        assert isinstance(target_size, (\n            int, Sequence)), \"target_size must be int, list or tuple\"\n        if random_size and not isinstance(target_size, list):\n            raise TypeError(\n                \"Type of target_size is invalid when random_size is True. Must be List, now is {}\".\n                format(type(target_size)))\n        self.target_size = target_size\n        self.random_size = random_size\n        self.random_interp = random_interp\n\n    def __call__(self, samples, context=None):\n        if self.random_size:\n            index = np.random.choice(len(self.target_size))\n            target_size = self.target_size[index]\n        else:\n            target_size = self.target_size\n        if context is not None:\n            target_size = self.target_size[context]\n        if self.random_interp:\n            interp = np.random.choice(self.interps)\n        else:\n            interp = self.interp\n\n        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)\n        return [resizer(samples, context=context), index]\n"
  },
  {
    "path": "ppdet/data/transform/culane_operators.py",
    "content": "import numpy as np\nimport imgaug.augmenters as iaa\nfrom .operators import BaseOperator, register_op\nfrom ppdet.utils.logger import setup_logger\nfrom ppdet.data.culane_utils import linestrings_to_lanes, transform_annotation\n\nlogger = setup_logger(__name__)\n\n__all__ = [\n    \"CULaneTrainProcess\", \"CULaneDataProcess\", \"HorizontalFlip\",\n    \"ChannelShuffle\", \"CULaneAffine\", \"CULaneResize\", \"OneOfBlur\",\n    \"MultiplyAndAddToBrightness\", \"AddToHueAndSaturation\"\n]\n\n\ndef trainTransforms(img_h, img_w):\n    transforms = [{\n        'name': 'Resize',\n        'parameters': dict(size=dict(\n            height=img_h, width=img_w)),\n        'p': 1.0\n    }, {\n        'name': 'HorizontalFlip',\n        'parameters': dict(p=1.0),\n        'p': 0.5\n    }, {\n        'name': 'ChannelShuffle',\n        'parameters': dict(p=1.0),\n        'p': 0.1\n    }, {\n        'name': 'MultiplyAndAddToBrightness',\n        'parameters': dict(\n            mul=(0.85, 1.15), add=(-10, 10)),\n        'p': 0.6\n    }, {\n        'name': 'AddToHueAndSaturation',\n        'parameters': dict(value=(-10, 10)),\n        'p': 0.7\n    }, {\n        'name': 'OneOf',\n        'transforms': [\n            dict(\n                name='MotionBlur', parameters=dict(k=(3, 5))), dict(\n                    name='MedianBlur', parameters=dict(k=(3, 5)))\n        ],\n        'p': 0.2\n    }, {\n        'name': 'Affine',\n        'parameters': dict(\n            translate_percent=dict(\n                x=(-0.1, 0.1), y=(-0.1, 0.1)),\n            rotate=(-10, 10),\n            scale=(0.8, 1.2)),\n        'p': 0.7\n    }, {\n        'name': 'Resize',\n        'parameters': dict(size=dict(\n            height=img_h, width=img_w)),\n        'p': 1.0\n    }]\n    return transforms\n\n\n@register_op\nclass CULaneTrainProcess(BaseOperator):\n    def __init__(self, img_w, img_h):\n        super(CULaneTrainProcess, self).__init__()\n        self.img_w = img_w\n        self.img_h = img_h\n        self.transforms = trainTransforms(self.img_h, self.img_w)\n\n        if self.transforms is not None:\n            img_transforms = []\n            for aug in self.transforms:\n                p = aug['p']\n                if aug['name'] != 'OneOf':\n                    img_transforms.append(\n                        iaa.Sometimes(\n                            p=p,\n                            then_list=getattr(iaa, aug['name'])(**aug[\n                                'parameters'])))\n                else:\n                    img_transforms.append(\n                        iaa.Sometimes(\n                            p=p,\n                            then_list=iaa.OneOf([\n                                getattr(iaa, aug_['name'])(**aug_['parameters'])\n                                for aug_ in aug['transforms']\n                            ])))\n        else:\n            img_transforms = []\n        self.iaa_transform = iaa.Sequential(img_transforms)\n\n    def apply(self, sample, context=None):\n        img, line_strings, seg = self.iaa_transform(\n            image=sample['image'],\n            line_strings=sample['lanes'],\n            segmentation_maps=sample['mask'])\n        sample['image'] = img\n        sample['lanes'] = line_strings\n        sample['mask'] = seg\n        return sample\n\n\n@register_op\nclass CULaneDataProcess(BaseOperator):\n    def __init__(self, img_w, img_h, num_points, max_lanes):\n        super(CULaneDataProcess, self).__init__()\n        self.img_w = img_w\n        self.img_h = img_h\n        self.num_points = num_points\n        self.n_offsets = num_points\n        self.n_strips = num_points - 1\n        self.strip_size = self.img_h / self.n_strips\n\n        self.max_lanes = max_lanes\n        self.offsets_ys = np.arange(self.img_h, -1, -self.strip_size)\n\n    def apply(self, sample, context=None):\n        data = {}\n        line_strings = sample['lanes']\n        line_strings.clip_out_of_image_()\n        new_anno = {'lanes': linestrings_to_lanes(line_strings)}\n\n        for i in range(30):\n            try:\n                annos = transform_annotation(\n                    self.img_w, self.img_h, self.max_lanes, self.n_offsets,\n                    self.offsets_ys, self.n_strips, self.strip_size, new_anno)\n                label = annos['label']\n                lane_endpoints = annos['lane_endpoints']\n                break\n            except:\n                if (i + 1) == 30:\n                    logger.critical('Transform annotation failed 30 times :(')\n                    exit()\n\n        sample['image'] = sample['image'].astype(np.float32) / 255.\n        data['image'] = sample['image'].transpose(2, 0, 1)\n        data['lane_line'] = label\n        data['seg'] = sample['seg']\n        data['full_img_path'] = sample['full_img_path']\n        data['img_name'] = sample['img_name']\n        data['im_id'] = sample['im_id']\n\n        if 'mask' in sample.keys():\n            data['seg'] = sample['mask'].get_arr()\n\n        data['im_shape'] = np.array([self.img_w, self.img_h], dtype=np.float32)\n        data['scale_factor'] = np.array([1., 1.], dtype=np.float32)\n\n        return data\n\n\n@register_op\nclass CULaneResize(BaseOperator):\n    def __init__(self, img_h, img_w, prob=0.5):\n        super(CULaneResize, self).__init__()\n        self.img_h = img_h\n        self.img_w = img_w\n        self.prob = prob\n\n    def apply(self, sample, context=None):\n        transform = iaa.Sometimes(self.prob,\n                                  iaa.Resize({\n                                      \"height\": self.img_h,\n                                      \"width\": self.img_w\n                                  }))\n        if 'mask' in sample.keys():\n            img, line_strings, seg = transform(\n                image=sample['image'],\n                line_strings=sample['lanes'],\n                segmentation_maps=sample['mask'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n            sample['mask'] = seg\n        else:\n            img, line_strings = transform(\n                image=sample['image'].copy().astype(np.uint8),\n                line_strings=sample['lanes'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n\n        return sample\n\n\n@register_op\nclass HorizontalFlip(BaseOperator):\n    def __init__(self, prob=0.5):\n        super(HorizontalFlip, self).__init__()\n        self.prob = prob\n\n    def apply(self, sample, context=None):\n        transform = iaa.Sometimes(self.prob, iaa.HorizontalFlip(1.0))\n        if 'mask' in sample.keys():\n            img, line_strings, seg = transform(\n                image=sample['image'],\n                line_strings=sample['lanes'],\n                segmentation_maps=sample['mask'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n            sample['mask'] = seg\n        else:\n            img, line_strings = transform(\n                image=sample['image'], line_strings=sample['lanes'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n\n        return sample\n\n\n@register_op\nclass ChannelShuffle(BaseOperator):\n    def __init__(self, prob=0.1):\n        super(ChannelShuffle, self).__init__()\n        self.prob = prob\n\n    def apply(self, sample, context=None):\n        transform = iaa.Sometimes(self.prob, iaa.ChannelShuffle(1.0))\n        if 'mask' in sample.keys():\n            img, line_strings, seg = transform(\n                image=sample['image'],\n                line_strings=sample['lanes'],\n                segmentation_maps=sample['mask'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n            sample['mask'] = seg\n        else:\n            img, line_strings = transform(\n                image=sample['image'], line_strings=sample['lanes'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n\n        return sample\n\n\n@register_op\nclass MultiplyAndAddToBrightness(BaseOperator):\n    def __init__(self, mul=(0.85, 1.15), add=(-10, 10), prob=0.5):\n        super(MultiplyAndAddToBrightness, self).__init__()\n        self.mul = tuple(mul)\n        self.add = tuple(add)\n        self.prob = prob\n\n    def apply(self, sample, context=None):\n        transform = iaa.Sometimes(\n            self.prob,\n            iaa.MultiplyAndAddToBrightness(\n                mul=self.mul, add=self.add))\n        if 'mask' in sample.keys():\n            img, line_strings, seg = transform(\n                image=sample['image'],\n                line_strings=sample['lanes'],\n                segmentation_maps=sample['mask'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n            sample['mask'] = seg\n        else:\n            img, line_strings = transform(\n                image=sample['image'], line_strings=sample['lanes'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n\n        return sample\n\n\n@register_op\nclass AddToHueAndSaturation(BaseOperator):\n    def __init__(self, value=(-10, 10), prob=0.5):\n        super(AddToHueAndSaturation, self).__init__()\n        self.value = tuple(value)\n        self.prob = prob\n\n    def apply(self, sample, context=None):\n        transform = iaa.Sometimes(\n            self.prob, iaa.AddToHueAndSaturation(value=self.value))\n        if 'mask' in sample.keys():\n            img, line_strings, seg = transform(\n                image=sample['image'],\n                line_strings=sample['lanes'],\n                segmentation_maps=sample['mask'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n            sample['mask'] = seg\n        else:\n            img, line_strings = transform(\n                image=sample['image'], line_strings=sample['lanes'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n\n        return sample\n\n\n@register_op\nclass OneOfBlur(BaseOperator):\n    def __init__(self, MotionBlur_k=(3, 5), MedianBlur_k=(3, 5), prob=0.5):\n        super(OneOfBlur, self).__init__()\n        self.MotionBlur_k = tuple(MotionBlur_k)\n        self.MedianBlur_k = tuple(MedianBlur_k)\n        self.prob = prob\n\n    def apply(self, sample, context=None):\n        transform = iaa.Sometimes(\n            self.prob,\n            iaa.OneOf([\n                iaa.MotionBlur(k=self.MotionBlur_k),\n                iaa.MedianBlur(k=self.MedianBlur_k)\n            ]))\n\n        if 'mask' in sample.keys():\n            img, line_strings, seg = transform(\n                image=sample['image'],\n                line_strings=sample['lanes'],\n                segmentation_maps=sample['mask'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n            sample['mask'] = seg\n        else:\n            img, line_strings = transform(\n                image=sample['image'], line_strings=sample['lanes'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n\n        return sample\n\n\n@register_op\nclass CULaneAffine(BaseOperator):\n    def __init__(self,\n                 translate_percent_x=(-0.1, 0.1),\n                 translate_percent_y=(-0.1, 0.1),\n                 rotate=(3, 5),\n                 scale=(0.8, 1.2),\n                 prob=0.5):\n        super(CULaneAffine, self).__init__()\n        self.translate_percent = {\n            'x': tuple(translate_percent_x),\n            'y': tuple(translate_percent_y)\n        }\n        self.rotate = tuple(rotate)\n        self.scale = tuple(scale)\n        self.prob = prob\n\n    def apply(self, sample, context=None):\n        transform = iaa.Sometimes(\n            self.prob,\n            iaa.Affine(\n                translate_percent=self.translate_percent,\n                rotate=self.rotate,\n                scale=self.scale))\n\n        if 'mask' in sample.keys():\n            img, line_strings, seg = transform(\n                image=sample['image'],\n                line_strings=sample['lanes'],\n                segmentation_maps=sample['mask'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n            sample['mask'] = seg\n        else:\n            img, line_strings = transform(\n                image=sample['image'], line_strings=sample['lanes'])\n            sample['image'] = img\n            sample['lanes'] = line_strings\n\n        return sample\n"
  },
  {
    "path": "ppdet/data/transform/gridmask_utils.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# The code is based on:\n# https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport numpy as np\nfrom PIL import Image\n\n\nclass Gridmask(object):\n    def __init__(self,\n                 use_h=True,\n                 use_w=True,\n                 rotate=1,\n                 offset=False,\n                 ratio=0.5,\n                 mode=1,\n                 prob=0.7,\n                 upper_iter=360000):\n        super(Gridmask, self).__init__()\n        self.use_h = use_h\n        self.use_w = use_w\n        self.rotate = rotate\n        self.offset = offset\n        self.ratio = ratio\n        self.mode = mode\n        self.prob = prob\n        self.st_prob = prob\n        self.upper_iter = upper_iter\n\n    def __call__(self, x, curr_iter):\n        self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter)\n        if np.random.rand() > self.prob:\n            return x\n        h, w, _ = x.shape\n        hh = int(1.5 * h)\n        ww = int(1.5 * w)\n        d = np.random.randint(2, h)\n        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)\n        mask = np.ones((hh, ww), np.float32)\n        st_h = np.random.randint(d)\n        st_w = np.random.randint(d)\n        if self.use_h:\n            for i in range(hh // d):\n                s = d * i + st_h\n                t = min(s + self.l, hh)\n                mask[s:t, :] *= 0\n        if self.use_w:\n            for i in range(ww // d):\n                s = d * i + st_w\n                t = min(s + self.l, ww)\n                mask[:, s:t] *= 0\n\n        r = np.random.randint(self.rotate)\n        mask = Image.fromarray(np.uint8(mask))\n        mask = mask.rotate(r)\n        mask = np.asarray(mask)\n        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2\n                    + w].astype(np.float32)\n\n        if self.mode == 1:\n            mask = 1 - mask\n        mask = np.expand_dims(mask, axis=-1)\n        if self.offset:\n            offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32)\n            x = (x * mask + offset * (1 - mask)).astype(x.dtype)\n        else:\n            x = (x * mask).astype(x.dtype)\n\n        return x\n"
  },
  {
    "path": "ppdet/data/transform/keypoint_operators.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# function:\n#    operators to process sample,\n#    eg: decode/resize/crop image\n\nfrom __future__ import absolute_import\n\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\n\nimport cv2\nimport numpy as np\nimport math\nimport copy\n\nfrom ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix\nfrom ppdet.core.workspace import serializable\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\nregistered_ops = []\n\n__all__ = [\n    'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps',\n    'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform',\n    'TopDownRandomFlip', 'TopDownRandomShiftBboxCenter', 'TopDownGetRandomScaleRotation',\n    'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK',\n    'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine',\n    'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter',\n    'FlipPose', 'PETR_Resize'\n]\n\n\ndef register_keypointop(cls):\n    return serializable(cls)\n\n\n@register_keypointop\nclass KeyPointFlip(object):\n    \"\"\"Get the fliped image by flip_prob. flip the coords also\n    the left coords and right coords should exchange while flip, for the right keypoint will be left keypoint after image fliped\n\n    Args:\n        flip_permutation (list[17]): the left-right exchange order list corresponding to [0,1,2,...,16]\n        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet\n        flip_prob (float): the ratio whether to flip the image\n        records(dict): the dict contained the image, mask and coords\n\n    Returns:\n        records(dict): contain the image, mask and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self, flip_permutation, hmsize=None, flip_prob=0.5):\n        super(KeyPointFlip, self).__init__()\n        assert isinstance(flip_permutation, Sequence)\n        self.flip_permutation = flip_permutation\n        self.flip_prob = flip_prob\n        self.hmsize = hmsize\n\n    def _flipjoints(self, records, sizelst):\n        '''\n        records['gt_joints'] is Sequence in higherhrnet\n        '''\n        if not ('gt_joints' in records and len(records['gt_joints']) > 0):\n            return records\n\n        kpts_lst = records['gt_joints']\n        if isinstance(kpts_lst, Sequence):\n            for idx, hmsize in enumerate(sizelst):\n                if kpts_lst[idx].ndim == 3:\n                    kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation]\n                else:\n                    kpts_lst[idx] = kpts_lst[idx][self.flip_permutation]\n                kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0]\n        else:\n            hmsize = sizelst[0]\n            if kpts_lst.ndim == 3:\n                kpts_lst = kpts_lst[:, self.flip_permutation]\n            else:\n                kpts_lst = kpts_lst[self.flip_permutation]\n            kpts_lst[..., 0] = hmsize - kpts_lst[..., 0]\n\n        records['gt_joints'] = kpts_lst\n        return records\n\n    def _flipmask(self, records, sizelst):\n        if not 'mask' in records:\n            return records\n\n        mask_lst = records['mask']\n        for idx, hmsize in enumerate(sizelst):\n            if len(mask_lst) > idx:\n                mask_lst[idx] = mask_lst[idx][:, ::-1]\n        records['mask'] = mask_lst\n        return records\n\n    def _flipbbox(self, records, sizelst):\n        if not 'gt_bbox' in records:\n            return records\n\n        bboxes = records['gt_bbox']\n        hmsize = sizelst[0]\n        bboxes[:, 0::2] = hmsize - bboxes[:, 0::2][:, ::-1]\n        bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, hmsize)\n        records['gt_bbox'] = bboxes\n        return records\n\n    def __call__(self, records):\n        flip = np.random.random() < self.flip_prob\n        if flip:\n            image = records['image']\n            image = image[:, ::-1]\n            records['image'] = image\n            if self.hmsize is None:\n                sizelst = [image.shape[1]]\n            else:\n                sizelst = self.hmsize\n            self._flipjoints(records, sizelst)\n            self._flipmask(records, sizelst)\n            self._flipbbox(records, sizelst)\n\n        return records\n\n\n@register_keypointop\nclass RandomAffine(object):\n    \"\"\"apply affine transform to image, mask and coords\n    to achieve the rotate, scale and shift effect for training image\n\n    Args:\n        max_degree (float): the max abslute rotate degree to apply, transform range is [-max_degree, max_degree]\n        max_scale (list[2]): the scale range to apply, transform range is [min, max]\n        max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize]\n        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet\n        trainsize (list[2]): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard\n        scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long'\n        records(dict): the dict contained the image, mask and coords\n\n    Returns:\n        records(dict): contain the image, mask and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self,\n                 max_degree=30,\n                 scale=[0.75, 1.5],\n                 max_shift=0.2,\n                 hmsize=None,\n                 trainsize=[512, 512],\n                 scale_type='short',\n                 boldervalue=[114, 114, 114]):\n        super(RandomAffine, self).__init__()\n        self.max_degree = max_degree\n        self.min_scale = scale[0]\n        self.max_scale = scale[1]\n        self.max_shift = max_shift\n        self.hmsize = hmsize\n        self.trainsize = trainsize\n        self.scale_type = scale_type\n        self.boldervalue = boldervalue\n\n    def _get_affine_matrix_old(self, center, scale, res, rot=0):\n        \"\"\"Generate transformation matrix.\"\"\"\n        h = scale\n        t = np.zeros((3, 3), dtype=np.float32)\n        t[0, 0] = float(res[1]) / h\n        t[1, 1] = float(res[0]) / h\n        t[0, 2] = res[1] * (-float(center[0]) / h + .5)\n        t[1, 2] = res[0] * (-float(center[1]) / h + .5)\n        t[2, 2] = 1\n        if rot != 0:\n            rot = -rot  # To match direction of rotation from cropping\n            rot_mat = np.zeros((3, 3), dtype=np.float32)\n            rot_rad = rot * np.pi / 180\n            sn, cs = np.sin(rot_rad), np.cos(rot_rad)\n            rot_mat[0, :2] = [cs, -sn]\n            rot_mat[1, :2] = [sn, cs]\n            rot_mat[2, 2] = 1\n            # Need to rotate around center\n            t_mat = np.eye(3)\n            t_mat[0, 2] = -res[1] / 2\n            t_mat[1, 2] = -res[0] / 2\n            t_inv = t_mat.copy()\n            t_inv[:2, 2] *= -1\n            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))\n        return t\n\n    def _get_affine_matrix(self, center, scale, res, rot=0):\n        \"\"\"Generate transformation matrix.\"\"\"\n        w, h = scale\n        t = np.zeros((3, 3), dtype=np.float32)\n        t[0, 0] = float(res[0]) / w\n        t[1, 1] = float(res[1]) / h\n        t[0, 2] = res[0] * (-float(center[0]) / w + .5)\n        t[1, 2] = res[1] * (-float(center[1]) / h + .5)\n        t[2, 2] = 1\n        if rot != 0:\n            rot = -rot  # To match direction of rotation from cropping\n            rot_mat = np.zeros((3, 3), dtype=np.float32)\n            rot_rad = rot * np.pi / 180\n            sn, cs = np.sin(rot_rad), np.cos(rot_rad)\n            rot_mat[0, :2] = [cs, -sn]\n            rot_mat[1, :2] = [sn, cs]\n            rot_mat[2, 2] = 1\n            # Need to rotate around center\n            t_mat = np.eye(3)\n            t_mat[0, 2] = -res[0] / 2\n            t_mat[1, 2] = -res[1] / 2\n            t_inv = t_mat.copy()\n            t_inv[:2, 2] *= -1\n            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))\n        return t\n\n    def _affine_joints_mask(self,\n                            degree,\n                            center,\n                            roi_size,\n                            dsize,\n                            keypoints=None,\n                            heatmap_mask=None,\n                            gt_bbox=None):\n        kpts = None\n        mask = None\n        bbox = None\n        mask_affine_mat = self._get_affine_matrix(center, roi_size, dsize,\n                                                  degree)[:2]\n        if heatmap_mask is not None:\n            mask = cv2.warpAffine(heatmap_mask, mask_affine_mat, dsize)\n            mask = ((mask / 255) > 0.5).astype(np.float32)\n        if keypoints is not None:\n            kpts = copy.deepcopy(keypoints)\n            kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),\n                                                mask_affine_mat)\n            kpts[(kpts[..., 0]) > dsize[0], :] = 0\n            kpts[(kpts[..., 1]) > dsize[1], :] = 0\n            kpts[(kpts[..., 0]) < 0, :] = 0\n            kpts[(kpts[..., 1]) < 0, :] = 0\n        if gt_bbox is not None:\n            temp_bbox = gt_bbox[:, [0, 3, 2, 1]]\n            cat_bbox = np.concatenate((gt_bbox, temp_bbox), axis=-1)\n            gt_bbox_warped = warp_affine_joints(cat_bbox, mask_affine_mat)\n            bbox = np.zeros_like(gt_bbox)\n            bbox[:, 0] = gt_bbox_warped[:, 0::2].min(1).clip(0, dsize[0])\n            bbox[:, 2] = gt_bbox_warped[:, 0::2].max(1).clip(0, dsize[0])\n            bbox[:, 1] = gt_bbox_warped[:, 1::2].min(1).clip(0, dsize[1])\n            bbox[:, 3] = gt_bbox_warped[:, 1::2].max(1).clip(0, dsize[1])\n        return kpts, mask, bbox\n\n    def __call__(self, records):\n        image = records['image']\n        shape = np.array(image.shape[:2][::-1])\n        keypoints = None\n        heatmap_mask = None\n        gt_bbox = None\n        if 'gt_joints' in records:\n            keypoints = records['gt_joints']\n\n        if 'mask' in records:\n            heatmap_mask = records['mask']\n            heatmap_mask *= 255\n\n        if 'gt_bbox' in records:\n            gt_bbox = records['gt_bbox']\n\n        degree = (np.random.random() * 2 - 1) * self.max_degree\n        center = center = np.array((np.array(shape) / 2))\n\n        aug_scale = np.random.random() * (self.max_scale - self.min_scale\n                                          ) + self.min_scale\n        if self.scale_type == 'long':\n            scale = np.array([max(shape[0], shape[1]) / 1.0] * 2)\n        elif self.scale_type == 'short':\n            scale = np.array([min(shape[0], shape[1]) / 1.0] * 2)\n        elif self.scale_type == 'wh':\n            scale = shape\n        else:\n            raise ValueError('Unknown scale type: {}'.format(self.scale_type))\n        roi_size = aug_scale * scale\n        dx = int(0)\n        dy = int(0)\n        if self.max_shift > 0:\n\n            dx = np.random.randint(-self.max_shift * roi_size[0],\n                                   self.max_shift * roi_size[0])\n            dy = np.random.randint(-self.max_shift * roi_size[0],\n                                   self.max_shift * roi_size[1])\n\n        center += np.array([dx, dy])\n        input_size = 2 * center\n        if self.trainsize != -1:\n            dsize = self.trainsize\n            imgshape = (dsize)\n        else:\n            dsize = scale\n            imgshape = (shape.tolist())\n\n        image_affine_mat = self._get_affine_matrix(center, roi_size, dsize,\n                                                   degree)[:2]\n        image = cv2.warpAffine(\n            image,\n            image_affine_mat,\n            imgshape,\n            flags=cv2.INTER_LINEAR,\n            borderValue=self.boldervalue)\n\n        if self.hmsize is None:\n            kpts, mask, gt_bbox = self._affine_joints_mask(\n                degree, center, roi_size, dsize, keypoints, heatmap_mask,\n                gt_bbox)\n            records['image'] = image\n            if kpts is not None: records['gt_joints'] = kpts\n            if mask is not None: records['mask'] = mask\n            if gt_bbox is not None: records['gt_bbox'] = gt_bbox\n            return records\n\n        kpts_lst = []\n        mask_lst = []\n        for hmsize in self.hmsize:\n            kpts, mask, gt_bbox = self._affine_joints_mask(\n                degree, center, roi_size, [hmsize, hmsize], keypoints,\n                heatmap_mask, gt_bbox)\n            kpts_lst.append(kpts)\n            mask_lst.append(mask)\n        records['image'] = image\n\n        if 'gt_joints' in records:\n            records['gt_joints'] = kpts_lst\n        if 'mask' in records:\n            records['mask'] = mask_lst\n        if 'gt_bbox' in records:\n            records['gt_bbox'] = gt_bbox\n        return records\n\n\n@register_keypointop\nclass EvalAffine(object):\n    \"\"\"apply affine transform to image\n    resize the short of [h,w] to standard size for eval\n\n    Args:\n        size (int): the standard length used to train, the 'short' of [h,w] will be resize to trainsize for standard\n        records(dict): the dict contained the image, mask and coords\n\n    Returns:\n        records(dict): contain the image, mask and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self, size, stride=64):\n        super(EvalAffine, self).__init__()\n        self.size = size\n        self.stride = stride\n\n    def __call__(self, records):\n        image = records['image']\n        mask = records['mask'] if 'mask' in records else None\n        s = self.size\n        h, w, _ = image.shape\n        trans, size_resized = get_affine_mat_kernel(h, w, s, inv=False)\n        image_resized = cv2.warpAffine(image, trans, size_resized)\n        if mask is not None:\n            mask = cv2.warpAffine(mask, trans, size_resized)\n            records['mask'] = mask\n        if 'gt_joints' in records:\n            del records['gt_joints']\n        records['image'] = image_resized\n        records['scale_factor'] = self.size / min(h, w)\n        return records\n\n\n@register_keypointop\nclass NormalizePermute(object):\n    def __init__(self,\n                 mean=[123.675, 116.28, 103.53],\n                 std=[58.395, 57.120, 57.375],\n                 is_scale=True):\n        super(NormalizePermute, self).__init__()\n        self.mean = mean\n        self.std = std\n        self.is_scale = is_scale\n\n    def __call__(self, records):\n        image = records['image']\n        image = image.astype(np.float32)\n        if self.is_scale:\n            image /= 255.\n        image = image.transpose((2, 0, 1))\n        mean = np.array(self.mean, dtype=np.float32)\n        std = np.array(self.std, dtype=np.float32)\n        invstd = 1. / std\n        for v, m, s in zip(image, mean, invstd):\n            v.__isub__(m).__imul__(s)\n        records['image'] = image\n        return records\n\n\n@register_keypointop\nclass TagGenerate(object):\n    \"\"\"record gt coords for aeloss to sample coords value in tagmaps\n\n    Args:\n        num_joints (int): the keypoint numbers of dataset to train\n        num_people (int): maxmum people to support for sample aeloss\n        records(dict): the dict contained the image, mask and coords\n\n    Returns:\n        records(dict): contain the gt coords used in tagmap\n\n    \"\"\"\n\n    def __init__(self, num_joints, max_people=30):\n        super(TagGenerate, self).__init__()\n        self.max_people = max_people\n        self.num_joints = num_joints\n\n    def __call__(self, records):\n        kpts_lst = records['gt_joints']\n        kpts = kpts_lst[0]\n        tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64)\n        inds = np.where(kpts[..., 2] > 0)\n        p, j = inds[0], inds[1]\n        visible = kpts[inds]\n        # tagmap is [p, j, 3], where last dim is j, y, x\n        tagmap[p, j, 0] = j\n        tagmap[p, j, 1] = visible[..., 1]  # y\n        tagmap[p, j, 2] = visible[..., 0]  # x\n        tagmap[p, j, 3] = 1\n        records['tagmap'] = tagmap\n        del records['gt_joints']\n        return records\n\n\n@register_keypointop\nclass ToHeatmaps(object):\n    \"\"\"to generate the gaussin heatmaps of keypoint for heatmap loss\n\n    Args:\n        num_joints (int): the keypoint numbers of dataset to train\n        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet\n        sigma (float): the std of gaussin kernel genereted\n        records(dict): the dict contained the image, mask and coords\n\n    Returns:\n        records(dict): contain the heatmaps used to heatmaploss\n\n    \"\"\"\n\n    def __init__(self, num_joints, hmsize, sigma=None):\n        super(ToHeatmaps, self).__init__()\n        self.num_joints = num_joints\n        self.hmsize = np.array(hmsize)\n        if sigma is None:\n            sigma = hmsize[0] // 64\n        self.sigma = sigma\n\n        r = 6 * sigma + 3\n        x = np.arange(0, r, 1, np.float32)\n        y = x[:, None]\n        x0, y0 = 3 * sigma + 1, 3 * sigma + 1\n        self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))\n\n    def __call__(self, records):\n        kpts_lst = records['gt_joints']\n        mask_lst = records['mask']\n        for idx, hmsize in enumerate(self.hmsize):\n            mask = mask_lst[idx]\n            kpts = kpts_lst[idx]\n            heatmaps = np.zeros((self.num_joints, hmsize, hmsize))\n            inds = np.where(kpts[..., 2] > 0)\n            visible = kpts[inds].astype(np.int64)[..., :2]\n            ul = np.round(visible - 3 * self.sigma - 1)\n            br = np.round(visible + 3 * self.sigma + 2)\n            sul = np.maximum(0, -ul)\n            sbr = np.minimum(hmsize, br) - ul\n            dul = np.clip(ul, 0, hmsize - 1)\n            dbr = np.clip(br, 0, hmsize)\n            for i in range(len(visible)):\n                if visible[i][0] < 0 or visible[i][1] < 0 or visible[i][\n                        0] >= hmsize or visible[i][1] >= hmsize:\n                    continue\n                dx1, dy1 = dul[i]\n                dx2, dy2 = dbr[i]\n                sx1, sy1 = sul[i]\n                sx2, sy2 = sbr[i]\n                heatmaps[inds[1][i], dy1:dy2, dx1:dx2] = np.maximum(\n                    self.gaussian[sy1:sy2, sx1:sx2],\n                    heatmaps[inds[1][i], dy1:dy2, dx1:dx2])\n            records['heatmap_gt{}x'.format(idx + 1)] = heatmaps\n            records['mask_{}x'.format(idx + 1)] = mask\n        del records['mask']\n        return records\n\n\n@register_keypointop\nclass RandomFlipHalfBodyTransform(object):\n    \"\"\"apply data augment to image and coords\n    to achieve the flip, scale, rotate and half body transform effect for training image\n\n    Args:\n        trainsize (list):[w, h], Image target size\n        upper_body_ids (list): The upper body joint ids\n        flip_pairs (list): The left-right joints exchange order list\n        pixel_std (int): The pixel std of the scale\n        scale (float): The scale factor to transform the image\n        rot (int): The rotate factor to transform the image\n        num_joints_half_body (int): The joints threshold of the half body transform\n        prob_half_body (float): The threshold of the half body transform\n        flip (bool): Whether to flip the image\n\n    Returns:\n        records(dict): contain the image and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self,\n                 trainsize,\n                 upper_body_ids,\n                 flip_pairs,\n                 pixel_std,\n                 scale=0.35,\n                 rot=40,\n                 num_joints_half_body=8,\n                 prob_half_body=0.3,\n                 flip=True,\n                 rot_prob=0.6):\n        super(RandomFlipHalfBodyTransform, self).__init__()\n        self.trainsize = trainsize\n        self.upper_body_ids = upper_body_ids\n        self.flip_pairs = flip_pairs\n        self.pixel_std = pixel_std\n        self.scale = scale\n        self.rot = rot\n        self.num_joints_half_body = num_joints_half_body\n        self.prob_half_body = prob_half_body\n        self.flip = flip\n        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]\n        self.rot_prob = rot_prob\n\n    def halfbody_transform(self, joints, joints_vis):\n        upper_joints = []\n        lower_joints = []\n        for joint_id in range(joints.shape[0]):\n            if joints_vis[joint_id][0] > 0:\n                if joint_id in self.upper_body_ids:\n                    upper_joints.append(joints[joint_id])\n                else:\n                    lower_joints.append(joints[joint_id])\n        if np.random.randn() < 0.5 and len(upper_joints) > 2:\n            selected_joints = upper_joints\n        else:\n            selected_joints = lower_joints if len(\n                lower_joints) > 2 else upper_joints\n        if len(selected_joints) < 2:\n            return None, None\n        selected_joints = np.array(selected_joints, dtype=np.float32)\n        center = selected_joints.mean(axis=0)[:2]\n        left_top = np.amin(selected_joints, axis=0)\n        right_bottom = np.amax(selected_joints, axis=0)\n        w = right_bottom[0] - left_top[0]\n        h = right_bottom[1] - left_top[1]\n        if w > self.aspect_ratio * h:\n            h = w * 1.0 / self.aspect_ratio\n        elif w < self.aspect_ratio * h:\n            w = h * self.aspect_ratio\n        scale = np.array(\n            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],\n            dtype=np.float32)\n        scale = scale * 1.5\n\n        return center, scale\n\n    def flip_joints(self, joints, joints_vis, width, matched_parts):\n        joints[:, 0] = width - joints[:, 0] - 1\n        for pair in matched_parts:\n            joints[pair[0], :], joints[pair[1], :] = \\\n                joints[pair[1], :], joints[pair[0], :].copy()\n            joints_vis[pair[0], :], joints_vis[pair[1], :] = \\\n                joints_vis[pair[1], :], joints_vis[pair[0], :].copy()\n\n        return joints * joints_vis, joints_vis\n\n    def __call__(self, records):\n        image = records['image']\n        joints = records['gt_joints']\n        joints_vis = records['joints_vis']\n        c = records['center']\n        s = records['scale']\n        r = 0\n        if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body and\n                np.random.rand() < self.prob_half_body):\n            c_half_body, s_half_body = self.halfbody_transform(joints,\n                                                               joints_vis)\n            if c_half_body is not None and s_half_body is not None:\n                c, s = c_half_body, s_half_body\n        sf = self.scale\n        rf = self.rot\n        s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)\n        r = np.clip(np.random.randn() * rf, -rf * 2,\n                    rf * 2) if np.random.random() <= self.rot_prob else 0\n\n        if self.flip and np.random.random() <= 0.5:\n            image = image[:, ::-1, :]\n            joints, joints_vis = self.flip_joints(\n                joints, joints_vis, image.shape[1], self.flip_pairs)\n            c[0] = image.shape[1] - c[0] - 1\n        records['image'] = image\n        records['gt_joints'] = joints\n        records['joints_vis'] = joints_vis\n        records['center'] = c\n        records['scale'] = s\n        records['rotate'] = r\n\n        return records\n\n\n@register_keypointop\nclass AugmentationbyInformantionDropping(object):\n    \"\"\"AID: Augmentation by Informantion Dropping. Please refer \n        to https://arxiv.org/abs/2008.07139 \n    \n    Args:\n        prob_cutout (float): The probability of the Cutout augmentation.\n        offset_factor (float): Offset factor of cutout center.\n        num_patch (int): Number of patches to be cutout.                       \n        records(dict): the dict contained the image and coords\n        \n    Returns:\n        records (dict): contain the image and coords after tranformed\n    \n    \"\"\"\n\n    def __init__(self,\n                 trainsize,\n                 prob_cutout=0.0,\n                 offset_factor=0.2,\n                 num_patch=1):\n        self.prob_cutout = prob_cutout\n        self.offset_factor = offset_factor\n        self.num_patch = num_patch\n        self.trainsize = trainsize\n\n    def _cutout(self, img, joints, joints_vis):\n        height, width, _ = img.shape\n        img = img.reshape((height * width, -1))\n        feat_x_int = np.arange(0, width)\n        feat_y_int = np.arange(0, height)\n        feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)\n        feat_x_int = feat_x_int.reshape((-1, ))\n        feat_y_int = feat_y_int.reshape((-1, ))\n        for _ in range(self.num_patch):\n            vis_idx, _ = np.where(joints_vis > 0)\n            occlusion_joint_id = np.random.choice(vis_idx)\n            center = joints[occlusion_joint_id, 0:2]\n            offset = np.random.randn(2) * self.trainsize[0] * self.offset_factor\n            center = center + offset\n            radius = np.random.uniform(0.1, 0.2) * self.trainsize[0]\n            x_offset = (center[0] - feat_x_int) / radius\n            y_offset = (center[1] - feat_y_int) / radius\n            dis = x_offset**2 + y_offset**2\n            keep_pos = np.where((dis <= 1) & (dis >= 0))[0]\n            img[keep_pos, :] = 0\n        img = img.reshape((height, width, -1))\n        return img\n\n    def __call__(self, records):\n        img = records['image']\n        joints = records['gt_joints']\n        joints_vis = records['joints_vis']\n        if np.random.rand() < self.prob_cutout:\n            img = self._cutout(img, joints, joints_vis)\n        records['image'] = img\n        return records\n\n\n@register_keypointop\nclass TopDownRandomFlip(object):\n    \"\"\"Data augmentation with random image flip.\n\n    Args:\n        flip_perm: (list[tuple]): Pairs of keypoints which are mirrored\n                (for example, left ear and right ear).\n        flip_prob (float): Probability of flip.\n    \"\"\"\n\n    def __init__(self, flip_perm=[], flip_prob=0.5):\n        self.flip_perm = flip_perm\n        self.flip_prob = flip_prob\n\n    def flip_joints(self, joints_3d, joints_3d_visible, img_width, flip_pairs):\n        assert len(joints_3d) == len(joints_3d_visible)\n        assert img_width > 0\n\n        joints_3d_flipped = joints_3d.copy()\n        joints_3d_visible_flipped = joints_3d_visible.copy()\n\n        # Swap left-right parts\n        for left, right in flip_pairs:\n            joints_3d_flipped[left, :] = joints_3d[right, :]\n            joints_3d_flipped[right, :] = joints_3d[left, :]\n\n            joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]\n            joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]\n\n        # Flip horizontally\n        joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]\n        joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0)\n\n        return joints_3d_flipped, joints_3d_visible_flipped\n\n    def __call__(self, results):\n        \"\"\"Perform data augmentation with random image flip.\"\"\"\n        if np.random.rand() <= self.flip_prob:\n            return results\n\n        img = results['image']\n        joints_3d = results['gt_joints']\n        joints_3d_visible = results['joints_vis']\n        center = results['center']\n\n        # A flag indicating whether the image is flipped,\n        # which can be used by child class.\n        if not isinstance(img, list):\n            img = img[:, ::-1, :]\n        else:\n            img = [i[:, ::-1, :] for i in img]\n        if not isinstance(img, list):\n            joints_3d, joints_3d_visible = self.flip_joints(\n                joints_3d, joints_3d_visible, img.shape[1],\n                self.flip_perm)\n            center[0] = img.shape[1] - center[0] - 1\n        else:\n            joints_3d, joints_3d_visible = self.flip_joints(\n                joints_3d, joints_3d_visible, img[0].shape[1],\n                self.flip_perm)\n            center[0] = img[0].shape[1] - center[0] - 1\n\n        results['image'] = img\n        results['gt_joints'] = joints_3d\n        results['joints_vis'] = joints_3d_visible\n        results['center'] = center\n\n        return results\n\n\n@register_keypointop\nclass TopDownRandomShiftBboxCenter(object):\n    \"\"\"Random shift the bbox center.\n\n    Args:\n        shift_factor (float): The factor to control the shift range, which is\n            scale*pixel_std*scale_factor. Default: 0.16\n        shift_prob (float): Probability of applying random shift. Default: 0.3\n    \"\"\"\n\n    def __init__(self, shift_factor=0.16, shift_prob=0.3):\n        self.shift_factor = shift_factor\n        self.shift_prob = shift_prob\n\n    def __call__(self, results):\n        center = results['center']\n        scale = results['scale']\n        if np.random.rand() < self.shift_prob:\n            center += np.random.uniform(\n                -1, 1, 2) * self.shift_factor * scale * 200.0\n\n        results['center'] = center\n        return results\n\n@register_keypointop\nclass TopDownGetRandomScaleRotation(object):\n    \"\"\"Data augmentation with random scaling & rotating.\n\n    Args:\n        rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.\n        scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.\n        rot_prob (float): Probability of random rotation.\n    \"\"\"\n\n    def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6):\n        self.rot_factor = rot_factor\n        self.scale_factor = scale_factor\n        self.rot_prob = rot_prob\n\n    def __call__(self, results):\n        \"\"\"Perform data augmentation with random scaling & rotating.\"\"\"\n        s = results['scale']\n\n        sf = self.scale_factor\n        rf = self.rot_factor\n\n        s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)\n        s = s * s_factor\n\n        r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)\n        r = r_factor if np.random.rand() <= self.rot_prob else 0\n\n        results['scale'] = s\n        results['rotate'] = r\n\n        return results\n\n\n@register_keypointop\nclass TopDownAffine(object):\n    \"\"\"apply affine transform to image and coords\n\n    Args:\n        trainsize (list): [w, h], the standard size used to train\n        use_udp (bool): whether to use Unbiased Data Processing.\n        records(dict): the dict contained the image and coords\n\n    Returns:\n        records (dict): contain the image and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self, trainsize, use_udp=False):\n        self.trainsize = trainsize\n        self.use_udp = use_udp\n\n    def __call__(self, records):\n        image = records['image']\n        joints = records['gt_joints']\n        joints_vis = records['joints_vis']\n        rot = records['rotate'] if \"rotate\" in records else 0\n        if self.use_udp:\n            trans = get_warp_matrix(\n                rot, records['center'] * 2.0,\n                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],\n                records['scale'] * 200.0)\n            image = cv2.warpAffine(\n                image,\n                trans, (int(self.trainsize[0]), int(self.trainsize[1])),\n                flags=cv2.INTER_LINEAR)\n            joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans)\n        else:\n            trans = get_affine_transform(records['center'], records['scale'] *\n                                         200, rot, self.trainsize)\n            image = cv2.warpAffine(\n                image,\n                trans, (int(self.trainsize[0]), int(self.trainsize[1])),\n                flags=cv2.INTER_LINEAR)\n            for i in range(joints.shape[0]):\n                if joints_vis[i, 0] > 0.0:\n                    joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)\n\n        records['image'] = image\n        records['gt_joints'] = joints\n\n        return records\n\n\n@register_keypointop\nclass SinglePoseAffine(object):\n    \"\"\"apply affine transform to image and coords\n\n    Args:\n        trainsize (list): [w, h], the standard size used to train\n        use_udp (bool): whether to use Unbiased Data Processing.\n        records(dict): the dict contained the image and coords\n\n    Returns:\n        records (dict): contain the image and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self,\n                 trainsize,\n                 rotate=[1.0, 30],\n                 scale=[1.0, 0.25],\n                 use_udp=False):\n        self.trainsize = trainsize\n        self.use_udp = use_udp\n        self.rot_prob = rotate[0]\n        self.rot_range = rotate[1]\n        self.scale_prob = scale[0]\n        self.scale_ratio = scale[1]\n\n    def __call__(self, records):\n        image = records['image']\n        if 'joints_2d' in records:\n            joints = records['joints_2d'] if 'joints_2d' in records else None\n            joints_vis = records[\n                'joints_vis'] if 'joints_vis' in records else np.ones(\n                    (len(joints), 1))\n        rot = 0\n        s = 1.\n        if np.random.random() < self.rot_prob:\n            rot = np.clip(np.random.randn() * self.rot_range,\n                          -self.rot_range * 2, self.rot_range * 2)\n        if np.random.random() < self.scale_prob:\n            s = np.clip(np.random.randn() * self.scale_ratio + 1,\n                        1 - self.scale_ratio, 1 + self.scale_ratio)\n\n        if self.use_udp:\n            trans = get_warp_matrix(\n                rot,\n                np.array(records['bbox_center']) * 2.0,\n                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],\n                records['bbox_scale'] * 200.0 * s)\n            image = cv2.warpAffine(\n                image,\n                trans, (int(self.trainsize[0]), int(self.trainsize[1])),\n                flags=cv2.INTER_LINEAR)\n            if 'joints_2d' in records:\n                joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(),\n                                                    trans)\n        else:\n            trans = get_affine_transform(\n                np.array(records['bbox_center']),\n                records['bbox_scale'] * s * 200, rot, self.trainsize)\n            image = cv2.warpAffine(\n                image,\n                trans, (int(self.trainsize[0]), int(self.trainsize[1])),\n                flags=cv2.INTER_LINEAR)\n            if 'joints_2d' in records:\n                for i in range(len(joints)):\n                    if joints_vis[i, 0] > 0.0:\n                        joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)\n\n        if 'joints_3d' in records:\n            pose3d = records['joints_3d']\n            if not rot == 0:\n                trans_3djoints = np.eye(3)\n                rot_rad = -rot * np.pi / 180\n                sn, cs = np.sin(rot_rad), np.cos(rot_rad)\n                trans_3djoints[0, :2] = [cs, -sn]\n                trans_3djoints[1, :2] = [sn, cs]\n                pose3d[:, :3] = np.einsum('ij,kj->ki', trans_3djoints,\n                                          pose3d[:, :3])\n                records['joints_3d'] = pose3d\n\n        records['image'] = image\n        if 'joints_2d' in records:\n            records['joints_2d'] = joints\n\n        return records\n\n\n@register_keypointop\nclass NoiseJitter(object):\n    \"\"\"apply NoiseJitter to image\n\n    Args:\n        noise_factor (float): the noise factor ratio used to generate the jitter\n\n    Returns:\n        records (dict): contain the image and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self, noise_factor=0.4):\n        self.noise_factor = noise_factor\n\n    def __call__(self, records):\n        self.pn = np.random.uniform(1 - self.noise_factor,\n                                    1 + self.noise_factor, 3)\n        rgb_img = records['image']\n        rgb_img[:, :, 0] = np.minimum(\n            255.0, np.maximum(0.0, rgb_img[:, :, 0] * self.pn[0]))\n        rgb_img[:, :, 1] = np.minimum(\n            255.0, np.maximum(0.0, rgb_img[:, :, 1] * self.pn[1]))\n        rgb_img[:, :, 2] = np.minimum(\n            255.0, np.maximum(0.0, rgb_img[:, :, 2] * self.pn[2]))\n        records['image'] = rgb_img\n        return records\n\n\n@register_keypointop\nclass FlipPose(object):\n    \"\"\"random apply flip to image\n\n    Args:\n        noise_factor (float): the noise factor ratio used to generate the jitter\n\n    Returns:\n        records (dict): contain the image and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self, flip_prob=0.5, img_res=224, num_joints=14):\n        self.flip_pob = flip_prob\n        self.img_res = img_res\n        if num_joints == 24:\n            self.perm = [\n                5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17,\n                18, 19, 21, 20, 23, 22\n            ]\n        elif num_joints == 14:\n            self.perm = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13]\n        else:\n            print(\"error num_joints in flip :{}\".format(num_joints))\n\n    def __call__(self, records):\n\n        if np.random.random() < self.flip_pob:\n            img = records['image']\n            img = np.fliplr(img)\n\n            if 'joints_2d' in records:\n                joints_2d = records['joints_2d']\n                joints_2d = joints_2d[self.perm]\n                joints_2d[:, 0] = self.img_res - joints_2d[:, 0]\n                records['joints_2d'] = joints_2d\n\n            if 'joints_3d' in records:\n                joints_3d = records['joints_3d']\n                joints_3d = joints_3d[self.perm]\n                joints_3d[:, 0] = -joints_3d[:, 0]\n                records['joints_3d'] = joints_3d\n\n            records['image'] = img\n        return records\n\n\n@register_keypointop\nclass TopDownEvalAffine(object):\n    \"\"\"apply affine transform to image and coords\n\n    Args:\n        trainsize (list): [w, h], the standard size used to train\n        use_udp (bool): whether to use Unbiased Data Processing.\n        records(dict): the dict contained the image and coords\n\n    Returns:\n        records (dict): contain the image and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self, trainsize, use_udp=False):\n        self.trainsize = trainsize\n        self.use_udp = use_udp\n\n    def __call__(self, records):\n        image = records['image']\n        rot = 0\n        imshape = records['im_shape'][::-1]\n        center = imshape / 2.\n        scale = imshape\n\n        if self.use_udp:\n            trans = get_warp_matrix(\n                rot, center * 2.0,\n                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)\n            image = cv2.warpAffine(\n                image,\n                trans, (int(self.trainsize[0]), int(self.trainsize[1])),\n                flags=cv2.INTER_LINEAR)\n        else:\n            trans = get_affine_transform(center, scale, rot, self.trainsize)\n            image = cv2.warpAffine(\n                image,\n                trans, (int(self.trainsize[0]), int(self.trainsize[1])),\n                flags=cv2.INTER_LINEAR)\n        records['image'] = image\n\n        return records\n\n\n@register_keypointop\nclass ToHeatmapsTopDown(object):\n    \"\"\"to generate the gaussin heatmaps of keypoint for heatmap loss\n\n    Args:\n        hmsize (list): [w, h] output heatmap's size\n        sigma (float): the std of gaussin kernel genereted\n        records(dict): the dict contained the image and coords\n\n    Returns:\n        records (dict): contain the heatmaps used to heatmaploss\n\n    \"\"\"\n\n    def __init__(self, hmsize, sigma):\n        super(ToHeatmapsTopDown, self).__init__()\n        self.hmsize = np.array(hmsize)\n        self.sigma = sigma\n\n    def __call__(self, records):\n        \"\"\"refer to\n            https://github.com/leoxiaobin/deep-high-resolution-net.pytorch\n            Copyright (c) Microsoft, under the MIT License.\n        \"\"\"\n        joints = records['gt_joints']\n        joints_vis = records['joints_vis']\n        num_joints = joints.shape[0]\n        image_size = np.array(\n            [records['image'].shape[1], records['image'].shape[0]])\n        target_weight = np.ones((num_joints, 1), dtype=np.float32)\n        target_weight[:, 0] = joints_vis[:, 0]\n        target = np.zeros(\n            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)\n        tmp_size = self.sigma * 3\n        feat_stride = image_size / self.hmsize\n        for joint_id in range(num_joints):\n            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)\n            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)\n            # Check that any part of the gaussian is in-bounds\n            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]\n            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]\n            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[\n                    0] < 0 or br[1] < 0:\n                # If not, just return the image as is\n                target_weight[joint_id] = 0\n                continue\n            # # Generate gaussian\n            size = 2 * tmp_size + 1\n            x = np.arange(0, size, 1, np.float32)\n            y = x[:, np.newaxis]\n            x0 = y0 = size // 2\n            # The gaussian is not normalized, we want the center value to equal 1\n            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))\n\n            # Usable gaussian range\n            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]\n            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]\n            # Image range\n            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])\n            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])\n\n            v = target_weight[joint_id]\n            if v > 0.5:\n                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[\n                    0]:g_y[1], g_x[0]:g_x[1]]\n        records['target'] = target\n        records['target_weight'] = target_weight\n        del records['gt_joints'], records['joints_vis']\n\n        return records\n\n\n@register_keypointop\nclass ToHeatmapsTopDown_DARK(object):\n    \"\"\"to generate the gaussin heatmaps of keypoint for heatmap loss\n\n    Args:\n        hmsize (list): [w, h] output heatmap's size\n        sigma (float): the std of gaussin kernel genereted\n        records(dict): the dict contained the image and coords\n\n    Returns:\n        records (dict): contain the heatmaps used to heatmaploss\n\n    \"\"\"\n\n    def __init__(self, hmsize, sigma):\n        super(ToHeatmapsTopDown_DARK, self).__init__()\n        self.hmsize = np.array(hmsize)\n        self.sigma = sigma\n\n    def __call__(self, records):\n        joints = records['gt_joints']\n        joints_vis = records['joints_vis']\n        num_joints = joints.shape[0]\n        image_size = np.array(\n            [records['image'].shape[1], records['image'].shape[0]])\n        target_weight = np.ones((num_joints, 1), dtype=np.float32)\n        target_weight[:, 0] = joints_vis[:, 0]\n        target = np.zeros(\n            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)\n        tmp_size = self.sigma * 3\n        feat_stride = image_size / self.hmsize\n        for joint_id in range(num_joints):\n            mu_x = joints[joint_id][0] / feat_stride[0]\n            mu_y = joints[joint_id][1] / feat_stride[1]\n            # Check that any part of the gaussian is in-bounds\n            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]\n            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]\n            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[\n                    0] < 0 or br[1] < 0:\n                # If not, just return the image as is\n                target_weight[joint_id] = 0\n                continue\n\n            x = np.arange(0, self.hmsize[0], 1, np.float32)\n            y = np.arange(0, self.hmsize[1], 1, np.float32)\n            y = y[:, np.newaxis]\n\n            v = target_weight[joint_id]\n            if v > 0.5:\n                target[joint_id] = np.exp(-(\n                    (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2))\n        records['target'] = target\n        records['target_weight'] = target_weight\n        del records['gt_joints'], records['joints_vis']\n\n        return records\n\n\n@register_keypointop\nclass ToHeatmapsTopDown_UDP(object):\n    \"\"\"This code is based on:\n        https://github.com/HuangJunJie2017/UDP-Pose/blob/master/deep-high-resolution-net.pytorch/lib/dataset/JointsDataset.py\n       \n        to generate the gaussian heatmaps of keypoint for heatmap loss.\n        ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing\n        for Human Pose Estimation (CVPR 2020).\n\n    Args:\n        hmsize (list): [w, h] output heatmap's size\n        sigma (float): the std of gaussin kernel genereted\n        records(dict): the dict contained the image and coords\n\n    Returns:\n        records (dict): contain the heatmaps used to heatmaploss\n    \"\"\"\n\n    def __init__(self, hmsize, sigma):\n        super(ToHeatmapsTopDown_UDP, self).__init__()\n        self.hmsize = np.array(hmsize)\n        self.sigma = sigma\n\n    def __call__(self, records):\n        joints = records['gt_joints']\n        joints_vis = records['joints_vis']\n        num_joints = joints.shape[0]\n        image_size = np.array(\n            [records['image'].shape[1], records['image'].shape[0]])\n        target_weight = np.ones((num_joints, 1), dtype=np.float32)\n        target_weight[:, 0] = joints_vis[:, 0]\n        target = np.zeros(\n            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)\n        tmp_size = self.sigma * 3\n        size = 2 * tmp_size + 1\n        x = np.arange(0, size, 1, np.float32)\n        y = x[:, None]\n        feat_stride = (image_size - 1.0) / (self.hmsize - 1.0)\n        for joint_id in range(num_joints):\n            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)\n            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)\n            # Check that any part of the gaussian is in-bounds\n            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]\n            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]\n            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[\n                    0] < 0 or br[1] < 0:\n                # If not, just return the image as is\n                target_weight[joint_id] = 0\n                continue\n\n            mu_x_ac = joints[joint_id][0] / feat_stride[0]\n            mu_y_ac = joints[joint_id][1] / feat_stride[1]\n            x0 = y0 = size // 2\n            x0 += mu_x_ac - mu_x\n            y0 += mu_y_ac - mu_y\n            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))\n            # Usable gaussian range\n            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]\n            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]\n            # Image range\n            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])\n            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])\n\n            v = target_weight[joint_id]\n            if v > 0.5:\n                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[\n                    0]:g_y[1], g_x[0]:g_x[1]]\n        records['target'] = target\n        records['target_weight'] = target_weight\n        del records['gt_joints'], records['joints_vis']\n\n        return records\n\n\nfrom typing import Optional, Tuple, Union, List\nimport numbers\n\n\ndef _scale_size(\n        size: Tuple[int, int],\n        scale: Union[float, int, tuple], ) -> Tuple[int, int]:\n    \"\"\"Rescale a size by a ratio.\n\n    Args:\n        size (tuple[int]): (w, h).\n        scale (float | tuple(float)): Scaling factor.\n\n    Returns:\n        tuple[int]: scaled size.\n    \"\"\"\n    if isinstance(scale, (float, int)):\n        scale = (scale, scale)\n    w, h = size\n    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)\n\n\ndef rescale_size(old_size: tuple,\n                 scale: Union[float, int, tuple],\n                 return_scale: bool=False) -> tuple:\n    \"\"\"Calculate the new size to be rescaled to.\n\n    Args:\n        old_size (tuple[int]): The old size (w, h) of image.\n        scale (float | tuple[int]): The scaling factor or maximum size.\n            If it is a float number, then the image will be rescaled by this\n            factor, else if it is a tuple of 2 integers, then the image will\n            be rescaled as large as possible within the scale.\n        return_scale (bool): Whether to return the scaling factor besides the\n            rescaled image size.\n\n    Returns:\n        tuple[int]: The new rescaled image size.\n    \"\"\"\n    w, h = old_size\n    if isinstance(scale, (float, int)):\n        if scale <= 0:\n            raise ValueError(f'Invalid scale {scale}, must be positive.')\n        scale_factor = scale\n    elif isinstance(scale, list):\n        max_long_edge = max(scale)\n        max_short_edge = min(scale)\n        scale_factor = min(max_long_edge / max(h, w),\n                           max_short_edge / min(h, w))\n    else:\n        raise TypeError(\n            f'Scale must be a number or tuple of int, but got {type(scale)}')\n\n    new_size = _scale_size((w, h), scale_factor)\n\n    if return_scale:\n        return new_size, scale_factor\n    else:\n        return new_size\n\n\ndef imrescale(img: np.ndarray,\n              scale: Union[float, Tuple[int, int]],\n              return_scale: bool=False,\n              interpolation: str='bilinear',\n              backend: Optional[str]=None) -> Union[np.ndarray, Tuple[\n                  np.ndarray, float]]:\n    \"\"\"Resize image while keeping the aspect ratio.\n\n    Args:\n        img (ndarray): The input image.\n        scale (float | tuple[int]): The scaling factor or maximum size.\n            If it is a float number, then the image will be rescaled by this\n            factor, else if it is a tuple of 2 integers, then the image will\n            be rescaled as large as possible within the scale.\n        return_scale (bool): Whether to return the scaling factor besides the\n            rescaled image.\n        interpolation (str): Same as :func:`resize`.\n        backend (str | None): Same as :func:`resize`.\n\n    Returns:\n        ndarray: The rescaled image.\n    \"\"\"\n    h, w = img.shape[:2]\n    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)\n    rescaled_img = imresize(\n        img, new_size, interpolation=interpolation, backend=backend)\n    if return_scale:\n        return rescaled_img, scale_factor\n    else:\n        return rescaled_img\n\n\ndef imresize(\n        img: np.ndarray,\n        size: Tuple[int, int],\n        return_scale: bool=False,\n        interpolation: str='bilinear',\n        out: Optional[np.ndarray]=None,\n        backend: Optional[str]=None,\n        interp=cv2.INTER_LINEAR, ) -> Union[Tuple[np.ndarray, float, float],\n                                            np.ndarray]:\n    \"\"\"Resize image to a given size.\n\n    Args:\n        img (ndarray): The input image.\n        size (tuple[int]): Target size (w, h).\n        return_scale (bool): Whether to return `w_scale` and `h_scale`.\n        interpolation (str): Interpolation method, accepted values are\n            \"nearest\", \"bilinear\", \"bicubic\", \"area\", \"lanczos\" for 'cv2'\n            backend, \"nearest\", \"bilinear\" for 'pillow' backend.\n        out (ndarray): The output destination.\n        backend (str | None): The image resize backend type. Options are `cv2`,\n            `pillow`, `None`. If backend is None, the global imread_backend\n            specified by ``mmcv.use_backend()`` will be used. Default: None.\n\n    Returns:\n        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or\n        `resized_img`.\n    \"\"\"\n    h, w = img.shape[:2]\n    if backend is None:\n        backend = imread_backend\n    if backend not in ['cv2', 'pillow']:\n        raise ValueError(f'backend: {backend} is not supported for resize.'\n                         f\"Supported backends are 'cv2', 'pillow'\")\n\n    if backend == 'pillow':\n        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'\n        pil_image = Image.fromarray(img)\n        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])\n        resized_img = np.array(pil_image)\n    else:\n        resized_img = cv2.resize(img, size, dst=out, interpolation=interp)\n    if not return_scale:\n        return resized_img\n    else:\n        w_scale = size[0] / w\n        h_scale = size[1] / h\n        return resized_img, w_scale, h_scale\n\n\nclass PETR_Resize:\n    \"\"\"Resize images & bbox & mask.\n\n    This transform resizes the input image to some scale. Bboxes and masks are\n    then resized with the same scale factor. If the input dict contains the key\n    \"scale\", then the scale in the input dict is used, otherwise the specified\n    scale in the init method is used. If the input dict contains the key\n    \"scale_factor\" (if MultiScaleFlipAug does not give img_scale but\n    scale_factor), the actual scale will be computed by image shape and\n    scale_factor.\n\n    `img_scale` can either be a tuple (single-scale) or a list of tuple\n    (multi-scale). There are 3 multiscale modes:\n\n    - ``ratio_range is not None``: randomly sample a ratio from the ratio \\\n      range and multiply it with the image scale.\n    - ``ratio_range is None`` and ``multiscale_mode == \"range\"``: randomly \\\n      sample a scale from the multiscale range.\n    - ``ratio_range is None`` and ``multiscale_mode == \"value\"``: randomly \\\n      sample a scale from multiple scales.\n\n    Args:\n        img_scale (tuple or list[tuple]): Images scales for resizing.\n        multiscale_mode (str): Either \"range\" or \"value\".\n        ratio_range (tuple[float]): (min_ratio, max_ratio)\n        keep_ratio (bool): Whether to keep the aspect ratio when resizing the\n            image.\n        bbox_clip_border (bool, optional): Whether to clip the objects outside\n            the border of the image. In some dataset like MOT17, the gt bboxes\n            are allowed to cross the border of images. Therefore, we don't\n            need to clip the gt bboxes in these cases. Defaults to True.\n        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.\n            These two backends generates slightly different results. Defaults\n            to 'cv2'.\n        interpolation (str): Interpolation method, accepted values are\n            \"nearest\", \"bilinear\", \"bicubic\", \"area\", \"lanczos\" for 'cv2'\n            backend, \"nearest\", \"bilinear\" for 'pillow' backend.\n        override (bool, optional): Whether to override `scale` and\n            `scale_factor` so as to call resize twice. Default False. If True,\n            after the first resizing, the existed `scale` and `scale_factor`\n            will be ignored so the second resizing can be allowed.\n            This option is a work-around for multiple times of resize in DETR.\n            Defaults to False.\n    \"\"\"\n\n    def __init__(self,\n                 img_scale=None,\n                 multiscale_mode='range',\n                 ratio_range=None,\n                 keep_ratio=True,\n                 bbox_clip_border=True,\n                 backend='cv2',\n                 interpolation='bilinear',\n                 override=False,\n                 keypoint_clip_border=True):\n        if img_scale is None:\n            self.img_scale = None\n        else:\n            if isinstance(img_scale, list):\n                self.img_scale = img_scale\n            else:\n                self.img_scale = [img_scale]\n            assert isinstance(self.img_scale, list)\n\n        if ratio_range is not None:\n            # mode 1: given a scale and a range of image ratio\n            assert len(self.img_scale) == 1\n        else:\n            # mode 2: given multiple scales or a range of scales\n            assert multiscale_mode in ['value', 'range']\n\n        self.backend = backend\n        self.multiscale_mode = multiscale_mode\n        self.ratio_range = ratio_range\n        self.keep_ratio = keep_ratio\n        # TODO: refactor the override option in Resize\n        self.interpolation = interpolation\n        self.override = override\n        self.bbox_clip_border = bbox_clip_border\n        self.keypoint_clip_border = keypoint_clip_border\n\n    @staticmethod\n    def random_select(img_scales):\n        \"\"\"Randomly select an img_scale from given candidates.\n\n        Args:\n            img_scales (list[tuple]): Images scales for selection.\n\n        Returns:\n            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \\\n                where ``img_scale`` is the selected image scale and \\\n                ``scale_idx`` is the selected index in the given candidates.\n        \"\"\"\n\n        assert isinstance(img_scales, list)\n        scale_idx = np.random.randint(len(img_scales))\n        img_scale = img_scales[scale_idx]\n        return img_scale, scale_idx\n\n    @staticmethod\n    def random_sample(img_scales):\n        \"\"\"Randomly sample an img_scale when ``multiscale_mode=='range'``.\n\n        Args:\n            img_scales (list[tuple]): Images scale range for sampling.\n                There must be two tuples in img_scales, which specify the lower\n                and upper bound of image scales.\n\n        Returns:\n            (tuple, None): Returns a tuple ``(img_scale, None)``, where \\\n                ``img_scale`` is sampled scale and None is just a placeholder \\\n                to be consistent with :func:`random_select`.\n        \"\"\"\n\n        assert isinstance(img_scales, list) and len(img_scales) == 2\n        img_scale_long = [max(s) for s in img_scales]\n        img_scale_short = [min(s) for s in img_scales]\n        long_edge = np.random.randint(\n            min(img_scale_long), max(img_scale_long) + 1)\n        short_edge = np.random.randint(\n            min(img_scale_short), max(img_scale_short) + 1)\n        img_scale = (long_edge, short_edge)\n        return img_scale, None\n\n    @staticmethod\n    def random_sample_ratio(img_scale, ratio_range):\n        \"\"\"Randomly sample an img_scale when ``ratio_range`` is specified.\n\n        A ratio will be randomly sampled from the range specified by\n        ``ratio_range``. Then it would be multiplied with ``img_scale`` to\n        generate sampled scale.\n\n        Args:\n            img_scale (list): Images scale base to multiply with ratio.\n            ratio_range (tuple[float]): The minimum and maximum ratio to scale\n                the ``img_scale``.\n\n        Returns:\n            (tuple, None): Returns a tuple ``(scale, None)``, where \\\n                ``scale`` is sampled ratio multiplied with ``img_scale`` and \\\n                None is just a placeholder to be consistent with \\\n                :func:`random_select`.\n        \"\"\"\n\n        assert isinstance(img_scale, list) and len(img_scale) == 2\n        min_ratio, max_ratio = ratio_range\n        assert min_ratio <= max_ratio\n        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio\n        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)\n        return scale, None\n\n    def _random_scale(self, results):\n        \"\"\"Randomly sample an img_scale according to ``ratio_range`` and\n        ``multiscale_mode``.\n\n        If ``ratio_range`` is specified, a ratio will be sampled and be\n        multiplied with ``img_scale``.\n        If multiple scales are specified by ``img_scale``, a scale will be\n        sampled according to ``multiscale_mode``.\n        Otherwise, single scale will be used.\n\n        Args:\n            results (dict): Result dict from :obj:`dataset`.\n\n        Returns:\n            dict: Two new keys 'scale` and 'scale_idx` are added into \\\n                ``results``, which would be used by subsequent pipelines.\n        \"\"\"\n\n        if self.ratio_range is not None:\n            scale, scale_idx = self.random_sample_ratio(self.img_scale[0],\n                                                        self.ratio_range)\n        elif len(self.img_scale) == 1:\n            scale, scale_idx = self.img_scale[0], 0\n        elif self.multiscale_mode == 'range':\n            scale, scale_idx = self.random_sample(self.img_scale)\n        elif self.multiscale_mode == 'value':\n            scale, scale_idx = self.random_select(self.img_scale)\n        else:\n            raise NotImplementedError\n        results['scale'] = scale\n        results['scale_idx'] = scale_idx\n\n    def _resize_img(self, results):\n        \"\"\"Resize images with ``results['scale']``.\"\"\"\n        for key in ['image'] if 'image' in results else []:\n            if self.keep_ratio:\n                img, scale_factor = imrescale(\n                    results[key],\n                    results['scale'],\n                    return_scale=True,\n                    interpolation=self.interpolation,\n                    backend=self.backend)\n                # the w_scale and h_scale has minor difference\n                # a real fix should be done in the imrescale in the future\n                new_h, new_w = img.shape[:2]\n                h, w = results[key].shape[:2]\n                w_scale = new_w / w\n                h_scale = new_h / h\n            else:\n                img, w_scale, h_scale = imresize(\n                    results[key],\n                    results['scale'],\n                    return_scale=True,\n                    interpolation=self.interpolation,\n                    backend=self.backend)\n\n            scale_factor = np.array(\n                [w_scale, h_scale, w_scale, h_scale], dtype=np.float32)\n            results['im_shape'] = np.array(img.shape)\n            # in case that there is no padding\n            results['pad_shape'] = img.shape\n            results['scale_factor'] = scale_factor\n            results['keep_ratio'] = self.keep_ratio\n            # img_pad = self.impad(img, shape=results['scale'])\n            results[key] = img\n\n    def _resize_bboxes(self, results):\n        \"\"\"Resize bounding boxes with ``results['scale_factor']``.\"\"\"\n        for key in ['gt_bbox'] if 'gt_bbox' in results else []:\n            bboxes = results[key] * results['scale_factor']\n            if self.bbox_clip_border:\n                img_shape = results['im_shape']\n                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])\n                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])\n            results[key] = bboxes\n\n    def _resize_masks(self, results):\n        \"\"\"Resize masks with ``results['scale']``\"\"\"\n        for key in ['mask'] if 'mask' in results else []:\n            if results[key] is None:\n                continue\n            if self.keep_ratio:\n                results[key] = results[key].rescale(results['scale'])\n            else:\n                results[key] = results[key].resize(results['im_shape'][:2])\n\n    def _resize_seg(self, results):\n        \"\"\"Resize semantic segmentation map with ``results['scale']``.\"\"\"\n        for key in ['seg'] if 'seg' in results else []:\n            if self.keep_ratio:\n                gt_seg = imrescale(\n                    results[key],\n                    results['scale'],\n                    interpolation='nearest',\n                    backend=self.backend)\n            else:\n                gt_seg = imresize(\n                    results[key],\n                    results['scale'],\n                    interpolation='nearest',\n                    backend=self.backend)\n            results[key] = gt_seg\n\n    def _resize_keypoints(self, results):\n        \"\"\"Resize keypoints with ``results['scale_factor']``.\"\"\"\n        for key in ['gt_joints'] if 'gt_joints' in results else []:\n            keypoints = results[key].copy()\n            keypoints[..., 0] = keypoints[..., 0] * results['scale_factor'][0]\n            keypoints[..., 1] = keypoints[..., 1] * results['scale_factor'][1]\n            if self.keypoint_clip_border:\n                img_shape = results['im_shape']\n                keypoints[..., 0] = np.clip(keypoints[..., 0], 0, img_shape[1])\n                keypoints[..., 1] = np.clip(keypoints[..., 1], 0, img_shape[0])\n            results[key] = keypoints\n\n    def _resize_areas(self, results):\n        \"\"\"Resize mask areas with ``results['scale_factor']``.\"\"\"\n        for key in ['gt_areas'] if 'gt_areas' in results else []:\n            areas = results[key].copy()\n            areas = areas * results['scale_factor'][0] * results[\n                'scale_factor'][1]\n            results[key] = areas\n\n    def __call__(self, results):\n        \"\"\"Call function to resize images, bounding boxes, masks, semantic\n        segmentation map.\n\n        Args:\n            results (dict): Result dict from loading pipeline.\n\n        Returns:\n            dict: Resized results, 'im_shape', 'pad_shape', 'scale_factor', \\\n                'keep_ratio' keys are added into result dict.\n        \"\"\"\n        if 'scale' not in results:\n            if 'scale_factor' in results:\n                img_shape = results['image'].shape[:2]\n                scale_factor = results['scale_factor'][0]\n                # assert isinstance(scale_factor, float)\n                results['scale'] = [int(x * scale_factor)\n                                    for x in img_shape][::-1]\n            else:\n                self._random_scale(results)\n        else:\n            if not self.override:\n                assert 'scale_factor' not in results, (\n                    'scale and scale_factor cannot be both set.')\n            else:\n                results.pop('scale')\n                if 'scale_factor' in results:\n                    results.pop('scale_factor')\n                self._random_scale(results)\n\n        self._resize_img(results)\n        self._resize_bboxes(results)\n        self._resize_masks(results)\n        self._resize_seg(results)\n        self._resize_keypoints(results)\n        self._resize_areas(results)\n        return results\n\n    def __repr__(self):\n        repr_str = self.__class__.__name__\n        repr_str += f'(img_scale={self.img_scale}, '\n        repr_str += f'multiscale_mode={self.multiscale_mode}, '\n        repr_str += f'ratio_range={self.ratio_range}, '\n        repr_str += f'keep_ratio={self.keep_ratio}, '\n        repr_str += f'bbox_clip_border={self.bbox_clip_border})'\n        repr_str += f'keypoint_clip_border={self.keypoint_clip_border})'\n        return repr_str\n"
  },
  {
    "path": "ppdet/data/transform/keypoints_3d_operators.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\n\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\nimport cv2\nimport numpy as np\nimport math\nimport copy\nimport random\nimport uuid\nfrom numbers import Number, Integral\n\nfrom ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix\nfrom ppdet.core.workspace import serializable\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\nregistered_ops = []\n\n__all__ = [\n    'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages'\n]\n\nimport matplotlib.pyplot as plt\nfrom PIL import Image, ImageDraw\nfrom mpl_toolkits.mplot3d import Axes3D\n\n\ndef register_keypointop(cls):\n    return serializable(cls)\n\n\ndef register_op(cls):\n    registered_ops.append(cls.__name__)\n    if not hasattr(BaseOperator, cls.__name__):\n        setattr(BaseOperator, cls.__name__, cls)\n    else:\n        raise KeyError(\"The {} class has been registered.\".format(cls.__name__))\n    return serializable(cls)\n\n\nclass BaseOperator(object):\n    def __init__(self, name=None):\n        if name is None:\n            name = self.__class__.__name__\n        self._id = name + '_' + str(uuid.uuid4())[-6:]\n\n    def apply(self, sample, context=None):\n        \"\"\" Process a sample.\n        Args:\n            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}\n            context (dict): info about this sample processing\n        Returns:\n            result (dict): a processed sample\n        \"\"\"\n        return sample\n\n    def __call__(self, sample, context=None):\n        \"\"\" Process a sample.\n        Args:\n            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}\n            context (dict): info about this sample processing\n        Returns:\n            result (dict): a processed sample\n        \"\"\"\n        if isinstance(sample, Sequence):  # for batch_size\n            for i in range(len(sample)):\n                sample[i] = self.apply(sample[i], context)\n        else:\n            # image.shape changed\n            sample = self.apply(sample, context)\n        return sample\n\n    def __str__(self):\n        return str(self._id)\n\n\n@register_keypointop\nclass CropAndFlipImages(object):\n    \"\"\"Crop all images\"\"\"\n\n    def __init__(self, crop_range, flip_pairs=None):\n        super(CropAndFlipImages, self).__init__()\n        self.crop_range = crop_range\n        self.flip_pairs = flip_pairs\n\n    def __call__(self, records):  # tuple\n        images = records[\"image\"]\n        images = images[:, :, ::-1, :]\n        images = images[:, :, self.crop_range[0]:self.crop_range[1]]\n        records[\"image\"] = images\n\n        if \"kps2d\" in records.keys():\n            kps2d = records[\"kps2d\"]\n\n            width, height = images.shape[2], images.shape[1]\n            kps2d = np.array(kps2d)\n            kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0]\n\n            for pair in self.flip_pairs:\n                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \\\n                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()\n\n            records[\"kps2d\"] = kps2d\n\n        return records\n\n\n@register_op\nclass PermuteImages(BaseOperator):\n    def __init__(self):\n        \"\"\"\n        Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920)\n        \"\"\"\n        super(PermuteImages, self).__init__()\n\n    def apply(self, sample, context=None):\n        images = sample[\"image\"]\n        images = images.transpose((0, 3, 1, 2))\n\n        sample[\"image\"] = images\n\n        return sample\n\n\n@register_keypointop\nclass RandomFlipHalfBody3DTransformImages(object):\n    \"\"\"apply data augment to images and coords\n    to achieve the flip, scale, rotate and half body transform effect for training image\n    Args:\n        trainsize (list):[w, h], Image target size\n        upper_body_ids (list): The upper body joint ids\n        flip_pairs (list): The left-right joints exchange order list\n        pixel_std (int): The pixel std of the scale\n        scale (float): The scale factor to transform the image\n        rot (int): The rotate factor to transform the image\n        num_joints_half_body (int): The joints threshold of the half body transform\n        prob_half_body (float): The threshold of the half body transform\n        flip (bool): Whether to flip the image\n    Returns:\n        records(dict): contain the image and coords after tranformed\n    \"\"\"\n\n    def __init__(self,\n                 trainsize,\n                 upper_body_ids,\n                 flip_pairs,\n                 pixel_std,\n                 scale=0.35,\n                 rot=40,\n                 num_joints_half_body=8,\n                 prob_half_body=0.3,\n                 flip=True,\n                 rot_prob=0.6,\n                 do_occlusion=False):\n        super(RandomFlipHalfBody3DTransformImages, self).__init__()\n        self.trainsize = trainsize\n        self.upper_body_ids = upper_body_ids\n        self.flip_pairs = flip_pairs\n        self.pixel_std = pixel_std\n        self.scale = scale\n        self.rot = rot\n        self.num_joints_half_body = num_joints_half_body\n        self.prob_half_body = prob_half_body\n        self.flip = flip\n        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]\n        self.rot_prob = rot_prob\n        self.do_occlusion = do_occlusion\n\n    def halfbody_transform(self, joints, joints_vis):\n        upper_joints = []\n        lower_joints = []\n        for joint_id in range(joints.shape[0]):\n            if joints_vis[joint_id][0] > 0:\n                if joint_id in self.upper_body_ids:\n                    upper_joints.append(joints[joint_id])\n                else:\n                    lower_joints.append(joints[joint_id])\n        if np.random.randn() < 0.5 and len(upper_joints) > 2:\n            selected_joints = upper_joints\n        else:\n            selected_joints = lower_joints if len(\n                lower_joints) > 2 else upper_joints\n        if len(selected_joints) < 2:\n            return None, None\n        selected_joints = np.array(selected_joints, dtype=np.float32)\n        center = selected_joints.mean(axis=0)[:2]\n        left_top = np.amin(selected_joints, axis=0)\n        right_bottom = np.amax(selected_joints, axis=0)\n        w = right_bottom[0] - left_top[0]\n        h = right_bottom[1] - left_top[1]\n        if w > self.aspect_ratio * h:\n            h = w * 1.0 / self.aspect_ratio\n        elif w < self.aspect_ratio * h:\n            w = h * self.aspect_ratio\n        scale = np.array(\n            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],\n            dtype=np.float32)\n        scale = scale * 1.5\n\n        return center, scale\n\n    def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None):\n        # joints: (6, 24, 3),(num_frames, num_joints, 3)\n\n        joints[:, :, 0] = width - joints[:, :, 0] - 1  # x\n        if kps2d is not None:\n            kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1\n\n        for pair in matched_parts:\n            joints[:, pair[0], :], joints[:,pair[1], :] = \\\n                joints[:,pair[1], :], joints[:,pair[0], :].copy()\n\n            joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \\\n                joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy()\n\n            if kps2d is not None:\n                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \\\n                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()\n\n        # move to zero\n        joints -= joints[:, [0], :]  # (batch_size, 24, 3),numpy.ndarray\n\n        return joints, joints_vis, kps2d\n\n    def __call__(self, records):\n        images = records[\n            'image']  #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3)\n\n        joints = records['kps3d']\n        joints_vis = records['kps3d_vis']\n\n        kps2d = None\n        if 'kps2d' in records.keys():\n            kps2d = records['kps2d']\n\n        if self.flip and np.random.random() <= 0.5:\n            images = images[:, :, ::-1, :]  # 图像水平翻转 (6, 1080, 810, 3)\n            joints, joints_vis, kps2d = self.flip_joints(\n                joints, joints_vis, images.shape[2], self.flip_pairs,\n                kps2d)  # 关键点左右对称翻转\n        occlusion = False\n        if self.do_occlusion and random.random() <= 0.5:  # 随机遮挡\n            height = images[0].shape[0]\n            width = images[0].shape[1]\n            occlusion = True\n            while True:\n                area_min = 0.0\n                area_max = 0.2\n                synth_area = (random.random() *\n                              (area_max - area_min) + area_min) * width * height\n\n                ratio_min = 0.3\n                ratio_max = 1 / 0.3\n                synth_ratio = (random.random() *\n                               (ratio_max - ratio_min) + ratio_min)\n\n                synth_h = math.sqrt(synth_area * synth_ratio)\n                synth_w = math.sqrt(synth_area / synth_ratio)\n                synth_xmin = random.random() * (width - synth_w - 1)\n                synth_ymin = random.random() * (height - synth_h - 1)\n\n                if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height:\n                    xmin = int(synth_xmin)\n                    ymin = int(synth_ymin)\n                    w = int(synth_w)\n                    h = int(synth_h)\n\n                    mask = np.random.rand(h, w, 3) * 255\n                    images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[\n                        None, :, :, :]\n                    break\n\n        records['image'] = images\n        records['kps3d'] = joints\n        records['kps3d_vis'] = joints_vis\n        if kps2d is not None:\n            records['kps2d'] = kps2d\n\n        return records\n"
  },
  {
    "path": "ppdet/data/transform/mot_operators.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\nfrom numbers import Integral\n\nimport cv2\nimport copy\nimport numpy as np\nimport random\nimport math\n\nfrom .operators import BaseOperator, register_op\nfrom .batch_operators import Gt2TTFTarget\nfrom ppdet.modeling.bbox_utils import bbox_iou_np_expand\nfrom ppdet.utils.logger import setup_logger\nfrom .op_helper import gaussian_radius\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'RGBReverse', 'LetterBoxResize', 'MOTRandomAffine', 'Gt2JDETargetThres',\n    'Gt2JDETargetMax', 'Gt2FairMOTTarget'\n]\n\n\n@register_op\nclass RGBReverse(BaseOperator):\n    \"\"\"RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine\n    \"\"\"\n\n    def __init__(self):\n        super(RGBReverse, self).__init__()\n\n    def apply(self, sample, context=None):\n        im = sample['image']\n        sample['image'] = np.ascontiguousarray(im[:, :, ::-1])\n        return sample\n\n\n@register_op\nclass LetterBoxResize(BaseOperator):\n    def __init__(self, target_size):\n        \"\"\"\n        Resize image to target size, convert normalized xywh to pixel xyxy\n        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).\n        Args:\n            target_size (int|list): image target size.\n        \"\"\"\n        super(LetterBoxResize, self).__init__()\n        if not isinstance(target_size, (Integral, Sequence)):\n            raise TypeError(\n                \"Type of target_size is invalid. Must be Integer or List or Tuple, now is {}\".\n                format(type(target_size)))\n        if isinstance(target_size, Integral):\n            target_size = [target_size, target_size]\n        self.target_size = target_size\n\n    def apply_image(self, img, height, width, color=(127.5, 127.5, 127.5)):\n        # letterbox: resize a rectangular image to a padded rectangular\n        shape = img.shape[:2]  # [height, width]\n        ratio_h = float(height) / shape[0]\n        ratio_w = float(width) / shape[1]\n        ratio = min(ratio_h, ratio_w)\n        new_shape = (round(shape[1] * ratio),\n                     round(shape[0] * ratio))  # [width, height]\n        padw = (width - new_shape[0]) / 2\n        padh = (height - new_shape[1]) / 2\n        top, bottom = round(padh - 0.1), round(padh + 0.1)\n        left, right = round(padw - 0.1), round(padw + 0.1)\n\n        img = cv2.resize(\n            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border\n        img = cv2.copyMakeBorder(\n            img, top, bottom, left, right, cv2.BORDER_CONSTANT,\n            value=color)  # padded rectangular\n        return img, ratio, padw, padh\n\n    def apply_bbox(self, bbox0, h, w, ratio, padw, padh):\n        bboxes = bbox0.copy()\n        bboxes[:, 0] = ratio * w * (bbox0[:, 0] - bbox0[:, 2] / 2) + padw\n        bboxes[:, 1] = ratio * h * (bbox0[:, 1] - bbox0[:, 3] / 2) + padh\n        bboxes[:, 2] = ratio * w * (bbox0[:, 0] + bbox0[:, 2] / 2) + padw\n        bboxes[:, 3] = ratio * h * (bbox0[:, 1] + bbox0[:, 3] / 2) + padh\n        return bboxes\n\n    def apply(self, sample, context=None):\n        \"\"\" Resize the image numpy.\n        \"\"\"\n        im = sample['image']\n        h, w = sample['im_shape']\n        if not isinstance(im, np.ndarray):\n            raise TypeError(\"{}: image type is not numpy.\".format(self))\n        if len(im.shape) != 3:\n            from PIL import UnidentifiedImageError\n            raise UnidentifiedImageError(\n                '{}: image is not 3-dimensional.'.format(self))\n\n        # apply image\n        height, width = self.target_size\n        img, ratio, padw, padh = self.apply_image(\n            im, height=height, width=width)\n\n        sample['image'] = img\n        new_shape = (round(h * ratio), round(w * ratio))\n        sample['im_shape'] = np.asarray(new_shape, dtype=np.float32)\n        sample['scale_factor'] = np.asarray([ratio, ratio], dtype=np.float32)\n\n        # apply bbox\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], h, w, ratio,\n                                                padw, padh)\n        return sample\n\n\n@register_op\nclass MOTRandomAffine(BaseOperator):\n    \"\"\" \n    Affine transform to image and coords to achieve the rotate, scale and\n    shift effect for training image.\n\n    Args:\n        degrees (list[2]): the rotate range to apply, transform range is [min, max]\n        translate (list[2]): the translate range to apply, transform range is [min, max]\n        scale (list[2]): the scale range to apply, transform range is [min, max]\n        shear (list[2]): the shear range to apply, transform range is [min, max]\n        borderValue (list[3]): value used in case of a constant border when appling\n            the perspective transformation\n        reject_outside (bool): reject warped bounding bboxes outside of image\n\n    Returns:\n        records(dict): contain the image and coords after tranformed\n\n    \"\"\"\n\n    def __init__(self,\n                 degrees=(-5, 5),\n                 translate=(0.10, 0.10),\n                 scale=(0.50, 1.20),\n                 shear=(-2, 2),\n                 borderValue=(127.5, 127.5, 127.5),\n                 reject_outside=True):\n        super(MOTRandomAffine, self).__init__()\n        self.degrees = degrees\n        self.translate = translate\n        self.scale = scale\n        self.shear = shear\n        self.borderValue = borderValue\n        self.reject_outside = reject_outside\n\n    def apply(self, sample, context=None):\n        # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4\n        border = 0  # width of added border (optional)\n\n        img = sample['image']\n        height, width = img.shape[0], img.shape[1]\n\n        # Rotation and Scale\n        R = np.eye(3)\n        a = random.random() * (self.degrees[1] - self.degrees[0]\n                               ) + self.degrees[0]\n        s = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0]\n        R[:2] = cv2.getRotationMatrix2D(\n            angle=a, center=(width / 2, height / 2), scale=s)\n\n        # Translation\n        T = np.eye(3)\n        T[0, 2] = (\n            random.random() * 2 - 1\n        ) * self.translate[0] * height + border  # x translation (pixels)\n        T[1, 2] = (\n            random.random() * 2 - 1\n        ) * self.translate[1] * width + border  # y translation (pixels)\n\n        # Shear\n        S = np.eye(3)\n        S[0, 1] = math.tan((random.random() *\n                            (self.shear[1] - self.shear[0]) + self.shear[0]) *\n                           math.pi / 180)  # x shear (deg)\n        S[1, 0] = math.tan((random.random() *\n                            (self.shear[1] - self.shear[0]) + self.shear[0]) *\n                           math.pi / 180)  # y shear (deg)\n\n        M = S @T @R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!\n        imw = cv2.warpPerspective(\n            img,\n            M,\n            dsize=(width, height),\n            flags=cv2.INTER_LINEAR,\n            borderValue=self.borderValue)  # BGR order borderValue\n\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            targets = sample['gt_bbox']\n            n = targets.shape[0]\n            points = targets.copy()\n            area0 = (points[:, 2] - points[:, 0]) * (\n                points[:, 3] - points[:, 1])\n\n            # warp points\n            xy = np.ones((n * 4, 3))\n            xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(\n                n * 4, 2)  # x1y1, x2y2, x1y2, x2y1\n            xy = (xy @M.T)[:, :2].reshape(n, 8)\n\n            # create new boxes\n            x = xy[:, [0, 2, 4, 6]]\n            y = xy[:, [1, 3, 5, 7]]\n            xy = np.concatenate(\n                (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T\n\n            # apply angle-based reduction\n            radians = a * math.pi / 180\n            reduction = max(abs(math.sin(radians)), abs(math.cos(radians)))**0.5\n            x = (xy[:, 2] + xy[:, 0]) / 2\n            y = (xy[:, 3] + xy[:, 1]) / 2\n            w = (xy[:, 2] - xy[:, 0]) * reduction\n            h = (xy[:, 3] - xy[:, 1]) * reduction\n            xy = np.concatenate(\n                (x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T\n\n            # reject warped points outside of image\n            if self.reject_outside:\n                np.clip(xy[:, 0], 0, width, out=xy[:, 0])\n                np.clip(xy[:, 2], 0, width, out=xy[:, 2])\n                np.clip(xy[:, 1], 0, height, out=xy[:, 1])\n                np.clip(xy[:, 3], 0, height, out=xy[:, 3])\n            w = xy[:, 2] - xy[:, 0]\n            h = xy[:, 3] - xy[:, 1]\n            area = w * h\n            ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))\n            i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)\n\n            if sum(i) > 0:\n                sample['gt_bbox'] = xy[i].astype(sample['gt_bbox'].dtype)\n                sample['gt_class'] = sample['gt_class'][i]\n                if 'difficult' in sample:\n                    sample['difficult'] = sample['difficult'][i]\n                if 'gt_ide' in sample:\n                    sample['gt_ide'] = sample['gt_ide'][i]\n                if 'is_crowd' in sample:\n                    sample['is_crowd'] = sample['is_crowd'][i]\n                sample['image'] = imw\n                return sample\n            else:\n                return sample\n\n\n@register_op\nclass Gt2JDETargetThres(BaseOperator):\n    __shared__ = ['num_classes']\n    \"\"\"\n    Generate JDE targets by groud truth data when training\n    Args:\n        anchors (list): anchors of JDE model\n        anchor_masks (list): anchor_masks of JDE model\n        downsample_ratios (list): downsample ratios of JDE model\n        ide_thresh (float): thresh of identity, higher is groud truth \n        fg_thresh (float): thresh of foreground, higher is foreground\n        bg_thresh (float): thresh of background, lower is background\n        num_classes (int): number of classes\n    \"\"\"\n\n    def __init__(self,\n                 anchors,\n                 anchor_masks,\n                 downsample_ratios,\n                 ide_thresh=0.5,\n                 fg_thresh=0.5,\n                 bg_thresh=0.4,\n                 num_classes=1):\n        super(Gt2JDETargetThres, self).__init__()\n        self.anchors = anchors\n        self.anchor_masks = anchor_masks\n        self.downsample_ratios = downsample_ratios\n        self.ide_thresh = ide_thresh\n        self.fg_thresh = fg_thresh\n        self.bg_thresh = bg_thresh\n        self.num_classes = num_classes\n\n    def generate_anchor(self, nGh, nGw, anchor_hw):\n        nA = len(anchor_hw)\n        yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw))\n\n        mesh = np.stack([xx.T, yy.T], axis=0)  # [2, nGh, nGw]\n        mesh = np.repeat(mesh[None, :], nA, axis=0)  # [nA, 2, nGh, nGw]\n\n        anchor_offset_mesh = anchor_hw[:, :, None][:, :, :, None]\n        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGh, axis=-2)\n        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGw, axis=-1)\n\n        anchor_mesh = np.concatenate(\n            [mesh, anchor_offset_mesh], axis=1)  # [nA, 4, nGh, nGw]\n        return anchor_mesh\n\n    def encode_delta(self, gt_box_list, fg_anchor_list):\n        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \\\n                        fg_anchor_list[:, 2], fg_anchor_list[:,3]\n        gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \\\n                        gt_box_list[:, 2], gt_box_list[:, 3]\n        dx = (gx - px) / pw\n        dy = (gy - py) / ph\n        dw = np.log(gw / pw)\n        dh = np.log(gh / ph)\n        return np.stack([dx, dy, dw, dh], axis=1)\n\n    def pad_box(self, sample, num_max):\n        assert 'gt_bbox' in sample\n        bbox = sample['gt_bbox']\n        gt_num = len(bbox)\n        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)\n        if gt_num > 0:\n            pad_bbox[:gt_num, :] = bbox[:gt_num, :]\n        sample['gt_bbox'] = pad_bbox\n        if 'gt_score' in sample:\n            pad_score = np.zeros((num_max, ), dtype=np.float32)\n            if gt_num > 0:\n                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]\n            sample['gt_score'] = pad_score\n        if 'difficult' in sample:\n            pad_diff = np.zeros((num_max, ), dtype=np.int32)\n            if gt_num > 0:\n                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]\n            sample['difficult'] = pad_diff\n        if 'is_crowd' in sample:\n            pad_crowd = np.zeros((num_max, ), dtype=np.int32)\n            if gt_num > 0:\n                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]\n            sample['is_crowd'] = pad_crowd\n        if 'gt_ide' in sample:\n            pad_ide = np.zeros((num_max, ), dtype=np.int32)\n            if gt_num > 0:\n                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]\n            sample['gt_ide'] = pad_ide\n        return sample\n\n    def __call__(self, samples, context=None):\n        assert len(self.anchor_masks) == len(self.downsample_ratios), \\\n            \"anchor_masks', and 'downsample_ratios' should have same length.\"\n        h, w = samples[0]['image'].shape[1:3]\n\n        num_max = 0\n        for sample in samples:\n            num_max = max(num_max, len(sample['gt_bbox']))\n\n        for sample in samples:\n            gt_bbox = sample['gt_bbox']\n            gt_ide = sample['gt_ide']\n            for i, (anchor_hw, downsample_ratio\n                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):\n                anchor_hw = np.array(\n                    anchor_hw, dtype=np.float32) / downsample_ratio\n                nA = len(anchor_hw)\n                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)\n                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)\n                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)\n                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)\n\n                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()\n                gxy[:, 0] = gxy[:, 0] * nGw\n                gxy[:, 1] = gxy[:, 1] * nGh\n                gwh[:, 0] = gwh[:, 0] * nGw\n                gwh[:, 1] = gwh[:, 1] * nGh\n                gxy[:, 0] = np.clip(gxy[:, 0], 0, nGw - 1)\n                gxy[:, 1] = np.clip(gxy[:, 1], 0, nGh - 1)\n                tboxes = np.concatenate([gxy, gwh], axis=1)\n\n                anchor_mesh = self.generate_anchor(nGh, nGw, anchor_hw)\n\n                anchor_list = np.transpose(anchor_mesh,\n                                           (0, 2, 3, 1)).reshape(-1, 4)\n                iou_pdist = bbox_iou_np_expand(\n                    anchor_list, tboxes, x1y1x2y2=False)\n\n                iou_max = np.max(iou_pdist, axis=1)\n                max_gt_index = np.argmax(iou_pdist, axis=1)\n\n                iou_map = iou_max.reshape(nA, nGh, nGw)\n                gt_index_map = max_gt_index.reshape(nA, nGh, nGw)\n\n                id_index = iou_map > self.ide_thresh\n                fg_index = iou_map > self.fg_thresh\n                bg_index = iou_map < self.bg_thresh\n                ign_index = (iou_map < self.fg_thresh) * (\n                    iou_map > self.bg_thresh)\n                tconf[fg_index] = 1\n                tconf[bg_index] = 0\n                tconf[ign_index] = -1\n\n                gt_index = gt_index_map[fg_index]\n                gt_box_list = tboxes[gt_index]\n                gt_id_list = gt_ide[gt_index_map[id_index]]\n\n                if np.sum(fg_index) > 0:\n                    tid[id_index] = gt_id_list\n\n                    fg_anchor_list = anchor_list.reshape(nA, nGh, nGw,\n                                                         4)[fg_index]\n                    delta_target = self.encode_delta(gt_box_list,\n                                                     fg_anchor_list)\n                    tbox[fg_index] = delta_target\n\n                sample['tbox{}'.format(i)] = tbox\n                sample['tconf{}'.format(i)] = tconf\n                sample['tide{}'.format(i)] = tid\n            sample.pop('gt_class')\n            sample = self.pad_box(sample, num_max)\n        return samples\n\n\n@register_op\nclass Gt2JDETargetMax(BaseOperator):\n    __shared__ = ['num_classes']\n    \"\"\"\n    Generate JDE targets by groud truth data when evaluating\n    Args:\n        anchors (list): anchors of JDE model\n        anchor_masks (list): anchor_masks of JDE model\n        downsample_ratios (list): downsample ratios of JDE model\n        max_iou_thresh (float): iou thresh for high quality anchor\n        num_classes (int): number of classes\n    \"\"\"\n\n    def __init__(self,\n                 anchors,\n                 anchor_masks,\n                 downsample_ratios,\n                 max_iou_thresh=0.60,\n                 num_classes=1):\n        super(Gt2JDETargetMax, self).__init__()\n        self.anchors = anchors\n        self.anchor_masks = anchor_masks\n        self.downsample_ratios = downsample_ratios\n        self.max_iou_thresh = max_iou_thresh\n        self.num_classes = num_classes\n\n    def __call__(self, samples, context=None):\n        assert len(self.anchor_masks) == len(self.downsample_ratios), \\\n            \"anchor_masks', and 'downsample_ratios' should have same length.\"\n        h, w = samples[0]['image'].shape[1:3]\n        for sample in samples:\n            gt_bbox = sample['gt_bbox']\n            gt_ide = sample['gt_ide']\n            for i, (anchor_hw, downsample_ratio\n                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):\n                anchor_hw = np.array(\n                    anchor_hw, dtype=np.float32) / downsample_ratio\n                nA = len(anchor_hw)\n                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)\n                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)\n                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)\n                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)\n\n                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()\n                gxy[:, 0] = gxy[:, 0] * nGw\n                gxy[:, 1] = gxy[:, 1] * nGh\n                gwh[:, 0] = gwh[:, 0] * nGw\n                gwh[:, 1] = gwh[:, 1] * nGh\n                gi = np.clip(gxy[:, 0], 0, nGw - 1).astype(int)\n                gj = np.clip(gxy[:, 1], 0, nGh - 1).astype(int)\n\n                # iou of targets-anchors (using wh only)\n                box1 = gwh\n                box2 = anchor_hw[:, None, :]\n                inter_area = np.minimum(box1, box2).prod(2)\n                iou = inter_area / (\n                    box1.prod(1) + box2.prod(2) - inter_area + 1e-16)\n\n                # Select best iou_pred and anchor\n                iou_best = iou.max(0)  # best anchor [0-2] for each target\n                a = np.argmax(iou, axis=0)\n\n                # Select best unique target-anchor combinations\n                iou_order = np.argsort(-iou_best)  # best to worst\n\n                # Unique anchor selection\n                u = np.stack((gi, gj, a), 0)[:, iou_order]\n                _, first_unique = np.unique(u, axis=1, return_index=True)\n                mask = iou_order[first_unique]\n                # best anchor must share significant commonality (iou) with target\n                # TODO: examine arbitrary threshold\n                idx = mask[iou_best[mask] > self.max_iou_thresh]\n\n                if len(idx) > 0:\n                    a_i, gj_i, gi_i = a[idx], gj[idx], gi[idx]\n                    t_box = gt_bbox[idx]\n                    t_id = gt_ide[idx]\n                    if len(t_box.shape) == 1:\n                        t_box = t_box.reshape(1, 4)\n\n                    gxy, gwh = t_box[:, 0:2].copy(), t_box[:, 2:4].copy()\n                    gxy[:, 0] = gxy[:, 0] * nGw\n                    gxy[:, 1] = gxy[:, 1] * nGh\n                    gwh[:, 0] = gwh[:, 0] * nGw\n                    gwh[:, 1] = gwh[:, 1] * nGh\n\n                    # XY coordinates\n                    tbox[:, :, :, 0:2][a_i, gj_i, gi_i] = gxy - gxy.astype(int)\n                    # Width and height in yolo method\n                    tbox[:, :, :, 2:4][a_i, gj_i, gi_i] = np.log(gwh /\n                                                                 anchor_hw[a_i])\n                    tconf[a_i, gj_i, gi_i] = 1\n                    tid[a_i, gj_i, gi_i] = t_id\n\n                sample['tbox{}'.format(i)] = tbox\n                sample['tconf{}'.format(i)] = tconf\n                sample['tide{}'.format(i)] = tid\n\n\nclass Gt2FairMOTTarget(Gt2TTFTarget):\n    __shared__ = ['num_classes']\n    \"\"\"\n    Generate FairMOT targets by ground truth data.\n    Difference between Gt2FairMOTTarget and Gt2TTFTarget are:\n        1. the gaussian kernal radius to generate a heatmap.\n        2. the targets needed during training.\n    \n    Args:\n        num_classes(int): the number of classes.\n        down_ratio(int): the down ratio from images to heatmap, 4 by default.\n        max_objs(int): the maximum number of ground truth objects in a image, 500 by default.\n    \"\"\"\n\n    def __init__(self, num_classes=1, down_ratio=4, max_objs=500):\n        super(Gt2TTFTarget, self).__init__()\n        self.down_ratio = down_ratio\n        self.num_classes = num_classes\n        self.max_objs = max_objs\n\n    def __call__(self, samples, context=None):\n        for b_id, sample in enumerate(samples):\n            output_h = sample['image'].shape[1] // self.down_ratio\n            output_w = sample['image'].shape[2] // self.down_ratio\n\n            heatmap = np.zeros(\n                (self.num_classes, output_h, output_w), dtype='float32')\n            bbox_size = np.zeros((self.max_objs, 4), dtype=np.float32)\n            center_offset = np.zeros((self.max_objs, 2), dtype=np.float32)\n            index = np.zeros((self.max_objs, ), dtype=np.int64)\n            index_mask = np.zeros((self.max_objs, ), dtype=np.int32)\n            reid = np.zeros((self.max_objs, ), dtype=np.int64)\n            bbox_xys = np.zeros((self.max_objs, 4), dtype=np.float32)\n            if self.num_classes > 1:\n                # each category corresponds to a set of track ids\n                cls_tr_ids = np.zeros(\n                    (self.num_classes, output_h, output_w), dtype=np.int64)\n                cls_id_map = np.full((output_h, output_w), -1, dtype=np.int64)\n\n            gt_bbox = sample['gt_bbox']\n            gt_class = sample['gt_class']\n            gt_ide = sample['gt_ide']\n\n            for k in range(len(gt_bbox)):\n                cls_id = gt_class[k][0]\n                bbox = gt_bbox[k]\n                ide = gt_ide[k][0]\n                bbox[[0, 2]] = bbox[[0, 2]] * output_w\n                bbox[[1, 3]] = bbox[[1, 3]] * output_h\n                bbox_amodal = copy.deepcopy(bbox)\n                bbox_amodal[0] = bbox_amodal[0] - bbox_amodal[2] / 2.\n                bbox_amodal[1] = bbox_amodal[1] - bbox_amodal[3] / 2.\n                bbox_amodal[2] = bbox_amodal[0] + bbox_amodal[2]\n                bbox_amodal[3] = bbox_amodal[1] + bbox_amodal[3]\n                bbox[0] = np.clip(bbox[0], 0, output_w - 1)\n                bbox[1] = np.clip(bbox[1], 0, output_h - 1)\n                h = bbox[3]\n                w = bbox[2]\n\n                bbox_xy = copy.deepcopy(bbox)\n                bbox_xy[0] = bbox_xy[0] - bbox_xy[2] / 2\n                bbox_xy[1] = bbox_xy[1] - bbox_xy[3] / 2\n                bbox_xy[2] = bbox_xy[0] + bbox_xy[2]\n                bbox_xy[3] = bbox_xy[1] + bbox_xy[3]\n\n                if h > 0 and w > 0:\n                    radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)\n                    radius = max(0, int(radius))\n                    ct = np.array([bbox[0], bbox[1]], dtype=np.float32)\n                    ct_int = ct.astype(np.int32)\n                    self.draw_truncate_gaussian(heatmap[cls_id], ct_int, radius,\n                                                radius)\n                    bbox_size[k] = ct[0] - bbox_amodal[0], ct[1] - bbox_amodal[1], \\\n                            bbox_amodal[2] - ct[0], bbox_amodal[3] - ct[1]\n\n                    index[k] = ct_int[1] * output_w + ct_int[0]\n                    center_offset[k] = ct - ct_int\n                    index_mask[k] = 1\n                    reid[k] = ide\n                    bbox_xys[k] = bbox_xy\n                    if self.num_classes > 1:\n                        cls_id_map[ct_int[1], ct_int[0]] = cls_id\n                        cls_tr_ids[cls_id][ct_int[1]][ct_int[0]] = ide - 1\n                        # track id start from 0\n\n            sample['heatmap'] = heatmap\n            sample['index'] = index\n            sample['offset'] = center_offset\n            sample['size'] = bbox_size\n            sample['index_mask'] = index_mask\n            sample['reid'] = reid\n            if self.num_classes > 1:\n                sample['cls_id_map'] = cls_id_map\n                sample['cls_tr_ids'] = cls_tr_ids\n            sample['bbox_xys'] = bbox_xys\n            sample.pop('is_crowd', None)\n            sample.pop('difficult', None)\n            sample.pop('gt_class', None)\n            sample.pop('gt_bbox', None)\n            sample.pop('gt_score', None)\n            sample.pop('gt_ide', None)\n        return samples\n"
  },
  {
    "path": "ppdet/data/transform/op_helper.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# this file contains helper methods for BBOX processing\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport random\nimport math\nimport cv2\n\n\ndef meet_emit_constraint(src_bbox, sample_bbox):\n    center_x = (src_bbox[2] + src_bbox[0]) / 2\n    center_y = (src_bbox[3] + src_bbox[1]) / 2\n    if center_x >= sample_bbox[0] and \\\n            center_x <= sample_bbox[2] and \\\n            center_y >= sample_bbox[1] and \\\n            center_y <= sample_bbox[3]:\n        return True\n    return False\n\n\ndef clip_bbox(src_bbox):\n    src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)\n    src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)\n    src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)\n    src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)\n    return src_bbox\n\n\ndef bbox_area(src_bbox):\n    if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:\n        return 0.\n    else:\n        width = src_bbox[2] - src_bbox[0]\n        height = src_bbox[3] - src_bbox[1]\n        return width * height\n\n\ndef is_overlap(object_bbox, sample_bbox):\n    if object_bbox[0] >= sample_bbox[2] or \\\n       object_bbox[2] <= sample_bbox[0] or \\\n       object_bbox[1] >= sample_bbox[3] or \\\n       object_bbox[3] <= sample_bbox[1]:\n        return False\n    else:\n        return True\n\n\ndef filter_and_process(sample_bbox, bboxes, labels, scores=None,\n                       keypoints=None):\n    new_bboxes = []\n    new_labels = []\n    new_scores = []\n    new_keypoints = []\n    new_kp_ignore = []\n    for i in range(len(bboxes)):\n        new_bbox = [0, 0, 0, 0]\n        obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]\n        if not meet_emit_constraint(obj_bbox, sample_bbox):\n            continue\n        if not is_overlap(obj_bbox, sample_bbox):\n            continue\n        sample_width = sample_bbox[2] - sample_bbox[0]\n        sample_height = sample_bbox[3] - sample_bbox[1]\n        new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width\n        new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height\n        new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width\n        new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height\n        new_bbox = clip_bbox(new_bbox)\n        if bbox_area(new_bbox) > 0:\n            new_bboxes.append(new_bbox)\n            new_labels.append([labels[i][0]])\n            if scores is not None:\n                new_scores.append([scores[i][0]])\n            if keypoints is not None:\n                sample_keypoint = keypoints[0][i]\n                for j in range(len(sample_keypoint)):\n                    kp_len = sample_height if j % 2 else sample_width\n                    sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]\n                    sample_keypoint[j] = (\n                        sample_keypoint[j] - sample_coord) / kp_len\n                    sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)\n                new_keypoints.append(sample_keypoint)\n                new_kp_ignore.append(keypoints[1][i])\n\n    bboxes = np.array(new_bboxes)\n    labels = np.array(new_labels)\n    scores = np.array(new_scores)\n    if keypoints is not None:\n        keypoints = np.array(new_keypoints)\n        new_kp_ignore = np.array(new_kp_ignore)\n        return bboxes, labels, scores, (keypoints, new_kp_ignore)\n    return bboxes, labels, scores\n\n\ndef bbox_area_sampling(bboxes, labels, scores, target_size, min_size):\n    new_bboxes = []\n    new_labels = []\n    new_scores = []\n    for i, bbox in enumerate(bboxes):\n        w = float((bbox[2] - bbox[0]) * target_size)\n        h = float((bbox[3] - bbox[1]) * target_size)\n        if w * h < float(min_size * min_size):\n            continue\n        else:\n            new_bboxes.append(bbox)\n            new_labels.append(labels[i])\n            if scores is not None and scores.size != 0:\n                new_scores.append(scores[i])\n    bboxes = np.array(new_bboxes)\n    labels = np.array(new_labels)\n    scores = np.array(new_scores)\n    return bboxes, labels, scores\n\n\ndef generate_sample_bbox(sampler):\n    scale = np.random.uniform(sampler[2], sampler[3])\n    aspect_ratio = np.random.uniform(sampler[4], sampler[5])\n    aspect_ratio = max(aspect_ratio, (scale**2.0))\n    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))\n    bbox_width = scale * (aspect_ratio**0.5)\n    bbox_height = scale / (aspect_ratio**0.5)\n    xmin_bound = 1 - bbox_width\n    ymin_bound = 1 - bbox_height\n    xmin = np.random.uniform(0, xmin_bound)\n    ymin = np.random.uniform(0, ymin_bound)\n    xmax = xmin + bbox_width\n    ymax = ymin + bbox_height\n    sampled_bbox = [xmin, ymin, xmax, ymax]\n    return sampled_bbox\n\n\ndef generate_sample_bbox_square(sampler, image_width, image_height):\n    scale = np.random.uniform(sampler[2], sampler[3])\n    aspect_ratio = np.random.uniform(sampler[4], sampler[5])\n    aspect_ratio = max(aspect_ratio, (scale**2.0))\n    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))\n    bbox_width = scale * (aspect_ratio**0.5)\n    bbox_height = scale / (aspect_ratio**0.5)\n    if image_height < image_width:\n        bbox_width = bbox_height * image_height / image_width\n    else:\n        bbox_height = bbox_width * image_width / image_height\n    xmin_bound = 1 - bbox_width\n    ymin_bound = 1 - bbox_height\n    xmin = np.random.uniform(0, xmin_bound)\n    ymin = np.random.uniform(0, ymin_bound)\n    xmax = xmin + bbox_width\n    ymax = ymin + bbox_height\n    sampled_bbox = [xmin, ymin, xmax, ymax]\n    return sampled_bbox\n\n\ndef data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,\n                         resize_width):\n    num_gt = len(bbox_labels)\n    # np.random.randint range: [low, high)\n    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0\n\n    if num_gt != 0:\n        norm_xmin = bbox_labels[rand_idx][0]\n        norm_ymin = bbox_labels[rand_idx][1]\n        norm_xmax = bbox_labels[rand_idx][2]\n        norm_ymax = bbox_labels[rand_idx][3]\n\n        xmin = norm_xmin * image_width\n        ymin = norm_ymin * image_height\n        wid = image_width * (norm_xmax - norm_xmin)\n        hei = image_height * (norm_ymax - norm_ymin)\n        range_size = 0\n\n        area = wid * hei\n        for scale_ind in range(0, len(scale_array) - 1):\n            if area > scale_array[scale_ind] ** 2 and area < \\\n                    scale_array[scale_ind + 1] ** 2:\n                range_size = scale_ind + 1\n                break\n\n        if area > scale_array[len(scale_array) - 2]**2:\n            range_size = len(scale_array) - 2\n\n        scale_choose = 0.0\n        if range_size == 0:\n            rand_idx_size = 0\n        else:\n            # np.random.randint range: [low, high)\n            rng_rand_size = np.random.randint(0, range_size + 1)\n            rand_idx_size = rng_rand_size % (range_size + 1)\n\n        if rand_idx_size == range_size:\n            min_resize_val = scale_array[rand_idx_size] / 2.0\n            max_resize_val = min(2.0 * scale_array[rand_idx_size],\n                                 2 * math.sqrt(wid * hei))\n            scale_choose = random.uniform(min_resize_val, max_resize_val)\n        else:\n            min_resize_val = scale_array[rand_idx_size] / 2.0\n            max_resize_val = 2.0 * scale_array[rand_idx_size]\n            scale_choose = random.uniform(min_resize_val, max_resize_val)\n\n        sample_bbox_size = wid * resize_width / scale_choose\n\n        w_off_orig = 0.0\n        h_off_orig = 0.0\n        if sample_bbox_size < max(image_height, image_width):\n            if wid <= sample_bbox_size:\n                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,\n                                               xmin)\n            else:\n                w_off_orig = np.random.uniform(xmin,\n                                               xmin + wid - sample_bbox_size)\n\n            if hei <= sample_bbox_size:\n                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,\n                                               ymin)\n            else:\n                h_off_orig = np.random.uniform(ymin,\n                                               ymin + hei - sample_bbox_size)\n\n        else:\n            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)\n            h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0)\n\n        w_off_orig = math.floor(w_off_orig)\n        h_off_orig = math.floor(h_off_orig)\n\n        # Figure out top left coordinates.\n        w_off = float(w_off_orig / image_width)\n        h_off = float(h_off_orig / image_height)\n\n        sampled_bbox = [\n            w_off, h_off, w_off + float(sample_bbox_size / image_width),\n            h_off + float(sample_bbox_size / image_height)\n        ]\n        return sampled_bbox\n    else:\n        return 0\n\n\ndef jaccard_overlap(sample_bbox, object_bbox):\n    if sample_bbox[0] >= object_bbox[2] or \\\n        sample_bbox[2] <= object_bbox[0] or \\\n        sample_bbox[1] >= object_bbox[3] or \\\n        sample_bbox[3] <= object_bbox[1]:\n        return 0\n    intersect_xmin = max(sample_bbox[0], object_bbox[0])\n    intersect_ymin = max(sample_bbox[1], object_bbox[1])\n    intersect_xmax = min(sample_bbox[2], object_bbox[2])\n    intersect_ymax = min(sample_bbox[3], object_bbox[3])\n    intersect_size = (intersect_xmax - intersect_xmin) * (\n        intersect_ymax - intersect_ymin)\n    sample_bbox_size = bbox_area(sample_bbox)\n    object_bbox_size = bbox_area(object_bbox)\n    overlap = intersect_size / (\n        sample_bbox_size + object_bbox_size - intersect_size)\n    return overlap\n\n\ndef intersect_bbox(bbox1, bbox2):\n    if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \\\n        bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:\n        intersection_box = [0.0, 0.0, 0.0, 0.0]\n    else:\n        intersection_box = [\n            max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),\n            min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])\n        ]\n    return intersection_box\n\n\ndef bbox_coverage(bbox1, bbox2):\n    inter_box = intersect_bbox(bbox1, bbox2)\n    intersect_size = bbox_area(inter_box)\n\n    if intersect_size > 0:\n        bbox1_size = bbox_area(bbox1)\n        return intersect_size / bbox1_size\n    else:\n        return 0.\n\n\ndef satisfy_sample_constraint(sampler,\n                              sample_bbox,\n                              gt_bboxes,\n                              satisfy_all=False):\n    if sampler[6] == 0 and sampler[7] == 0:\n        return True\n    satisfied = []\n    for i in range(len(gt_bboxes)):\n        object_bbox = [\n            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]\n        ]\n        overlap = jaccard_overlap(sample_bbox, object_bbox)\n        if sampler[6] != 0 and \\\n                overlap < sampler[6]:\n            satisfied.append(False)\n            continue\n        if sampler[7] != 0 and \\\n                overlap > sampler[7]:\n            satisfied.append(False)\n            continue\n        satisfied.append(True)\n        if not satisfy_all:\n            return True\n\n    if satisfy_all:\n        return np.all(satisfied)\n    else:\n        return False\n\n\ndef satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):\n    if sampler[6] == 0 and sampler[7] == 0:\n        has_jaccard_overlap = False\n    else:\n        has_jaccard_overlap = True\n    if sampler[8] == 0 and sampler[9] == 0:\n        has_object_coverage = False\n    else:\n        has_object_coverage = True\n\n    if not has_jaccard_overlap and not has_object_coverage:\n        return True\n    found = False\n    for i in range(len(gt_bboxes)):\n        object_bbox = [\n            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]\n        ]\n        if has_jaccard_overlap:\n            overlap = jaccard_overlap(sample_bbox, object_bbox)\n            if sampler[6] != 0 and \\\n                    overlap < sampler[6]:\n                continue\n            if sampler[7] != 0 and \\\n                    overlap > sampler[7]:\n                continue\n            found = True\n        if has_object_coverage:\n            object_coverage = bbox_coverage(object_bbox, sample_bbox)\n            if sampler[8] != 0 and \\\n                    object_coverage < sampler[8]:\n                continue\n            if sampler[9] != 0 and \\\n                    object_coverage > sampler[9]:\n                continue\n            found = True\n        if found:\n            return True\n    return found\n\n\ndef crop_image_sampling(img, sample_bbox, image_width, image_height,\n                        target_size):\n    # no clipping here\n    xmin = int(sample_bbox[0] * image_width)\n    xmax = int(sample_bbox[2] * image_width)\n    ymin = int(sample_bbox[1] * image_height)\n    ymax = int(sample_bbox[3] * image_height)\n\n    w_off = xmin\n    h_off = ymin\n    width = xmax - xmin\n    height = ymax - ymin\n    cross_xmin = max(0.0, float(w_off))\n    cross_ymin = max(0.0, float(h_off))\n    cross_xmax = min(float(w_off + width - 1.0), float(image_width))\n    cross_ymax = min(float(h_off + height - 1.0), float(image_height))\n    cross_width = cross_xmax - cross_xmin\n    cross_height = cross_ymax - cross_ymin\n\n    roi_xmin = 0 if w_off >= 0 else abs(w_off)\n    roi_ymin = 0 if h_off >= 0 else abs(h_off)\n    roi_width = cross_width\n    roi_height = cross_height\n\n    roi_y1 = int(roi_ymin)\n    roi_y2 = int(roi_ymin + roi_height)\n    roi_x1 = int(roi_xmin)\n    roi_x2 = int(roi_xmin + roi_width)\n\n    cross_y1 = int(cross_ymin)\n    cross_y2 = int(cross_ymin + cross_height)\n    cross_x1 = int(cross_xmin)\n    cross_x2 = int(cross_xmin + cross_width)\n\n    sample_img = np.zeros((height, width, 3))\n    sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \\\n        img[cross_y1: cross_y2, cross_x1: cross_x2]\n\n    sample_img = cv2.resize(\n        sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)\n\n    return sample_img\n\n\ndef is_poly(segm):\n    assert isinstance(segm, (list, dict)), \\\n        \"Invalid segm type: {}\".format(type(segm))\n    return isinstance(segm, list)\n\n\ndef gaussian_radius(bbox_size, min_overlap):\n    height, width = bbox_size\n\n    a1 = 1\n    b1 = (height + width)\n    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)\n    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)\n    radius1 = (b1 + sq1) / (2 * a1)\n\n    a2 = 4\n    b2 = 2 * (height + width)\n    c2 = (1 - min_overlap) * width * height\n    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)\n    radius2 = (b2 + sq2) / 2\n\n    a3 = 4 * min_overlap\n    b3 = -2 * min_overlap * (height + width)\n    c3 = (min_overlap - 1) * width * height\n    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)\n    radius3 = (b3 + sq3) / 2\n    return min(radius1, radius2, radius3)\n\n\ndef draw_gaussian(heatmap, center, radius, k=1, delte=6):\n    diameter = 2 * radius + 1\n    sigma = diameter / delte\n    gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)\n\n    x, y = center\n\n    height, width = heatmap.shape[0:2]\n\n    left, right = min(x, radius), min(width - x, radius + 1)\n    top, bottom = min(y, radius), min(height - y, radius + 1)\n\n    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]\n    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:\n                               radius + right]\n    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)\n\n\ndef gaussian2D(shape, sigma_x=1, sigma_y=1):\n    m, n = [(ss - 1.) / 2. for ss in shape]\n    y, x = np.ogrid[-m:m + 1, -n:n + 1]\n\n    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *\n                                                            sigma_y)))\n    h[h < np.finfo(h.dtype).eps * h.max()] = 0\n    return h\n\n\ndef draw_umich_gaussian(heatmap, center, radius, k=1):\n    \"\"\"\n    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126\n    \"\"\"\n    diameter = 2 * radius + 1\n    gaussian = gaussian2D(\n        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)\n\n    x, y = int(center[0]), int(center[1])\n\n    height, width = heatmap.shape[0:2]\n\n    left, right = min(x, radius), min(width - x, radius + 1)\n    top, bottom = min(y, radius), min(height - y, radius + 1)\n\n    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]\n    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:\n                               radius + right]\n    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:\n        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)\n    return heatmap\n\n\ndef get_border(border, size):\n    i = 1\n    while size - border // i <= border // i:\n        i *= 2\n    return border // i\n"
  },
  {
    "path": "ppdet/data/transform/operators.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# function:\n#    operators to process sample,\n#    eg: decode/resize/crop image\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\n\nfrom numbers import Number, Integral\n\nimport uuid\nimport random\nimport math\nimport numpy as np\nimport os\nimport copy\nimport logging\nimport cv2\nfrom PIL import Image, ImageDraw, ImageEnhance\nfrom pycocotools import mask\nimport pickle\nimport threading\nMUTEX = threading.Lock()\n\nimport paddle\nfrom ppdet.core.workspace import serializable\nfrom ..reader import Compose\n\nfrom .op_helper import (satisfy_sample_constraint, filter_and_process,\n                        generate_sample_bbox, clip_bbox, data_anchor_sampling,\n                        satisfy_sample_constraint_coverage, crop_image_sampling,\n                        generate_sample_bbox_square, bbox_area_sampling,\n                        is_poly, get_border)\n\nfrom ppdet.utils.logger import setup_logger\nfrom ppdet.utils.compact import imagedraw_textsize_c\n\nfrom ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform\nlogger = setup_logger(__name__)\n\nregistered_ops = []\n\n\ndef register_op(cls):\n    registered_ops.append(cls.__name__)\n    if not hasattr(BaseOperator, cls.__name__):\n        setattr(BaseOperator, cls.__name__, cls)\n    else:\n        raise KeyError(\"The {} class has been registered.\".format(cls.__name__))\n    return serializable(cls)\n\n\nclass BboxError(ValueError):\n    pass\n\n\nclass ImageError(ValueError):\n    pass\n\n\nclass BaseOperator(object):\n    def __init__(self, name=None):\n        if name is None:\n            name = self.__class__.__name__\n        self._id = name + '_' + str(uuid.uuid4())[-6:]\n\n    def apply(self, sample, context=None):\n        \"\"\" Process a sample.\n        Args:\n            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}\n            context (dict): info about this sample processing\n        Returns:\n            result (dict): a processed sample\n        \"\"\"\n        return sample\n\n    def __call__(self, sample, context=None):\n        \"\"\" Process a sample.\n        Args:\n            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}\n            context (dict): info about this sample processing\n        Returns:\n            result (dict): a processed sample\n        \"\"\"\n        if isinstance(sample, Sequence):\n            for i in range(len(sample)):\n                sample[i] = self.apply(sample[i], context)\n        else:\n            sample = self.apply(sample, context)\n        return sample\n\n    def __str__(self):\n        return str(self._id)\n\n\n@register_op\nclass Decode(BaseOperator):\n    def __init__(self, rtn_im_file=False):\n        \"\"\" Transform the image data to numpy format following the rgb format\n        \"\"\"\n        super(Decode, self).__init__()\n        self.rtn_im_file = rtn_im_file\n\n    def apply(self, sample, context=None):\n        \"\"\" load image if 'im_file' field is not empty but 'image' is\"\"\"\n        if 'image' not in sample:\n            with open(sample['im_file'], 'rb') as f:\n                sample['image'] = f.read()\n            if not self.rtn_im_file:\n                sample.pop('im_file')\n\n        try:\n            im = sample['image']\n            data = np.frombuffer(im, dtype='uint8')\n            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode\n            if 'keep_ori_im' in sample and sample['keep_ori_im']:\n                sample['ori_image'] = im\n            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)\n        except:\n            im = sample['image']\n\n        sample['image'] = im\n        if 'h' not in sample:\n            sample['h'] = im.shape[0]\n        elif sample['h'] != im.shape[0]:\n            logger.warning(\n                \"The actual image height: {} is not equal to the \"\n                \"height: {} in annotation, and update sample['h'] by actual \"\n                \"image height.\".format(im.shape[0], sample['h']))\n            sample['h'] = im.shape[0]\n        if 'w' not in sample:\n            sample['w'] = im.shape[1]\n        elif sample['w'] != im.shape[1]:\n            logger.warning(\n                \"The actual image width: {} is not equal to the \"\n                \"width: {} in annotation, and update sample['w'] by actual \"\n                \"image width.\".format(im.shape[1], sample['w']))\n            sample['w'] = im.shape[1]\n\n        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)\n        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)\n        return sample\n\n\ndef _make_dirs(dirname):\n    try:\n        from pathlib import Path\n    except ImportError:\n        from pathlib2 import Path\n    Path(dirname).mkdir(exist_ok=True)\n\n\n@register_op\nclass DecodeCache(BaseOperator):\n    def __init__(self, cache_root=None):\n        '''decode image and caching\n        '''\n        super(DecodeCache, self).__init__()\n\n        self.use_cache = False if cache_root is None else True\n        self.cache_root = cache_root\n\n        if cache_root is not None:\n            _make_dirs(cache_root)\n\n    def apply(self, sample, context=None):\n\n        if self.use_cache and os.path.exists(\n                self.cache_path(self.cache_root, sample['im_file'])):\n            path = self.cache_path(self.cache_root, sample['im_file'])\n            im = self.load(path)\n\n        else:\n            if 'image' not in sample:\n                with open(sample['im_file'], 'rb') as f:\n                    sample['image'] = f.read()\n\n            im = sample['image']\n            data = np.frombuffer(im, dtype='uint8')\n            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode\n            if 'keep_ori_im' in sample and sample['keep_ori_im']:\n                sample['ori_image'] = im\n            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)\n\n            if self.use_cache and not os.path.exists(\n                    self.cache_path(self.cache_root, sample['im_file'])):\n                path = self.cache_path(self.cache_root, sample['im_file'])\n                self.dump(im, path)\n\n        sample['image'] = im\n        sample['h'] = im.shape[0]\n        sample['w'] = im.shape[1]\n\n        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)\n        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)\n\n        sample.pop('im_file')\n\n        return sample\n\n    @staticmethod\n    def cache_path(dir_oot, im_file):\n        return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')\n\n    @staticmethod\n    def load(path):\n        with open(path, 'rb') as f:\n            im = pickle.load(f)\n        return im\n\n    @staticmethod\n    def dump(obj, path):\n        MUTEX.acquire()\n        try:\n            with open(path, 'wb') as f:\n                pickle.dump(obj, f)\n\n        except Exception as e:\n            logger.warning('dump {} occurs exception {}'.format(path, str(e)))\n\n        finally:\n            MUTEX.release()\n\n\n@register_op\nclass SniperDecodeCrop(BaseOperator):\n    def __init__(self):\n        super(SniperDecodeCrop, self).__init__()\n\n    def __call__(self, sample, context=None):\n        if 'image' not in sample:\n            with open(sample['im_file'], 'rb') as f:\n                sample['image'] = f.read()\n            sample.pop('im_file')\n\n        im = sample['image']\n        data = np.frombuffer(im, dtype='uint8')\n        im = cv2.imdecode(data, cv2.IMREAD_COLOR)  # BGR mode, but need RGB mode\n        if 'keep_ori_im' in sample and sample['keep_ori_im']:\n            sample['ori_image'] = im\n        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)\n\n        chip = sample['chip']\n        x1, y1, x2, y2 = [int(xi) for xi in chip]\n        im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[\n            1]), :]\n\n        sample['image'] = im\n        h = im.shape[0]\n        w = im.shape[1]\n        # sample['im_info'] = [h, w, 1.0]\n        sample['h'] = h\n        sample['w'] = w\n\n        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)\n        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)\n        return sample\n\n\n@register_op\nclass Permute(BaseOperator):\n    def __init__(self):\n        \"\"\"\n        Change the channel to be (C, H, W)\n        \"\"\"\n        super(Permute, self).__init__()\n\n    def apply(self, sample, context=None):\n        im = sample['image']\n        im = im.transpose((2, 0, 1))\n        sample['image'] = im\n\n        if 'pre_image' in sample:\n            pre_im = sample['pre_image']\n            pre_im = pre_im.transpose((2, 0, 1))\n            sample['pre_image'] = pre_im\n        return sample\n\n\n@register_op\nclass Lighting(BaseOperator):\n    \"\"\"\n    Lighting the image by eigenvalues and eigenvectors\n    Args:\n        eigval (list): eigenvalues\n        eigvec (list): eigenvectors\n        alphastd (float): random weight of lighting, 0.1 by default\n    \"\"\"\n\n    def __init__(self, eigval, eigvec, alphastd=0.1):\n        super(Lighting, self).__init__()\n        self.alphastd = alphastd\n        self.eigval = np.array(eigval).astype('float32')\n        self.eigvec = np.array(eigvec).astype('float32')\n\n    def apply(self, sample, context=None):\n        alpha = np.random.normal(scale=self.alphastd, size=(3, ))\n        sample['image'] += np.dot(self.eigvec, self.eigval * alpha)\n\n        if 'pre_image' in sample:\n            sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha)\n        return sample\n\n\n@register_op\nclass RandomErasingImage(BaseOperator):\n    def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3):\n        \"\"\"\n        Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896\n        Args:\n            prob (float): probability to carry out random erasing\n            lower (float): lower limit of the erasing area ratio\n            higher (float): upper limit of the erasing area ratio\n            aspect_ratio (float): aspect ratio of the erasing region\n        \"\"\"\n        super(RandomErasingImage, self).__init__()\n        self.prob = prob\n        self.lower = lower\n        self.higher = higher\n        self.aspect_ratio = aspect_ratio\n\n    def apply(self, sample, context=None):\n        gt_bbox = sample['gt_bbox']\n        im = sample['image']\n        if not isinstance(im, np.ndarray):\n            raise TypeError(\"{}: image is not a numpy array.\".format(self))\n        if len(im.shape) != 3:\n            raise ImageError(\"{}: image is not 3-dimensional.\".format(self))\n\n        for idx in range(gt_bbox.shape[0]):\n            if self.prob <= np.random.rand():\n                continue\n\n            x1, y1, x2, y2 = gt_bbox[idx, :]\n            w_bbox = x2 - x1\n            h_bbox = y2 - y1\n            area = w_bbox * h_bbox\n\n            target_area = random.uniform(self.lower, self.higher) * area\n            aspect_ratio = random.uniform(self.aspect_ratio,\n                                          1 / self.aspect_ratio)\n\n            h = int(round(math.sqrt(target_area * aspect_ratio)))\n            w = int(round(math.sqrt(target_area / aspect_ratio)))\n\n            if w < w_bbox and h < h_bbox:\n                off_y1 = random.randint(0, int(h_bbox - h))\n                off_x1 = random.randint(0, int(w_bbox - w))\n                im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int(\n                    x1 + off_x1 + w), :] = 0\n        sample['image'] = im\n        return sample\n\n\n@register_op\nclass NormalizeImage(BaseOperator):\n    def __init__(self,\n                 mean=[0.485, 0.456, 0.406],\n                 std=[0.229, 0.224, 0.225],\n                 is_scale=True,\n                 norm_type='mean_std'):\n        \"\"\"\n        Args:\n            mean (list): the pixel mean\n            std (list): the pixel variance\n            is_scale (bool): scale the pixel to [0,1]\n            norm_type (str): type in ['mean_std', 'none']\n        \"\"\"\n        super(NormalizeImage, self).__init__()\n        self.mean = mean\n        self.std = std\n        self.is_scale = is_scale\n        self.norm_type = norm_type\n        if not (isinstance(self.mean, list) and isinstance(self.std, list) and\n                isinstance(self.is_scale, bool) and\n                self.norm_type in ['mean_std', 'none']):\n            raise TypeError(\"{}: input type is invalid.\".format(self))\n        from functools import reduce\n        if reduce(lambda x, y: x * y, self.std) == 0:\n            raise ValueError('{}: std is invalid!'.format(self))\n\n    def apply(self, sample, context=None):\n        \"\"\"Normalize the image.\n        Operators:\n            1.(optional) Scale the pixel to [0,1]\n            2.(optional) Each pixel minus mean and is divided by std\n        \"\"\"\n        im = sample['image']\n\n        im = im.astype(np.float32, copy=False)\n        if self.is_scale:\n            scale = 1.0 / 255.0\n            im *= scale\n\n        if self.norm_type == 'mean_std':\n            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]\n            std = np.array(self.std)[np.newaxis, np.newaxis, :]\n            im -= mean\n            im /= std\n\n        sample['image'] = im\n\n        if 'pre_image' in sample:\n            pre_im = sample['pre_image']\n            pre_im = pre_im.astype(np.float32, copy=False)\n            if self.is_scale:\n                scale = 1.0 / 255.0\n                pre_im *= scale\n\n            if self.norm_type == 'mean_std':\n                mean = np.array(self.mean)[np.newaxis, np.newaxis, :]\n                std = np.array(self.std)[np.newaxis, np.newaxis, :]\n                pre_im -= mean\n                pre_im /= std\n            sample['pre_image'] = pre_im\n\n        return sample\n\n\n@register_op\nclass GridMask(BaseOperator):\n    def __init__(self,\n                 use_h=True,\n                 use_w=True,\n                 rotate=1,\n                 offset=False,\n                 ratio=0.5,\n                 mode=1,\n                 prob=0.7,\n                 upper_iter=360000):\n        \"\"\"\n        GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086\n        Args:\n            use_h (bool): whether to mask vertically\n            use_w (boo;): whether to mask horizontally\n            rotate (float): angle for the mask to rotate\n            offset (float): mask offset\n            ratio (float): mask ratio\n            mode (int): gridmask mode\n            prob (float): max probability to carry out gridmask\n            upper_iter (int): suggested to be equal to global max_iter\n        \"\"\"\n        super(GridMask, self).__init__()\n        self.use_h = use_h\n        self.use_w = use_w\n        self.rotate = rotate\n        self.offset = offset\n        self.ratio = ratio\n        self.mode = mode\n        self.prob = prob\n        self.upper_iter = upper_iter\n\n        from .gridmask_utils import Gridmask\n        self.gridmask_op = Gridmask(\n            use_h,\n            use_w,\n            rotate=rotate,\n            offset=offset,\n            ratio=ratio,\n            mode=mode,\n            prob=prob,\n            upper_iter=upper_iter)\n\n    def apply(self, sample, context=None):\n        sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter'])\n        return sample\n\n\n@register_op\nclass RandomDistort(BaseOperator):\n    \"\"\"Random color distortion.\n    Args:\n        hue (list): hue settings. in [lower, upper, probability] format.\n        saturation (list): saturation settings. in [lower, upper, probability] format.\n        contrast (list): contrast settings. in [lower, upper, probability] format.\n        brightness (list): brightness settings. in [lower, upper, probability] format.\n        random_apply (bool): whether to apply in random (yolo) or fixed (SSD) order.\n        count (int): the number of doing distrot.\n        random_channel (bool): whether to swap channels randomly.\n        prob (float): the probability of enhancing the sample.\n    \"\"\"\n\n    def __init__(self,\n                 hue=[-18, 18, 0.5],\n                 saturation=[0.5, 1.5, 0.5],\n                 contrast=[0.5, 1.5, 0.5],\n                 brightness=[0.5, 1.5, 0.5],\n                 random_apply=True,\n                 count=4,\n                 random_channel=False,\n                 prob=1.0):\n        super(RandomDistort, self).__init__()\n        self.hue = hue\n        self.saturation = saturation\n        self.contrast = contrast\n        self.brightness = brightness\n        self.random_apply = random_apply\n        self.count = count\n        self.random_channel = random_channel\n        self.prob = prob\n\n    def apply_hue(self, img):\n        low, high, prob = self.hue\n        if np.random.uniform(0., 1.) < prob:\n            return img\n        delta = np.random.uniform(low, high)\n        img = np.array(img.convert('HSV'))\n        img[:, :, 0] = img[:, :, 0] + delta\n        img = Image.fromarray(img, mode='HSV').convert('RGB')\n        return img\n\n    def apply_saturation(self, img):\n        low, high, prob = self.saturation\n        if np.random.uniform(0., 1.) < prob:\n            return img\n        delta = np.random.uniform(low, high)\n        img = ImageEnhance.Color(img).enhance(delta)\n        return img\n\n    def apply_contrast(self, img):\n        low, high, prob = self.contrast\n        if np.random.uniform(0., 1.) < prob:\n            return img\n        delta = np.random.uniform(low, high)\n        img = ImageEnhance.Contrast(img).enhance(delta)\n        return img\n\n    def apply_brightness(self, img):\n        low, high, prob = self.brightness\n        if np.random.uniform(0., 1.) < prob:\n            return img\n        delta = np.random.uniform(low, high)\n        img = ImageEnhance.Brightness(img).enhance(delta)\n        return img\n\n    def apply(self, sample, context=None):\n        if random.random() > self.prob:\n            return sample\n        img = sample['image']\n        img = Image.fromarray(img.astype(np.uint8))\n        if self.random_apply:\n            functions = [\n                self.apply_brightness, self.apply_contrast,\n                self.apply_saturation, self.apply_hue\n            ]\n            distortions = np.random.permutation(functions)[:self.count]\n            for func in distortions:\n                img = func(img)\n            img = np.asarray(img).astype(np.float32)\n            sample['image'] = img\n            return sample\n\n        img = self.apply_brightness(img)\n        mode = np.random.randint(0, 2)\n        if mode:\n            img = self.apply_contrast(img)\n        img = self.apply_saturation(img)\n        img = self.apply_hue(img)\n        if not mode:\n            img = self.apply_contrast(img)\n\n        img = np.asarray(img).astype(np.float32)\n        if self.random_channel:\n            if np.random.randint(0, 2):\n                img = img[..., np.random.permutation(3)]\n        sample['image'] = img\n        return sample\n\n\n@register_op\nclass PhotoMetricDistortion(BaseOperator):\n    \"\"\"Apply photometric distortion to image sequentially, every transformation\n    is applied with a probability of 0.5. The position of random contrast is in\n    second or second to last.\n\n    1. random brightness\n    2. random contrast (mode 0)\n    3. convert color from BGR to HSV\n    4. random saturation\n    5. random hue\n    6. convert color from HSV to BGR\n    7. random contrast (mode 1)\n    8. randomly swap channels\n\n    Args:\n        brightness_delta (int): delta of brightness.\n        contrast_range (tuple): range of contrast.\n        saturation_range (tuple): range of saturation.\n        hue_delta (int): delta of hue.\n    \"\"\"\n\n    def __init__(self,\n                 brightness_delta=32,\n                 contrast_range=(0.5, 1.5),\n                 saturation_range=(0.5, 1.5),\n                 hue_delta=18):\n        super(PhotoMetricDistortion, self).__init__()\n        self.brightness_delta = brightness_delta\n        self.contrast_lower, self.contrast_upper = contrast_range\n        self.saturation_lower, self.saturation_upper = saturation_range\n        self.hue_delta = hue_delta\n\n    def apply(self, results, context=None):\n        \"\"\"Call function to perform photometric distortion on images.\n\n        Args:\n            results (dict): Result dict from loading pipeline.\n\n        Returns:\n            dict: Result dict with images distorted.\n        \"\"\"\n\n        img = results['image']\n        img = img.astype(np.float32)\n        # random brightness\n        if np.random.randint(2):\n            delta = np.random.uniform(-self.brightness_delta,\n                                      self.brightness_delta)\n            img += delta\n\n        # mode == 0 --> do random contrast first\n        # mode == 1 --> do random contrast last\n        mode = np.random.randint(2)\n        if mode == 1:\n            if np.random.randint(2):\n                alpha = np.random.uniform(self.contrast_lower,\n                                          self.contrast_upper)\n                img *= alpha\n\n        # convert color from BGR to HSV\n        img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)\n\n        # random saturation\n        if np.random.randint(2):\n            img[..., 1] *= np.random.uniform(self.saturation_lower,\n                                             self.saturation_upper)\n\n        # random hue\n        if np.random.randint(2):\n            img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta)\n            img[..., 0][img[..., 0] > 360] -= 360\n            img[..., 0][img[..., 0] < 0] += 360\n\n        # convert color from HSV to BGR\n        img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR)\n\n        # random contrast\n        if mode == 0:\n            if np.random.randint(2):\n                alpha = np.random.uniform(self.contrast_lower,\n                                          self.contrast_upper)\n                img *= alpha\n\n        # randomly swap channels\n        if np.random.randint(2):\n            img = img[..., np.random.permutation(3)]\n\n        results['image'] = img\n        return results\n\n    def __repr__(self):\n        repr_str = self.__class__.__name__\n        repr_str += f'(\\nbrightness_delta={self.brightness_delta},\\n'\n        repr_str += 'contrast_range='\n        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\\n'\n        repr_str += 'saturation_range='\n        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\\n'\n        repr_str += f'hue_delta={self.hue_delta})'\n        return repr_str\n\n\n@register_op\nclass AutoAugment(BaseOperator):\n    def __init__(self, autoaug_type=\"v1\"):\n        \"\"\"\n        Args:\n            autoaug_type (str): autoaug type, support v0, v1, v2, v3, test\n        \"\"\"\n        super(AutoAugment, self).__init__()\n        self.autoaug_type = autoaug_type\n\n    def apply(self, sample, context=None):\n        \"\"\"\n        Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172\n        \"\"\"\n        im = sample['image']\n        gt_bbox = sample['gt_bbox']\n        if not isinstance(im, np.ndarray):\n            raise TypeError(\"{}: image is not a numpy array.\".format(self))\n        if len(im.shape) != 3:\n            raise ImageError(\"{}: image is not 3-dimensional.\".format(self))\n        if len(gt_bbox) == 0:\n            return sample\n\n        height, width, _ = im.shape\n        norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32)\n        norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height)\n        norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width)\n        norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height)\n        norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width)\n\n        from .autoaugment_utils import distort_image_with_autoaugment\n        im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox,\n                                                          self.autoaug_type)\n\n        gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width)\n        gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height)\n        gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width)\n        gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height)\n\n        sample['image'] = im\n        sample['gt_bbox'] = gt_bbox\n        return sample\n\n\n@register_op\nclass RandomFlip(BaseOperator):\n    def __init__(self, prob=0.5):\n        \"\"\"\n        Args:\n            prob (float): the probability of flipping image\n        \"\"\"\n        super(RandomFlip, self).__init__()\n        self.prob = prob\n        if not (isinstance(self.prob, float)):\n            raise TypeError(\"{}: input type is invalid.\".format(self))\n\n    def apply_segm(self, segms, height, width):\n        def _flip_poly(poly, width):\n            flipped_poly = np.array(poly)\n            flipped_poly[0::2] = width - np.array(poly[0::2])\n            return flipped_poly.tolist()\n\n        def _flip_rle(rle, height, width):\n            if 'counts' in rle and type(rle['counts']) == list:\n                rle = mask_util.frPyObjects(rle, height, width)\n            mask = mask_util.decode(rle)\n            mask = mask[:, ::-1]\n            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))\n            return rle\n\n        flipped_segms = []\n        for segm in segms:\n            if is_poly(segm):\n                # Polygon format\n                flipped_segms.append([_flip_poly(poly, width) for poly in segm])\n            else:\n                # RLE format\n                import pycocotools.mask as mask_util\n                flipped_segms.append(_flip_rle(segm, height, width))\n        return flipped_segms\n\n    def apply_keypoint(self, gt_keypoint, width):\n        for i in range(gt_keypoint.shape[1]):\n            if i % 2 == 0:\n                old_x = gt_keypoint[:, i].copy()\n                gt_keypoint[:, i] = width - old_x\n        return gt_keypoint\n\n    def apply_image(self, image):\n        return image[:, ::-1, :]\n\n    def apply_bbox(self, bbox, width):\n        oldx1 = bbox[:, 0].copy()\n        oldx2 = bbox[:, 2].copy()\n        bbox[:, 0] = width - oldx2\n        bbox[:, 2] = width - oldx1\n        return bbox\n\n    def apply(self, sample, context=None):\n        \"\"\"Filp the image and bounding box.\n        Operators:\n            1. Flip the image numpy.\n            2. Transform the bboxes' x coordinates.\n              (Must judge whether the coordinates are normalized!)\n            3. Transform the segmentations' x coordinates.\n              (Must judge whether the coordinates are normalized!)\n        Output:\n            sample: the image, bounding box and segmentation part\n                    in sample are flipped.\n        \"\"\"\n        if np.random.uniform(0, 1) < self.prob:\n            im = sample['image']\n            height, width = im.shape[:2]\n            im = self.apply_image(im)\n            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n                sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width)\n            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:\n                sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height,\n                                                    width)\n            if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:\n                sample['gt_keypoint'] = self.apply_keypoint(\n                    sample['gt_keypoint'], width)\n\n            if 'semantic' in sample and sample['semantic']:\n                sample['semantic'] = sample['semantic'][:, ::-1]\n\n            if 'gt_segm' in sample and sample['gt_segm'].any():\n                sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]\n\n            sample['flipped'] = True\n            sample['image'] = im\n        return sample\n\n\n@register_op\nclass Resize(BaseOperator):\n    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):\n        \"\"\"\n        Resize image to target size. if keep_ratio is True,\n        resize the image's long side to the maximum of target_size\n        if keep_ratio is False, resize the image to target size(h, w)\n        Args:\n            target_size (int|list): image target size\n            keep_ratio (bool): whether keep_ratio or not, default true\n            interp (int): the interpolation method\n        \"\"\"\n        super(Resize, self).__init__()\n        self.keep_ratio = keep_ratio\n        self.interp = interp\n        if not isinstance(target_size, (Integral, Sequence)):\n            raise TypeError(\n                \"Type of target_size is invalid. Must be Integer or List or Tuple, now is {}\".\n                format(type(target_size)))\n        if isinstance(target_size, Integral):\n            target_size = [target_size, target_size]\n        self.target_size = target_size\n\n    def apply_image(self, image, scale):\n        im_scale_x, im_scale_y = scale\n\n        return cv2.resize(\n            image,\n            None,\n            None,\n            fx=im_scale_x,\n            fy=im_scale_y,\n            interpolation=self.interp)\n\n    def apply_bbox(self, bbox, scale, size):\n        im_scale_x, im_scale_y = scale\n        resize_w, resize_h = size\n        bbox[:, 0::2] *= im_scale_x\n        bbox[:, 1::2] *= im_scale_y\n        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)\n        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)\n        return bbox\n\n    def apply_area(self, area, scale):\n        im_scale_x, im_scale_y = scale\n        return area * im_scale_x * im_scale_y\n\n    def apply_joints(self, joints, scale, size):\n        im_scale_x, im_scale_y = scale\n        resize_w, resize_h = size\n        joints[..., 0] *= im_scale_x\n        joints[..., 1] *= im_scale_y\n        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)\n        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)\n        return joints\n\n    def apply_segm(self, segms, im_size, scale):\n        def _resize_poly(poly, im_scale_x, im_scale_y):\n            resized_poly = np.array(poly).astype('float32')\n            resized_poly[0::2] *= im_scale_x\n            resized_poly[1::2] *= im_scale_y\n            return resized_poly.tolist()\n\n        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):\n            if 'counts' in rle and type(rle['counts']) == list:\n                rle = mask_util.frPyObjects(rle, im_h, im_w)\n\n            mask = mask_util.decode(rle)\n            mask = cv2.resize(\n                mask,\n                None,\n                None,\n                fx=im_scale_x,\n                fy=im_scale_y,\n                interpolation=self.interp)\n            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))\n            return rle\n\n        im_h, im_w = im_size\n        im_scale_x, im_scale_y = scale\n        resized_segms = []\n        for segm in segms:\n            if is_poly(segm):\n                # Polygon format\n                resized_segms.append([\n                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm\n                ])\n            else:\n                # RLE format\n                import pycocotools.mask as mask_util\n                resized_segms.append(\n                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))\n\n        return resized_segms\n\n    def apply(self, sample, context=None):\n        \"\"\" Resize the image numpy.\n        \"\"\"\n        im = sample['image']\n        if not isinstance(im, np.ndarray):\n            raise TypeError(\"{}: image type is not numpy.\".format(self))\n\n        # apply image\n        if len(im.shape) == 3:\n            im_shape = im.shape\n        else:\n            im_shape = im[0].shape\n\n        if self.keep_ratio:\n            im_size_min = np.min(im_shape[0:2])\n            im_size_max = np.max(im_shape[0:2])\n\n            target_size_min = np.min(self.target_size)\n            target_size_max = np.max(self.target_size)\n\n            im_scale = min(target_size_min / im_size_min,\n                           target_size_max / im_size_max)\n\n            resize_h = int(im_scale * float(im_shape[0]) + 0.5)\n            resize_w = int(im_scale * float(im_shape[1]) + 0.5)\n        else:\n            resize_h, resize_w = self.target_size\n\n        im_scale_y = resize_h / im_shape[0]\n        im_scale_x = resize_w / im_shape[1]\n\n        if len(im.shape) == 3:\n            im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])\n            sample['image'] = im.astype(np.float32)\n        else:\n            resized_images = []\n            for one_im in im:\n                applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y])\n                resized_images.append(applied_im)\n\n            sample['image'] = np.array(resized_images)\n\n        # 2d keypoints resize\n        if 'kps2d' in sample.keys():\n            kps2d = sample['kps2d']\n            kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x\n            kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y\n\n            sample['kps2d'] = kps2d\n\n        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)\n        if 'scale_factor' in sample:\n            scale_factor = sample['scale_factor']\n            sample['scale_factor'] = np.asarray(\n                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],\n                dtype=np.float32)\n        else:\n            sample['scale_factor'] = np.asarray(\n                [im_scale_y, im_scale_x], dtype=np.float32)\n\n        # apply bbox\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],\n                                                [im_scale_x, im_scale_y],\n                                                [resize_w, resize_h])\n\n        # apply areas\n        if 'gt_areas' in sample:\n            sample['gt_areas'] = self.apply_area(sample['gt_areas'],\n                                                 [im_scale_x, im_scale_y])\n\n        # apply polygon\n        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:\n            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],\n                                                [im_scale_x, im_scale_y])\n\n        # apply semantic\n        if 'semantic' in sample and sample['semantic']:\n            semantic = sample['semantic']\n            semantic = cv2.resize(\n                semantic.astype('float32'),\n                None,\n                None,\n                fx=im_scale_x,\n                fy=im_scale_y,\n                interpolation=self.interp)\n            semantic = np.asarray(semantic).astype('int32')\n            semantic = np.expand_dims(semantic, 0)\n            sample['semantic'] = semantic\n\n        # apply gt_segm\n        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:\n            masks = [\n                cv2.resize(\n                    gt_segm,\n                    None,\n                    None,\n                    fx=im_scale_x,\n                    fy=im_scale_y,\n                    interpolation=cv2.INTER_NEAREST)\n                for gt_segm in sample['gt_segm']\n            ]\n            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)\n\n        if 'gt_joints' in sample:\n            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],\n                                                    [im_scale_x, im_scale_y],\n                                                    [resize_w, resize_h])\n\n        return sample\n\n\n@register_op\nclass MultiscaleTestResize(BaseOperator):\n    def __init__(self,\n                 origin_target_size=[800, 1333],\n                 target_size=[],\n                 interp=cv2.INTER_LINEAR,\n                 use_flip=True):\n        \"\"\"\n        Rescale image to the each size in target size, and capped at max_size.\n        Args:\n            origin_target_size (list): origin target size of image\n            target_size (list): A list of target sizes of image.\n            interp (int): the interpolation method.\n            use_flip (bool): whether use flip augmentation.\n        \"\"\"\n        super(MultiscaleTestResize, self).__init__()\n        self.interp = interp\n        self.use_flip = use_flip\n\n        if not isinstance(target_size, Sequence):\n            raise TypeError(\n                \"Type of target_size is invalid. Must be List or Tuple, now is {}\".\n                format(type(target_size)))\n        self.target_size = target_size\n\n        if not isinstance(origin_target_size, Sequence):\n            raise TypeError(\n                \"Type of origin_target_size is invalid. Must be List or Tuple, now is {}\".\n                format(type(origin_target_size)))\n\n        self.origin_target_size = origin_target_size\n\n    def apply(self, sample, context=None):\n        \"\"\" Resize the image numpy for multi-scale test.\n        \"\"\"\n        samples = []\n        resizer = Resize(\n            self.origin_target_size, keep_ratio=True, interp=self.interp)\n        samples.append(resizer(sample.copy(), context))\n        if self.use_flip:\n            flipper = RandomFlip(1.1)\n            samples.append(flipper(sample.copy(), context=context))\n\n        for size in self.target_size:\n            resizer = Resize(size, keep_ratio=True, interp=self.interp)\n            samples.append(resizer(sample.copy(), context))\n\n        return samples\n\n\n@register_op\nclass RandomResize(BaseOperator):\n    def __init__(self,\n                 target_size,\n                 keep_ratio=True,\n                 interp=cv2.INTER_LINEAR,\n                 random_range=False,\n                 random_size=True,\n                 random_interp=False):\n        \"\"\"\n        Resize image to target size randomly. random target_size and interpolation method\n        Args:\n            target_size (int, list, tuple): image target size, if random size is True, must be list or tuple\n            keep_ratio (bool): whether keep_raio or not, default true\n            interp (int): the interpolation method\n            random_range (bool): whether random select target size of image, the target_size must be\n                a [[min_short_edge, long_edge], [max_short_edge, long_edge]]\n            random_size (bool): whether random select target size of image\n            random_interp (bool): whether random select interpolation method\n        \"\"\"\n        super(RandomResize, self).__init__()\n        self.keep_ratio = keep_ratio\n        self.interp = interp\n        self.interps = [\n            cv2.INTER_NEAREST,\n            cv2.INTER_LINEAR,\n            cv2.INTER_AREA,\n            cv2.INTER_CUBIC,\n            cv2.INTER_LANCZOS4,\n        ]\n        assert isinstance(target_size, (\n            Integral, Sequence)), \"target_size must be Integer, List or Tuple\"\n        if (random_range or random_size) and not isinstance(target_size,\n                                                            Sequence):\n            raise TypeError(\n                \"Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}\".\n                format(type(target_size)))\n        if random_range and not len(target_size) == 2:\n            raise TypeError(\n                \"target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True.\"\n            )\n        self.target_size = target_size\n        self.random_range = random_range\n        self.random_size = random_size\n        self.random_interp = random_interp\n\n    def apply(self, sample, context=None):\n        \"\"\" Resize the image numpy.\n        \"\"\"\n        if self.random_range:\n            short_edge = np.random.randint(self.target_size[0][0],\n                                           self.target_size[1][0] + 1)\n            long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1)\n            target_size = [short_edge, long_edge]\n        else:\n            if self.random_size:\n                target_size = random.choice(self.target_size)\n            else:\n                target_size = self.target_size\n\n        if self.random_interp:\n            interp = random.choice(self.interps)\n        else:\n            interp = self.interp\n\n        resizer = Resize(target_size, self.keep_ratio, interp)\n        return resizer(sample, context=context)\n\n\n@register_op\nclass RandomExpand(BaseOperator):\n    \"\"\"Random expand the canvas.\n    Args:\n        ratio (float): maximum expansion ratio.\n        prob (float): probability to expand.\n        fill_value (list): color value used to fill the canvas. in RGB order.\n    \"\"\"\n\n    def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):\n        super(RandomExpand, self).__init__()\n        assert ratio > 1.01, \"expand ratio must be larger than 1.01\"\n        self.ratio = ratio\n        self.prob = prob\n        assert isinstance(fill_value, (Number, Sequence)), \\\n            \"fill value must be either float or sequence\"\n        if isinstance(fill_value, Number):\n            fill_value = (fill_value, ) * 3\n        if not isinstance(fill_value, tuple):\n            fill_value = tuple(fill_value)\n        self.fill_value = fill_value\n\n    def apply(self, sample, context=None):\n        if np.random.uniform(0., 1.) < self.prob:\n            return sample\n\n        im = sample['image']\n        height, width = im.shape[:2]\n        ratio = np.random.uniform(1., self.ratio)\n        h = int(height * ratio)\n        w = int(width * ratio)\n        if not h > height or not w > width:\n            return sample\n        y = np.random.randint(0, h - height)\n        x = np.random.randint(0, w - width)\n        offsets, size = [x, y], [h, w]\n\n        pad = Pad(size,\n                  pad_mode=-1,\n                  offsets=offsets,\n                  fill_value=self.fill_value)\n\n        return pad(sample, context=context)\n\n\n@register_op\nclass CropWithSampling(BaseOperator):\n    def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True):\n        \"\"\"\n        Args:\n            batch_sampler (list): Multiple sets of different\n                                  parameters for cropping.\n            satisfy_all (bool): whether all boxes must satisfy.\n            e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0],\n                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],\n                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],\n                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],\n                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],\n                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],\n                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]\n           [max sample, max trial, min scale, max scale,\n            min aspect ratio, max aspect ratio,\n            min overlap, max overlap]\n            avoid_no_bbox (bool): whether to avoid the\n                                  situation where the box does not appear.\n        \"\"\"\n        super(CropWithSampling, self).__init__()\n        self.batch_sampler = batch_sampler\n        self.satisfy_all = satisfy_all\n        self.avoid_no_bbox = avoid_no_bbox\n\n    def apply(self, sample, context):\n        \"\"\"\n        Crop the image and modify bounding box.\n        Operators:\n            1. Scale the image width and height.\n            2. Crop the image according to a radom sample.\n            3. Rescale the bounding box.\n            4. Determine if the new bbox is satisfied in the new image.\n        Returns:\n            sample: the image, bounding box are replaced.\n        \"\"\"\n        assert 'image' in sample, \"image data not found\"\n        im = sample['image']\n        gt_bbox = sample['gt_bbox']\n        gt_class = sample['gt_class']\n        im_height, im_width = im.shape[:2]\n        gt_score = None\n        if 'gt_score' in sample:\n            gt_score = sample['gt_score']\n        sampled_bbox = []\n        gt_bbox = gt_bbox.tolist()\n        for sampler in self.batch_sampler:\n            found = 0\n            for i in range(sampler[1]):\n                if found >= sampler[0]:\n                    break\n                sample_bbox = generate_sample_bbox(sampler)\n                if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox,\n                                             self.satisfy_all):\n                    sampled_bbox.append(sample_bbox)\n                    found = found + 1\n        im = np.array(im)\n        while sampled_bbox:\n            idx = int(np.random.uniform(0, len(sampled_bbox)))\n            sample_bbox = sampled_bbox.pop(idx)\n            sample_bbox = clip_bbox(sample_bbox)\n            crop_bbox, crop_class, crop_score = \\\n                filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score)\n            if self.avoid_no_bbox:\n                if len(crop_bbox) < 1:\n                    continue\n            xmin = int(sample_bbox[0] * im_width)\n            xmax = int(sample_bbox[2] * im_width)\n            ymin = int(sample_bbox[1] * im_height)\n            ymax = int(sample_bbox[3] * im_height)\n            im = im[ymin:ymax, xmin:xmax]\n            sample['image'] = im\n            sample['gt_bbox'] = crop_bbox\n            sample['gt_class'] = crop_class\n            sample['gt_score'] = crop_score\n            return sample\n        return sample\n\n\n@register_op\nclass CropWithDataAchorSampling(BaseOperator):\n    def __init__(self,\n                 batch_sampler,\n                 anchor_sampler=None,\n                 target_size=None,\n                 das_anchor_scales=[16, 32, 64, 128],\n                 sampling_prob=0.5,\n                 min_size=8.,\n                 avoid_no_bbox=True):\n        \"\"\"\n        Args:\n            anchor_sampler (list): anchor_sampling sets of different\n                                  parameters for cropping.\n            batch_sampler (list): Multiple sets of different\n                                  parameters for cropping.\n              e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]]\n                  [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],\n                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],\n                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],\n                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],\n                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]]\n              [max sample, max trial, min scale, max scale,\n               min aspect ratio, max aspect ratio,\n               min overlap, max overlap, min coverage, max coverage]\n            target_size (int): target image size.\n            das_anchor_scales (list[float]): a list of anchor scales in data\n                anchor smapling.\n            min_size (float): minimum size of sampled bbox.\n            avoid_no_bbox (bool): whether to avoid the\n                                  situation where the box does not appear.\n        \"\"\"\n        super(CropWithDataAchorSampling, self).__init__()\n        self.anchor_sampler = anchor_sampler\n        self.batch_sampler = batch_sampler\n        self.target_size = target_size\n        self.sampling_prob = sampling_prob\n        self.min_size = min_size\n        self.avoid_no_bbox = avoid_no_bbox\n        self.das_anchor_scales = np.array(das_anchor_scales)\n\n    def apply(self, sample, context):\n        \"\"\"\n        Crop the image and modify bounding box.\n        Operators:\n            1. Scale the image width and height.\n            2. Crop the image according to a radom sample.\n            3. Rescale the bounding box.\n            4. Determine if the new bbox is satisfied in the new image.\n        Returns:\n            sample: the image, bounding box are replaced.\n        \"\"\"\n        assert 'image' in sample, \"image data not found\"\n        im = sample['image']\n        gt_bbox = sample['gt_bbox']\n        gt_class = sample['gt_class']\n        image_height, image_width = im.shape[:2]\n        gt_bbox[:, 0] /= image_width\n        gt_bbox[:, 1] /= image_height\n        gt_bbox[:, 2] /= image_width\n        gt_bbox[:, 3] /= image_height\n        gt_score = None\n        if 'gt_score' in sample:\n            gt_score = sample['gt_score']\n        sampled_bbox = []\n        gt_bbox = gt_bbox.tolist()\n\n        prob = np.random.uniform(0., 1.)\n        if prob > self.sampling_prob:  # anchor sampling\n            assert self.anchor_sampler\n            for sampler in self.anchor_sampler:\n                found = 0\n                for i in range(sampler[1]):\n                    if found >= sampler[0]:\n                        break\n                    sample_bbox = data_anchor_sampling(\n                        gt_bbox, image_width, image_height,\n                        self.das_anchor_scales, self.target_size)\n                    if sample_bbox == 0:\n                        break\n                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,\n                                                          gt_bbox):\n                        sampled_bbox.append(sample_bbox)\n                        found = found + 1\n            im = np.array(im)\n            while sampled_bbox:\n                idx = int(np.random.uniform(0, len(sampled_bbox)))\n                sample_bbox = sampled_bbox.pop(idx)\n\n                if 'gt_keypoint' in sample.keys():\n                    keypoints = (sample['gt_keypoint'],\n                                 sample['keypoint_ignore'])\n                    crop_bbox, crop_class, crop_score, gt_keypoints = \\\n                        filter_and_process(sample_bbox, gt_bbox, gt_class,\n                                scores=gt_score,\n                                keypoints=keypoints)\n                else:\n                    crop_bbox, crop_class, crop_score = filter_and_process(\n                        sample_bbox, gt_bbox, gt_class, scores=gt_score)\n                crop_bbox, crop_class, crop_score = bbox_area_sampling(\n                    crop_bbox, crop_class, crop_score, self.target_size,\n                    self.min_size)\n\n                if self.avoid_no_bbox:\n                    if len(crop_bbox) < 1:\n                        continue\n                im = crop_image_sampling(im, sample_bbox, image_width,\n                                         image_height, self.target_size)\n                height, width = im.shape[:2]\n                crop_bbox[:, 0] *= width\n                crop_bbox[:, 1] *= height\n                crop_bbox[:, 2] *= width\n                crop_bbox[:, 3] *= height\n                sample['image'] = im\n                sample['gt_bbox'] = crop_bbox\n                sample['gt_class'] = crop_class\n                if 'gt_score' in sample:\n                    sample['gt_score'] = crop_score\n                if 'gt_keypoint' in sample.keys():\n                    sample['gt_keypoint'] = gt_keypoints[0]\n                    sample['keypoint_ignore'] = gt_keypoints[1]\n                return sample\n            return sample\n\n        else:\n            for sampler in self.batch_sampler:\n                found = 0\n                for i in range(sampler[1]):\n                    if found >= sampler[0]:\n                        break\n                    sample_bbox = generate_sample_bbox_square(\n                        sampler, image_width, image_height)\n                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,\n                                                          gt_bbox):\n                        sampled_bbox.append(sample_bbox)\n                        found = found + 1\n            im = np.array(im)\n            while sampled_bbox:\n                idx = int(np.random.uniform(0, len(sampled_bbox)))\n                sample_bbox = sampled_bbox.pop(idx)\n                sample_bbox = clip_bbox(sample_bbox)\n\n                if 'gt_keypoint' in sample.keys():\n                    keypoints = (sample['gt_keypoint'],\n                                 sample['keypoint_ignore'])\n                    crop_bbox, crop_class, crop_score, gt_keypoints = \\\n                        filter_and_process(sample_bbox, gt_bbox, gt_class,\n                                scores=gt_score,\n                                keypoints=keypoints)\n                else:\n                    crop_bbox, crop_class, crop_score = filter_and_process(\n                        sample_bbox, gt_bbox, gt_class, scores=gt_score)\n                # sampling bbox according the bbox area\n                crop_bbox, crop_class, crop_score = bbox_area_sampling(\n                    crop_bbox, crop_class, crop_score, self.target_size,\n                    self.min_size)\n\n                if self.avoid_no_bbox:\n                    if len(crop_bbox) < 1:\n                        continue\n                xmin = int(sample_bbox[0] * image_width)\n                xmax = int(sample_bbox[2] * image_width)\n                ymin = int(sample_bbox[1] * image_height)\n                ymax = int(sample_bbox[3] * image_height)\n                im = im[ymin:ymax, xmin:xmax]\n                height, width = im.shape[:2]\n                crop_bbox[:, 0] *= width\n                crop_bbox[:, 1] *= height\n                crop_bbox[:, 2] *= width\n                crop_bbox[:, 3] *= height\n                sample['image'] = im\n                sample['gt_bbox'] = crop_bbox\n                sample['gt_class'] = crop_class\n                if 'gt_score' in sample:\n                    sample['gt_score'] = crop_score\n                if 'gt_keypoint' in sample.keys():\n                    sample['gt_keypoint'] = gt_keypoints[0]\n                    sample['keypoint_ignore'] = gt_keypoints[1]\n                return sample\n            return sample\n\n\n@register_op\nclass RandomCrop(BaseOperator):\n    \"\"\"Random crop image and bboxes.\n    Args:\n        aspect_ratio (list): aspect ratio of cropped region.\n            in [min, max] format.\n        thresholds (list): iou thresholds for decide a valid bbox crop.\n        scaling (list): ratio between a cropped region and the original image.\n             in [min, max] format.\n        num_attempts (int): number of tries before giving up.\n        allow_no_crop (bool): allow return without actually cropping them.\n        cover_all_box (bool): ensure all bboxes are covered in the final crop.\n        is_mask_crop(bool): whether crop the segmentation.\n    \"\"\"\n\n    def __init__(self,\n                 aspect_ratio=[.5, 2.],\n                 thresholds=[.0, .1, .3, .5, .7, .9],\n                 scaling=[.3, 1.],\n                 num_attempts=50,\n                 allow_no_crop=True,\n                 cover_all_box=False,\n                 is_mask_crop=False,\n                 ioumode=\"iou\",\n                 prob=1.0):\n        super(RandomCrop, self).__init__()\n        self.aspect_ratio = aspect_ratio\n        self.thresholds = thresholds\n        self.scaling = scaling\n        self.num_attempts = num_attempts\n        self.allow_no_crop = allow_no_crop\n        self.cover_all_box = cover_all_box\n        self.is_mask_crop = is_mask_crop\n        self.ioumode = ioumode\n        self.prob = prob\n\n    def crop_segms(self, segms, valid_ids, crop, height, width):\n        def _crop_poly(segm, crop):\n            xmin, ymin, xmax, ymax = crop\n            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]\n            crop_p = np.array(crop_coord).reshape(4, 2)\n            crop_p = Polygon(crop_p)\n\n            crop_segm = list()\n            for poly in segm:\n                poly = np.array(poly).reshape(len(poly) // 2, 2)\n                polygon = Polygon(poly)\n                if not polygon.is_valid:\n                    exterior = polygon.exterior\n                    multi_lines = exterior.intersection(exterior)\n                    polygons = shapely.ops.polygonize(multi_lines)\n                    polygon = MultiPolygon(polygons)\n                multi_polygon = list()\n                if isinstance(polygon, MultiPolygon):\n                    multi_polygon = copy.deepcopy(polygon)\n                else:\n                    multi_polygon.append(copy.deepcopy(polygon))\n                for per_polygon in multi_polygon:\n                    inter = per_polygon.intersection(crop_p)\n                    if not inter:\n                        continue\n                    if isinstance(inter, (MultiPolygon, GeometryCollection)):\n                        for part in inter:\n                            if not isinstance(part, Polygon):\n                                continue\n                            part = np.squeeze(\n                                np.array(part.exterior.coords[:-1]).reshape(1,\n                                                                            -1))\n                            part[0::2] -= xmin\n                            part[1::2] -= ymin\n                            crop_segm.append(part.tolist())\n                    elif isinstance(inter, Polygon):\n                        crop_poly = np.squeeze(\n                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))\n                        crop_poly[0::2] -= xmin\n                        crop_poly[1::2] -= ymin\n                        crop_segm.append(crop_poly.tolist())\n                    else:\n                        continue\n            return crop_segm\n\n        def _crop_rle(rle, crop, height, width):\n            if 'counts' in rle and type(rle['counts']) == list:\n                rle = mask_util.frPyObjects(rle, height, width)\n            mask = mask_util.decode(rle)\n            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]\n            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))\n            return rle\n\n        crop_segms = []\n        for id in valid_ids:\n            segm = self.polygon_to_rle(segms[id], height, width)\n            if is_poly(segm):\n                import copy\n                import shapely.ops\n                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection\n                logging.getLogger(\"shapely\").setLevel(logging.WARNING)\n                # Polygon format\n                crop_segms.append(_crop_poly(segm, crop))\n            else:\n                # RLE format\n                import pycocotools.mask as mask_util\n                res = _crop_rle(segm, crop, height, width)\n                crop_segms.append(self.rle_to_polygon(res))\n        return crop_segms\n\n    def polygon_to_rle(self, polygons, height, width):\n        # Create an empty mask\n        mask_img = np.zeros((height, width), dtype=np.uint8)\n\n        # Fill the polygon in the mask\n        for polygon in polygons:\n            contour = np.array(polygon).reshape((-1, 1, 2)).astype(int)\n            cv2.drawContours(mask_img, [contour], 0, 255, -1)\n\n        # Convert binary mask to RLE\n        rle = mask.encode(np.asfortranarray(mask_img))\n        return rle\n\n    def rle_to_polygon(self, rle_mask, min_area=5):\n        binary_mask = mask.decode(rle_mask).squeeze()\n        # Find contours in the binary mask\n        contours, _ = cv2.findContours(\n            binary_mask.astype(np.uint8), cv2.RETR_EXTERNAL,\n            cv2.CHAIN_APPROX_SIMPLE)\n        polygons = []\n        for contour in contours:\n            # Convert contour to polygon and filter small areas\n            if cv2.contourArea(contour) >= min_area:\n                # Flatten list and add to polygons\n                polygon = contour.flatten().tolist()\n                if len(polygon) > 4:\n                    polygons.append(polygon)\n        return polygons\n\n    def set_fake_bboxes(self, sample):\n        sample['gt_bbox'] = np.array(\n            [\n                [32, 32, 128, 128],\n                [32, 32, 128, 256],\n                [32, 64, 128, 128],\n                [32, 64, 128, 256],\n                [64, 64, 128, 256],\n                [64, 64, 256, 256],\n                [64, 32, 128, 256],\n                [64, 32, 128, 256],\n                [96, 32, 128, 256],\n                [96, 32, 128, 256],\n            ],\n            dtype=np.float32)\n        sample['gt_class'] = np.array(\n            [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32)\n        return sample\n\n    def apply(self, sample, context=None):\n        if random.random() > self.prob:\n            return sample\n\n        if 'gt_bbox' not in sample:\n            # only used in semi-det as unsup data\n            sample = self.set_fake_bboxes(sample)\n            sample = self.random_crop(sample, fake_bboxes=True)\n            del sample['gt_bbox']\n            del sample['gt_class']\n            return sample\n\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:\n            return sample\n        sample = self.random_crop(sample)\n        return sample\n\n    def random_crop(self, sample, fake_bboxes=False):\n        h, w = sample['image'].shape[:2]\n        gt_bbox = sample['gt_bbox']\n\n        # NOTE Original method attempts to generate one candidate for each\n        # threshold then randomly sample one from the resulting list.\n        # Here a short circuit approach is taken, i.e., randomly choose a\n        # threshold and attempt to find a valid crop, and simply return the\n        # first one found.\n        # The probability is not exactly the same, kinda resembling the\n        # \"Monty Hall\" problem. Actually carrying out the attempts will affect\n        # observability (just like opening doors in the \"Monty Hall\" game).\n        thresholds = list(self.thresholds)\n        if self.allow_no_crop:\n            thresholds.append('no_crop')\n        np.random.shuffle(thresholds)\n\n        for thresh in thresholds:\n            if thresh == 'no_crop':\n                return sample\n\n            found = False\n            for i in range(self.num_attempts):\n                scale = np.random.uniform(*self.scaling)\n                if self.aspect_ratio is not None:\n                    min_ar, max_ar = self.aspect_ratio\n                    aspect_ratio = np.random.uniform(\n                        max(min_ar, scale**2), min(max_ar, scale**-2))\n                    h_scale = scale / np.sqrt(aspect_ratio)\n                    w_scale = scale * np.sqrt(aspect_ratio)\n                else:\n                    h_scale = np.random.uniform(*self.scaling)\n                    w_scale = np.random.uniform(*self.scaling)\n                crop_h = h * h_scale\n                crop_w = w * w_scale\n                if self.aspect_ratio is None:\n                    if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:\n                        continue\n\n                crop_h = int(crop_h)\n                crop_w = int(crop_w)\n                crop_y = np.random.randint(0, h - crop_h)\n                crop_x = np.random.randint(0, w - crop_w)\n                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]\n                if self.ioumode == \"iof\":\n                    iou = self._gtcropiou_matrix(\n                        gt_bbox, np.array(\n                            [crop_box], dtype=np.float32))\n                elif self.ioumode == \"iou\":\n                    iou = self._iou_matrix(\n                        gt_bbox, np.array(\n                            [crop_box], dtype=np.float32))\n                if iou.max() < thresh:\n                    continue\n\n                if self.cover_all_box and iou.min() < thresh:\n                    continue\n\n                cropped_box, valid_ids = self._crop_box_with_center_constraint(\n                    gt_bbox, np.array(\n                        crop_box, dtype=np.float32))\n                if valid_ids.size > 0:\n                    found = True\n                    break\n\n            if found:\n                if self.is_mask_crop and 'gt_poly' in sample and len(sample[\n                        'gt_poly']) > 0:\n                    crop_polys = self.crop_segms(\n                        sample['gt_poly'],\n                        valid_ids,\n                        np.array(\n                            crop_box, dtype=np.int64),\n                        h,\n                        w)\n                    if [] in crop_polys:\n                        delete_id = list()\n                        valid_polys = list()\n                        for id, crop_poly in enumerate(crop_polys):\n                            if crop_poly == []:\n                                delete_id.append(id)\n                            else:\n                                valid_polys.append(crop_poly)\n                        valid_ids = np.delete(valid_ids, delete_id)\n                        if len(valid_polys) == 0:\n                            return sample\n                        sample['gt_poly'] = valid_polys\n                    else:\n                        sample['gt_poly'] = crop_polys\n\n                if 'gt_segm' in sample:\n                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],\n                                                        crop_box)\n                    sample['gt_segm'] = np.take(\n                        sample['gt_segm'], valid_ids, axis=0)\n\n                sample['image'] = self._crop_image(sample['image'], crop_box)\n                if fake_bboxes == True:\n                    return sample\n\n                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)\n                sample['gt_class'] = np.take(\n                    sample['gt_class'], valid_ids, axis=0)\n                if 'gt_score' in sample:\n                    sample['gt_score'] = np.take(\n                        sample['gt_score'], valid_ids, axis=0)\n\n                if 'is_crowd' in sample:\n                    sample['is_crowd'] = np.take(\n                        sample['is_crowd'], valid_ids, axis=0)\n\n                if 'difficult' in sample:\n                    sample['difficult'] = np.take(\n                        sample['difficult'], valid_ids, axis=0)\n\n                if 'gt_joints' in sample:\n                    sample['gt_joints'] = self._crop_joints(sample['gt_joints'],\n                                                            crop_box)\n\n                return sample\n\n        return sample\n\n    def _iou_matrix(self, a, b):\n        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])\n        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])\n\n        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)\n        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)\n        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)\n        area_o = (area_a[:, np.newaxis] + area_b - area_i)\n        return area_i / (area_o + 1e-10)\n\n    def _gtcropiou_matrix(self, a, b):\n        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])\n        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])\n\n        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)\n        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)\n        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)\n        area_o = (area_a[:, np.newaxis] + area_b - area_i)\n        return area_i / (area_a + 1e-10)\n\n    def _crop_box_with_center_constraint(self, box, crop):\n        cropped_box = box.copy()\n\n        cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])\n        cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])\n        cropped_box[:, :2] -= crop[:2]\n        cropped_box[:, 2:] -= crop[:2]\n\n        centers = (box[:, :2] + box[:, 2:]) / 2\n        valid = np.logical_and(crop[:2] <= centers,\n                               centers < crop[2:]).all(axis=1)\n        valid = np.logical_and(\n            valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))\n\n        return cropped_box, np.where(valid)[0]\n\n    def _crop_image(self, img, crop):\n        x1, y1, x2, y2 = crop\n        return img[y1:y2, x1:x2, :]\n\n    def _crop_segm(self, segm, crop):\n        x1, y1, x2, y2 = crop\n        return segm[:, y1:y2, x1:x2]\n\n    def _crop_joints(self, joints, crop):\n        x1, y1, x2, y2 = crop\n        joints[joints[..., 0] > x2, :] = 0\n        joints[joints[..., 1] > y2, :] = 0\n        joints[joints[..., 0] < x1, :] = 0\n        joints[joints[..., 1] < y1, :] = 0\n        joints[..., 0] -= x1\n        joints[..., 1] -= y1\n        return joints\n\n\n@register_op\nclass RandomScaledCrop(BaseOperator):\n    \"\"\"Resize image and bbox based on long side (with optional random scaling),\n       then crop or pad image to target size.\n    Args:\n        target_size (int|list): target size, \"hw\" format.\n        scale_range (list): random scale range.\n        interp (int): interpolation method, default to `cv2.INTER_LINEAR`.\n        fill_value (float|list|tuple): color value used to fill the canvas,\n            in RGB order.\n    \"\"\"\n\n    def __init__(self,\n                 target_size=512,\n                 scale_range=[.1, 2.],\n                 interp=cv2.INTER_LINEAR,\n                 fill_value=(123.675, 116.28, 103.53)):\n        super(RandomScaledCrop, self).__init__()\n        assert isinstance(target_size, (\n            Integral, Sequence)), \"target_size must be Integer, List or Tuple\"\n        if isinstance(target_size, Integral):\n            target_size = [target_size, ] * 2\n\n        self.target_size = target_size\n        self.scale_range = scale_range\n        self.interp = interp\n        assert isinstance(fill_value, (Number, Sequence)), \\\n            \"fill value must be either float or sequence\"\n        if isinstance(fill_value, Number):\n            fill_value = (fill_value, ) * 3\n        if not isinstance(fill_value, tuple):\n            fill_value = tuple(fill_value)\n        self.fill_value = fill_value\n\n    def apply_image(self, img, output_size, offset_x, offset_y):\n        th, tw = self.target_size\n        rh, rw = output_size\n        img = cv2.resize(\n            img, (rw, rh), interpolation=self.interp).astype(np.float32)\n        canvas = np.ones([th, tw, 3], dtype=np.float32)\n        canvas *= np.array(self.fill_value, dtype=np.float32)\n        canvas[:min(th, rh), :min(tw, rw)] = \\\n            img[offset_y:offset_y + th, offset_x:offset_x + tw]\n        return canvas\n\n    def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y):\n        th, tw = self.target_size\n        shift_array = np.array(\n            [\n                offset_x,\n                offset_y,\n            ] * 2, dtype=np.float32)\n        boxes = gt_bbox * scale - shift_array\n        boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw)\n        boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th)\n        # filter boxes with no area\n        area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1)\n        valid = (area > 1.).nonzero()[0]\n        return boxes[valid], gt_class[valid], valid\n\n    def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None):\n        th, tw = self.target_size\n        rh, rw = output_size\n        out_segms = []\n        for segm in segms:\n            segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST)\n            segm = segm.astype(np.float32)\n            canvas = np.zeros([th, tw], dtype=segm.dtype)\n            canvas[:min(th, rh), :min(tw, rw)] = \\\n                segm[offset_y:offset_y + th, offset_x:offset_x + tw]\n            out_segms.append(canvas)\n        out_segms = np.stack(out_segms)\n        return out_segms if valid is None else out_segms[valid]\n\n    def apply(self, sample, context=None):\n        img = sample['image']\n        h, w = img.shape[:2]\n        random_scale = np.random.uniform(*self.scale_range)\n        target_scale_size = [t * random_scale for t in self.target_size]\n        # Compute actual rescaling applied to image.\n        scale = min(target_scale_size[0] / h, target_scale_size[1] / w)\n        output_size = [int(round(h * scale)), int(round(w * scale))]\n        # get offset\n        offset_x = int(\n            max(0, np.random.uniform(0., output_size[1] - self.target_size[1])))\n        offset_y = int(\n            max(0, np.random.uniform(0., output_size[0] - self.target_size[0])))\n\n        # apply to image\n        sample['image'] = self.apply_image(img, output_size, offset_x, offset_y)\n\n        # apply to bbox\n        valid = None\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox(\n                sample['gt_bbox'], sample['gt_class'], scale, offset_x,\n                offset_y)\n\n        # apply to segm\n        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:\n            sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size,\n                                                offset_x, offset_y, valid)\n\n        sample['im_shape'] = np.asarray(output_size, dtype=np.float32)\n        scale_factor = sample['scale_factor']\n        sample['scale_factor'] = np.asarray(\n            [scale_factor[0] * scale, scale_factor[1] * scale],\n            dtype=np.float32)\n\n        return sample\n\n\n@register_op\nclass Cutmix(BaseOperator):\n    def __init__(self, alpha=1.5, beta=1.5):\n        \"\"\"\n        CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899\n        Cutmix image and gt_bbbox/gt_score\n        Args:\n             alpha (float): alpha parameter of beta distribute\n             beta (float): beta parameter of beta distribute\n        \"\"\"\n        super(Cutmix, self).__init__()\n        self.alpha = alpha\n        self.beta = beta\n        if self.alpha <= 0.0:\n            raise ValueError(\"alpha shold be positive in {}\".format(self))\n        if self.beta <= 0.0:\n            raise ValueError(\"beta shold be positive in {}\".format(self))\n\n    def apply_image(self, img1, img2, factor):\n        \"\"\" _rand_bbox \"\"\"\n        h = max(img1.shape[0], img2.shape[0])\n        w = max(img1.shape[1], img2.shape[1])\n        cut_rat = np.sqrt(1. - factor)\n\n        cut_w = np.int32(w * cut_rat)\n        cut_h = np.int32(h * cut_rat)\n\n        # uniform\n        cx = np.random.randint(w)\n        cy = np.random.randint(h)\n\n        bbx1 = np.clip(cx - cut_w // 2, 0, w - 1)\n        bby1 = np.clip(cy - cut_h // 2, 0, h - 1)\n        bbx2 = np.clip(cx + cut_w // 2, 0, w - 1)\n        bby2 = np.clip(cy + cut_h // 2, 0, h - 1)\n\n        img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32')\n        img_1_pad[:img1.shape[0], :img1.shape[1], :] = \\\n            img1.astype('float32')\n        img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32')\n        img_2_pad[:img2.shape[0], :img2.shape[1], :] = \\\n            img2.astype('float32')\n        img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :]\n        return img_1_pad\n\n    def __call__(self, sample, context=None):\n        if not isinstance(sample, Sequence):\n            return sample\n\n        assert len(sample) == 2, 'cutmix need two samples'\n\n        factor = np.random.beta(self.alpha, self.beta)\n        factor = max(0.0, min(1.0, factor))\n        if factor >= 1.0:\n            return sample[0]\n        if factor <= 0.0:\n            return sample[1]\n        img1 = sample[0]['image']\n        img2 = sample[1]['image']\n        img = self.apply_image(img1, img2, factor)\n        gt_bbox1 = sample[0]['gt_bbox']\n        gt_bbox2 = sample[1]['gt_bbox']\n        gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)\n        gt_class1 = sample[0]['gt_class']\n        gt_class2 = sample[1]['gt_class']\n        gt_class = np.concatenate((gt_class1, gt_class2), axis=0)\n        gt_score1 = np.ones_like(sample[0]['gt_class'])\n        gt_score2 = np.ones_like(sample[1]['gt_class'])\n        gt_score = np.concatenate(\n            (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)\n        result = copy.deepcopy(sample[0])\n        result['image'] = img\n        result['gt_bbox'] = gt_bbox\n        result['gt_score'] = gt_score\n        result['gt_class'] = gt_class\n        if 'is_crowd' in sample[0]:\n            is_crowd1 = sample[0]['is_crowd']\n            is_crowd2 = sample[1]['is_crowd']\n            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)\n            result['is_crowd'] = is_crowd\n        if 'difficult' in sample[0]:\n            is_difficult1 = sample[0]['difficult']\n            is_difficult2 = sample[1]['difficult']\n            is_difficult = np.concatenate(\n                (is_difficult1, is_difficult2), axis=0)\n            result['difficult'] = is_difficult\n        return result\n\n\n@register_op\nclass Mixup(BaseOperator):\n    def __init__(self, alpha=1.5, beta=1.5):\n        \"\"\" Mixup image and gt_bbbox/gt_score\n        Args:\n            alpha (float): alpha parameter of beta distribute\n            beta (float): beta parameter of beta distribute\n        \"\"\"\n        super(Mixup, self).__init__()\n        self.alpha = alpha\n        self.beta = beta\n        if self.alpha <= 0.0:\n            raise ValueError(\"alpha shold be positive in {}\".format(self))\n        if self.beta <= 0.0:\n            raise ValueError(\"beta shold be positive in {}\".format(self))\n\n    def apply_image(self, img1, img2, factor):\n        h = max(img1.shape[0], img2.shape[0])\n        w = max(img1.shape[1], img2.shape[1])\n        img = np.zeros((h, w, img1.shape[2]), 'float32')\n        img[:img1.shape[0], :img1.shape[1], :] = \\\n            img1.astype('float32') * factor\n        img[:img2.shape[0], :img2.shape[1], :] += \\\n            img2.astype('float32') * (1.0 - factor)\n        return img.astype('uint8')\n\n    def __call__(self, sample, context=None):\n        if not isinstance(sample, Sequence):\n            return sample\n\n        assert len(sample) == 2, 'mixup need two samples'\n\n        factor = np.random.beta(self.alpha, self.beta)\n        factor = max(0.0, min(1.0, factor))\n        if factor >= 1.0:\n            return sample[0]\n        if factor <= 0.0:\n            return sample[1]\n        im = self.apply_image(sample[0]['image'], sample[1]['image'], factor)\n        result = copy.deepcopy(sample[0])\n        result['image'] = im\n        # apply bbox and score\n        if 'gt_bbox' in sample[0]:\n            gt_bbox1 = sample[0]['gt_bbox']\n            gt_bbox2 = sample[1]['gt_bbox']\n            gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)\n            result['gt_bbox'] = gt_bbox\n        if 'gt_class' in sample[0]:\n            gt_class1 = sample[0]['gt_class']\n            gt_class2 = sample[1]['gt_class']\n            gt_class = np.concatenate((gt_class1, gt_class2), axis=0)\n            result['gt_class'] = gt_class\n\n            gt_score1 = np.ones_like(sample[0]['gt_class'])\n            gt_score2 = np.ones_like(sample[1]['gt_class'])\n            gt_score = np.concatenate(\n                (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)\n            result['gt_score'] = gt_score.astype('float32')\n        if 'is_crowd' in sample[0]:\n            is_crowd1 = sample[0]['is_crowd']\n            is_crowd2 = sample[1]['is_crowd']\n            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)\n            result['is_crowd'] = is_crowd\n        if 'difficult' in sample[0]:\n            is_difficult1 = sample[0]['difficult']\n            is_difficult2 = sample[1]['difficult']\n            is_difficult = np.concatenate(\n                (is_difficult1, is_difficult2), axis=0)\n            result['difficult'] = is_difficult\n\n        if 'gt_ide' in sample[0]:\n            gt_ide1 = sample[0]['gt_ide']\n            gt_ide2 = sample[1]['gt_ide']\n            gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0)\n            result['gt_ide'] = gt_ide\n        return result\n\n\n@register_op\nclass NormalizeBox(BaseOperator):\n    \"\"\"Transform the bounding box's coornidates to [0,1].\"\"\"\n\n    def __init__(self, retain_origin_box=False):\n        super(NormalizeBox, self).__init__()\n        self.retain_origin_box = retain_origin_box\n\n    def apply(self, sample, context):\n        im = sample['image']\n        if 'gt_bbox' in sample.keys():\n            if self.retain_origin_box:\n                sample['origin_gt_bbox'] = sample['gt_bbox'].copy()\n                sample['origin_gt_class'] = sample['gt_class'].copy()\n\n            gt_bbox = sample['gt_bbox']\n            height, width, _ = im.shape\n            for i in range(gt_bbox.shape[0]):\n                gt_bbox[i][0] = gt_bbox[i][0] / width\n                gt_bbox[i][1] = gt_bbox[i][1] / height\n                gt_bbox[i][2] = gt_bbox[i][2] / width\n                gt_bbox[i][3] = gt_bbox[i][3] / height\n            sample['gt_bbox'] = gt_bbox\n\n            if 'gt_keypoint' in sample.keys():\n                gt_keypoint = sample['gt_keypoint']\n\n                for i in range(gt_keypoint.shape[1]):\n                    if i % 2:\n                        gt_keypoint[:, i] = gt_keypoint[:, i] / height\n                    else:\n                        gt_keypoint[:, i] = gt_keypoint[:, i] / width\n                sample['gt_keypoint'] = gt_keypoint\n\n            return sample\n        else:\n            return sample\n\n\n@register_op\nclass BboxXYXY2XYWH(BaseOperator):\n    \"\"\"\n    Convert bbox XYXY format to XYWH format.\n    \"\"\"\n\n    def __init__(self):\n        super(BboxXYXY2XYWH, self).__init__()\n\n    def apply(self, sample, context=None):\n        if 'gt_bbox' in sample.keys():\n            bbox = sample['gt_bbox']\n            bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2]\n            bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2.\n            sample['gt_bbox'] = bbox\n            return sample\n        else:\n            return sample\n\n\n@register_op\nclass PadBox(BaseOperator):\n    def __init__(self, num_max_boxes=50):\n        \"\"\"\n        Pad zeros to bboxes if number of bboxes is less than num_max_boxes.\n        Args:\n            num_max_boxes (int): the max number of bboxes\n        \"\"\"\n        self.num_max_boxes = num_max_boxes\n        super(PadBox, self).__init__()\n\n    def apply(self, sample, context=None):\n        assert 'gt_bbox' in sample\n        bbox = sample['gt_bbox']\n        gt_num = min(self.num_max_boxes, len(bbox))\n        num_max = self.num_max_boxes\n        # fields = context['fields'] if context else []\n        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)\n        if gt_num > 0:\n            pad_bbox[:gt_num, :] = bbox[:gt_num, :]\n        sample['gt_bbox'] = pad_bbox\n        if 'gt_class' in sample:\n            pad_class = np.zeros((num_max, ), dtype=np.int32)\n            if gt_num > 0:\n                pad_class[:gt_num] = sample['gt_class'][:gt_num, 0]\n            sample['gt_class'] = pad_class\n        if 'gt_score' in sample:\n            pad_score = np.zeros((num_max, ), dtype=np.float32)\n            if gt_num > 0:\n                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]\n            sample['gt_score'] = pad_score\n        # in training, for example in op ExpandImage,\n        # the bbox and gt_class is expandded, but the difficult is not,\n        # so, judging by it's length\n        if 'difficult' in sample:\n            pad_diff = np.zeros((num_max, ), dtype=np.int32)\n            if gt_num > 0:\n                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]\n            sample['difficult'] = pad_diff\n        if 'is_crowd' in sample:\n            pad_crowd = np.zeros((num_max, ), dtype=np.int32)\n            if gt_num > 0:\n                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]\n            sample['is_crowd'] = pad_crowd\n        if 'gt_ide' in sample:\n            pad_ide = np.zeros((num_max, ), dtype=np.int32)\n            if gt_num > 0:\n                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]\n            sample['gt_ide'] = pad_ide\n        return sample\n\n\n@register_op\nclass DebugVisibleImage(BaseOperator):\n    \"\"\"\n    In debug mode, visualize images according to `gt_box`.\n    (Currently only supported when not cropping and flipping image.)\n    \"\"\"\n\n    def __init__(self, output_dir='output/debug', is_normalized=False):\n        super(DebugVisibleImage, self).__init__()\n        self.is_normalized = is_normalized\n        self.output_dir = output_dir\n        if not os.path.isdir(output_dir):\n            os.makedirs(output_dir)\n        if not isinstance(self.is_normalized, bool):\n            raise TypeError(\"{}: input type is invalid.\".format(self))\n\n    def apply(self, sample, context=None):\n        image = Image.fromarray(sample['image'].astype(np.uint8))\n        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])\n        width = sample['w']\n        height = sample['h']\n        gt_bbox = sample['gt_bbox']\n        gt_class = sample['gt_class']\n        draw = ImageDraw.Draw(image)\n        for i in range(gt_bbox.shape[0]):\n            if self.is_normalized:\n                gt_bbox[i][0] = gt_bbox[i][0] * width\n                gt_bbox[i][1] = gt_bbox[i][1] * height\n                gt_bbox[i][2] = gt_bbox[i][2] * width\n                gt_bbox[i][3] = gt_bbox[i][3] * height\n\n            xmin, ymin, xmax, ymax = gt_bbox[i]\n            draw.line(\n                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),\n                 (xmin, ymin)],\n                width=2,\n                fill='green')\n            # draw label\n            text = str(gt_class[i][0])\n            tw, th = imagedraw_textsize_c(draw, text)\n            draw.rectangle(\n                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')\n            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))\n\n        if 'gt_keypoint' in sample.keys():\n            gt_keypoint = sample['gt_keypoint']\n            if self.is_normalized:\n                for i in range(gt_keypoint.shape[1]):\n                    if i % 2:\n                        gt_keypoint[:, i] = gt_keypoint[:, i] * height\n                    else:\n                        gt_keypoint[:, i] = gt_keypoint[:, i] * width\n            for i in range(gt_keypoint.shape[0]):\n                keypoint = gt_keypoint[i]\n                for j in range(int(keypoint.shape[0] / 2)):\n                    x1 = round(keypoint[2 * j]).astype(np.int32)\n                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)\n                    draw.ellipse(\n                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')\n        save_path = os.path.join(self.output_dir, out_file_name)\n        image.save(save_path, quality=95)\n        return sample\n\n\n@register_op\nclass Pad(BaseOperator):\n    def __init__(self,\n                 size=None,\n                 size_divisor=32,\n                 pad_mode=0,\n                 offsets=None,\n                 fill_value=(127.5, 127.5, 127.5)):\n        \"\"\"\n        Pad image to a specified size or multiple of size_divisor.\n        Args:\n            size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None\n            size_divisor (int): size divisor, default 32\n            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets\n                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top\n            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1\n            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)\n        \"\"\"\n        super(Pad, self).__init__()\n\n        if not isinstance(size, (int, Sequence)):\n            raise TypeError(\n                \"Type of target_size is invalid when random_size is True. \\\n                            Must be List, now is {}\".format(type(size)))\n\n        if isinstance(size, int):\n            size = [size, size]\n\n        assert pad_mode in [\n            -1, 0, 1, 2\n        ], 'currently only supports four modes [-1, 0, 1, 2]'\n        if pad_mode == -1:\n            assert offsets, 'if pad_mode is -1, offsets should not be None'\n\n        self.size = size\n        self.size_divisor = size_divisor\n        self.pad_mode = pad_mode\n        self.fill_value = fill_value\n        self.offsets = offsets\n\n    def apply_segm(self, segms, offsets, im_size, size):\n        def _expand_poly(poly, x, y):\n            expanded_poly = np.array(poly)\n            expanded_poly[0::2] += x\n            expanded_poly[1::2] += y\n            return expanded_poly.tolist()\n\n        def _expand_rle(rle, x, y, height, width, h, w):\n            if 'counts' in rle and type(rle['counts']) == list:\n                rle = mask_util.frPyObjects(rle, height, width)\n            mask = mask_util.decode(rle)\n            expanded_mask = np.full((h, w), 0).astype(mask.dtype)\n            expanded_mask[y:y + height, x:x + width] = mask\n            rle = mask_util.encode(\n                np.array(\n                    expanded_mask, order='F', dtype=np.uint8))\n            return rle\n\n        x, y = offsets\n        height, width = im_size\n        h, w = size\n        expanded_segms = []\n        for segm in segms:\n            if is_poly(segm):\n                # Polygon format\n                expanded_segms.append(\n                    [_expand_poly(poly, x, y) for poly in segm])\n            else:\n                # RLE format\n                import pycocotools.mask as mask_util\n                expanded_segms.append(\n                    _expand_rle(segm, x, y, height, width, h, w))\n        return expanded_segms\n\n    def apply_bbox(self, bbox, offsets):\n        return bbox + np.array(offsets * 2, dtype=np.float32)\n\n    def apply_keypoint(self, keypoints, offsets):\n        n = len(keypoints[0]) // 2\n        return keypoints + np.array(offsets * n, dtype=np.float32)\n\n    def apply_image(self, image, offsets, im_size, size):\n        x, y = offsets\n        im_h, im_w = im_size\n        h, w = size\n        canvas = np.ones((h, w, 3), dtype=np.float32)\n        canvas *= np.array(self.fill_value, dtype=np.float32)\n        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)\n        return canvas\n\n    def apply(self, sample, context=None):\n        im = sample['image']\n        im_h, im_w = im.shape[:2]\n        if self.size:\n            h, w = self.size\n            assert (\n                im_h <= h and im_w <= w\n            ), '(h, w) of target size should be greater than (im_h, im_w)'\n        else:\n            h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)\n            w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)\n\n        if h == im_h and w == im_w:\n            sample['image'] = im.astype(np.float32)\n            return sample\n\n        if self.pad_mode == -1:\n            offset_x, offset_y = self.offsets\n        elif self.pad_mode == 0:\n            offset_y, offset_x = 0, 0\n        elif self.pad_mode == 1:\n            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2\n        else:\n            offset_y, offset_x = h - im_h, w - im_w\n\n        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]\n\n        sample['image'] = self.apply_image(im, offsets, im_size, size)\n\n        if self.pad_mode == 0:\n            return sample\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets)\n\n        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:\n            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets,\n                                                im_size, size)\n\n        if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:\n            sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'],\n                                                        offsets)\n\n        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:\n            masks = [\n                cv2.copyMakeBorder(\n                    gt_segm,\n                    offset_y, h - (offset_y + im_h),\n                    offset_x, w - (offset_x + im_w),\n                    borderType=cv2.BORDER_CONSTANT,\n                    value=0)\n                for gt_segm in sample['gt_segm']\n            ]\n            sample['gt_segm'] = np.asarray(masks, dtype=np.uint8)\n\n        return sample\n\n\n@register_op\nclass Poly2Mask(BaseOperator):\n    \"\"\"\n    gt poly to mask annotations.\n    Args:\n        del_poly (bool): Whether to delete poly after generating mask. Default: False.\n    \"\"\"\n\n    def __init__(self, del_poly=False):\n        super(Poly2Mask, self).__init__()\n        import pycocotools.mask as maskUtils\n        self.maskutils = maskUtils\n        self.del_poly = del_poly\n\n    def _poly2mask(self, mask_ann, img_h, img_w):\n        if isinstance(mask_ann, list):\n            # polygon -- a single object might consist of multiple parts\n            # we merge all parts into one mask rle code\n            rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w)\n            rle = self.maskutils.merge(rles)\n        elif isinstance(mask_ann['counts'], list):\n            # uncompressed RLE\n            rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w)\n        else:\n            # rle\n            rle = mask_ann\n        mask = self.maskutils.decode(rle)\n        return mask\n\n    def apply(self, sample, context=None):\n        assert 'gt_poly' in sample\n        im_h, im_w = sample['im_shape']\n        masks = [\n            self._poly2mask(gt_poly, im_h, im_w)\n            for gt_poly in sample['gt_poly']\n        ]\n        sample['gt_segm'] = np.asarray(masks).astype(np.uint8)\n        if self.del_poly:\n            del (sample['gt_poly'])\n\n        return sample\n\n\n@register_op\nclass AugmentHSV(BaseOperator):\n    \"\"\"\n    Augment the SV channel of image data.\n    Args:\n        fraction (float): the fraction for augment. Default: 0.5.\n        is_bgr (bool): whether the image is BGR mode. Default: True.\n        hgain (float): H channel gains\n        sgain (float): S channel gains\n        vgain (float): V channel gains\n    \"\"\"\n\n    def __init__(self,\n                 fraction=0.50,\n                 is_bgr=True,\n                 hgain=None,\n                 sgain=None,\n                 vgain=None):\n        super(AugmentHSV, self).__init__()\n        self.fraction = fraction\n        self.is_bgr = is_bgr\n        self.hgain = hgain\n        self.sgain = sgain\n        self.vgain = vgain\n        self.use_hsvgain = False if hgain is None else True\n\n    def apply(self, sample, context=None):\n        img = sample['image']\n        if self.is_bgr:\n            img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)\n        else:\n            img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)\n\n        if self.use_hsvgain:\n            hsv_augs = np.random.uniform(\n                -1, 1, 3) * [self.hgain, self.sgain, self.vgain]\n            # random selection of h, s, v\n            hsv_augs *= np.random.randint(0, 2, 3)\n            img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180\n            img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)\n            img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)\n\n        else:\n            S = img_hsv[:, :, 1].astype(np.float32)\n            V = img_hsv[:, :, 2].astype(np.float32)\n\n            a = (random.random() * 2 - 1) * self.fraction + 1\n            S *= a\n            if a > 1:\n                np.clip(S, a_min=0, a_max=255, out=S)\n\n            a = (random.random() * 2 - 1) * self.fraction + 1\n            V *= a\n            if a > 1:\n                np.clip(V, a_min=0, a_max=255, out=V)\n\n            img_hsv[:, :, 1] = S.astype(np.uint8)\n            img_hsv[:, :, 2] = V.astype(np.uint8)\n\n        if self.is_bgr:\n            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)\n        else:\n            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)\n\n        sample['image'] = img.astype(np.float32)\n        return sample\n\n\n@register_op\nclass Norm2PixelBbox(BaseOperator):\n    \"\"\"\n    Transform the bounding box's coornidates which is in [0,1] to pixels.\n    \"\"\"\n\n    def __init__(self):\n        super(Norm2PixelBbox, self).__init__()\n\n    def apply(self, sample, context=None):\n        assert 'gt_bbox' in sample\n        bbox = sample['gt_bbox']\n        height, width = sample['image'].shape[:2]\n        bbox[:, 0::2] = bbox[:, 0::2] * width\n        bbox[:, 1::2] = bbox[:, 1::2] * height\n        sample['gt_bbox'] = bbox\n        return sample\n\n\n@register_op\nclass BboxCXCYWH2XYXY(BaseOperator):\n    \"\"\"\n    Convert bbox CXCYWH format to XYXY format.\n    [center_x, center_y, width, height] -> [x0, y0, x1, y1]\n    \"\"\"\n\n    def __init__(self):\n        super(BboxCXCYWH2XYXY, self).__init__()\n\n    def apply(self, sample, context=None):\n        assert 'gt_bbox' in sample\n        bbox0 = sample['gt_bbox']\n        bbox = bbox0.copy()\n\n        bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2.\n        bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2.\n        sample['gt_bbox'] = bbox\n        return sample\n\n\n@register_op\nclass RandomResizeCrop(BaseOperator):\n    \"\"\"Random resize and crop image and bboxes.\n    Args:\n        resizes (list): resize image to one of resizes. if keep_ratio is True and mode is\n        'long', resize the image's long side to the maximum of target_size, if keep_ratio is\n        True and mode is 'short', resize the image's short side to the minimum of target_size.\n        cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...]\n        mode (str): resize mode, `long` or `short`. Details see resizes.\n        prob (float): probability of this op.\n        keep_ratio (bool): whether keep_ratio or not, default true\n        interp (int): the interpolation method\n        thresholds (list): iou thresholds for decide a valid bbox crop.\n        num_attempts (int): number of tries before giving up.\n        allow_no_crop (bool): allow return without actually cropping them.\n        cover_all_box (bool): ensure all bboxes are covered in the final crop.\n        is_mask_crop(bool): whether crop the segmentation.\n    \"\"\"\n\n    def __init__(self,\n                 resizes,\n                 cropsizes,\n                 prob=0.5,\n                 mode='short',\n                 keep_ratio=True,\n                 interp=cv2.INTER_LINEAR,\n                 num_attempts=3,\n                 cover_all_box=False,\n                 allow_no_crop=False,\n                 thresholds=[0.3, 0.5, 0.7],\n                 is_mask_crop=False,\n                 ioumode=\"iou\"):\n        super(RandomResizeCrop, self).__init__()\n\n        self.resizes = resizes\n        self.cropsizes = cropsizes\n        self.prob = prob\n        self.mode = mode\n        self.ioumode = ioumode\n\n        self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp)\n        self.croper = RandomCrop(\n            num_attempts=num_attempts,\n            cover_all_box=cover_all_box,\n            thresholds=thresholds,\n            allow_no_crop=allow_no_crop,\n            is_mask_crop=is_mask_crop)\n\n    def _format_size(self, size):\n        if isinstance(size, Integral):\n            size = (size, size)\n        return size\n\n    def apply(self, sample, context=None):\n        if random.random() < self.prob:\n            _resize = self._format_size(random.choice(self.resizes))\n            _cropsize = self._format_size(random.choice(self.cropsizes))\n            sample = self._resize(\n                self.resizer,\n                sample,\n                size=_resize,\n                mode=self.mode,\n                context=context)\n            sample = self._random_crop(\n                self.croper, sample, size=_cropsize, context=context)\n        return sample\n\n    @staticmethod\n    def _random_crop(croper, sample, size, context=None):\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:\n            return sample\n\n        self = croper\n        h, w = sample['image'].shape[:2]\n        gt_bbox = sample['gt_bbox']\n        cropsize = size\n        min_crop = min(cropsize)\n        max_crop = max(cropsize)\n\n        thresholds = list(self.thresholds)\n        np.random.shuffle(thresholds)\n\n        for thresh in thresholds:\n            found = False\n            for _ in range(self.num_attempts):\n\n                crop_h = random.randint(min_crop, min(h, max_crop))\n                crop_w = random.randint(min_crop, min(w, max_crop))\n\n                crop_y = random.randint(0, h - crop_h)\n                crop_x = random.randint(0, w - crop_w)\n\n                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]\n                if self.ioumode == \"iof\":\n                    iou = self._gtcropiou_matrix(\n                        gt_bbox, np.array(\n                            [crop_box], dtype=np.float32))\n                elif self.ioumode == \"iou\":\n                    iou = self._iou_matrix(\n                        gt_bbox, np.array(\n                            [crop_box], dtype=np.float32))\n                if iou.max() < thresh:\n                    continue\n\n                if self.cover_all_box and iou.min() < thresh:\n                    continue\n\n                cropped_box, valid_ids = self._crop_box_with_center_constraint(\n                    gt_bbox, np.array(\n                        crop_box, dtype=np.float32))\n                if valid_ids.size > 0:\n                    found = True\n                    break\n\n            if found:\n                if self.is_mask_crop and 'gt_poly' in sample and len(sample[\n                        'gt_poly']) > 0:\n                    crop_polys = self.crop_segms(\n                        sample['gt_poly'],\n                        valid_ids,\n                        np.array(\n                            crop_box, dtype=np.int64),\n                        h,\n                        w)\n                    if [] in crop_polys:\n                        delete_id = list()\n                        valid_polys = list()\n                        for id, crop_poly in enumerate(crop_polys):\n                            if crop_poly == []:\n                                delete_id.append(id)\n                            else:\n                                valid_polys.append(crop_poly)\n                        valid_ids = np.delete(valid_ids, delete_id)\n                        if len(valid_polys) == 0:\n                            return sample\n                        sample['gt_poly'] = valid_polys\n                    else:\n                        sample['gt_poly'] = crop_polys\n\n                if 'gt_segm' in sample:\n                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],\n                                                        crop_box)\n                    sample['gt_segm'] = np.take(\n                        sample['gt_segm'], valid_ids, axis=0)\n\n                sample['image'] = self._crop_image(sample['image'], crop_box)\n                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)\n                sample['gt_class'] = np.take(\n                    sample['gt_class'], valid_ids, axis=0)\n                if 'gt_score' in sample:\n                    sample['gt_score'] = np.take(\n                        sample['gt_score'], valid_ids, axis=0)\n\n                if 'is_crowd' in sample:\n                    sample['is_crowd'] = np.take(\n                        sample['is_crowd'], valid_ids, axis=0)\n\n                if 'gt_areas' in sample:\n                    sample['gt_areas'] = np.take(\n                        sample['gt_areas'], valid_ids, axis=0)\n\n                if 'gt_joints' in sample:\n                    gt_joints = self._crop_joints(sample['gt_joints'], crop_box)\n                    sample['gt_joints'] = gt_joints[valid_ids]\n                return sample\n\n        return sample\n\n    @staticmethod\n    def _resize(resizer, sample, size, mode='short', context=None):\n        self = resizer\n        im = sample['image']\n        target_size = size\n\n        if not isinstance(im, np.ndarray):\n            raise TypeError(\"{}: image type is not numpy.\".format(self))\n        if len(im.shape) != 3:\n            raise ImageError('{}: image is not 3-dimensional.'.format(self))\n\n        # apply image\n        im_shape = im.shape\n        if self.keep_ratio:\n\n            im_size_min = np.min(im_shape[0:2])\n            im_size_max = np.max(im_shape[0:2])\n\n            target_size_min = np.min(target_size)\n            target_size_max = np.max(target_size)\n\n            if mode == 'long':\n                im_scale = min(target_size_min / im_size_min,\n                               target_size_max / im_size_max)\n            else:\n                im_scale = max(target_size_min / im_size_min,\n                               target_size_max / im_size_max)\n\n            resize_h = int(im_scale * float(im_shape[0]) + 0.5)\n            resize_w = int(im_scale * float(im_shape[1]) + 0.5)\n\n            im_scale_x = im_scale\n            im_scale_y = im_scale\n        else:\n            resize_h, resize_w = target_size\n            im_scale_y = resize_h / im_shape[0]\n            im_scale_x = resize_w / im_shape[1]\n\n        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])\n        sample['image'] = im\n        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)\n        if 'scale_factor' in sample:\n            scale_factor = sample['scale_factor']\n            sample['scale_factor'] = np.asarray(\n                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],\n                dtype=np.float32)\n        else:\n            sample['scale_factor'] = np.asarray(\n                [im_scale_y, im_scale_x], dtype=np.float32)\n\n        # apply bbox\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],\n                                                [im_scale_x, im_scale_y],\n                                                [resize_w, resize_h])\n\n        # apply polygon\n        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:\n            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],\n                                                [im_scale_x, im_scale_y])\n\n        # apply semantic\n        if 'semantic' in sample and sample['semantic']:\n            semantic = sample['semantic']\n            semantic = cv2.resize(\n                semantic.astype('float32'),\n                None,\n                None,\n                fx=im_scale_x,\n                fy=im_scale_y,\n                interpolation=self.interp)\n            semantic = np.asarray(semantic).astype('int32')\n            semantic = np.expand_dims(semantic, 0)\n            sample['semantic'] = semantic\n\n        # apply gt_segm\n        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:\n            masks = [\n                cv2.resize(\n                    gt_segm,\n                    None,\n                    None,\n                    fx=im_scale_x,\n                    fy=im_scale_y,\n                    interpolation=cv2.INTER_NEAREST)\n                for gt_segm in sample['gt_segm']\n            ]\n            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)\n\n        if 'gt_joints' in sample:\n            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],\n                                                    [im_scale_x, im_scale_y],\n                                                    [resize_w, resize_h])\n\n        return sample\n\n\n@register_op\nclass RandomSelect(BaseOperator):\n    \"\"\"\n    Randomly choose a transformation between transforms1 and transforms2,\n    and the probability of choosing transforms1 is p.\n\n    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py\n\n    \"\"\"\n\n    def __init__(self, transforms1, transforms2, p=0.5):\n        super(RandomSelect, self).__init__()\n        self.transforms1 = Compose(transforms1)\n        self.transforms2 = Compose(transforms2)\n        self.p = p\n\n    def apply(self, sample, context=None):\n        if random.random() < self.p:\n            return self.transforms1(sample)\n        return self.transforms2(sample)\n\n\n@register_op\nclass RandomSelects(BaseOperator):\n    \"\"\"\n    Randomly choose a transformation between transforms1 and transforms2,\n    and the probability of choosing transforms1 is p.\n\n    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py\n\n    \"\"\"\n\n    def __init__(self, transforms_list, p=None):\n        super(RandomSelects, self).__init__()\n        if p is not None:\n            assert isinstance(p, (list, tuple))\n            assert len(transforms_list) == len(p)\n        else:\n            assert len(transforms_list) > 0\n        self.transforms = [Compose(t) for t in transforms_list]\n        self.p = p\n\n    def apply(self, sample, context=None):\n        if self.p is None:\n            return random.choice(self.transforms)(sample)\n        else:\n            prob = random.random()\n            for p, t in zip(self.p, self.transforms):\n                if prob <= p:\n                    return t(sample)\n\n\n@register_op\nclass RandomShortSideResize(BaseOperator):\n    def __init__(self,\n                 short_side_sizes,\n                 max_size=None,\n                 interp=cv2.INTER_LINEAR,\n                 random_interp=False):\n        \"\"\"\n        Resize the image randomly according to the short side. If max_size is not None,\n        the long side is scaled according to max_size. The whole process will be keep ratio.\n        Args:\n            short_side_sizes (list|tuple): Image target short side size.\n            max_size (int): The size of the longest side of image after resize.\n            interp (int): The interpolation method.\n            random_interp (bool): Whether random select interpolation method.\n        \"\"\"\n        super(RandomShortSideResize, self).__init__()\n\n        assert isinstance(short_side_sizes,\n                          Sequence), \"short_side_sizes must be List or Tuple\"\n\n        self.short_side_sizes = short_side_sizes\n        self.max_size = max_size\n        self.interp = interp\n        self.random_interp = random_interp\n        self.interps = [\n            cv2.INTER_NEAREST,\n            cv2.INTER_LINEAR,\n            cv2.INTER_AREA,\n            cv2.INTER_CUBIC,\n            cv2.INTER_LANCZOS4,\n        ]\n\n    def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):\n        h, w = image_shape\n        max_clip = False\n        if max_size is not None:\n            min_original_size = float(min((w, h)))\n            max_original_size = float(max((w, h)))\n            if max_original_size / min_original_size * size > max_size:\n                size = int(max_size * min_original_size / max_original_size)\n                max_clip = True\n\n        if (w <= h and w == size) or (h <= w and h == size):\n            return (w, h)\n\n        if w < h:\n            ow = size\n            oh = int(round(size * h / w)) if not max_clip else max_size\n        else:\n            oh = size\n            ow = int(round(size * w / h)) if not max_clip else max_size\n\n        return (ow, oh)\n\n    def resize(self,\n               sample,\n               target_size,\n               max_size=None,\n               interp=cv2.INTER_LINEAR):\n        im = sample['image']\n        if not isinstance(im, np.ndarray):\n            raise TypeError(\"{}: image type is not numpy.\".format(self))\n        if len(im.shape) != 3:\n            raise ImageError('{}: image is not 3-dimensional.'.format(self))\n\n        target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,\n                                                      max_size)\n        im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[\n            0] / im.shape[1]\n\n        sample['image'] = cv2.resize(im, target_size, interpolation=interp)\n        sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)\n        if 'scale_factor' in sample:\n            scale_factor = sample['scale_factor']\n            sample['scale_factor'] = np.asarray(\n                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],\n                dtype=np.float32)\n        else:\n            sample['scale_factor'] = np.asarray(\n                [im_scale_y, im_scale_x], dtype=np.float32)\n\n        # apply bbox\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            sample['gt_bbox'] = self.apply_bbox(\n                sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)\n        # apply polygon\n        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:\n            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],\n                                                [im_scale_x, im_scale_y])\n        # apply semantic\n        if 'semantic' in sample and sample['semantic']:\n            semantic = sample['semantic']\n            semantic = cv2.resize(\n                semantic.astype('float32'),\n                target_size,\n                interpolation=self.interp)\n            semantic = np.asarray(semantic).astype('int32')\n            semantic = np.expand_dims(semantic, 0)\n            sample['semantic'] = semantic\n        # apply gt_segm\n        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:\n            masks = [\n                cv2.resize(\n                    gt_segm, target_size, interpolation=cv2.INTER_NEAREST)\n                for gt_segm in sample['gt_segm']\n            ]\n            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)\n\n        if 'gt_joints' in sample:\n            sample['gt_joints'] = self.apply_joints(\n                sample['gt_joints'], [im_scale_x, im_scale_y], target_size)\n\n        # apply areas\n        if 'gt_areas' in sample:\n            sample['gt_areas'] = self.apply_area(sample['gt_areas'],\n                                                 [im_scale_x, im_scale_y])\n\n        return sample\n\n    def apply_bbox(self, bbox, scale, size):\n        im_scale_x, im_scale_y = scale\n        resize_w, resize_h = size\n        bbox[:, 0::2] *= im_scale_x\n        bbox[:, 1::2] *= im_scale_y\n        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)\n        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)\n        return bbox.astype('float32')\n\n    def apply_joints(self, joints, scale, size):\n        im_scale_x, im_scale_y = scale\n        resize_w, resize_h = size\n        joints[..., 0] *= im_scale_x\n        joints[..., 1] *= im_scale_y\n        # joints[joints[..., 0] >= resize_w, :] = 0\n        # joints[joints[..., 1] >= resize_h, :] = 0\n        # joints[joints[..., 0] < 0, :] = 0\n        # joints[joints[..., 1] < 0, :] = 0\n        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)\n        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)\n        return joints\n\n    def apply_area(self, area, scale):\n        im_scale_x, im_scale_y = scale\n        return area * im_scale_x * im_scale_y\n\n    def apply_segm(self, segms, im_size, scale):\n        def _resize_poly(poly, im_scale_x, im_scale_y):\n            resized_poly = np.array(poly).astype('float32')\n            resized_poly[0::2] *= im_scale_x\n            resized_poly[1::2] *= im_scale_y\n            return resized_poly.tolist()\n\n        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):\n            if 'counts' in rle and type(rle['counts']) == list:\n                rle = mask_util.frPyObjects(rle, im_h, im_w)\n\n            mask = mask_util.decode(rle)\n            mask = cv2.resize(\n                mask,\n                None,\n                None,\n                fx=im_scale_x,\n                fy=im_scale_y,\n                interpolation=self.interp)\n            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))\n            return rle\n\n        im_h, im_w = im_size\n        im_scale_x, im_scale_y = scale\n        resized_segms = []\n        for segm in segms:\n            if is_poly(segm):\n                # Polygon format\n                resized_segms.append([\n                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm\n                ])\n            else:\n                # RLE format\n                import pycocotools.mask as mask_util\n                resized_segms.append(\n                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))\n\n        return resized_segms\n\n    def apply(self, sample, context=None):\n        target_size = random.choice(self.short_side_sizes)\n        interp = random.choice(\n            self.interps) if self.random_interp else self.interp\n\n        return self.resize(sample, target_size, self.max_size, interp)\n\n\n@register_op\nclass RandomShortSideRangeResize(RandomShortSideResize):\n    def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False):\n        \"\"\"\n        Resize the image randomly according to the short side. If max_size is not None,\n        the long side is scaled according to max_size. The whole process will be keep ratio.\n        Args:\n            short_side_sizes (list|tuple): Image target short side size.\n            interp (int): The interpolation method.\n            random_interp (bool): Whether random select interpolation method.\n        \"\"\"\n        super(RandomShortSideRangeResize, self).__init__(scales, None, interp,\n                                                         random_interp)\n\n        assert isinstance(scales,\n                          Sequence), \"short_side_sizes must be List or Tuple\"\n\n        self.scales = scales\n\n    def random_sample(self, img_scales):\n        img_scale_long = [max(s) for s in img_scales]\n        img_scale_short = [min(s) for s in img_scales]\n        long_edge = np.random.randint(\n            min(img_scale_long), max(img_scale_long) + 1)\n        short_edge = np.random.randint(\n            min(img_scale_short), max(img_scale_short) + 1)\n        img_scale = (long_edge, short_edge)\n        return img_scale\n\n    def apply(self, sample, context=None):\n        long_edge, short_edge = self.random_sample(self.short_side_sizes)\n        # print(\"target size:{}\".format((long_edge, short_edge)))\n        interp = random.choice(\n            self.interps) if self.random_interp else self.interp\n\n        return self.resize(sample, short_edge, long_edge, interp)\n\n\n@register_op\nclass RandomSizeCrop(BaseOperator):\n    \"\"\"\n    Cut the image randomly according to `min_size` and `max_size`\n    Args:\n        min_size (int): Min size for edges of cropped image.\n        max_size (int): Max size for edges of cropped image. If it\n                        is set to larger than length of the input image,\n                        the output will keep the origin length.\n        keep_empty (bool): Whether to keep the cropped result with no object.\n                           If it is set to False, the no-object result will not\n                           be returned, replaced by the original input.\n    \"\"\"\n\n    def __init__(self, min_size, max_size, keep_empty=True):\n        super(RandomSizeCrop, self).__init__()\n        self.min_size = min_size\n        self.max_size = max_size\n        self.keep_empty = keep_empty\n\n        from paddle.vision.transforms.functional import crop as paddle_crop\n        self.paddle_crop = paddle_crop\n\n    @staticmethod\n    def get_crop_params(img_shape, output_size):\n        \"\"\"Get parameters for ``crop`` for a random crop.\n        Args:\n            img_shape (list|tuple): Image's height and width.\n            output_size (list|tuple): Expected output size of the crop.\n        Returns:\n            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.\n        \"\"\"\n        h, w = img_shape\n        th, tw = output_size\n\n        if h + 1 < th or w + 1 < tw:\n            raise ValueError(\n                \"Required crop size {} is larger then input image size {}\".\n                format((th, tw), (h, w)))\n\n        if w == tw and h == th:\n            return 0, 0, h, w\n\n        i = random.randint(0, h - th + 1)\n        j = random.randint(0, w - tw + 1)\n        return i, j, th, tw\n\n    def crop(self, sample, region):\n        keep_index = None\n        # apply bbox and check whether the cropped result is valid\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            croped_bbox = self.apply_bbox(sample['gt_bbox'], region)\n            bbox = croped_bbox.reshape([-1, 2, 2])\n            area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)\n            keep_index = np.where(area > 0)[0]\n\n            if not self.keep_empty and len(keep_index) == 0:\n                # When keep_empty is set to False, cropped with no-object will\n                # not be used and return the origin content.\n                return sample\n\n            sample['gt_bbox'] = croped_bbox[keep_index] if len(\n                keep_index) > 0 else np.zeros(\n                    [0, 4], dtype=np.float32)\n            sample['gt_class'] = sample['gt_class'][keep_index] if len(\n                keep_index) > 0 else np.zeros(\n                    [0, 1], dtype=np.float32)\n            if 'gt_score' in sample:\n                sample['gt_score'] = sample['gt_score'][keep_index] if len(\n                    keep_index) > 0 else np.zeros(\n                        [0, 1], dtype=np.float32)\n            if 'is_crowd' in sample:\n                sample['is_crowd'] = sample['is_crowd'][keep_index] if len(\n                    keep_index) > 0 else np.zeros(\n                        [0, 1], dtype=np.float32)\n            if 'gt_areas' in sample:\n                sample['gt_areas'] = np.take(\n                    sample['gt_areas'], keep_index, axis=0)\n\n        image_shape = sample['image'].shape[:2]\n        sample['image'] = self.paddle_crop(sample['image'], *region)\n        sample['im_shape'] = np.array(\n            sample['image'].shape[:2], dtype=np.float32)\n\n        # apply polygon\n        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:\n            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,\n                                                image_shape)\n            sample['gt_poly'] = np.array(sample['gt_poly'])\n            if keep_index is not None and len(keep_index) > 0:\n                sample['gt_poly'] = sample['gt_poly'][keep_index]\n            sample['gt_poly'] = sample['gt_poly'].tolist()\n        # apply gt_segm\n        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:\n            i, j, h, w = region\n            sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]\n            if keep_index is not None and len(keep_index) > 0:\n                sample['gt_segm'] = sample['gt_segm'][keep_index]\n\n        if 'gt_joints' in sample:\n            gt_joints = self._crop_joints(sample['gt_joints'], region)\n            sample['gt_joints'] = gt_joints\n            if keep_index is not None:\n                sample['gt_joints'] = sample['gt_joints'][keep_index]\n\n        return sample\n\n    def apply_bbox(self, bbox, region):\n        i, j, h, w = region\n        region_size = np.asarray([w, h])\n        crop_bbox = bbox - np.asarray([j, i, j, i])\n        crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)\n        crop_bbox = crop_bbox.clip(min=0)\n        return crop_bbox.reshape([-1, 4]).astype('float32')\n\n    def _crop_joints(self, joints, region):\n        y1, x1, h, w = region\n        x2 = x1 + w\n        y2 = y1 + h\n        # x1, y1, x2, y2 = crop\n        joints[..., 0] -= x1\n        joints[..., 1] -= y1\n        joints[joints[..., 0] > w, :] = 0\n        joints[joints[..., 1] > h, :] = 0\n        joints[joints[..., 0] < 0, :] = 0\n        joints[joints[..., 1] < 0, :] = 0\n        return joints\n\n    def apply_segm(self, segms, region, image_shape):\n        def _crop_poly(segm, crop):\n            xmin, ymin, xmax, ymax = crop\n            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]\n            crop_p = np.array(crop_coord).reshape(4, 2)\n            crop_p = Polygon(crop_p)\n\n            crop_segm = list()\n            for poly in segm:\n                poly = np.array(poly).reshape(len(poly) // 2, 2)\n                polygon = Polygon(poly)\n                if not polygon.is_valid:\n                    exterior = polygon.exterior\n                    multi_lines = exterior.intersection(exterior)\n                    polygons = shapely.ops.polygonize(multi_lines)\n                    polygon = MultiPolygon(polygons)\n                multi_polygon = list()\n                if isinstance(polygon, MultiPolygon):\n                    multi_polygon = copy.deepcopy(polygon)\n                else:\n                    multi_polygon.append(copy.deepcopy(polygon))\n                for per_polygon in multi_polygon:\n                    inter = per_polygon.intersection(crop_p)\n                    if not inter:\n                        continue\n                    if isinstance(inter, (MultiPolygon, GeometryCollection)):\n                        for part in inter:\n                            if not isinstance(part, Polygon):\n                                continue\n                            part = np.squeeze(\n                                np.array(part.exterior.coords[:-1]).reshape(1,\n                                                                            -1))\n                            part[0::2] -= xmin\n                            part[1::2] -= ymin\n                            crop_segm.append(part.tolist())\n                    elif isinstance(inter, Polygon):\n                        crop_poly = np.squeeze(\n                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))\n                        crop_poly[0::2] -= xmin\n                        crop_poly[1::2] -= ymin\n                        crop_segm.append(crop_poly.tolist())\n                    else:\n                        continue\n            return crop_segm\n\n        def _crop_rle(rle, crop, height, width):\n            if 'counts' in rle and type(rle['counts']) == list:\n                rle = mask_util.frPyObjects(rle, height, width)\n            mask = mask_util.decode(rle)\n            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]\n            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))\n            return rle\n\n        i, j, h, w = region\n        crop = [j, i, j + w, i + h]\n        height, width = image_shape\n        crop_segms = []\n        for segm in segms:\n            if is_poly(segm):\n                import copy\n                import shapely.ops\n                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection\n                # Polygon format\n                crop_segms.append(_crop_poly(segm, crop))\n            else:\n                # RLE format\n                import pycocotools.mask as mask_util\n                crop_segms.append(_crop_rle(segm, crop, height, width))\n        return crop_segms\n\n    def apply(self, sample, context=None):\n        h = random.randint(self.min_size,\n                           min(sample['image'].shape[0], self.max_size))\n        w = random.randint(self.min_size,\n                           min(sample['image'].shape[1], self.max_size))\n\n        region = self.get_crop_params(sample['image'].shape[:2], [h, w])\n        return self.crop(sample, region)\n\n\n@register_op\nclass WarpAffine(BaseOperator):\n    def __init__(self,\n                 keep_res=False,\n                 pad=31,\n                 input_h=512,\n                 input_w=512,\n                 scale=0.4,\n                 shift=0.1,\n                 down_ratio=4):\n        \"\"\"WarpAffine\n        Warp affine the image\n        The code is based on https://github.com/xingyizhou/CenterNet/blob/master/src/lib/datasets/sample/ctdet.py\n        \"\"\"\n        super(WarpAffine, self).__init__()\n        self.keep_res = keep_res\n        self.pad = pad\n        self.input_h = input_h\n        self.input_w = input_w\n        self.scale = scale\n        self.shift = shift\n        self.down_ratio = down_ratio\n\n    def apply(self, sample, context=None):\n        img = sample['image']\n        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)\n\n        h, w = img.shape[:2]\n\n        if self.keep_res:\n            # True in detection eval/infer\n            input_h = (h | self.pad) + 1\n            input_w = (w | self.pad) + 1\n            s = np.array([input_w, input_h], dtype=np.float32)\n            c = np.array([w // 2, h // 2], dtype=np.float32)\n        else:\n            # False in centertrack eval_mot/eval_mot\n            s = max(h, w) * 1.0\n            input_h, input_w = self.input_h, self.input_w\n            c = np.array([w / 2., h / 2.], dtype=np.float32)\n\n        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])\n        img = cv2.resize(img, (w, h))\n        inp = cv2.warpAffine(\n            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)\n        sample['image'] = inp\n\n        if not self.keep_res:\n            out_h = input_h // self.down_ratio\n            out_w = input_w // self.down_ratio\n            trans_output = get_affine_transform(c, s, 0, [out_w, out_h])\n\n            sample.update({\n                'center': c,\n                'scale': s,\n                'out_height': out_h,\n                'out_width': out_w,\n                'inp_height': input_h,\n                'inp_width': input_w,\n                'trans_input': trans_input,\n                'trans_output': trans_output,\n            })\n        return sample\n\n\n@register_op\nclass FlipWarpAffine(BaseOperator):\n    def __init__(self,\n                 keep_res=False,\n                 pad=31,\n                 input_h=512,\n                 input_w=512,\n                 not_rand_crop=False,\n                 scale=0.4,\n                 shift=0.1,\n                 flip=0.5,\n                 is_scale=True,\n                 use_random=True,\n                 add_pre_img=False):\n        \"\"\"FlipWarpAffine\n        1. Random Crop\n        2. Flip the image horizontal\n        3. Warp affine the image\n        4. (Optinal) Add previous image\n        \"\"\"\n        super(FlipWarpAffine, self).__init__()\n        self.keep_res = keep_res\n        self.pad = pad\n        self.input_h = input_h\n        self.input_w = input_w\n        self.not_rand_crop = not_rand_crop\n        self.scale = scale\n        self.shift = shift\n        self.flip = flip\n        self.is_scale = is_scale\n        self.use_random = use_random\n        self.add_pre_img = add_pre_img\n\n    def __call__(self, samples, context=None):\n        if self.add_pre_img:\n            assert isinstance(samples, Sequence) and len(samples) == 2\n            sample, pre_sample = samples[0], samples[1]\n        else:\n            sample = samples\n\n        img = sample['image']\n        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:\n            return sample\n\n        h, w = img.shape[:2]\n        flipped = 0\n\n        if self.keep_res:\n            input_h = (h | self.pad) + 1\n            input_w = (w | self.pad) + 1\n            s = np.array([input_w, input_h], dtype=np.float32)\n            c = np.array([w // 2, h // 2], dtype=np.float32)\n        else:\n            # centernet training default\n            s = max(h, w) * 1.0\n            input_h, input_w = self.input_h, self.input_w\n            c = np.array([w / 2., h / 2.], dtype=np.float32)\n\n        if self.use_random:\n            gt_bbox = sample['gt_bbox']\n            if not self.not_rand_crop:\n                # centernet default\n                s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))\n                w_border = get_border(128, w)\n                h_border = get_border(128, h)\n                c[0] = np.random.randint(low=w_border, high=w - w_border)\n                c[1] = np.random.randint(low=h_border, high=h - h_border)\n            else:\n                sf = self.scale\n                cf = self.shift\n                c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)\n                c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)\n                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)\n\n            if np.random.random() < self.flip:\n                img = img[:, ::-1, :]\n                c[0] = w - c[0] - 1\n                oldx1 = gt_bbox[:, 0].copy()\n                oldx2 = gt_bbox[:, 2].copy()\n                gt_bbox[:, 0] = w - oldx2 - 1\n                gt_bbox[:, 2] = w - oldx1 - 1\n                flipped = 1\n            sample['gt_bbox'] = gt_bbox\n\n        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])\n        inp = cv2.warpAffine(\n            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)\n        if self.is_scale:\n            inp = (inp.astype(np.float32) / 255.)\n\n        sample['image'] = inp\n        sample['center'] = c\n        sample['scale'] = s\n\n        if self.add_pre_img:\n            sample['trans_input'] = trans_input\n\n            # previous image, use same aug trans_input as current image\n            pre_img = pre_sample['image']\n            pre_img = cv2.cvtColor(pre_img, cv2.COLOR_RGB2BGR)\n            if flipped:\n                pre_img = pre_img[:, ::-1, :].copy()\n            pre_inp = cv2.warpAffine(\n                pre_img,\n                trans_input, (input_w, input_h),\n                flags=cv2.INTER_LINEAR)\n            if self.is_scale:\n                pre_inp = (pre_inp.astype(np.float32) / 255.)\n            sample['pre_image'] = pre_inp\n\n            # if empty gt_bbox\n            if 'gt_bbox' in pre_sample and len(pre_sample['gt_bbox']) == 0:\n                return sample\n            pre_gt_bbox = pre_sample['gt_bbox']\n            if flipped:\n                pre_oldx1 = pre_gt_bbox[:, 0].copy()\n                pre_oldx2 = pre_gt_bbox[:, 2].copy()\n                pre_gt_bbox[:, 0] = w - pre_oldx1 - 1\n                pre_gt_bbox[:, 2] = w - pre_oldx2 - 1\n            sample['pre_gt_bbox'] = pre_gt_bbox\n\n            sample['pre_gt_class'] = pre_sample['gt_class']\n            sample['pre_gt_track_id'] = pre_sample['gt_track_id']\n            del pre_sample\n\n        return sample\n\n\n@register_op\nclass CenterRandColor(BaseOperator):\n    \"\"\"Random color for CenterNet series models.\n    Args:\n        saturation (float): saturation settings.\n        contrast (float): contrast settings.\n        brightness (float): brightness settings.\n    \"\"\"\n\n    def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4):\n        super(CenterRandColor, self).__init__()\n        self.saturation = saturation\n        self.contrast = contrast\n        self.brightness = brightness\n\n    def apply_saturation(self, img, img_gray):\n        alpha = 1. + np.random.uniform(\n            low=-self.saturation, high=self.saturation)\n        self._blend(alpha, img, img_gray[:, :, None])\n        return img\n\n    def apply_contrast(self, img, img_gray):\n        alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast)\n        img_mean = img_gray.mean()\n        self._blend(alpha, img, img_mean)\n        return img\n\n    def apply_brightness(self, img, img_gray):\n        alpha = 1 + np.random.uniform(\n            low=-self.brightness, high=self.brightness)\n        img *= alpha\n        return img\n\n    def _blend(self, alpha, img, img_mean):\n        img *= alpha\n        img_mean *= (1 - alpha)\n        img += img_mean\n\n    def apply(self, sample, context=None):\n        functions = [\n            self.apply_brightness,\n            self.apply_contrast,\n            self.apply_saturation,\n        ]\n\n        img = sample['image']\n        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n        distortions = np.random.permutation(functions)\n        for func in distortions:\n            img = func(img, img_gray)\n        sample['image'] = img\n\n        if 'pre_image' in sample:\n            pre_img = sample['pre_image']\n            pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY)\n            pre_distortions = np.random.permutation(functions)\n            for func in pre_distortions:\n                pre_img = func(pre_img, pre_img_gray)\n            sample['pre_image'] = pre_img\n\n        return sample\n\n\n@register_op\nclass Mosaic(BaseOperator):\n    \"\"\" Mosaic operator for image and gt_bboxes\n    The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py\n\n    1. get mosaic coords\n    2. clip bbox and get mosaic_labels\n    3. random_affine augment\n    4. Mixup augment as copypaste (optinal), not used in tiny/nano\n\n    Args:\n        prob (float): probability of using Mosaic, 1.0 as default\n        input_dim (list[int]): input shape\n        degrees (list[2]): the rotate range to apply, transform range is [min, max]\n        translate (list[2]): the translate range to apply, transform range is [min, max]\n        scale (list[2]): the scale range to apply, transform range is [min, max]\n        shear (list[2]): the shear range to apply, transform range is [min, max]\n        enable_mixup (bool): whether to enable Mixup or not\n        mixup_prob (float): probability of using Mixup, 1.0 as default\n        mixup_scale (list[int]): scale range of Mixup\n        remove_outside_box (bool): whether remove outside boxes, False as\n            default in COCO dataset, True in MOT dataset\n    \"\"\"\n\n    def __init__(self,\n                 prob=1.0,\n                 input_dim=[640, 640],\n                 degrees=[-10, 10],\n                 translate=[-0.1, 0.1],\n                 scale=[0.1, 2],\n                 shear=[-2, 2],\n                 enable_mixup=True,\n                 mixup_prob=1.0,\n                 mixup_scale=[0.5, 1.5],\n                 remove_outside_box=False):\n        super(Mosaic, self).__init__()\n        self.prob = prob\n        if isinstance(input_dim, Integral):\n            input_dim = [input_dim, input_dim]\n        self.input_dim = input_dim\n        self.degrees = degrees\n        self.translate = translate\n        self.scale = scale\n        self.shear = shear\n        self.enable_mixup = enable_mixup\n        self.mixup_prob = mixup_prob\n        self.mixup_scale = mixup_scale\n        self.remove_outside_box = remove_outside_box\n\n    def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w):\n        # (x1, y1, x2, y2) means coords in large image,\n        # small_coords means coords in small image in mosaic aug.\n        if mosaic_idx == 0:\n            # top left\n            x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc\n            small_coords = w - (x2 - x1), h - (y2 - y1), w, h\n        elif mosaic_idx == 1:\n            # top right\n            x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc\n            small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h\n        elif mosaic_idx == 2:\n            # bottom left\n            x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)\n            small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h)\n        elif mosaic_idx == 3:\n            # bottom right\n            x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2,\n                                                                   yc + h)\n            small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h)\n\n        return (x1, y1, x2, y2), small_coords\n\n    def random_affine_augment(self,\n                              img,\n                              labels=[],\n                              input_dim=[640, 640],\n                              degrees=[-10, 10],\n                              scales=[0.1, 2],\n                              shears=[-2, 2],\n                              translates=[-0.1, 0.1]):\n        # random rotation and scale\n        degree = random.uniform(degrees[0], degrees[1])\n        scale = random.uniform(scales[0], scales[1])\n        assert scale > 0, \"Argument scale should be positive.\"\n        R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale)\n        M = np.ones([2, 3])\n\n        # random shear\n        shear = random.uniform(shears[0], shears[1])\n        shear_x = math.tan(shear * math.pi / 180)\n        shear_y = math.tan(shear * math.pi / 180)\n        M[0] = R[0] + shear_y * R[1]\n        M[1] = R[1] + shear_x * R[0]\n\n        # random translation\n        translate = random.uniform(translates[0], translates[1])\n        translation_x = translate * input_dim[0]\n        translation_y = translate * input_dim[1]\n        M[0, 2] = translation_x\n        M[1, 2] = translation_y\n\n        # warpAffine\n        img = cv2.warpAffine(\n            img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114))\n\n        num_gts = len(labels)\n        if num_gts > 0:\n            # warp corner points\n            corner_points = np.ones((4 * num_gts, 3))\n            corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(\n                4 * num_gts, 2)  # x1y1, x2y2, x1y2, x2y1\n            # apply affine transform\n            corner_points = corner_points @M.T\n            corner_points = corner_points.reshape(num_gts, 8)\n\n            # create new boxes\n            corner_xs = corner_points[:, 0::2]\n            corner_ys = corner_points[:, 1::2]\n            new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1),\n                                         corner_xs.max(1), corner_ys.max(1)))\n            new_bboxes = new_bboxes.reshape(4, num_gts).T\n\n            # clip boxes\n            new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0])\n            new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1])\n            labels[:, :4] = new_bboxes\n\n        return img, labels\n\n    def __call__(self, sample, context=None):\n        if not isinstance(sample, Sequence):\n            return sample\n\n        assert len(\n            sample) == 5, \"Mosaic needs 5 samples, 4 for mosaic and 1 for mixup.\"\n        if np.random.uniform(0., 1.) > self.prob:\n            return sample[0]\n\n        mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], []\n        input_h, input_w = self.input_dim\n        yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))\n        xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))\n        mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8)\n\n        # 1. get mosaic coords\n        for mosaic_idx, sp in enumerate(sample[:4]):\n            img = sp['image']\n            gt_bbox = sp['gt_bbox']\n            h0, w0 = img.shape[:2]\n            scale = min(1. * input_h / h0, 1. * input_w / w0)\n            img = cv2.resize(\n                img, (int(w0 * scale), int(h0 * scale)),\n                interpolation=cv2.INTER_LINEAR)\n            (h, w, c) = img.shape[:3]\n\n            # suffix l means large image, while s means small image in mosaic aug.\n            (l_x1, l_y1, l_x2, l_y2), (\n                s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords(\n                    mosaic_idx, xc, yc, w, h, input_h, input_w)\n\n            mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]\n            padw, padh = l_x1 - s_x1, l_y1 - s_y1\n\n            # Normalized xywh to pixel xyxy format\n            _gt_bbox = gt_bbox.copy()\n            if len(gt_bbox) > 0:\n                _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw\n                _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh\n                _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw\n                _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh\n\n            mosaic_gt_bbox.append(_gt_bbox)\n            mosaic_gt_class.append(sp['gt_class'])\n            if 'is_crowd' in sp:\n                mosaic_is_crowd.append(sp['is_crowd'])\n            if 'difficult' in sp:\n                mosaic_difficult.append(sp['difficult'])\n\n        # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd])\n        if len(mosaic_gt_bbox):\n            mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0)\n            mosaic_gt_class = np.concatenate(mosaic_gt_class, 0)\n            if mosaic_is_crowd:\n                mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0)\n                mosaic_labels = np.concatenate([\n                    mosaic_gt_bbox,\n                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),\n                    mosaic_is_crowd.astype(mosaic_gt_bbox.dtype)\n                ], 1)\n            elif mosaic_difficult:\n                mosaic_difficult = np.concatenate(mosaic_difficult, 0)\n                mosaic_labels = np.concatenate([\n                    mosaic_gt_bbox,\n                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),\n                    mosaic_difficult.astype(mosaic_gt_bbox.dtype)\n                ], 1)\n            else:\n                mosaic_labels = np.concatenate([\n                    mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype)\n                ], 1)\n            if self.remove_outside_box:\n                # for MOT dataset\n                flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w\n                flag2 = mosaic_gt_bbox[:, 2] > 0\n                flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h\n                flag4 = mosaic_gt_bbox[:, 3] > 0\n                flag_all = flag1 * flag2 * flag3 * flag4\n                mosaic_labels = mosaic_labels[flag_all]\n            else:\n                mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0,\n                                              2 * input_w)\n                mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0,\n                                              2 * input_h)\n                mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0,\n                                              2 * input_w)\n                mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0,\n                                              2 * input_h)\n        else:\n            mosaic_labels = np.zeros((1, 6))\n\n        # 3. random_affine augment\n        mosaic_img, mosaic_labels = self.random_affine_augment(\n            mosaic_img,\n            mosaic_labels,\n            input_dim=self.input_dim,\n            degrees=self.degrees,\n            translates=self.translate,\n            scales=self.scale,\n            shears=self.shear)\n\n        # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177\n        # optinal, not used(enable_mixup=False) in tiny/nano\n        if (self.enable_mixup and not len(mosaic_labels) == 0 and\n                random.random() < self.mixup_prob):\n            sample_mixup = sample[4]\n            mixup_img = sample_mixup['image']\n            if 'is_crowd' in sample_mixup:\n                cp_labels = np.concatenate([\n                    sample_mixup['gt_bbox'],\n                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),\n                    sample_mixup['is_crowd'].astype(mosaic_labels.dtype)\n                ], 1)\n            elif 'difficult' in sample_mixup:\n                cp_labels = np.concatenate([\n                    sample_mixup['gt_bbox'],\n                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),\n                    sample_mixup['difficult'].astype(mosaic_labels.dtype)\n                ], 1)\n            else:\n                cp_labels = np.concatenate([\n                    sample_mixup['gt_bbox'],\n                    sample_mixup['gt_class'].astype(mosaic_labels.dtype)\n                ], 1)\n            mosaic_img, mosaic_labels = self.mixup_augment(\n                mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img)\n\n        sample0 = sample[0]\n        sample0['image'] = mosaic_img.astype(np.uint8)  # can not be float32\n        sample0['h'] = float(mosaic_img.shape[0])\n        sample0['w'] = float(mosaic_img.shape[1])\n        sample0['im_shape'][0] = sample0['h']\n        sample0['im_shape'][1] = sample0['w']\n        sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32)\n        sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32)\n        if 'is_crowd' in sample[0]:\n            sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32)\n        if 'difficult' in sample[0]:\n            sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32)\n        return sample0\n\n    def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels,\n                      img):\n        jit_factor = random.uniform(*self.mixup_scale)\n        FLIP = random.uniform(0, 1) > 0.5\n        if len(img.shape) == 3:\n            cp_img = np.ones(\n                (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114\n        else:\n            cp_img = np.ones(input_dim, dtype=np.uint8) * 114\n\n        cp_scale_ratio = min(input_dim[0] / img.shape[0],\n                             input_dim[1] / img.shape[1])\n        resized_img = cv2.resize(\n            img, (int(img.shape[1] * cp_scale_ratio),\n                  int(img.shape[0] * cp_scale_ratio)),\n            interpolation=cv2.INTER_LINEAR)\n\n        cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[\n            1] * cp_scale_ratio)] = resized_img\n\n        cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor),\n                                     int(cp_img.shape[0] * jit_factor)))\n        cp_scale_ratio *= jit_factor\n\n        if FLIP:\n            cp_img = cp_img[:, ::-1, :]\n\n        origin_h, origin_w = cp_img.shape[:2]\n        target_h, target_w = origin_img.shape[:2]\n        padded_img = np.zeros(\n            (max(origin_h, target_h), max(origin_w, target_w), 3),\n            dtype=np.uint8)\n        padded_img[:origin_h, :origin_w] = cp_img\n\n        x_offset, y_offset = 0, 0\n        if padded_img.shape[0] > target_h:\n            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)\n        if padded_img.shape[1] > target_w:\n            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)\n        padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:\n                                        x_offset + target_w]\n\n        # adjust boxes\n        cp_bboxes_origin_np = cp_labels[:, :4].copy()\n        cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] *\n                                               cp_scale_ratio, 0, origin_w)\n        cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] *\n                                               cp_scale_ratio, 0, origin_h)\n\n        if FLIP:\n            cp_bboxes_origin_np[:, 0::2] = (\n                origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])\n        cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()\n        if self.remove_outside_box:\n            # for MOT dataset\n            cp_bboxes_transformed_np[:, 0::2] -= x_offset\n            cp_bboxes_transformed_np[:, 1::2] -= y_offset\n        else:\n            cp_bboxes_transformed_np[:, 0::2] = np.clip(\n                cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)\n            cp_bboxes_transformed_np[:, 1::2] = np.clip(\n                cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)\n\n        cls_labels = cp_labels[:, 4:5].copy()\n        box_labels = cp_bboxes_transformed_np\n        if cp_labels.shape[-1] == 6:\n            crd_labels = cp_labels[:, 5:6].copy()\n            labels = np.hstack((box_labels, cls_labels, crd_labels))\n        else:\n            labels = np.hstack((box_labels, cls_labels))\n        if self.remove_outside_box:\n            labels = labels[labels[:, 0] < target_w]\n            labels = labels[labels[:, 2] > 0]\n            labels = labels[labels[:, 1] < target_h]\n            labels = labels[labels[:, 3] > 0]\n\n        origin_labels = np.vstack((origin_labels, labels))\n        origin_img = origin_img.astype(np.float32)\n        origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(\n            np.float32)\n\n        return origin_img.astype(np.uint8), origin_labels\n\n\n@register_op\nclass PadResize(BaseOperator):\n    \"\"\" PadResize for image and gt_bbbox\n\n    Args:\n        target_size (list[int]): input shape\n        fill_value (float): pixel value of padded image\n    \"\"\"\n\n    def __init__(self, target_size, fill_value=114):\n        super(PadResize, self).__init__()\n        if isinstance(target_size, Integral):\n            target_size = [target_size, target_size]\n        self.target_size = target_size\n        self.fill_value = fill_value\n\n    def _resize(self, img, bboxes, labels):\n        ratio = min(self.target_size[0] / img.shape[0],\n                    self.target_size[1] / img.shape[1])\n        w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio)\n        resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)\n\n        if len(bboxes) > 0:\n            bboxes *= ratio\n            mask = np.minimum(bboxes[:, 2] - bboxes[:, 0],\n                              bboxes[:, 3] - bboxes[:, 1]) > 1\n            bboxes = bboxes[mask]\n            labels = labels[mask]\n        return resized_img, bboxes, labels\n\n    def _pad(self, img):\n        h, w, _ = img.shape\n        if h == self.target_size[0] and w == self.target_size[1]:\n            return img\n        padded_img = np.full(\n            (self.target_size[0], self.target_size[1], 3),\n            self.fill_value,\n            dtype=np.uint8)\n        padded_img[:h, :w] = img\n        return padded_img\n\n    def apply(self, sample, context=None):\n        image = sample['image']\n        bboxes = sample['gt_bbox']\n        labels = sample['gt_class']\n        image, bboxes, labels = self._resize(image, bboxes, labels)\n        sample['image'] = self._pad(image).astype(np.float32)\n        sample['gt_bbox'] = bboxes\n        sample['gt_class'] = labels\n        return sample\n\n\n@register_op\nclass RandomShift(BaseOperator):\n    \"\"\"\n    Randomly shift image\n\n    Args:\n        prob (float): probability to do random shift.\n        max_shift (int): max shift pixels\n        filter_thr (int): filter gt bboxes if one side is smaller than this\n    \"\"\"\n\n    def __init__(self, prob=0.5, max_shift=32, filter_thr=1):\n        super(RandomShift, self).__init__()\n        self.prob = prob\n        self.max_shift = max_shift\n        self.filter_thr = filter_thr\n\n    def calc_shift_coor(self, im_h, im_w, shift_h, shift_w):\n        return [\n            max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w),\n            min(im_h, im_h + shift_h)\n        ]\n\n    def apply(self, sample, context=None):\n        if random.random() > self.prob:\n            return sample\n\n        im = sample['image']\n        gt_bbox = sample['gt_bbox']\n        gt_class = sample['gt_class']\n        im_h, im_w = im.shape[:2]\n        shift_h = random.randint(-self.max_shift, self.max_shift)\n        shift_w = random.randint(-self.max_shift, self.max_shift)\n\n        gt_bbox[:, 0::2] += shift_w\n        gt_bbox[:, 1::2] += shift_h\n        gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w)\n        gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h)\n        gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0]\n        gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1]\n        keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr)\n        if not keep.any():\n            return sample\n\n        gt_bbox = gt_bbox[keep]\n        gt_class = gt_class[keep]\n\n        # shift image\n        coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w)\n        # shift frame to the opposite direction\n        coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w)\n        canvas = np.zeros_like(im)\n        canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \\\n            = im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]]\n\n        sample['image'] = canvas\n        sample['gt_bbox'] = gt_bbox\n        sample['gt_class'] = gt_class\n        return sample\n\n\n@register_op\nclass StrongAugImage(BaseOperator):\n    def __init__(self, transforms):\n        super(StrongAugImage, self).__init__()\n        self.transforms = Compose(transforms)\n\n    def apply(self, sample, context=None):\n        im = sample\n        im['image'] = sample['image'].astype('uint8')\n        results = self.transforms(im)\n        sample['image'] = results['image'].astype('uint8')\n        return sample\n\n\n@register_op\nclass RandomColorJitter(BaseOperator):\n    def __init__(self,\n                 prob=0.8,\n                 brightness=0.4,\n                 contrast=0.4,\n                 saturation=0.4,\n                 hue=0.1):\n        super(RandomColorJitter, self).__init__()\n        self.prob = prob\n        self.brightness = brightness\n        self.contrast = contrast\n        self.saturation = saturation\n        self.hue = hue\n\n    def apply(self, sample, context=None):\n        if np.random.uniform(0, 1) < self.prob:\n            from paddle.vision.transforms import ColorJitter\n            transform = ColorJitter(self.brightness, self.contrast,\n                                    self.saturation, self.hue)\n            sample['image'] = transform(sample['image'].astype(np.uint8))\n            sample['image'] = sample['image'].astype(np.float32)\n        return sample\n\n\n@register_op\nclass RandomGrayscale(BaseOperator):\n    def __init__(self, prob=0.2):\n        super(RandomGrayscale, self).__init__()\n        self.prob = prob\n\n    def apply(self, sample, context=None):\n        if np.random.uniform(0, 1) < self.prob:\n            from paddle.vision.transforms import Grayscale\n            transform = Grayscale(num_output_channels=3)\n            sample['image'] = transform(sample['image'])\n        return sample\n\n\n@register_op\nclass RandomGaussianBlur(BaseOperator):\n    def __init__(self, prob=0.5, sigma=[0.1, 2.0]):\n        super(RandomGaussianBlur, self).__init__()\n        self.prob = prob\n        self.sigma = sigma\n\n    def apply(self, sample, context=None):\n        if np.random.uniform(0, 1) < self.prob:\n            sigma = np.random.uniform(self.sigma[0], self.sigma[1])\n            im = cv2.GaussianBlur(sample['image'], (23, 23), sigma)\n            sample['image'] = im\n        return sample\n\n\n@register_op\nclass RandomErasing(BaseOperator):\n    def __init__(self,\n                 prob=0.5,\n                 scale=(0.02, 0.33),\n                 ratio=(0.3, 3.3),\n                 value=0,\n                 inplace=False):\n        super(RandomErasing, self).__init__()\n        assert isinstance(scale,\n                          (tuple, list)), \"scale should be a tuple or list\"\n        assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]\n                ), \"scale should be of kind (min, max) and in range [0, 1]\"\n        assert isinstance(ratio,\n                          (tuple, list)), \"ratio should be a tuple or list\"\n        assert (ratio[0] >= 0 and\n                ratio[0] <= ratio[1]), \"ratio should be of kind (min, max)\"\n        assert isinstance(\n            value, (Number, str, tuple,\n                    list)), \"value should be a number, tuple, list or str\"\n        if isinstance(value, str) and value != \"random\":\n            raise ValueError(\"value must be 'random' when type is str\")\n        self.prob = prob\n        self.scale = scale\n        self.ratio = ratio\n        self.value = value\n        self.inplace = inplace\n\n    def _erase(self, img, i, j, h, w, v, inplace=False):\n        if not inplace:\n            img = img.copy()\n        img[i:i + h, j:j + w, ...] = v\n        return img\n\n    def _get_param(self, img, scale, ratio, value):\n        shape = np.asarray(img).astype(np.uint8).shape\n        h, w, c = shape[-3], shape[-2], shape[-1]\n        img_area = h * w\n        log_ratio = np.log(ratio)\n        for _ in range(1):\n            erase_area = np.random.uniform(*scale) * img_area\n            aspect_ratio = np.exp(np.random.uniform(*log_ratio))\n            erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))\n            erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))\n            if erase_h >= h or erase_w >= w:\n                continue\n\n            if value is None:\n                v = np.random.normal(size=[erase_h, erase_w, c]) * 255\n            else:\n                v = np.array(value)[None, None, :]\n            top = np.random.randint(0, h - erase_h + 1)\n            left = np.random.randint(0, w - erase_w + 1)\n            return top, left, erase_h, erase_w, v\n        return 0, 0, h, w, img\n\n    def apply(self, sample, context=None):\n        if random.random() < self.prob:\n            if isinstance(self.value, Number):\n                value = [self.value]\n            elif isinstance(self.value, str):\n                value = None\n            else:\n                value = self.value\n            if value is not None and not (len(value) == 1 or len(value) == 3):\n                raise ValueError(\n                    \"Value should be a single number or a sequence with length equals to image's channel.\"\n                )\n            im = sample['image']\n            top, left, erase_h, erase_w, v = self._get_param(im, self.scale,\n                                                             self.ratio, value)\n            im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace)\n            sample['image'] = im\n        return sample\n\n\n@register_op\nclass RandomErasingCrop(BaseOperator):\n    def __init__(self):\n        super(RandomErasingCrop, self).__init__()\n        self.transform1 = RandomErasing(\n            prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value=\"random\")\n        self.transform2 = RandomErasing(\n            prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value=\"random\")\n        self.transform3 = RandomErasing(\n            prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value=\"random\")\n\n    def apply(self, sample, context=None):\n        sample = self.transform1(sample)\n        sample = self.transform2(sample)\n        sample = self.transform3(sample)\n        return sample\n"
  },
  {
    "path": "ppdet/data/transform/rotated_operators.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\n\nfrom numbers import Number, Integral\n\nimport cv2\nimport numpy as np\nimport math\nimport copy\n\nfrom .operators import register_op, BaseOperator\nfrom ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np\nfrom ppdet.utils.logger import setup_logger\nfrom ppdet.utils.compact import imagedraw_textsize_c\nlogger = setup_logger(__name__)\n\n\n@register_op\nclass RRotate(BaseOperator):\n    \"\"\" Rotate Image, Polygon, Box\n\n    Args:\n        scale (float): rotate scale\n        angle (float): rotate angle\n        fill_value (int, tuple): fill color\n        auto_bound (bool): whether auto bound or not\n    \"\"\"\n\n    def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True):\n        super(RRotate, self).__init__()\n        self.scale = scale\n        self.angle = angle\n        self.fill_value = fill_value\n        self.auto_bound = auto_bound\n\n    def get_rotated_matrix(self, angle, scale, h, w):\n        center = ((w - 1) * 0.5, (h - 1) * 0.5)\n        matrix = cv2.getRotationMatrix2D(center, -angle, scale)\n        # calculate the new size\n        cos = np.abs(matrix[0, 0])\n        sin = np.abs(matrix[0, 1])\n        new_w = h * sin + w * cos\n        new_h = h * cos + w * sin\n        # calculate offset\n        n_w = int(np.round(new_w))\n        n_h = int(np.round(new_h))\n        if self.auto_bound:\n            ratio = min(w / n_w, h / n_h)\n            matrix = cv2.getRotationMatrix2D(center, -angle, ratio)\n        else:\n            matrix[0, 2] += (new_w - w) * 0.5\n            matrix[1, 2] += (new_h - h) * 0.5\n            w = n_w\n            h = n_h\n        return matrix, h, w\n\n    def get_rect_from_pts(self, pts, h, w):\n        \"\"\" get minimum rectangle of points\n        \"\"\"\n        assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'\n        min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2],\n                                                            axis=1)\n        max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2],\n                                                            axis=1)\n        min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h)\n        max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h)\n        boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1)\n        return boxes\n\n    def apply_image(self, image, matrix, h, w):\n        return cv2.warpAffine(\n            image, matrix, (w, h), borderValue=self.fill_value)\n\n    def apply_pts(self, pts, matrix, h, w):\n        assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'\n        # n is number of samples and m is two times the number of points due to (x, y)\n        _, m = pts.shape\n        # transpose points\n        pts_ = pts.reshape(-1, 2).T\n        # pad 1 to convert the points to homogeneous coordinates\n        padding = np.ones((1, pts_.shape[1]), pts.dtype)\n        rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0))\n        return rotated_pts[:2, :].T.reshape(-1, m)\n\n    def apply(self, sample, context=None):\n        image = sample['image']\n        h, w = image.shape[:2]\n        matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w)\n        sample['image'] = self.apply_image(image, matrix, h, w)\n        polys = sample['gt_poly']\n        # TODO: segment or keypoint to be processed \n        if len(polys) > 0:\n            pts = self.apply_pts(polys, matrix, h, w)\n            sample['gt_poly'] = pts\n            sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w)\n\n        return sample\n\n\n@register_op\nclass RandomRRotate(BaseOperator):\n    \"\"\" Random Rotate Image\n    Args:\n        scale (float, tuple, list): rotate scale\n        scale_mode (str): mode of scale, [range, value, None]\n        angle (float, tuple, list): rotate angle\n        angle_mode (str): mode of angle, [range, value, None]\n        fill_value (float, tuple, list): fill value\n        rotate_prob (float): probability of rotation\n        auto_bound (bool): whether auto bound or not\n    \"\"\"\n\n    def __init__(self,\n                 scale=1.0,\n                 scale_mode=None,\n                 angle=0.,\n                 angle_mode=None,\n                 fill_value=0.,\n                 rotate_prob=1.0,\n                 auto_bound=True):\n        super(RandomRRotate, self).__init__()\n        self.scale = scale\n        self.scale_mode = scale_mode\n        self.angle = angle\n        self.angle_mode = angle_mode\n        self.fill_value = fill_value\n        self.rotate_prob = rotate_prob\n        self.auto_bound = auto_bound\n\n    def get_angle(self, angle, angle_mode):\n        assert not angle_mode or angle_mode in [\n            'range', 'value'\n        ], 'angle mode should be in [range, value, None]'\n        if not angle_mode:\n            return angle\n        elif angle_mode == 'range':\n            low, high = angle\n            return np.random.rand() * (high - low) + low\n        elif angle_mode == 'value':\n            return np.random.choice(angle)\n\n    def get_scale(self, scale, scale_mode):\n        assert not scale_mode or scale_mode in [\n            'range', 'value'\n        ], 'scale mode should be in [range, value, None]'\n        if not scale_mode:\n            return scale\n        elif scale_mode == 'range':\n            low, high = scale\n            return np.random.rand() * (high - low) + low\n        elif scale_mode == 'value':\n            return np.random.choice(scale)\n\n    def apply(self, sample, context=None):\n        if np.random.rand() > self.rotate_prob:\n            return sample\n\n        angle = self.get_angle(self.angle, self.angle_mode)\n        scale = self.get_scale(self.scale, self.scale_mode)\n        rotator = RRotate(scale, angle, self.fill_value, self.auto_bound)\n        return rotator(sample)\n\n\n@register_op\nclass Poly2RBox(BaseOperator):\n    \"\"\" Polygon to Rotated Box, using new OpenCV definition since 4.5.1\n\n    Args:\n        filter_threshold (int, float): threshold to filter annotations\n        filter_mode (str): filter mode, ['area', 'edge']\n        rbox_type (str): rbox type, ['le135', 'oc']\n\n    \"\"\"\n\n    def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'):\n        super(Poly2RBox, self).__init__()\n        self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode)\n        self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np\n\n    def filter(self, size, threshold, mode):\n        if mode == 'area':\n            if size[0] * size[1] < threshold:\n                return True\n        elif mode == 'edge':\n            if min(size) < threshold:\n                return True\n        return False\n\n    def get_rbox(self, polys):\n        valid_ids, rboxes, bboxes = [], [], []\n        for i, poly in enumerate(polys):\n            cx, cy, w, h, angle = self.rbox_fn(poly)\n            if self.filter_fn((w, h)):\n                continue\n            rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32))\n            valid_ids.append(i)\n            xmin, ymin = min(poly[0::2]), min(poly[1::2])\n            xmax, ymax = max(poly[0::2]), max(poly[1::2])\n            bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32))\n\n        if len(valid_ids) == 0:\n            rboxes = np.zeros((0, 5), dtype=np.float32)\n            bboxes = np.zeros((0, 4), dtype=np.float32)\n        else:\n            rboxes = np.stack(rboxes)\n            bboxes = np.stack(bboxes)\n\n        return rboxes, bboxes, valid_ids\n\n    def apply(self, sample, context=None):\n        rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly'])\n        sample['gt_rbox'] = rboxes\n        sample['gt_bbox'] = bboxes\n        for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']:\n            if k in sample:\n                sample[k] = sample[k][valid_ids]\n\n        return sample\n\n\n@register_op\nclass Poly2Array(BaseOperator):\n    \"\"\" convert gt_poly to np.array for rotated bboxes\n    \"\"\"\n\n    def __init__(self):\n        super(Poly2Array, self).__init__()\n\n    def apply(self, sample, context=None):\n        if 'gt_poly' in sample:\n            sample['gt_poly'] = np.array(\n                sample['gt_poly'], dtype=np.float32).reshape((-1, 8))\n\n        return sample\n\n\n@register_op\nclass RResize(BaseOperator):\n    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):\n        \"\"\"\n        Resize image to target size. if keep_ratio is True, \n        resize the image's long side to the maximum of target_size\n        if keep_ratio is False, resize the image to target size(h, w)\n        Args:\n            target_size (int|list): image target size\n            keep_ratio (bool): whether keep_ratio or not, default true\n            interp (int): the interpolation method\n        \"\"\"\n        super(RResize, self).__init__()\n        self.keep_ratio = keep_ratio\n        self.interp = interp\n        if not isinstance(target_size, (Integral, Sequence)):\n            raise TypeError(\n                \"Type of target_size is invalid. Must be Integer or List or Tuple, now is {}\".\n                format(type(target_size)))\n        if isinstance(target_size, Integral):\n            target_size = [target_size, target_size]\n        self.target_size = target_size\n\n    def apply_image(self, image, scale):\n        im_scale_x, im_scale_y = scale\n\n        return cv2.resize(\n            image,\n            None,\n            None,\n            fx=im_scale_x,\n            fy=im_scale_y,\n            interpolation=self.interp)\n\n    def apply_pts(self, pts, scale, size):\n        im_scale_x, im_scale_y = scale\n        resize_w, resize_h = size\n        pts[:, 0::2] *= im_scale_x\n        pts[:, 1::2] *= im_scale_y\n        pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w)\n        pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h)\n        return pts\n\n    def apply(self, sample, context=None):\n        \"\"\" Resize the image numpy.\n        \"\"\"\n        im = sample['image']\n        if not isinstance(im, np.ndarray):\n            raise TypeError(\"{}: image type is not numpy.\".format(self))\n        if len(im.shape) != 3:\n            raise ImageError('{}: image is not 3-dimensional.'.format(self))\n\n        # apply image\n        im_shape = im.shape\n        if self.keep_ratio:\n\n            im_size_min = np.min(im_shape[0:2])\n            im_size_max = np.max(im_shape[0:2])\n\n            target_size_min = np.min(self.target_size)\n            target_size_max = np.max(self.target_size)\n\n            im_scale = min(target_size_min / im_size_min,\n                           target_size_max / im_size_max)\n\n            resize_h = im_scale * float(im_shape[0])\n            resize_w = im_scale * float(im_shape[1])\n\n            im_scale_x = im_scale\n            im_scale_y = im_scale\n        else:\n            resize_h, resize_w = self.target_size\n            im_scale_y = resize_h / im_shape[0]\n            im_scale_x = resize_w / im_shape[1]\n\n        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])\n        sample['image'] = im.astype(np.float32)\n        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)\n        if 'scale_factor' in sample:\n            scale_factor = sample['scale_factor']\n            sample['scale_factor'] = np.asarray(\n                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],\n                dtype=np.float32)\n        else:\n            sample['scale_factor'] = np.asarray(\n                [im_scale_y, im_scale_x], dtype=np.float32)\n\n        # apply bbox\n        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n            sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'],\n                                               [im_scale_x, im_scale_y],\n                                               [resize_w, resize_h])\n\n        # apply polygon\n        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:\n            sample['gt_poly'] = self.apply_pts(sample['gt_poly'],\n                                               [im_scale_x, im_scale_y],\n                                               [resize_w, resize_h])\n\n        return sample\n\n\n@register_op\nclass RandomRFlip(BaseOperator):\n    def __init__(self, prob=0.5):\n        \"\"\"\n        Args:\n            prob (float): the probability of flipping image\n        \"\"\"\n        super(RandomRFlip, self).__init__()\n        self.prob = prob\n        if not (isinstance(self.prob, float)):\n            raise TypeError(\"{}: input type is invalid.\".format(self))\n\n    def apply_image(self, image):\n        return image[:, ::-1, :]\n\n    def apply_pts(self, pts, width):\n        oldx = pts[:, 0::2].copy()\n        pts[:, 0::2] = width - oldx - 1\n        return pts\n\n    def apply(self, sample, context=None):\n        \"\"\"Filp the image and bounding box.\n        Operators:\n            1. Flip the image numpy.\n            2. Transform the bboxes' x coordinates.\n              (Must judge whether the coordinates are normalized!)\n            3. Transform the segmentations' x coordinates.\n              (Must judge whether the coordinates are normalized!)\n        Output:\n            sample: the image, bounding box and segmentation part\n                    in sample are flipped.\n        \"\"\"\n        if np.random.uniform(0, 1) < self.prob:\n            im = sample['image']\n            height, width = im.shape[:2]\n            im = self.apply_image(im)\n            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:\n                sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width)\n            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:\n                sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width)\n\n            sample['flipped'] = True\n            sample['image'] = im\n        return sample\n\n\n@register_op\nclass VisibleRBox(BaseOperator):\n    \"\"\"\n    In debug mode, visualize images according to `gt_box`.\n    (Currently only supported when not cropping and flipping image.)\n    \"\"\"\n\n    def __init__(self, output_dir='debug'):\n        super(VisibleRBox, self).__init__()\n        self.output_dir = output_dir\n        if not os.path.isdir(output_dir):\n            os.makedirs(output_dir)\n\n    def apply(self, sample, context=None):\n        image = Image.fromarray(sample['image'].astype(np.uint8))\n        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])\n        width = sample['w']\n        height = sample['h']\n        # gt_poly = sample['gt_rbox']\n        gt_poly = sample['gt_poly']\n        gt_class = sample['gt_class']\n        draw = ImageDraw.Draw(image)\n        for i in range(gt_poly.shape[0]):\n            x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i]\n            draw.line(\n                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],\n                width=2,\n                fill='green')\n            # draw label\n            xmin = min(x1, x2, x3, x4)\n            ymin = min(y1, y2, y3, y4)\n            text = str(gt_class[i][0])\n            tw, th = imagedraw_textsize_c(draw, text)\n            draw.rectangle(\n                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')\n            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))\n\n        if 'gt_keypoint' in sample.keys():\n            gt_keypoint = sample['gt_keypoint']\n            if self.is_normalized:\n                for i in range(gt_keypoint.shape[1]):\n                    if i % 2:\n                        gt_keypoint[:, i] = gt_keypoint[:, i] * height\n                    else:\n                        gt_keypoint[:, i] = gt_keypoint[:, i] * width\n            for i in range(gt_keypoint.shape[0]):\n                keypoint = gt_keypoint[i]\n                for j in range(int(keypoint.shape[0] / 2)):\n                    x1 = round(keypoint[2 * j]).astype(np.int32)\n                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)\n                    draw.ellipse(\n                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')\n        save_path = os.path.join(self.output_dir, out_file_name)\n        image.save(save_path, quality=95)\n        return sample\n\n\n@register_op\nclass Rbox2Poly(BaseOperator):\n    \"\"\"\n    Convert rbbox format to poly format.\n    \"\"\"\n\n    def __init__(self):\n        super(Rbox2Poly, self).__init__()\n\n    def apply(self, sample, context=None):\n        assert 'gt_rbox' in sample\n        assert sample['gt_rbox'].shape[1] == 5\n        rboxes = sample['gt_rbox']\n        polys = rbox2poly_np(rboxes)\n        sample['gt_poly'] = polys\n        xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1)\n        xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1)\n        sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1)\n        return sample\n"
  },
  {
    "path": "ppdet/data/utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport numbers\nimport numpy as np\n\ntry:\n    from collections.abc import Sequence, Mapping\nexcept:\n    from collections import Sequence, Mapping\n\n\ndef default_collate_fn(batch):\n    \"\"\"\n    Default batch collating function for :code:`paddle.io.DataLoader`,\n    get input data as a list of sample datas, each element in list\n    if the data of a sample, and sample data should composed of list,\n    dictionary, string, number, numpy array, this\n    function will parse input data recursively and stack number,\n    numpy array and paddle.Tensor datas as batch datas. e.g. for\n    following input data:\n    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},\n     {'image': np.array(shape=[3, 224, 224]), 'label': 3},\n     {'image': np.array(shape=[3, 224, 224]), 'label': 4},\n     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]\n    \n    \n    This default collate function zipped each number and numpy array\n    field together and stack each field as the batch field as follows:\n    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}\n    Args:  \n        batch(list of sample data): batch should be a list of sample data.\n    \n    Returns:\n        Batched data: batched each number, numpy array and paddle.Tensor\n                      in input data.\n    \"\"\"\n    sample = batch[0]\n    if isinstance(sample, np.ndarray):\n        batch = np.stack(batch, axis=0)\n        return batch\n    elif isinstance(sample, numbers.Number):\n        batch = np.array(batch)\n        return batch\n    elif isinstance(sample, (str, bytes)):\n        return batch\n    elif isinstance(sample, Mapping):\n        return {\n            key: default_collate_fn([d[key] for d in batch])\n            for key in sample\n        }\n    elif isinstance(sample, Sequence):\n        sample_fields_num = len(sample)\n        if not all(len(sample) == sample_fields_num for sample in iter(batch)):\n            raise RuntimeError(\n                \"fileds number not same among samples in a batch\")\n        return [default_collate_fn(fields) for fields in zip(*batch)]\n\n    raise TypeError(\"batch data con only contains: tensor, numpy.ndarray, \"\n                    \"dict, list, number, but got {}\".format(type(sample)))\n"
  },
  {
    "path": "ppdet/engine/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom . import trainer\nfrom .trainer import *\n\nfrom . import trainer_cot\nfrom .trainer_cot import *\n\nfrom . import callbacks\nfrom .callbacks import *\n\nfrom . import env\nfrom .env import *\n\n__all__ = trainer.__all__ \\\n        + callbacks.__all__ \\\n        + env.__all__\n\nfrom . import tracker\nfrom .tracker import *\n__all__ = __all__ + tracker.__all__\n\nfrom . import trainer_ssod\nfrom .trainer_ssod import *\n__all__ = __all__ + trainer_ssod.__all__\n"
  },
  {
    "path": "ppdet/engine/callbacks.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport gc\nimport sys\nimport datetime\nimport six\nimport copy\nimport json\n\nimport paddle\nimport paddle.distributed as dist\n\nfrom ppdet.utils.checkpoint import save_model, save_semi_model, save_model_info, update_train_results\nfrom ppdet.metrics import get_infer_results\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('ppdet.engine')\n\n__all__ = [\n    'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',\n    'VisualDLWriter', 'SniperProposalsGenerator'\n]\n\n\nclass Callback(object):\n    def __init__(self, model):\n        self.model = model\n        log_ranks = self.model.cfg.get(\"log_ranks\", '0')\n        if isinstance(log_ranks, str):\n            self.log_ranks = [int(i) for i in log_ranks.split(',')]\n        elif isinstance(log_ranks, int):\n            self.log_ranks = [log_ranks]\n        self.logger = setup_logger('ppdet.engine.callbacks',log_ranks=self.log_ranks)\n\n    def on_step_begin(self, status):\n        pass\n\n    def on_step_end(self, status):\n        pass\n\n    def on_epoch_begin(self, status):\n        pass\n\n    def on_epoch_end(self, status):\n        pass\n\n    def on_train_begin(self, status):\n        pass\n\n    def on_train_end(self, status):\n        pass\n\n\nclass ComposeCallback(object):\n    def __init__(self, callbacks):\n        callbacks = [c for c in list(callbacks) if c is not None]\n        for c in callbacks:\n            assert isinstance(\n                c, Callback), \"callback should be subclass of Callback\"\n        self._callbacks = callbacks\n\n    def on_step_begin(self, status):\n        for c in self._callbacks:\n            c.on_step_begin(status)\n\n    def on_step_end(self, status):\n        for c in self._callbacks:\n            c.on_step_end(status)\n\n    def on_epoch_begin(self, status):\n        for c in self._callbacks:\n            c.on_epoch_begin(status)\n\n    def on_epoch_end(self, status):\n        for c in self._callbacks:\n            c.on_epoch_end(status)\n\n    def on_train_begin(self, status):\n        for c in self._callbacks:\n            c.on_train_begin(status)\n\n    def on_train_end(self, status):\n        for c in self._callbacks:\n            c.on_train_end(status)\n\n\nclass LogPrinter(Callback):\n    def __init__(self, model):\n        super(LogPrinter, self).__init__(model)\n\n    def on_step_end(self, status):\n        \n        if dist.get_world_size() < 2 or dist.get_rank() in self.log_ranks:\n            mode = status['mode']\n            if mode == 'train':\n                epoch_id = status['epoch_id']\n                step_id = status['step_id']\n                steps_per_epoch = status['steps_per_epoch']\n                training_staus = status['training_staus']\n                batch_time = status['batch_time']\n                data_time = status['data_time']\n\n                epoches = self.model.cfg.epoch\n                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(\n                ))]['batch_size']\n\n                logs = training_staus.log()\n                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'\n                if step_id % self.model.cfg.log_iter == 0:\n                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id\n                    eta_sec = eta_steps * batch_time.global_avg\n                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))\n                    ips = float(batch_size) / batch_time.avg\n                    max_mem_reserved_str = \"\"\n                    max_mem_allocated_str = \"\"\n                    print_mem_info = self.model.cfg.get(\"print_mem_info\", True)\n                    if paddle.device.is_compiled_with_cuda() and print_mem_info:\n                        max_mem_reserved_str = f\", max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB\"\n                        max_mem_allocated_str = f\", max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB\"\n                    fmt = ' '.join([\n                        'Epoch: [{}]',\n                        '[{' + space_fmt + '}/{}]',\n                        'learning_rate: {lr:.6f}',\n                        '{meters}',\n                        'eta: {eta}',\n                        'batch_cost: {btime}',\n                        'data_cost: {dtime}',\n                        'ips: {ips:.4f} images/s'\n                        '{max_mem_reserved_str}'\n                        '{max_mem_allocated_str}'\n                    ])\n                    fmt = fmt.format(\n                        epoch_id,\n                        step_id,\n                        steps_per_epoch,\n                        lr=status['learning_rate'],\n                        meters=logs,\n                        eta=eta_str,\n                        btime=str(batch_time),\n                        dtime=str(data_time),\n                        ips=ips,\n                        max_mem_reserved_str=max_mem_reserved_str,\n                        max_mem_allocated_str=max_mem_allocated_str)\n                    self.logger.info(fmt)\n            if mode == 'eval':\n                step_id = status['step_id']\n                if step_id % 100 == 0:\n                    self.logger.info(\"Eval iter: {}\".format(step_id))\n\n    def on_epoch_end(self, status):\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            mode = status['mode']\n            if mode == 'eval':\n                sample_num = status['sample_num']\n                cost_time = status['cost_time']\n                self.logger.info('Total sample number: {}, average FPS: {}'.format(\n                    sample_num, sample_num / cost_time))\n\n\nclass Checkpointer(Callback):\n    def __init__(self, model):\n        super(Checkpointer, self).__init__(model)\n        self.best_ap = -1000.\n        self.save_dir = self.model.cfg.save_dir\n        self.uniform_output_enabled = self.model.cfg.get(\"uniform_output_enabled\", False)\n        if hasattr(self.model.model, 'student_model'):\n            self.weight = self.model.model.student_model\n        else:\n            self.weight = self.model.model\n        \n    def on_epoch_end(self, status):\n        # Checkpointer only performed during training\n        mode = status['mode']\n        epoch_id = status['epoch_id']\n        weight = None\n        save_name = None\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            end_epoch = self.model.cfg.epoch\n            save_name = str(epoch_id) if epoch_id != end_epoch - 1 else \"model_final\"\n            if mode == 'train':\n                end_epoch = self.model.cfg.epoch\n                if (\n                        epoch_id + 1\n                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:\n                    save_name = str(\n                        epoch_id) if epoch_id != end_epoch - 1 else \"model_final\"\n                    weight = self.weight.state_dict()\n            elif mode == 'eval':\n                for metric in self.model._metrics:\n                    map_res = metric.get_results()\n                    eval_func = \"ap\"\n                    if 'pose3d' in map_res:\n                        key = 'pose3d'\n                        eval_func = \"mpjpe\"\n                    elif 'bbox' in map_res:\n                        key = 'bbox'\n                    elif 'keypoint' in map_res:\n                        key = 'keypoint'\n                    else:\n                        key = 'mask'\n\n                    key = self.model.cfg.get('target_metrics', key)\n\n                    if key not in map_res:\n                        logger.warning(\"Evaluation results empty, this may be due to \" \\\n                                    \"training iterations being too few or not \" \\\n                                    \"loading the correct weights.\")\n                        return\n                    epoch_ap = map_res[key][0]\n                    epoch_metric = {\n                        'metric': abs(epoch_ap),\n                        'epoch': epoch_id + 1\n                    }\n                    save_path = os.path.join(os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, f\"{save_name}.pdstates\")\n                    paddle.save(epoch_metric, save_path)\n                    if self.uniform_output_enabled:\n                        save_model_info(epoch_metric, self.save_dir, save_name)\n                        update_train_results(self.model.cfg, save_name, epoch_metric, done_flag=epoch_id + 1 == self.model.cfg.epoch, ema=self.model.use_ema)\n                    if 'save_best_model' in status and status['save_best_model']:\n                        if epoch_ap >= self.best_ap:\n                            self.best_ap = epoch_ap\n                            save_name = 'best_model'\n                            weight = self.weight.state_dict()\n                            best_metric = {\n                                'metric': abs(self.best_ap),\n                                'epoch': epoch_id + 1\n                            }\n                            save_path = os.path.join(os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, \"best_model.pdstates\")\n                            paddle.save(best_metric, save_path)\n                            if self.uniform_output_enabled:\n                                save_model_info(best_metric, self.save_dir, save_name)\n                                update_train_results(self.model.cfg, save_name, best_metric, done_flag=epoch_id + 1 == self.model.cfg.epoch, ema=self.model.use_ema)\n                        logger.info(\"Best test {} {} is {:0.3f}.\".format(\n                            key, eval_func, abs(self.best_ap)))\n            if weight:\n                if self.model.use_ema:\n                    exchange_save_model = status.get('exchange_save_model',\n                                                     False)\n                    if not exchange_save_model:\n                        # save model and ema_model\n                        save_model(\n                            status['weight'],\n                            self.model.optimizer,\n                            os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir,\n                            save_name,\n                            epoch_id + 1,\n                            ema_model=weight)\n                        if self.uniform_output_enabled:\n                            self.model.export(output_dir=os.path.join(self.save_dir, save_name, \"inference\"), for_fd=True)\n                            gc.collect()\n                    else:\n                        # save model(student model) and ema_model(teacher model)\n                        # in DenseTeacher SSOD, the teacher model will be higher,\n                        # so exchange when saving pdparams\n                        student_model = status['weight']  # model\n                        teacher_model = weight  # ema_model\n                        save_model(\n                            teacher_model,\n                            self.model.optimizer,\n                            self.save_dir,\n                            save_name,\n                            epoch_id + 1,\n                            ema_model=student_model)\n                        del teacher_model\n                        del student_model\n                else:\n                    save_model(weight, self.model.optimizer, os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir,\n                               save_name, epoch_id + 1)\n                    if self.uniform_output_enabled:\n                        self.model.export(output_dir=os.path.join(self.save_dir, save_name, \"inference\"), for_fd=True)\n                        gc.collect()\n\n\nclass WiferFaceEval(Callback):\n    def __init__(self, model):\n        super(WiferFaceEval, self).__init__(model)\n\n    def on_epoch_begin(self, status):\n        assert self.model.mode == 'eval', \\\n            \"WiferFaceEval can only be set during evaluation\"\n        for metric in self.model._metrics:\n            metric.update(self.model.model)\n        sys.exit()\n\n\nclass VisualDLWriter(Callback):\n    \"\"\"\n    Use VisualDL to log data or image\n    \"\"\"\n\n    def __init__(self, model):\n        super(VisualDLWriter, self).__init__(model)\n\n        assert six.PY3, \"VisualDL requires Python >= 3.5\"\n        try:\n            from visualdl import LogWriter\n        except Exception as e:\n            logger.error('visualdl not found, plaese install visualdl. '\n                         'for example: `pip install visualdl`.')\n            raise e\n        self.vdl_writer = LogWriter(\n            model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))\n        self.vdl_loss_step = 0\n        self.vdl_mAP_step = 0\n        self.vdl_image_step = 0\n        self.vdl_image_frame = 0\n\n    def on_step_end(self, status):\n        mode = status['mode']\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            if mode == 'train':\n                training_staus = status['training_staus']\n                for loss_name, loss_value in training_staus.get().items():\n                    self.vdl_writer.add_scalar(loss_name, loss_value,\n                                               self.vdl_loss_step)\n                self.vdl_loss_step += 1\n            elif mode == 'test':\n                ori_image = status['original_image']\n                result_image = status['result_image']\n                self.vdl_writer.add_image(\n                    \"original/frame_{}\".format(self.vdl_image_frame), ori_image,\n                    self.vdl_image_step)\n                self.vdl_writer.add_image(\n                    \"result/frame_{}\".format(self.vdl_image_frame),\n                    result_image, self.vdl_image_step)\n                self.vdl_image_step += 1\n                # each frame can display ten pictures at most.\n                if self.vdl_image_step % 10 == 0:\n                    self.vdl_image_step = 0\n                    self.vdl_image_frame += 1\n\n    def on_epoch_end(self, status):\n        mode = status['mode']\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            if mode == 'eval':\n                for metric in self.model._metrics:\n                    for key, map_value in metric.get_results().items():\n                        self.vdl_writer.add_scalar(\"{}-mAP\".format(key),\n                                                   map_value[0],\n                                                   self.vdl_mAP_step)\n                self.vdl_mAP_step += 1\n\n\nclass WandbCallback(Callback):\n    def __init__(self, model):\n        super(WandbCallback, self).__init__(model)\n\n        try:\n            import wandb\n            self.wandb = wandb\n        except Exception as e:\n            logger.error('wandb not found, please install wandb. '\n                         'Use: `pip install wandb`.')\n            raise e\n\n        self.wandb_params = model.cfg.get('wandb', None)\n        self.save_dir = self.model.cfg.save_dir\n        if self.wandb_params is None:\n            self.wandb_params = {}\n        for k, v in model.cfg.items():\n            if k.startswith(\"wandb_\"):\n                self.wandb_params.update({k.lstrip(\"wandb_\"): v})\n\n        self._run = None\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            _ = self.run\n            self.run.config.update(self.model.cfg)\n            self.run.define_metric(\"epoch\")\n            self.run.define_metric(\"eval/*\", step_metric=\"epoch\")\n\n        self.best_ap = -1000.\n        self.fps = []\n\n    @property\n    def run(self):\n        if self._run is None:\n            if self.wandb.run is not None:\n                logger.info(\n                    \"There is an ongoing wandb run which will be used\"\n                    \"for logging. Please use `wandb.finish()` to end that\"\n                    \"if the behaviour is not intended\")\n                self._run = self.wandb.run\n            else:\n                self._run = self.wandb.init(**self.wandb_params)\n        return self._run\n\n    def save_model(self,\n                   optimizer,\n                   save_dir,\n                   save_name,\n                   last_epoch,\n                   ema_model=None,\n                   ap=None,\n                   fps=None,\n                   tags=None):\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            model_path = os.path.join(save_dir, save_name)\n            metadata = {}\n            metadata[\"last_epoch\"] = last_epoch\n            if ap:\n                metadata[\"ap\"] = ap\n\n            if fps:\n                metadata[\"fps\"] = fps\n\n            if ema_model is None:\n                ema_artifact = self.wandb.Artifact(\n                    name=\"ema_model-{}\".format(self.run.id),\n                    type=\"model\",\n                    metadata=metadata)\n                model_artifact = self.wandb.Artifact(\n                    name=\"model-{}\".format(self.run.id),\n                    type=\"model\",\n                    metadata=metadata)\n\n                ema_artifact.add_file(model_path + \".pdema\", name=\"model_ema\")\n                model_artifact.add_file(model_path + \".pdparams\", name=\"model\")\n\n                self.run.log_artifact(ema_artifact, aliases=tags)\n                self.run.log_artfact(model_artifact, aliases=tags)\n            else:\n                model_artifact = self.wandb.Artifact(\n                    name=\"model-{}\".format(self.run.id),\n                    type=\"model\",\n                    metadata=metadata)\n                model_artifact.add_file(model_path + \".pdparams\", name=\"model\")\n                self.run.log_artifact(model_artifact, aliases=tags)\n\n    def on_step_end(self, status):\n\n        mode = status['mode']\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            if mode == 'train':\n                training_status = status['training_staus'].get()\n                for k, v in training_status.items():\n                    training_status[k] = float(v)\n\n                # calculate ips, data_cost, batch_cost\n                batch_time = status['batch_time']\n                data_time = status['data_time']\n                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(\n                ))]['batch_size']\n\n                ips = float(batch_size) / float(batch_time.avg)\n                data_cost = float(data_time.avg)\n                batch_cost = float(batch_time.avg)\n\n                metrics = {\"train/\" + k: v for k, v in training_status.items()}\n\n                metrics[\"train/ips\"] = ips\n                metrics[\"train/data_cost\"] = data_cost\n                metrics[\"train/batch_cost\"] = batch_cost\n\n                self.fps.append(ips)\n                self.run.log(metrics)\n\n    def on_epoch_end(self, status):\n        mode = status['mode']\n        epoch_id = status['epoch_id']\n        save_name = None\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            if mode == 'train':\n                fps = sum(self.fps) / len(self.fps)\n                self.fps = []\n\n                end_epoch = self.model.cfg.epoch\n                if (\n                        epoch_id + 1\n                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:\n                    save_name = str(\n                        epoch_id) if epoch_id != end_epoch - 1 else \"model_final\"\n                    tags = [\"latest\", \"epoch_{}\".format(epoch_id)]\n                    self.save_model(\n                        self.model.optimizer,\n                        self.save_dir,\n                        save_name,\n                        epoch_id + 1,\n                        self.model.use_ema,\n                        fps=fps,\n                        tags=tags)\n            if mode == 'eval':\n                sample_num = status['sample_num']\n                cost_time = status['cost_time']\n\n                fps = sample_num / cost_time\n\n                merged_dict = {}\n                for metric in self.model._metrics:\n                    for key, map_value in metric.get_results().items():\n                        merged_dict[\"eval/{}-mAP\".format(key)] = map_value[0]\n                merged_dict[\"epoch\"] = status[\"epoch_id\"]\n                merged_dict[\"eval/fps\"] = sample_num / cost_time\n\n                self.run.log(merged_dict)\n\n                if 'save_best_model' in status and status['save_best_model']:\n                    for metric in self.model._metrics:\n                        map_res = metric.get_results()\n                        if 'pose3d' in map_res:\n                            key = 'pose3d'\n                        elif 'bbox' in map_res:\n                            key = 'bbox'\n                        elif 'keypoint' in map_res:\n                            key = 'keypoint'\n                        else:\n                            key = 'mask'\n                        if key not in map_res:\n                            logger.warning(\"Evaluation results empty, this may be due to \" \\\n                                        \"training iterations being too few or not \" \\\n                                        \"loading the correct weights.\")\n                            return\n                        if map_res[key][0] >= self.best_ap:\n                            self.best_ap = map_res[key][0]\n                            save_name = 'best_model'\n                            tags = [\"best\", \"epoch_{}\".format(epoch_id)]\n\n                            self.save_model(\n                                self.model.optimizer,\n                                self.save_dir,\n                                save_name,\n                                last_epoch=epoch_id + 1,\n                                ema_model=self.model.use_ema,\n                                ap=abs(self.best_ap),\n                                fps=fps,\n                                tags=tags)\n\n    def on_train_end(self, status):\n        self.run.finish()\n\n\nclass SniperProposalsGenerator(Callback):\n    def __init__(self, model):\n        super(SniperProposalsGenerator, self).__init__(model)\n        ori_dataset = self.model.dataset\n        self.dataset = self._create_new_dataset(ori_dataset)\n        self.loader = self.model.loader\n        self.cfg = self.model.cfg\n        self.infer_model = self.model.model\n\n    def _create_new_dataset(self, ori_dataset):\n        dataset = copy.deepcopy(ori_dataset)\n        # init anno_cropper\n        dataset.init_anno_cropper()\n        # generate infer roidbs\n        ori_roidbs = dataset.get_ori_roidbs()\n        roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs)\n        # set new roidbs\n        dataset.set_roidbs(roidbs)\n\n        return dataset\n\n    def _eval_with_loader(self, loader):\n        results = []\n        with paddle.no_grad():\n            self.infer_model.eval()\n            for step_id, data in enumerate(loader):\n                outs = self.infer_model(data)\n                for key in ['im_shape', 'scale_factor', 'im_id']:\n                    outs[key] = data[key]\n                for key, value in outs.items():\n                    if hasattr(value, 'numpy'):\n                        outs[key] = value.numpy()\n\n                results.append(outs)\n\n        return results\n\n    def on_train_end(self, status):\n        self.loader.dataset = self.dataset\n        results = self._eval_with_loader(self.loader)\n        results = self.dataset.anno_cropper.aggregate_chips_detections(results)\n        # sniper\n        proposals = []\n        clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()}\n        for outs in results:\n            batch_res = get_infer_results(outs, clsid2catid)\n            start = 0\n            for i, im_id in enumerate(outs['im_id']):\n                bbox_num = outs['bbox_num']\n                end = start + bbox_num[i]\n                bbox_res = batch_res['bbox'][start:end] \\\n                    if 'bbox' in batch_res else None\n                if bbox_res:\n                    proposals += bbox_res\n        logger.info(\"save proposals in {}\".format(self.cfg.proposals_path))\n        with open(self.cfg.proposals_path, 'w') as f:\n            json.dump(proposals, f)\n\n\nclass SemiLogPrinter(LogPrinter):\n    def __init__(self, model):\n        super(SemiLogPrinter, self).__init__(model)\n\n    def on_step_end(self, status):\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            mode = status['mode']\n            if mode == 'train':\n                epoch_id = status['epoch_id']\n                step_id = status['step_id']\n                iter_id = status['iter_id']\n                steps_per_epoch = status['steps_per_epoch']\n                training_staus = status['training_staus']\n                batch_time = status['batch_time']\n                data_time = status['data_time']\n\n                epoches = self.model.cfg.epoch\n                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(\n                ))]['batch_size']\n                iters = epoches * steps_per_epoch\n                logs = training_staus.log()\n                iter_space_fmt = ':' + str(len(str(iters))) + 'd'\n                space_fmt = ':' + str(len(str(iters))) + 'd'\n                if step_id % self.model.cfg.log_iter == 0:\n                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id\n                    eta_sec = eta_steps * batch_time.global_avg\n                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))\n                    ips = float(batch_size) / batch_time.avg\n                    fmt = ' '.join([\n                        '{' + iter_space_fmt + '}/{} iters',\n                        'Epoch: [{}]',\n                        '[{' + space_fmt + '}/{}]',\n                        'learning_rate: {lr:.6f}',\n                        '{meters}',\n                        'eta: {eta}',\n                        'batch_cost: {btime}',\n                        'data_cost: {dtime}',\n                        'ips: {ips:.4f} images/s',\n                    ])\n                    fmt = fmt.format(\n                        iter_id,\n                        iters,\n                        epoch_id,\n                        step_id,\n                        steps_per_epoch,\n                        lr=status['learning_rate'],\n                        meters=logs,\n                        eta=eta_str,\n                        btime=str(batch_time),\n                        dtime=str(data_time),\n                        ips=ips)\n                    logger.info(fmt)\n            if mode == 'eval':\n                step_id = status['step_id']\n                if step_id % 100 == 0:\n                    logger.info(\"Eval iter: {}\".format(step_id))\n\n\nclass SemiCheckpointer(Checkpointer):\n    def __init__(self, model):\n        super(SemiCheckpointer, self).__init__(model)\n        cfg = self.model.cfg\n        self.best_ap = 0.\n        self.save_dir = os.path.join(self.model.cfg.save_dir,\n                                     self.model.cfg.filename)\n        if hasattr(self.model.model, 'student') and hasattr(self.model.model,\n                                                            'teacher'):\n            self.weight = (self.model.model.teacher, self.model.model.student)\n        elif hasattr(self.model.model, 'student') or hasattr(self.model.model,\n                                                             'teacher'):\n            raise AttributeError(\n                \"model has no attribute 'student' or 'teacher'\")\n        else:\n            raise AttributeError(\n                \"model has no attribute 'student' and 'teacher'\")\n\n    def every_n_iters(self, iter_id, n):\n        return (iter_id + 1) % n == 0 if n > 0 else False\n\n    def on_step_end(self, status):\n        # Checkpointer only performed during training\n        mode = status['mode']\n        eval_interval = status['eval_interval']\n        save_interval = status['save_interval']\n        iter_id = status['iter_id']\n        epoch_id = status['epoch_id']\n        t_weight = None\n        s_weight = None\n        save_name = None\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            if self.every_n_iters(iter_id, save_interval) and mode == 'train':\n                save_name = \"last_epoch\"\n                # save_name = str(iter_id + 1)\n                t_weight = self.weight[0].state_dict()\n                s_weight = self.weight[1].state_dict()\n                save_semi_model(t_weight, s_weight, self.model.optimizer,\n                                self.save_dir, save_name, epoch_id + 1,\n                                iter_id + 1)\n\n    def on_epoch_end(self, status):\n        # Checkpointer only performed during training\n        mode = status['mode']\n        eval_interval = status['eval_interval']\n        save_interval = status['save_interval']\n        iter_id = status['iter_id']\n        epoch_id = status['epoch_id']\n        t_weight = None\n        s_weight = None\n        save_name = None\n        if dist.get_world_size() < 2 or dist.get_rank() == 0:\n            if self.every_n_iters(iter_id, eval_interval) and mode == 'eval':\n                if 'save_best_model' in status and status['save_best_model']:\n                    for metric in self.model._metrics:\n                        map_res = metric.get_results()\n                        if 'bbox' in map_res:\n                            key = 'bbox'\n                        elif 'keypoint' in map_res:\n                            key = 'keypoint'\n                        else:\n                            key = 'mask'\n                        if key not in map_res:\n                            logger.warning(\"Evaluation results empty, this may be due to \" \\\n                                        \"training iterations being too few or not \" \\\n                                        \"loading the correct weights.\")\n                            return\n                        if map_res[key][0] > self.best_ap:\n                            self.best_ap = map_res[key][0]\n                            save_name = 'best_model'\n                            t_weight = self.weight[0].state_dict()\n                            s_weight = self.weight[1].state_dict()\n                        logger.info(\"Best teacher test {} ap is {:0.3f}.\".\n                                    format(key, self.best_ap))\n                    if t_weight and s_weight:\n                        save_semi_model(t_weight, s_weight,\n                                        self.model.optimizer, self.save_dir,\n                                        save_name, epoch_id + 1, iter_id + 1)\n"
  },
  {
    "path": "ppdet/engine/env.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport random\nimport numpy as np\n\nimport paddle\nfrom paddle.distributed import fleet\n\n__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']\n\n\ndef init_fleet_env(find_unused_parameters=False):\n    strategy = fleet.DistributedStrategy()\n    strategy.find_unused_parameters = find_unused_parameters\n    fleet.init(is_collective=True, strategy=strategy)\n\n\ndef init_parallel_env():\n    env = os.environ\n    dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env\n    if dist:\n        trainer_id = int(env['PADDLE_TRAINER_ID'])\n        local_seed = (99 + trainer_id)\n        random.seed(local_seed)\n        np.random.seed(local_seed)\n\n    paddle.distributed.init_parallel_env()\n\n\ndef set_random_seed(seed):\n    paddle.seed(seed)\n    random.seed(seed)\n    np.random.seed(seed)\n"
  },
  {
    "path": "ppdet/engine/export_utils.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport yaml\nfrom collections import OrderedDict\n\nimport paddle\nfrom ppdet.data.source.category import get_categories\nfrom ppdet.core.workspace import load_config\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('ppdet.engine')\n\n# Global dictionary\nTRT_MIN_SUBGRAPH = {\n    'YOLO': 3,\n    'PPYOLOE': 3,\n    'SSD': 60,\n    'RCNN': 40,\n    'RetinaNet': 40,\n    'S2ANet': 80,\n    'EfficientDet': 40,\n    'Face': 3,\n    'TTFNet': 60,\n    'FCOS': 16,\n    'SOLOv2': 60,\n    'HigherHRNet': 3,\n    'HRNet': 3,\n    'DeepSORT': 3,\n    'ByteTrack': 10,\n    'CenterTrack': 5,\n    'JDE': 10,\n    'FairMOT': 5,\n    'GFL': 16,\n    'PicoDet': 3,\n    'CenterNet': 5,\n    'TOOD': 5,\n    'YOLOX': 8,\n    'YOLOF': 40,\n    'METRO_Body': 3,\n    'DETR': 3,\n    'CLRNet': 3\n}\n\nKEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']\nMOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']\nLANE_ARCH = ['CLRNet']\n\nTO_STATIC_SPEC = {\n    'yolov3_darknet53_270e_coco': [{\n        'im_id': paddle.static.InputSpec(\n            name='im_id', shape=[-1, 1], dtype='float32'),\n        'is_crowd': paddle.static.InputSpec(\n            name='is_crowd', shape=[-1, 50], dtype='float32'),\n        'gt_bbox': paddle.static.InputSpec(\n            name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),\n        'curr_iter': paddle.static.InputSpec(\n            name='curr_iter', shape=[-1], dtype='float32'),\n        'curr_epoch': paddle.static.InputSpec(\n            name='curr_epoch', shape=[-1], dtype='int64'),\n        'image': paddle.static.InputSpec(\n            name='image', shape=[-1, 3, -1, -1], dtype='float32'),\n        'im_shape': paddle.static.InputSpec(\n            name='im_shape', shape=[-1, 2], dtype='float32'),\n        'scale_factor': paddle.static.InputSpec(\n            name='scale_factor', shape=[-1, 2], dtype='float32'),\n        'target0': paddle.static.InputSpec(\n            name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),\n        'target1': paddle.static.InputSpec(\n            name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),\n        'target2': paddle.static.InputSpec(\n            name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),\n    }],\n    'tinypose_128x96': [{\n        'center': paddle.static.InputSpec(\n            name='center', shape=[-1, 2], dtype='float32'),\n        'scale': paddle.static.InputSpec(\n            name='scale', shape=[-1, 2], dtype='float32'),\n        'im_id': paddle.static.InputSpec(\n            name='im_id', shape=[-1, 1], dtype='float32'),\n        'image': paddle.static.InputSpec(\n            name='image', shape=[-1, 3, 128, 96], dtype='float32'),\n        'score': paddle.static.InputSpec(\n            name='score', shape=[-1], dtype='float32'),\n        'rotate': paddle.static.InputSpec(\n            name='rotate', shape=[-1], dtype='float32'),\n        'target': paddle.static.InputSpec(\n            name='target', shape=[-1, 17, 32, 24], dtype='float32'),\n        'target_weight': paddle.static.InputSpec(\n            name='target_weight', shape=[-1, 17, 1], dtype='float32'),\n    }],\n    'fcos_r50_fpn_1x_coco': [{\n        'im_id': paddle.static.InputSpec(\n            name='im_id', shape=[-1, 1], dtype='float32'),\n        'curr_iter': paddle.static.InputSpec(\n            name='curr_iter', shape=[-1], dtype='float32'),\n        'curr_epoch': paddle.static.InputSpec(\n            name='curr_epoch', shape=[-1], dtype='int64'),\n        'image': paddle.static.InputSpec(\n            name='image', shape=[-1, 3, -1, -1], dtype='float32'),\n        'im_shape': paddle.static.InputSpec(\n            name='im_shape', shape=[-1, 2], dtype='float32'),\n        'scale_factor': paddle.static.InputSpec(\n            name='scale_factor', shape=[-1, 2], dtype='float32'),\n        'reg_target0': paddle.static.InputSpec(\n            name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'),\n        'labels0': paddle.static.InputSpec(\n            name='labels0', shape=[-1, 160, 160, 1], dtype='int32'),\n        'centerness0': paddle.static.InputSpec(\n            name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'),\n        'reg_target1': paddle.static.InputSpec(\n            name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'),\n        'labels1': paddle.static.InputSpec(\n            name='labels1', shape=[-1, 80, 80, 1], dtype='int32'),\n        'centerness1': paddle.static.InputSpec(\n            name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'),\n        'reg_target2': paddle.static.InputSpec(\n            name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'),\n        'labels2': paddle.static.InputSpec(\n            name='labels2', shape=[-1, 40, 40, 1], dtype='int32'),\n        'centerness2': paddle.static.InputSpec(\n            name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'),\n        'reg_target3': paddle.static.InputSpec(\n            name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'),\n        'labels3': paddle.static.InputSpec(\n            name='labels3', shape=[-1, 20, 20, 1], dtype='int32'),\n        'centerness3': paddle.static.InputSpec(\n            name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'),\n        'reg_target4': paddle.static.InputSpec(\n            name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'),\n        'labels4': paddle.static.InputSpec(\n            name='labels4', shape=[-1, 10, 10, 1], dtype='int32'),\n        'centerness4': paddle.static.InputSpec(\n            name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'),\n    }],\n    'picodet_s_320_coco_lcnet': [{\n        'im_id': paddle.static.InputSpec(\n            name='im_id', shape=[-1, 1], dtype='float32'),\n        'is_crowd': paddle.static.InputSpec(\n            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),\n        'gt_class': paddle.static.InputSpec(\n            name='gt_class', shape=[-1, -1, 1], dtype='int32'),\n        'gt_bbox': paddle.static.InputSpec(\n            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),\n        'curr_iter': paddle.static.InputSpec(\n            name='curr_iter', shape=[-1], dtype='float32'),\n        'curr_epoch': paddle.static.InputSpec(\n            name='curr_epoch', shape=[-1], dtype='int64'),\n        'image': paddle.static.InputSpec(\n            name='image', shape=[-1, 3, -1, -1], dtype='float32'),\n        'im_shape': paddle.static.InputSpec(\n            name='im_shape', shape=[-1, 2], dtype='float32'),\n        'scale_factor': paddle.static.InputSpec(\n            name='scale_factor', shape=[-1, 2], dtype='float32'),\n        'pad_gt_mask': paddle.static.InputSpec(\n            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),\n    }],\n    'ppyoloe_crn_s_300e_coco': [{\n        'im_id': paddle.static.InputSpec(\n            name='im_id', shape=[-1, 1], dtype='float32'),\n        'is_crowd': paddle.static.InputSpec(\n            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),\n        'gt_class': paddle.static.InputSpec(\n            name='gt_class', shape=[-1, -1, 1], dtype='int32'),\n        'gt_bbox': paddle.static.InputSpec(\n            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),\n        'curr_iter': paddle.static.InputSpec(\n            name='curr_iter', shape=[-1], dtype='float32'),\n        'curr_epoch': paddle.static.InputSpec(\n            name='curr_epoch', shape=[-1], dtype='int64'),\n        'image': paddle.static.InputSpec(\n            name='image', shape=[-1, 3, -1, -1], dtype='float32'),\n        'im_shape': paddle.static.InputSpec(\n            name='im_shape', shape=[-1, 2], dtype='float32'),\n        'scale_factor': paddle.static.InputSpec(\n            name='scale_factor', shape=[-1, 2], dtype='float32'),\n        'pad_gt_mask': paddle.static.InputSpec(\n            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),\n    }],\n}\n\n\ndef apply_to_static(config, model):\n    filename = config.get('filename', None)\n    spec = TO_STATIC_SPEC.get(filename, None)\n    model = paddle.jit.to_static(model, input_spec=spec)\n    logger.info(\"Successfully to apply @to_static with specs: {}\".format(spec))\n    return model\n\n\ndef _prune_input_spec(input_spec, program, targets):\n    # try to prune static program to figure out pruned input spec\n    # so we perform following operations in static mode\n    device = paddle.get_device()\n    paddle.enable_static()\n    paddle.set_device(device)\n    pruned_input_spec = [{}]\n    program = program.clone()\n    program = program._prune(targets=targets)\n    global_block = program.global_block()\n    pir_value_set = set()\n    if paddle.framework.use_pir_api():\n        for op in global_block.ops:\n            if op.name() == 'pd_op.data':\n                pir_value_set.insert(op.attrs()[\"name\"])\n    for name, spec in input_spec[0].items():\n        if paddle.framework.use_pir_api():\n            if name in pir_value_set:\n                pruned_input_spec[0][name] = spec\n        else:\n            try:\n                v = global_block.var(name)\n                pruned_input_spec[0][name] = spec\n            except Exception:\n                pass\n    paddle.disable_static(place=device)\n    return pruned_input_spec\n\n\ndef _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):\n    preprocess_list = []\n    label_list = []\n    if arch != \"lane_arch\":\n        anno_file = dataset_cfg.get_anno()\n\n        clsid2catid, catid2name = get_categories(metric, anno_file, arch)\n\n        label_list = [str(cat) for cat in catid2name.values()]\n\n    fuse_normalize = reader_cfg.get('fuse_normalize', False)\n    sample_transforms = reader_cfg['sample_transforms']\n    hpi_dynamic_shape = None\n    for st in sample_transforms[1:]:\n        for key, value in st.items():\n            p = {'type': key}\n            if key == 'Resize':\n                if int(image_shape[1]) != -1:\n                    value['target_size'] = image_shape[1:]\n                    hpi_dynamic_shape = image_shape[1:]\n                value['interp'] = value.get('interp', 1)  # cv2.INTER_LINEAR\n            if fuse_normalize and key == 'NormalizeImage':\n                continue\n            p.update(value)\n            preprocess_list.append(p)\n    batch_transforms = reader_cfg.get('batch_transforms', None)\n    if batch_transforms:\n        for bt in batch_transforms:\n            for key, value in bt.items():\n                # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride)\n                if key == 'PadBatch':\n                    preprocess_list.append({\n                        'type': 'PadStride',\n                        'stride': value['pad_to_stride']\n                    })\n                    break\n                elif key == \"CULaneResize\":\n                    # cut and resize\n                    p = {'type': key}\n                    p.update(value)\n                    p.update({\"cut_height\": dataset_cfg.cut_height})\n                    preprocess_list.append(p)\n                    break\n\n    return preprocess_list, label_list, hpi_dynamic_shape\n\n\ndef _parse_tracker(tracker_cfg):\n    tracker_params = {}\n    for k, v in tracker_cfg.items():\n        tracker_params.update({k: v})\n    return tracker_params\n\n\ndef _dump_infer_config(config, path, image_shape, model):\n    arch_state = False\n    from ppdet.core.config.yaml_helpers import setup_orderdict\n    setup_orderdict()\n    use_dynamic_shape = True if image_shape[2] == -1 else False\n    infer_cfg = OrderedDict({\n        'mode': 'paddle',\n        'draw_threshold': 0.5,\n        'metric': config['metric'],\n        'use_dynamic_shape': use_dynamic_shape\n    })\n    if config.get('pdx_model_name', None):\n        infer_cfg[\"Global\"] = {\"model_name\": config[\"pdx_model_name\"]}\n    export_onnx = config.get('export_onnx', False)\n    export_eb = config.get('export_eb', False)\n\n    infer_arch = config['architecture']\n    if 'RCNN' in infer_arch and export_onnx:\n        logger.warning(\n            \"Exporting RCNN model to ONNX only support batch_size = 1\")\n        infer_cfg['export_onnx'] = True\n        infer_cfg['export_eb'] = export_eb\n\n    if infer_arch in MOT_ARCH:\n        if infer_arch == 'DeepSORT':\n            tracker_cfg = config['DeepSORTTracker']\n        elif infer_arch == 'CenterTrack':\n            tracker_cfg = config['CenterTracker']\n        else:\n            tracker_cfg = config['JDETracker']\n        infer_cfg['tracker'] = _parse_tracker(tracker_cfg)\n\n    for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items():\n        if arch in infer_arch:\n            infer_cfg['arch'] = arch\n            infer_cfg['min_subgraph_size'] = min_subgraph_size\n            arch_state = True\n            break\n\n    if infer_arch == 'PPYOLOEWithAuxHead':\n        infer_arch = 'PPYOLOE'\n\n    if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']:\n        infer_cfg['arch'] = infer_arch\n        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]\n        arch_state = True\n\n    if infer_arch == 'DETR' and config.get('with_mask', False):\n        infer_cfg['mask'] = True\n\n    if not arch_state:\n        logger.error(\n            'Architecture: {} is not supported for exporting model now.\\n'.\n            format(infer_arch) +\n            'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py')\n        os._exit(0)\n    if 'mask_head' in config[config['architecture']] and config[config[\n            'architecture']]['mask_head']:\n        infer_cfg['mask'] = True\n    if 'with_mask' in config[config['architecture']] and config[config[\n            'architecture']]['with_mask']:\n        infer_cfg['mask'] = True\n    label_arch = 'detection_arch'\n    if infer_arch in KEYPOINT_ARCH:\n        label_arch = 'keypoint_arch'\n\n    if infer_arch in LANE_ARCH:\n        infer_cfg['arch'] = infer_arch\n        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]\n        infer_cfg['img_w'] = config['img_w']\n        infer_cfg['ori_img_h'] = config['ori_img_h']\n        infer_cfg['cut_height'] = config['cut_height']\n        label_arch = 'lane_arch'\n        head_name = \"CLRHead\"\n        infer_cfg['conf_threshold'] = config[head_name]['conf_threshold']\n        infer_cfg['nms_thres'] = config[head_name]['nms_thres']\n        infer_cfg['max_lanes'] = config[head_name]['max_lanes']\n        infer_cfg['num_points'] = config[head_name]['num_points']\n        arch_state = True\n\n    if infer_arch in MOT_ARCH:\n        if config['metric'] in ['COCO', 'VOC']:\n            # MOT model run as Detector\n            reader_cfg = config['TestReader']\n            dataset_cfg = config['TestDataset']\n        else:\n            # 'metric' in ['MOT', 'MCMOT', 'KITTI']\n            label_arch = 'mot_arch'\n            reader_cfg = config['TestMOTReader']\n            dataset_cfg = config['TestMOTDataset']\n    else:\n        reader_cfg = config['TestReader']\n        dataset_cfg = config['TestDataset']\n\n    infer_cfg['Preprocess'], infer_cfg['label_list'], hpi_dynamic_shape = _parse_reader(\n        reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])\n    if config.get(\"uniform_output_enabled\", None):\n        def get_dynamic_shapes(hpi_shape):\n            return [[1, 3] + hpi_shape, [1, 3] + hpi_shape, [8, 3] + hpi_shape]\n\n        dynamic_shapes = get_dynamic_shapes(hpi_dynamic_shape) if hpi_dynamic_shape else [\n            [1, 3, 320, 320],\n            [1, 3, 640, 640],\n            [8, 3, 1280, 1280]\n        ]\n        shapes = {\n            \"image\": dynamic_shapes,\n            \"im_shape\": [[1, 2], [1, 2], [8, 2]],\n            \"scale_factor\": [[1, 2], [1, 2], [8, 2]]\n        }\n        trt_dynamic_shape = [\n            [dim for _ in range(shape[0]) for dim in shape[2:]]\n            for shape in dynamic_shapes\n        ]\n        trt_dynamic_shape_input_data = {\n            \"im_shape\": trt_dynamic_shape,\n            \"scale_factor\": [\n                [2, 2],\n                [1, 1],\n                [0.67 for _ in range(2 * shapes[\"scale_factor\"][-1][0])]\n            ]\n        }\n        hpi_config = OrderedDict({\n            \"backend_configs\": OrderedDict({\n                \"paddle_infer\": OrderedDict({\n                    \"trt_dynamic_shapes\": shapes,\n                    \"trt_dynamic_shape_input_data\": trt_dynamic_shape_input_data\n                }),\n                \"tensorrt\": OrderedDict({\n                    \"dynamic_shapes\": shapes\n                })\n            })\n        })\n        infer_cfg[\"Hpi\"] = hpi_config\n\n    if infer_arch == 'PicoDet':\n        if hasattr(config, 'export') and config['export'].get(\n                'post_process',\n                False) and not config['export'].get('benchmark', False):\n            infer_cfg['arch'] = 'GFL'\n        head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead'\n        infer_cfg['NMS'] = config[head_name]['nms']\n        # In order to speed up the prediction, the threshold of nms \n        # is adjusted here, which can be changed in infer_cfg.yml\n        config[head_name]['nms'][\"score_threshold\"] = 0.3\n        config[head_name]['nms'][\"nms_threshold\"] = 0.5\n        infer_cfg['fpn_stride'] = config[head_name]['fpn_stride']\n\n    yaml.dump(infer_cfg, open(path, 'w'))\n    logger.info(\"Export inference config file to {}\".format(os.path.join(path)))\n"
  },
  {
    "path": "ppdet/engine/naive_sync_bn.py",
    "content": "import paddle.distributed as dist\nimport math\nimport paddle\nimport paddle.nn as nn\n\n\nclass _AllReduce(paddle.autograd.PyLayer):\n    @staticmethod\n    def forward(ctx, input):\n        input_list = [paddle.zeros_like(input) for k in range(dist.get_world_size())]\n        # Use allgather instead of allreduce since I don't trust in-place operations ..\n        dist.all_gather(input_list, input, sync_op=True)\n        inputs = paddle.stack(input_list, axis=0)\n        return paddle.sum(inputs, axis=0)\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        dist.all_reduce(grad_output, sync_op=True)\n        return grad_output\n\n\ndef differentiable_all_reduce(input):\n    \"\"\"\n    Differentiable counterpart of `dist.all_reduce`.\n    \"\"\"\n    if (\n        not dist.is_available()\n        or not dist.is_initialized()\n        or dist.get_world_size() == 1\n    ):\n        return input\n    return _AllReduce.apply(input)\n\n\nclass NaiveSyncBatchNorm(nn.BatchNorm2D):\n\n    def __init__(self, *args, stats_mode=\"\", **kwargs):\n        super().__init__(*args, **kwargs)\n        assert stats_mode in [\"\", \"N\"]\n        self._stats_mode = stats_mode\n\n    def forward(self, input):\n        if dist.get_world_size() == 1 or not self.training:\n            return super(NaiveSyncBatchNorm, self).forward(input)\n\n        B, C = input.shape[0], input.shape[1]\n\n        mean = paddle.mean(input, axis=[0, 2, 3])\n        meansqr = paddle.mean(input * input, axis=[0, 2, 3])\n\n        if self._stats_mode == \"\":\n            assert B > 0, 'SyncBatchNorm(stats_mode=\"\") does not support zero batch size.'\n            vec = paddle.concat([mean, meansqr], axis=0)\n            vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())\n            mean, meansqr = paddle.split(vec, [C, C])\n            momentum = 1 - self._momentum # NOTE: paddle has reverse momentum defination\n        else:\n            if B == 0:\n                vec = paddle.zeros([2 * C + 1], dtype=mean.dtype)\n                vec = vec + input.sum()  # make sure there is gradient w.r.t input\n            else:\n                vec = paddle.concat(\n                    [\n                        mean,\n                        meansqr,\n                        paddle.ones([1], dtype=mean.dtype),\n                    ],\n                    axis=0,\n                )\n            vec = differentiable_all_reduce(vec * B)\n\n            total_batch = vec[-1].detach()\n            momentum = total_batch.clip(max=1) * (1 - self._momentum)  # no update if total_batch is 0\n            mean, meansqr, _ = paddle.split(vec / total_batch.clip(min=1), [C, C, int(vec.shape[0] - 2*C)])  # avoid div-by-zero\n\n        var = meansqr - mean * mean\n        invstd = paddle.rsqrt(var + self._epsilon)\n        scale = self.weight * invstd\n        bias = self.bias - mean * scale\n        scale = scale.reshape([1, -1, 1, 1])\n        bias = bias.reshape([1, -1, 1, 1])\n\n        tmp_mean = self._mean + momentum * (mean.detach() - self._mean)\n        self._mean.set_value(tmp_mean)\n        tmp_variance = self._variance + (momentum * (var.detach() - self._variance))\n        self._variance.set_value(tmp_variance)\n        ret = input * scale + bias\n        return ret\n\n\ndef convert_syncbn(model):\n    for n, m in model.named_children():\n        if isinstance(m, nn.layer.norm._BatchNormBase):\n            syncbn = NaiveSyncBatchNorm(m._num_features, m._momentum, m._epsilon, m._weight_attr, m._bias_attr)\n            setattr(model, n, syncbn)\n        else:\n            convert_syncbn(m)"
  },
  {
    "path": "ppdet/engine/tracker.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport glob\nimport re\nimport paddle\nimport paddle.nn as nn\nimport numpy as np\nfrom tqdm import tqdm\nfrom collections import defaultdict\n\nfrom ppdet.core.workspace import create\nfrom ppdet.utils.checkpoint import load_weight, load_pretrain_weight\nfrom ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box\nfrom ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results\nfrom ppdet.modeling.mot.tracker import JDETracker, CenterTracker\nfrom ppdet.modeling.mot.tracker import DeepSORTTracker, OCSORTTracker, BOTSORTTracker\nfrom ppdet.modeling.architectures import YOLOX\nfrom ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric\nfrom ppdet.data.source.category import get_categories\nimport ppdet.utils.stats as stats\n\nfrom .callbacks import Callback, ComposeCallback\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\nMOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']\nMOT_ARCH_JDE = MOT_ARCH[:2]\nMOT_ARCH_SDE = MOT_ARCH[2:4]\nMOT_DATA_TYPE = ['mot', 'mcmot', 'kitti']\n\n__all__ = ['Tracker']\n\n\nclass Tracker(object):\n    def __init__(self, cfg, mode='eval'):\n        self.cfg = cfg\n        assert mode.lower() in ['test', 'eval'], \\\n                \"mode should be 'test' or 'eval'\"\n        self.mode = mode.lower()\n        self.optimizer = None\n\n        # build MOT data loader\n        self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())]\n\n        # build model\n        self.model = create(cfg.architecture)\n\n        if isinstance(self.model.detector, YOLOX):\n            for k, m in self.model.named_sublayers():\n                if isinstance(m, nn.BatchNorm2D):\n                    m._epsilon = 1e-3  # for amp(fp16)\n                    m._momentum = 0.97  # 0.03 in pytorch\n\n        anno_file = self.dataset.get_anno()\n        clsid2catid, catid2name = get_categories(\n            self.cfg.metric, anno_file=anno_file)\n        self.ids2names = []\n        for k, v in catid2name.items():\n            self.ids2names.append(v)\n\n        self.status = {}\n        self.start_epoch = 0\n\n        # initial default callbacks\n        self._init_callbacks()\n\n        # initial default metrics\n        self._init_metrics()\n        self._reset_metrics()\n\n    def _init_callbacks(self):\n        self._callbacks = []\n        self._compose_callback = None\n\n    def _init_metrics(self):\n        if self.mode in ['test']:\n            self._metrics = []\n            return\n\n        if self.cfg.metric == 'MOT':\n            self._metrics = [MOTMetric(), ]\n        elif self.cfg.metric == 'MCMOT':\n            self._metrics = [MCMOTMetric(self.cfg.num_classes), ]\n        elif self.cfg.metric == 'KITTI':\n            self._metrics = [KITTIMOTMetric(), ]\n        else:\n            logger.warning(\"Metric not support for metric type {}\".format(\n                self.cfg.metric))\n            self._metrics = []\n\n    def _reset_metrics(self):\n        for metric in self._metrics:\n            metric.reset()\n\n    def register_callbacks(self, callbacks):\n        callbacks = [h for h in list(callbacks) if h is not None]\n        for c in callbacks:\n            assert isinstance(c, Callback), \\\n                    \"metrics shoule be instances of subclass of Metric\"\n        self._callbacks.extend(callbacks)\n        self._compose_callback = ComposeCallback(self._callbacks)\n\n    def register_metrics(self, metrics):\n        metrics = [m for m in list(metrics) if m is not None]\n        for m in metrics:\n            assert isinstance(m, Metric), \\\n                    \"metrics shoule be instances of subclass of Metric\"\n        self._metrics.extend(metrics)\n\n    def load_weights_jde(self, weights):\n        load_weight(self.model, weights, self.optimizer)\n\n    def load_weights_sde(self, det_weights, reid_weights):\n        with_detector = self.model.detector is not None\n        with_reid = self.model.reid is not None\n\n        if with_detector:\n            load_weight(self.model.detector, det_weights)\n            if with_reid:\n                load_weight(self.model.reid, reid_weights)\n        else:\n            load_weight(self.model.reid, reid_weights)\n\n    def _eval_seq_centertrack(self,\n                              dataloader,\n                              save_dir=None,\n                              show_image=False,\n                              frame_rate=30,\n                              draw_threshold=0):\n        assert isinstance(self.model.tracker, CenterTracker)\n        if save_dir:\n            if not os.path.exists(save_dir): os.makedirs(save_dir)\n        tracker = self.model.tracker\n\n        timer = MOTTimer()\n        frame_id = 0\n        self.status['mode'] = 'track'\n        self.model.eval()\n        results = defaultdict(list)  # only support single class now\n\n        for step_id, data in enumerate(tqdm(dataloader)):\n            self.status['step_id'] = step_id\n            if step_id == 0:\n                self.model.reset_tracking()\n\n            # forward\n            timer.tic()\n            pred_ret = self.model(data)\n\n            online_targets = tracker.update(pred_ret)\n            online_tlwhs, online_scores, online_ids = [], [], []\n            for t in online_targets:\n                bbox = t['bbox']\n                tlwh = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]\n                tscore = float(t['score'])\n                tid = int(t['tracking_id'])\n                if tlwh[2] * tlwh[3] > 0:\n                    online_tlwhs.append(tlwh)\n                    online_ids.append(tid)\n                    online_scores.append(tscore)\n            timer.toc()\n            # save results\n            results[0].append(\n                (frame_id + 1, online_tlwhs, online_scores, online_ids))\n            save_vis_results(data, frame_id, online_ids, online_tlwhs,\n                             online_scores, timer.average_time, show_image,\n                             save_dir, self.cfg.num_classes, self.ids2names)\n            frame_id += 1\n        return results, frame_id, timer.average_time, timer.calls\n\n    def _eval_seq_jde(self,\n                      dataloader,\n                      save_dir=None,\n                      show_image=False,\n                      frame_rate=30,\n                      draw_threshold=0):\n        if save_dir:\n            if not os.path.exists(save_dir): os.makedirs(save_dir)\n        tracker = self.model.tracker\n        tracker.max_time_lost = int(frame_rate / 30.0 * tracker.track_buffer)\n\n        timer = MOTTimer()\n        frame_id = 0\n        self.status['mode'] = 'track'\n        self.model.eval()\n        results = defaultdict(list)  # support single class and multi classes\n\n        for step_id, data in enumerate(tqdm(dataloader)):\n            self.status['step_id'] = step_id\n            # forward\n            timer.tic()\n            pred_dets, pred_embs = self.model(data)\n\n            pred_dets, pred_embs = pred_dets.numpy(), pred_embs.numpy()\n            online_targets_dict = self.model.tracker.update(pred_dets,\n                                                            pred_embs)\n            online_tlwhs = defaultdict(list)\n            online_scores = defaultdict(list)\n            online_ids = defaultdict(list)\n            for cls_id in range(self.cfg.num_classes):\n                online_targets = online_targets_dict[cls_id]\n                for t in online_targets:\n                    tlwh = t.tlwh\n                    tid = t.track_id\n                    tscore = t.score\n                    if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue\n                    if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[\n                            3] > tracker.vertical_ratio:\n                        continue\n                    online_tlwhs[cls_id].append(tlwh)\n                    online_ids[cls_id].append(tid)\n                    online_scores[cls_id].append(tscore)\n                # save results\n                results[cls_id].append(\n                    (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id],\n                     online_ids[cls_id]))\n\n            timer.toc()\n            save_vis_results(data, frame_id, online_ids, online_tlwhs,\n                             online_scores, timer.average_time, show_image,\n                             save_dir, self.cfg.num_classes, self.ids2names)\n            frame_id += 1\n\n        return results, frame_id, timer.average_time, timer.calls\n\n    def _eval_seq_sde(self,\n                      dataloader,\n                      save_dir=None,\n                      show_image=False,\n                      frame_rate=30,\n                      seq_name='',\n                      scaled=False,\n                      det_file='',\n                      draw_threshold=0):\n        if save_dir:\n            if not os.path.exists(save_dir): os.makedirs(save_dir)\n        use_detector = False if not self.model.detector else True\n        use_reid = hasattr(self.model, 'reid')\n        if use_reid and self.model.reid is not None:\n            use_reid = True\n        else:\n            use_reid = False\n\n        timer = MOTTimer()\n        results = defaultdict(list)\n        frame_id = 0\n        self.status['mode'] = 'track'\n        self.model.eval()\n        if use_reid:\n            self.model.reid.eval()\n        if not use_detector:\n            dets_list = load_det_results(det_file, len(dataloader))\n            logger.info('Finish loading detection results file {}.'.format(\n                det_file))\n\n        tracker = self.model.tracker\n        for step_id, data in enumerate(tqdm(dataloader)):\n            self.status['step_id'] = step_id\n            ori_image = data['ori_image']  # [bs, H, W, 3]\n            ori_image_shape = data['ori_image'].shape[1:3]\n            # ori_image_shape: [H, W]\n\n            input_shape = data['image'].shape[2:]\n            # input_shape: [h, w], before data transforms, set in model config\n\n            im_shape = data['im_shape'][0].numpy()\n            # im_shape: [new_h, new_w], after data transforms\n            scale_factor = data['scale_factor'][0].numpy()\n\n            empty_detections = False\n            # when it has no detected bboxes, will not inference reid model \n            # and if visualize, use original image instead\n\n            # forward\n            timer.tic()\n            if not use_detector:\n                dets = dets_list[frame_id]\n                bbox_tlwh = np.array(dets['bbox'], dtype='float32')\n                if bbox_tlwh.shape[0] > 0:\n                    # detector outputs: pred_cls_ids, pred_scores, pred_bboxes\n                    pred_cls_ids = np.array(dets['cls_id'], dtype='float32')\n                    pred_scores = np.array(dets['score'], dtype='float32')\n                    pred_bboxes = np.concatenate(\n                        (bbox_tlwh[:, 0:2],\n                         bbox_tlwh[:, 2:4] + bbox_tlwh[:, 0:2]),\n                        axis=1)\n                else:\n                    logger.warning(\n                        'Frame {} has not object, try to modify score threshold.'.\n                        format(frame_id))\n                    empty_detections = True\n            else:\n                outs = self.model.detector(data)\n                outs['bbox'] = outs['bbox'].numpy()\n                outs['bbox_num'] = outs['bbox_num'].numpy()\n\n                if len(outs['bbox']) > 0 and empty_detections == False:\n                    # detector outputs: pred_cls_ids, pred_scores, pred_bboxes\n                    pred_cls_ids = outs['bbox'][:, 0:1]\n                    pred_scores = outs['bbox'][:, 1:2]\n                    if not scaled:\n                        # Note: scaled=False only in JDE YOLOv3 or other detectors\n                        # with LetterBoxResize and JDEBBoxPostProcess.\n                        #\n                        # 'scaled' means whether the coords after detector outputs\n                        # have been scaled back to the original image, set True \n                        # in general detector, set False in JDE YOLOv3.\n                        pred_bboxes = scale_coords(outs['bbox'][:, 2:],\n                                                   input_shape, im_shape,\n                                                   scale_factor)\n                    else:\n                        pred_bboxes = outs['bbox'][:, 2:]\n                    pred_dets_old = np.concatenate(\n                        (pred_cls_ids, pred_scores, pred_bboxes), axis=1)\n                else:\n                    logger.warning(\n                        'Frame {} has not detected object, try to modify score threshold.'.\n                        format(frame_id))\n                    empty_detections = True\n\n            if not empty_detections:\n                pred_xyxys, keep_idx = clip_box(pred_bboxes, ori_image_shape)\n                if len(keep_idx[0]) == 0:\n                    logger.warning(\n                        'Frame {} has not detected object left after clip_box.'.\n                        format(frame_id))\n                    empty_detections = True\n\n            if empty_detections:\n                timer.toc()\n                # if visualize, use original image instead\n                online_ids, online_tlwhs, online_scores = None, None, None\n                save_vis_results(data, frame_id, online_ids, online_tlwhs,\n                                 online_scores, timer.average_time, show_image,\n                                 save_dir, self.cfg.num_classes, self.ids2names)\n                frame_id += 1\n                # thus will not inference reid model\n                continue\n\n            pred_cls_ids = pred_cls_ids[keep_idx[0]]\n            pred_scores = pred_scores[keep_idx[0]]\n            pred_dets = np.concatenate(\n                (pred_cls_ids, pred_scores, pred_xyxys), axis=1)\n\n            if use_reid:\n                crops = get_crops(\n                    pred_xyxys,\n                    ori_image,\n                    w=tracker.input_size[0],\n                    h=tracker.input_size[1])\n                crops = paddle.to_tensor(crops)\n\n                data.update({'crops': crops})\n                pred_embs = self.model(data)['embeddings'].numpy()\n            else:\n                pred_embs = None\n\n            if isinstance(tracker, DeepSORTTracker):\n                online_tlwhs, online_scores, online_ids = [], [], []\n                tracker.predict()\n                online_targets = tracker.update(pred_dets, pred_embs)\n                for t in online_targets:\n                    if not t.is_confirmed() or t.time_since_update > 1:\n                        continue\n                    tlwh = t.to_tlwh()\n                    tscore = t.score\n                    tid = t.track_id\n                    if tscore < draw_threshold: continue\n                    if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue\n                    if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[\n                            3] > tracker.vertical_ratio:\n                        continue\n                    online_tlwhs.append(tlwh)\n                    online_scores.append(tscore)\n                    online_ids.append(tid)\n                timer.toc()\n\n                # save results\n                results[0].append(\n                    (frame_id + 1, online_tlwhs, online_scores, online_ids))\n                save_vis_results(data, frame_id, online_ids, online_tlwhs,\n                                 online_scores, timer.average_time, show_image,\n                                 save_dir, self.cfg.num_classes, self.ids2names)\n\n            elif isinstance(tracker, JDETracker):\n                # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set\n                tracker.track_buffer, tracker.conf_thres = get_trick_hyperparams(\n                    seq_name, tracker.track_buffer, tracker.conf_thres)\n\n                online_targets_dict = tracker.update(pred_dets_old, pred_embs)\n                online_tlwhs = defaultdict(list)\n                online_scores = defaultdict(list)\n                online_ids = defaultdict(list)\n                for cls_id in range(self.cfg.num_classes):\n                    online_targets = online_targets_dict[cls_id]\n                    for t in online_targets:\n                        tlwh = t.tlwh\n                        tid = t.track_id\n                        tscore = t.score\n                        if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue\n                        if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[\n                                3] > tracker.vertical_ratio:\n                            continue\n                        online_tlwhs[cls_id].append(tlwh)\n                        online_ids[cls_id].append(tid)\n                        online_scores[cls_id].append(tscore)\n                    # save results\n                    results[cls_id].append(\n                        (frame_id + 1, online_tlwhs[cls_id],\n                         online_scores[cls_id], online_ids[cls_id]))\n                timer.toc()\n                save_vis_results(data, frame_id, online_ids, online_tlwhs,\n                                 online_scores, timer.average_time, show_image,\n                                 save_dir, self.cfg.num_classes, self.ids2names)\n\n            elif isinstance(tracker, OCSORTTracker):\n                # OC_SORT Tracker\n                online_targets = tracker.update(pred_dets_old, pred_embs)\n                online_tlwhs = []\n                online_ids = []\n                online_scores = []\n                for t in online_targets:\n                    tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]]\n                    tscore = float(t[4])\n                    tid = int(t[5])\n                    if tlwh[2] * tlwh[3] > 0:\n                        online_tlwhs.append(tlwh)\n                        online_ids.append(tid)\n                        online_scores.append(tscore)\n                timer.toc()\n                # save results\n                results[0].append(\n                    (frame_id + 1, online_tlwhs, online_scores, online_ids))\n                save_vis_results(data, frame_id, online_ids, online_tlwhs,\n                                 online_scores, timer.average_time, show_image,\n                                 save_dir, self.cfg.num_classes, self.ids2names)\n\n            elif isinstance(tracker, BOTSORTTracker):\n                # BOTSORT Tracker\n                online_targets = tracker.update(\n                    pred_dets_old, img=ori_image.numpy())\n                online_tlwhs = []\n                online_ids = []\n                online_scores = []\n                for t in online_targets:\n                    tlwh = t.tlwh\n                    tid = t.track_id\n                    tscore = t.score\n                    if tlwh[2] * tlwh[3] > 0:\n                        online_tlwhs.append(tlwh)\n                        online_ids.append(tid)\n                        online_scores.append(tscore)\n                timer.toc()\n                # save results\n                results[0].append(\n                    (frame_id + 1, online_tlwhs, online_scores, online_ids))\n                save_vis_results(data, frame_id, online_ids, online_tlwhs,\n                                 online_scores, timer.average_time, show_image,\n                                 save_dir, self.cfg.num_classes, self.ids2names)\n\n            else:\n                raise ValueError(tracker)\n            frame_id += 1\n\n        return results, frame_id, timer.average_time, timer.calls\n\n    def mot_evaluate(self,\n                     data_root,\n                     seqs,\n                     output_dir,\n                     data_type='mot',\n                     model_type='JDE',\n                     save_images=False,\n                     save_videos=False,\n                     show_image=False,\n                     scaled=False,\n                     det_results_dir=''):\n        if not os.path.exists(output_dir): os.makedirs(output_dir)\n        result_root = os.path.join(output_dir, 'mot_results')\n        if not os.path.exists(result_root): os.makedirs(result_root)\n        assert data_type in MOT_DATA_TYPE, \\\n            \"data_type should be 'mot', 'mcmot' or 'kitti'\"\n        assert model_type in MOT_ARCH, \\\n            \"model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'\"\n\n        # run tracking\n        n_frame = 0\n        timer_avgs, timer_calls = [], []\n        for seq in seqs:\n            infer_dir = os.path.join(data_root, seq)\n            if not os.path.exists(infer_dir) or not os.path.isdir(infer_dir):\n                logger.warning(\"Seq {} error, {} has no images.\".format(\n                    seq, infer_dir))\n                continue\n            if os.path.exists(os.path.join(infer_dir, 'img1')):\n                infer_dir = os.path.join(infer_dir, 'img1')\n\n            frame_rate = 30\n            seqinfo = os.path.join(data_root, seq, 'seqinfo.ini')\n            if os.path.exists(seqinfo):\n                meta_info = open(seqinfo).read()\n                frame_rate = int(meta_info[meta_info.find('frameRate') + 10:\n                                           meta_info.find('\\nseqLength')])\n\n            save_dir = os.path.join(output_dir, 'mot_outputs',\n                                    seq) if save_images or save_videos else None\n            logger.info('Evaluate seq: {}'.format(seq))\n\n            self.dataset.set_images(self.get_infer_images(infer_dir))\n            dataloader = create('EvalMOTReader')(self.dataset, 0)\n\n            result_filename = os.path.join(result_root, '{}.txt'.format(seq))\n\n            with paddle.no_grad():\n                if model_type in MOT_ARCH_JDE:\n                    results, nf, ta, tc = self._eval_seq_jde(\n                        dataloader,\n                        save_dir=save_dir,\n                        show_image=show_image,\n                        frame_rate=frame_rate)\n                elif model_type in MOT_ARCH_SDE:\n                    results, nf, ta, tc = self._eval_seq_sde(\n                        dataloader,\n                        save_dir=save_dir,\n                        show_image=show_image,\n                        frame_rate=frame_rate,\n                        seq_name=seq,\n                        scaled=scaled,\n                        det_file=os.path.join(det_results_dir,\n                                              '{}.txt'.format(seq)))\n                elif model_type == 'CenterTrack':\n                    results, nf, ta, tc = self._eval_seq_centertrack(\n                        dataloader,\n                        save_dir=save_dir,\n                        show_image=show_image,\n                        frame_rate=frame_rate)\n                else:\n                    raise ValueError(model_type)\n\n            write_mot_results(result_filename, results, data_type,\n                              self.cfg.num_classes)\n            n_frame += nf\n            timer_avgs.append(ta)\n            timer_calls.append(tc)\n\n            if save_videos:\n                output_video_path = os.path.join(save_dir, '..',\n                                                 '{}_vis.mp4'.format(seq))\n                cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(\n                    save_dir, output_video_path)\n                os.system(cmd_str)\n                logger.info('Save video in {}.'.format(output_video_path))\n\n            # update metrics\n            for metric in self._metrics:\n                metric.update(data_root, seq, data_type, result_root,\n                              result_filename)\n\n        timer_avgs = np.asarray(timer_avgs)\n        timer_calls = np.asarray(timer_calls)\n        all_time = np.dot(timer_avgs, timer_calls)\n        avg_time = all_time / np.sum(timer_calls)\n        logger.info('Time elapsed: {:.2f} seconds, FPS: {:.2f}'.format(\n            all_time, 1.0 / avg_time))\n\n        # accumulate metric to log out\n        for metric in self._metrics:\n            metric.accumulate()\n            metric.log()\n        # reset metric states for metric may performed multiple times\n        self._reset_metrics()\n\n    def get_infer_images(self, infer_dir):\n        assert infer_dir is None or os.path.isdir(infer_dir), \\\n            \"{} is not a directory\".format(infer_dir)\n        images = set()\n        assert os.path.isdir(infer_dir), \\\n            \"infer_dir {} is not a directory\".format(infer_dir)\n        exts = ['jpg', 'jpeg', 'png', 'bmp']\n        exts += [ext.upper() for ext in exts]\n        for ext in exts:\n            images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))\n        images = list(images)\n        images.sort()\n        assert len(images) > 0, \"no image found in {}\".format(infer_dir)\n        logger.info(\"Found {} inference images in total.\".format(len(images)))\n        return images\n\n    def mot_predict_seq(self,\n                        video_file,\n                        frame_rate,\n                        image_dir,\n                        output_dir,\n                        data_type='mot',\n                        model_type='JDE',\n                        save_images=False,\n                        save_videos=True,\n                        show_image=False,\n                        scaled=False,\n                        det_results_dir='',\n                        draw_threshold=0.5):\n        assert video_file is not None or image_dir is not None, \\\n            \"--video_file or --image_dir should be set.\"\n        assert video_file is None or os.path.isfile(video_file), \\\n                \"{} is not a file\".format(video_file)\n        assert image_dir is None or os.path.isdir(image_dir), \\\n                \"{} is not a directory\".format(image_dir)\n\n        if not os.path.exists(output_dir): os.makedirs(output_dir)\n        result_root = os.path.join(output_dir, 'mot_results')\n        if not os.path.exists(result_root): os.makedirs(result_root)\n        assert data_type in MOT_DATA_TYPE, \\\n            \"data_type should be 'mot', 'mcmot' or 'kitti'\"\n        assert model_type in MOT_ARCH, \\\n            \"model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'\"\n\n        # run tracking        \n        if video_file:\n            seq = video_file.split('/')[-1].split('.')[0]\n            self.dataset.set_video(video_file, frame_rate)\n            logger.info('Starting tracking video {}'.format(video_file))\n        elif image_dir:\n            seq = image_dir.split('/')[-1].split('.')[0]\n            if os.path.exists(os.path.join(image_dir, 'img1')):\n                image_dir = os.path.join(image_dir, 'img1')\n            images = [\n                '{}/{}'.format(image_dir, x) for x in os.listdir(image_dir)\n            ]\n            images.sort()\n            self.dataset.set_images(images)\n            logger.info('Starting tracking folder {}, found {} images'.format(\n                image_dir, len(images)))\n        else:\n            raise ValueError('--video_file or --image_dir should be set.')\n\n        save_dir = os.path.join(output_dir, 'mot_outputs',\n                                seq) if save_images or save_videos else None\n\n        dataloader = create('TestMOTReader')(self.dataset, 0)\n        result_filename = os.path.join(result_root, '{}.txt'.format(seq))\n        if frame_rate == -1:\n            frame_rate = self.dataset.frame_rate\n\n        with paddle.no_grad():\n            if model_type in MOT_ARCH_JDE:\n                results, nf, ta, tc = self._eval_seq_jde(\n                    dataloader,\n                    save_dir=save_dir,\n                    show_image=show_image,\n                    frame_rate=frame_rate,\n                    draw_threshold=draw_threshold)\n            elif model_type in MOT_ARCH_SDE:\n                results, nf, ta, tc = self._eval_seq_sde(\n                    dataloader,\n                    save_dir=save_dir,\n                    show_image=show_image,\n                    frame_rate=frame_rate,\n                    seq_name=seq,\n                    scaled=scaled,\n                    det_file=os.path.join(det_results_dir,\n                                          '{}.txt'.format(seq)),\n                    draw_threshold=draw_threshold)\n            elif model_type == 'CenterTrack':\n                results, nf, ta, tc = self._eval_seq_centertrack(\n                    dataloader,\n                    save_dir=save_dir,\n                    show_image=show_image,\n                    frame_rate=frame_rate)\n            else:\n                raise ValueError(model_type)\n\n        if save_videos:\n            output_video_path = os.path.join(save_dir, '..',\n                                             '{}_vis.mp4'.format(seq))\n            cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(\n                save_dir, output_video_path)\n            os.system(cmd_str)\n            logger.info('Save video in {}'.format(output_video_path))\n\n        write_mot_results(result_filename, results, data_type,\n                          self.cfg.num_classes)\n\n\ndef get_trick_hyperparams(video_name, ori_buffer, ori_thresh):\n    if video_name[:3] != 'MOT':\n        # only used for MOTChallenge (MOT17, MOT20) Test-set\n        return ori_buffer, ori_thresh\n\n    video_name = video_name[:8]\n    if 'MOT17-05' in video_name:\n        track_buffer = 14\n    elif 'MOT17-13' in video_name:\n        track_buffer = 25\n    else:\n        track_buffer = ori_buffer\n\n    if 'MOT17-01' in video_name:\n        track_thresh = 0.65\n    elif 'MOT17-06' in video_name:\n        track_thresh = 0.65\n    elif 'MOT17-12' in video_name:\n        track_thresh = 0.7\n    elif 'MOT17-14' in video_name:\n        track_thresh = 0.67\n    else:\n        track_thresh = ori_thresh\n\n    if 'MOT20-06' in video_name or 'MOT20-08' in video_name:\n        track_thresh = 0.3\n    else:\n        track_thresh = ori_thresh\n\n    return track_buffer, ori_thresh\n"
  },
  {
    "path": "ppdet/engine/trainer.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport copy\nimport time\nimport yaml\nfrom tqdm import tqdm\n\nimport numpy as np\nimport typing\nfrom PIL import Image, ImageOps, ImageFile\n\nImageFile.LOAD_TRUNCATED_IMAGES = True\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.distributed as dist\nfrom paddle.distributed import fleet\nfrom paddle.static import InputSpec\nfrom ppdet.optimizer import ModelEMA\n\nfrom ppdet.core.workspace import create\nfrom ppdet.utils.checkpoint import load_weight, load_pretrain_weight, convert_to_dict\nfrom ppdet.utils.visualizer import visualize_results, save_result\nfrom ppdet.metrics import get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownCOCOWholeBadyHandEval, KeyPointTopDownMPIIEval, Pose3DEval\nfrom ppdet.metrics import Metric, COCOMetric, LVISMetric, VOCMetric, WiderFaceMetric, RBoxMetric, JDEDetMetric, SNIPERCOCOMetric, CULaneMetric\nfrom ppdet.data.source.sniper_coco import SniperCOCODataSet\nfrom ppdet.data.source.category import get_categories\nimport ppdet.utils.stats as stats\nfrom ppdet.utils.fuse_utils import fuse_conv_bn\nfrom ppdet.utils import profiler\nfrom ppdet.modeling.post_process import multiclass_nms\nfrom ppdet.modeling.lane_utils import imshow_lanes\n\nfrom .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback, SemiCheckpointer, SemiLogPrinter\nfrom .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static\nfrom .naive_sync_bn import convert_syncbn\n\nfrom paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('ppdet.engine')\n\n__all__ = ['Trainer']\n\nMOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']\n\n\nclass Trainer(object):\n    def __init__(self, cfg, mode='train'):\n        self.cfg = cfg.copy()\n        assert mode.lower() in ['train', 'eval', 'test'], \\\n                \"mode should be 'train', 'eval' or 'test'\"\n        self.mode = mode.lower()\n        self.optimizer = None\n        self.is_loaded_weights = False\n        self.use_amp = self.cfg.get('amp', False)\n        self.amp_level = self.cfg.get('amp_level', 'O1')\n        self.custom_white_list = self.cfg.get('custom_white_list', None)\n        self.custom_black_list = self.cfg.get('custom_black_list', None)\n        self.use_master_grad = self.cfg.get('master_grad', False)\n        self.uniform_output_enabled = self.cfg.get('uniform_output_enabled', False)\n        if ('slim' in cfg and cfg['slim_type'] == 'PTQ') or self.uniform_output_enabled:\n            self.cfg['TestDataset'] = create('TestDataset')()\n        log_ranks = cfg.get('log_ranks', '0')\n        if isinstance(log_ranks, str):\n            self.log_ranks = [int(i) for i in log_ranks.split(',')]\n        elif isinstance(log_ranks, int):\n            self.log_ranks = [log_ranks]\n        train_results_path = os.path.abspath(os.path.join(self.cfg.save_dir, \"train_result.json\"))\n        if self.uniform_output_enabled:\n            if os.path.exists(train_results_path) and self.mode == 'train':\n                try:\n                    os.remove(train_results_path)\n                except:\n                    pass\n            if not os.path.exists(self.cfg.save_dir):\n                os.mkdir(self.cfg.save_dir)\n            with open(os.path.join(self.cfg.save_dir, \"config.yaml\"), \"w\") as f:\n                config_dict = convert_to_dict(self.cfg)\n                config_dict = {k: v for k, v in config_dict.items() if v != {}}\n                yaml.dump(config_dict, f)\n\n        # build data loader\n        capital_mode = self.mode.capitalize()\n        if cfg.architecture in MOT_ARCH and self.mode in [\n                'eval', 'test'\n        ] and cfg.metric not in ['COCO', 'VOC']:\n            self.dataset = self.cfg['{}MOTDataset'.format(\n                capital_mode)] = create('{}MOTDataset'.format(capital_mode))()\n        else:\n            self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(\n                '{}Dataset'.format(capital_mode))()\n\n        if cfg.architecture == 'DeepSORT' and self.mode == 'train':\n            logger.error('DeepSORT has no need of training on mot dataset.')\n            sys.exit(1)\n\n        if cfg.architecture == 'FairMOT' and self.mode == 'eval':\n            images = self.parse_mot_images(cfg)\n            self.dataset.set_images(images)\n\n        if self.mode == 'train':\n            self.loader = create('{}Reader'.format(capital_mode))(\n                self.dataset, cfg.worker_num)\n\n        if cfg.architecture == 'JDE' and self.mode == 'train':\n            self.cfg['JDEEmbeddingHead'][\n                'num_identities'] = self.dataset.num_identities_dict[0]\n            # JDE only support single class MOT now.\n\n        if cfg.architecture == 'FairMOT' and self.mode == 'train':\n            self.cfg['FairMOTEmbeddingHead'][\n                'num_identities_dict'] = self.dataset.num_identities_dict\n            # FairMOT support single class and multi-class MOT now.\n\n        # build model\n        if 'model' not in self.cfg:\n            self.model = create(cfg.architecture)\n        else:\n            self.model = self.cfg.model\n            self.is_loaded_weights = True\n\n        if cfg.architecture == 'YOLOX':\n            for k, m in self.model.named_sublayers():\n                if isinstance(m, nn.BatchNorm2D):\n                    m._epsilon = 1e-3  # for amp(fp16)\n                    m._momentum = 0.97  # 0.03 in pytorch\n\n        # reset norm param attr for setting them in optimizer\n        if 'reset_norm_param_attr' in cfg and cfg['reset_norm_param_attr']:\n            self.model = self.reset_norm_param_attr(\n                self.model, weight_attr=None, bias_attr=None)\n\n        # normalize params for deploy\n        if 'slim' in cfg and cfg['slim_type'] == 'OFA':\n            self.model.model.load_meanstd(cfg['TestReader'][\n                'sample_transforms'])\n        elif 'slim' in cfg and cfg['slim_type'] == 'Distill':\n            self.model.student_model.load_meanstd(cfg['TestReader'][\n                'sample_transforms'])\n        elif 'slim' in cfg and cfg[\n                'slim_type'] == 'DistillPrune' and self.mode == 'train':\n            self.model.student_model.load_meanstd(cfg['TestReader'][\n                'sample_transforms'])\n        else:\n            self.model.load_meanstd(cfg['TestReader']['sample_transforms'])\n\n        # EvalDataset build with BatchSampler to evaluate in single device\n        # TODO: multi-device evaluate\n        if self.mode == 'eval':\n            if cfg.architecture == 'FairMOT':\n                self.loader = create('EvalMOTReader')(self.dataset, 0)\n            elif cfg.architecture == \"METRO_Body\":\n                reader_name = '{}Reader'.format(self.mode.capitalize())\n                self.loader = create(reader_name)(self.dataset, cfg.worker_num)\n            else:\n                self._eval_batch_sampler = paddle.io.BatchSampler(\n                    self.dataset, batch_size=self.cfg.EvalReader['batch_size'])\n                reader_name = '{}Reader'.format(self.mode.capitalize())\n                # If metric is VOC, need to be set collate_batch=False.\n                if cfg.metric == 'VOC':\n                    self.cfg[reader_name]['collate_batch'] = False\n                self.loader = create(reader_name)(self.dataset, cfg.worker_num,\n                                                  self._eval_batch_sampler)\n        # TestDataset build after user set images, skip loader creation here\n\n        # get Params\n        print_params = self.cfg.get('print_params', False)\n        if print_params:\n            params = sum([\n                p.numel() for n, p in self.model.named_parameters()\n                if all([x not in n for x in ['_mean', '_variance', 'aux_']])\n            ])  # exclude BatchNorm running status\n            logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[\n                0]))\n\n        # build optimizer in train mode\n        if self.mode == 'train':\n            steps_per_epoch = len(self.loader)\n            if steps_per_epoch < 1:\n                logger.warning(\n                    \"Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader.\"\n                )\n            self.lr = create('LearningRate')(steps_per_epoch)\n            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)\n\n            # Unstructured pruner is only enabled in the train mode.\n            if self.cfg.get('unstructured_prune'):\n                self.pruner = create('UnstructuredPruner')(self.model,\n                                                           steps_per_epoch)\n        if self.use_amp and self.amp_level == 'O2':\n            paddle_version = paddle.__version__[:3]\n            # paddle version >= 2.5.0 or develop\n            if paddle_version in [\"2.5\", \"0.0\"]:\n                self.model, self.optimizer = paddle.amp.decorate(\n                    models=self.model,\n                    optimizers=self.optimizer,\n                    level=self.amp_level,\n                    master_grad=self.use_master_grad)\n            else:\n                self.model, self.optimizer = paddle.amp.decorate(\n                    models=self.model,\n                    optimizers=self.optimizer,\n                    level=self.amp_level)\n\n        # support sync_bn for npu/xpu\n        if (paddle.get_device()[:3]=='npu' or paddle.get_device()[:3]=='xpu'):\n            use_npu = ('use_npu' in cfg and cfg['use_npu'])\n            use_xpu = ('use_xpu' in cfg and cfg['use_xpu'])\n            use_mlu = ('use_mlu' in cfg and cfg['use_mlu'])\n            norm_type = ('norm_type' in cfg and cfg['norm_type'])\n            if norm_type == 'sync_bn' and (use_npu or use_xpu or use_mlu) and dist.get_world_size() > 1:\n                convert_syncbn(self.model)\n\n        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])\n        if self.use_ema:\n            ema_decay = self.cfg.get('ema_decay', 0.9998)\n            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')\n            cycle_epoch = self.cfg.get('cycle_epoch', -1)\n            ema_black_list = self.cfg.get('ema_black_list', None)\n            ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)\n            self.ema = ModelEMA(\n                self.model,\n                decay=ema_decay,\n                ema_decay_type=ema_decay_type,\n                cycle_epoch=cycle_epoch,\n                ema_black_list=ema_black_list,\n                ema_filter_no_grad=ema_filter_no_grad)\n\n        self._nranks = dist.get_world_size()\n        self._local_rank = dist.get_rank()\n\n        self.status = {}\n\n        self.start_epoch = 0\n        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch\n\n        # initial default callbacks\n        self._init_callbacks()\n\n        # initial default metrics\n        self._init_metrics()\n        self._reset_metrics()\n\n    def _init_callbacks(self):\n        if self.mode == 'train':\n            if self.cfg.get('ssod_method',\n                            False) and self.cfg['ssod_method'] == 'Semi_RTDETR':\n                self._callbacks = [SemiLogPrinter(self), SemiCheckpointer(self)]\n            else:\n                self._callbacks = [LogPrinter(self), Checkpointer(self)]\n            if self.cfg.get('use_vdl', False):\n                self._callbacks.append(VisualDLWriter(self))\n            if self.cfg.get('save_proposals', False):\n                self._callbacks.append(SniperProposalsGenerator(self))\n            if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg:\n                self._callbacks.append(WandbCallback(self))\n            self._compose_callback = ComposeCallback(self._callbacks)\n        elif self.mode == 'eval':\n            self._callbacks = [LogPrinter(self)]\n            # if self.cfg.metric == 'WiderFace':\n            #     self._callbacks.append(WiferFaceEval(self))\n            self._compose_callback = ComposeCallback(self._callbacks)\n        elif self.mode == 'test' and self.cfg.get('use_vdl', False):\n            self._callbacks = [VisualDLWriter(self)]\n            self._compose_callback = ComposeCallback(self._callbacks)\n        else:\n            self._callbacks = []\n            self._compose_callback = None\n\n    def _init_metrics(self, validate=False):\n        if self.mode == 'test' or (self.mode == 'train' and not validate):\n            self._metrics = []\n            return\n        classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False\n        if self.cfg.metric == 'COCO' or self.cfg.metric == \"SNIPERCOCO\" or self.cfg.metric == 'LVIS':\n            # TODO: bias should be unified\n            bias = 1 if self.cfg.get('bias', False) else 0\n            output_eval = self.cfg['output_eval'] \\\n                if 'output_eval' in self.cfg else None\n            save_prediction_only = self.cfg.get('save_prediction_only', False)\n\n            # pass clsid2catid info to metric instance to avoid multiple loading\n            # annotation file\n            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \\\n                                if self.mode == 'eval' else None\n\n            save_threshold = self.cfg.get('save_threshold', 0)\n\n            # when do validation in train, annotation file should be get from\n            # EvalReader instead of self.dataset(which is TrainReader)\n            if self.mode == 'train' and validate:\n                eval_dataset = self.cfg['EvalDataset']\n                eval_dataset.check_or_download_dataset()\n                anno_file = eval_dataset.get_anno()\n                dataset = eval_dataset\n            else:\n                dataset = self.dataset\n                anno_file = dataset.get_anno()\n\n            IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'\n            if self.cfg.metric == \"COCO\":\n                self._metrics = [\n                    COCOMetric(\n                        anno_file=anno_file,\n                        clsid2catid=clsid2catid,\n                        classwise=classwise,\n                        output_eval=output_eval,\n                        bias=bias,\n                        IouType=IouType,\n                        save_prediction_only=save_prediction_only,\n                        save_threshold=save_threshold)\n                ]\n            elif self.cfg.metric == \"LVIS\":\n                self._metrics = [\n                    LVISMetric(\n                        anno_file=anno_file,\n                        clsid2catid=clsid2catid,\n                        classwise=classwise,\n                        output_eval=output_eval,\n                        bias=bias,\n                        IouType=IouType,\n                        save_prediction_only=save_prediction_only)\n                ]\n            elif self.cfg.metric == \"SNIPERCOCO\":  # sniper\n                self._metrics = [\n                    SNIPERCOCOMetric(\n                        anno_file=anno_file,\n                        dataset=dataset,\n                        clsid2catid=clsid2catid,\n                        classwise=classwise,\n                        output_eval=output_eval,\n                        bias=bias,\n                        IouType=IouType,\n                        save_prediction_only=save_prediction_only)\n                ]\n        elif self.cfg.metric == 'RBOX':\n            # TODO: bias should be unified\n            bias = self.cfg['bias'] if 'bias' in self.cfg else 0\n            output_eval = self.cfg['output_eval'] \\\n                if 'output_eval' in self.cfg else None\n            save_prediction_only = self.cfg.get('save_prediction_only', False)\n            imid2path = self.cfg.get('imid2path', None)\n\n            # when do validation in train, annotation file should be get from\n            # EvalReader instead of self.dataset(which is TrainReader)\n            anno_file = self.dataset.get_anno()\n            if self.mode == 'train' and validate:\n                eval_dataset = self.cfg['EvalDataset']\n                eval_dataset.check_or_download_dataset()\n                anno_file = eval_dataset.get_anno()\n\n            self._metrics = [\n                RBoxMetric(\n                    anno_file=anno_file,\n                    classwise=classwise,\n                    output_eval=output_eval,\n                    bias=bias,\n                    save_prediction_only=save_prediction_only,\n                    imid2path=imid2path)\n            ]\n        elif self.cfg.metric == 'VOC':\n            output_eval = self.cfg['output_eval'] \\\n                if 'output_eval' in self.cfg else None\n            save_prediction_only = self.cfg.get('save_prediction_only', False)\n\n            self._metrics = [\n                VOCMetric(\n                    label_list=self.dataset.get_label_list(),\n                    class_num=self.cfg.num_classes,\n                    map_type=self.cfg.map_type,\n                    classwise=classwise,\n                    output_eval=output_eval,\n                    save_prediction_only=save_prediction_only)\n            ]\n        elif self.cfg.metric == 'WiderFace':\n            self._metrics = [\n                WiderFaceMetric()\n            ]\n        elif self.cfg.metric == 'KeyPointTopDownCOCOEval':\n            eval_dataset = self.cfg['EvalDataset']\n            eval_dataset.check_or_download_dataset()\n            anno_file = eval_dataset.get_anno()\n            save_prediction_only = self.cfg.get('save_prediction_only', False)\n            self._metrics = [\n                KeyPointTopDownCOCOEval(\n                    anno_file,\n                    len(eval_dataset),\n                    self.cfg.num_joints,\n                    self.cfg.save_dir,\n                    save_prediction_only=save_prediction_only)\n            ]\n        elif self.cfg.metric == 'KeyPointTopDownCOCOWholeBadyHandEval':\n            eval_dataset = self.cfg['EvalDataset']\n            eval_dataset.check_or_download_dataset()\n            anno_file = eval_dataset.get_anno()\n            save_prediction_only = self.cfg.get('save_prediction_only', False)\n            self._metrics = [\n                KeyPointTopDownCOCOWholeBadyHandEval(\n                    anno_file,\n                    len(eval_dataset),\n                    self.cfg.num_joints,\n                    self.cfg.save_dir,\n                    save_prediction_only=save_prediction_only)\n            ]\n        elif self.cfg.metric == 'KeyPointTopDownMPIIEval':\n            eval_dataset = self.cfg['EvalDataset']\n            eval_dataset.check_or_download_dataset()\n            anno_file = eval_dataset.get_anno()\n            save_prediction_only = self.cfg.get('save_prediction_only', False)\n            self._metrics = [\n                KeyPointTopDownMPIIEval(\n                    anno_file,\n                    len(eval_dataset),\n                    self.cfg.num_joints,\n                    self.cfg.save_dir,\n                    save_prediction_only=save_prediction_only)\n            ]\n        elif self.cfg.metric == 'Pose3DEval':\n            save_prediction_only = self.cfg.get('save_prediction_only', False)\n            self._metrics = [\n                Pose3DEval(\n                    self.cfg.save_dir,\n                    save_prediction_only=save_prediction_only)\n            ]\n        elif self.cfg.metric == 'MOTDet':\n            self._metrics = [JDEDetMetric(), ]\n        elif self.cfg.metric == 'CULaneMetric':\n            output_eval = self.cfg.get('output_eval', None)\n            self._metrics = [\n                CULaneMetric(\n                    cfg=self.cfg,\n                    output_eval=output_eval,\n                    split=self.dataset.split,\n                    dataset_dir=self.cfg.dataset_dir)\n            ]\n        else:\n            logger.warning(\"Metric not support for metric type {}\".format(\n                self.cfg.metric))\n            self._metrics = []\n\n    def _reset_metrics(self):\n        for metric in self._metrics:\n            metric.reset()\n\n    def register_callbacks(self, callbacks):\n        callbacks = [c for c in list(callbacks) if c is not None]\n        for c in callbacks:\n            assert isinstance(c, Callback), \\\n                    \"metrics shoule be instances of subclass of Metric\"\n        self._callbacks.extend(callbacks)\n        self._compose_callback = ComposeCallback(self._callbacks)\n\n    def register_metrics(self, metrics):\n        metrics = [m for m in list(metrics) if m is not None]\n        for m in metrics:\n            assert isinstance(m, Metric), \\\n                    \"metrics shoule be instances of subclass of Metric\"\n        self._metrics.extend(metrics)\n\n    def load_weights(self, weights, ARSL_eval=False):\n        if self.is_loaded_weights:\n            return\n        self.start_epoch = 0\n        load_pretrain_weight(self.model, weights, ARSL_eval)\n        logger.debug(\"Load weights {} to start training\".format(weights))\n\n    def load_weights_sde(self, det_weights, reid_weights):\n        if self.model.detector:\n            load_weight(self.model.detector, det_weights)\n            if self.model.reid:\n                load_weight(self.model.reid, reid_weights)\n        else:\n            load_weight(self.model.reid, reid_weights)\n\n    def resume_weights(self, weights):\n        # support Distill resume weights\n        if hasattr(self.model, 'student_model'):\n            self.start_epoch = load_weight(self.model.student_model, weights,\n                                           self.optimizer)\n        else:\n            self.start_epoch = load_weight(self.model, weights, self.optimizer,\n                                           self.ema if self.use_ema else None)\n        logger.debug(\"Resume weights of epoch {}\".format(self.start_epoch))\n\n    def train(self, validate=False):\n        assert self.mode == 'train', \"Model not in 'train' mode\"\n        Init_mark = False\n        if validate:\n            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(\n                \"EvalDataset\")()\n\n        model = self.model\n        if self.cfg.get('to_static', False):\n            model = apply_to_static(self.cfg, model)\n        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and\n                   self.cfg.use_gpu and self._nranks > 1)\n        if sync_bn:\n            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)\n\n        # enabel auto mixed precision mode\n        if self.use_amp:\n            scaler = paddle.amp.GradScaler(\n                enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,\n                init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))\n        # get distributed model\n        if self.cfg.get('fleet', False):\n            model = fleet.distributed_model(model)\n            self.optimizer = fleet.distributed_optimizer(self.optimizer)\n        elif self._nranks > 1:\n            find_unused_parameters = self.cfg[\n                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False\n            model = paddle.DataParallel(\n                model, find_unused_parameters=find_unused_parameters)\n\n        self.status.update({\n            'epoch_id': self.start_epoch,\n            'step_id': 0,\n            'steps_per_epoch': len(self.loader)\n        })\n\n        self.status['batch_time'] = stats.SmoothedValue(\n            self.cfg.log_iter, fmt='{avg:.4f}')\n        self.status['data_time'] = stats.SmoothedValue(\n            self.cfg.log_iter, fmt='{avg:.4f}')\n        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)\n\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(\n                self.dataset, self.cfg.worker_num)\n            self._flops(flops_loader)\n        profiler_options = self.cfg.get('profiler_options', None)\n\n        self._compose_callback.on_train_begin(self.status)\n\n        use_fused_allreduce_gradients = self.cfg[\n            'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False\n\n        for epoch_id in range(self.start_epoch, self.cfg.epoch):\n            self.status['mode'] = 'train'\n            self.status['epoch_id'] = epoch_id\n            self._compose_callback.on_epoch_begin(self.status)\n            self.loader.dataset.set_epoch(epoch_id)\n            model.train()\n            iter_tic = time.time()\n            for step_id, data in enumerate(self.loader):\n                def deep_pin(blob, blocking):\n                    if isinstance(blob, paddle.Tensor):\n                        return blob.cuda(blocking=blocking)\n                    elif isinstance(blob, dict):\n                        return {k: deep_pin(v, blocking) for k, v in blob.items()}\n                    elif isinstance(blob, (list, tuple)):\n                        return type(blob)([deep_pin(x, blocking) for x in blob])\n                    else:\n                        return blob\n                # if paddle.base.core.is_compiled_with_cuda():\n                #     data = deep_pin(data, False)\n\n                self.status['data_time'].update(time.time() - iter_tic)\n                self.status['step_id'] = step_id\n                profiler.add_profiler_step(profiler_options)\n                self._compose_callback.on_step_begin(self.status)\n                data['epoch_id'] = epoch_id\n                if self.cfg.get('to_static',\n                                False) and 'image_file' in data.keys():\n                    data.pop('image_file')\n\n                if self.use_amp:\n                    if isinstance(\n                            model, paddle.\n                            DataParallel) and use_fused_allreduce_gradients:\n                        with model.no_sync():\n                            with paddle.amp.auto_cast(\n                                    enable=self.cfg.use_gpu or\n                                    self.cfg.use_npu or self.cfg.use_mlu,\n                                    custom_white_list=self.custom_white_list,\n                                    custom_black_list=self.custom_black_list,\n                                    level=self.amp_level):\n                                # model forward\n                                outputs = model(data)\n                                loss = outputs['loss']\n                            # model backward\n                            scaled_loss = scaler.scale(loss)\n                            scaled_loss.backward()\n                        fused_allreduce_gradients(\n                            list(model.parameters()), None)\n                    else:\n                        with paddle.amp.auto_cast(\n                                enable=self.cfg.use_gpu or self.cfg.use_npu or\n                                self.cfg.use_mlu,\n                                custom_white_list=self.custom_white_list,\n                                custom_black_list=self.custom_black_list,\n                                level=self.amp_level):\n                            # model forward\n                            outputs = model(data)\n                            loss = outputs['loss']\n                        # model backward\n                        scaled_loss = scaler.scale(loss)\n                        scaled_loss.backward()\n                    # in dygraph mode, optimizer.minimize is equal to optimizer.step\n                    scaler.minimize(self.optimizer, scaled_loss)\n                else:\n                    if isinstance(\n                            model, paddle.\n                            DataParallel) and use_fused_allreduce_gradients:\n                        with model.no_sync():\n                            # model forward\n                            outputs = model(data)\n                            loss = outputs['loss']\n                            # model backward\n                            loss.backward()\n                        fused_allreduce_gradients(\n                            list(model.parameters()), None)\n                    else:\n                        # model forward\n                        outputs = model(data)\n                        loss = outputs['loss']\n                        # model backward\n                        loss.backward()\n                    self.optimizer.step()\n                curr_lr = self.optimizer.get_lr()\n                self.lr.step()\n                if self.cfg.get('unstructured_prune'):\n                    self.pruner.step()\n                self.optimizer.clear_grad()\n                self.status['learning_rate'] = curr_lr\n\n                if self._nranks < 2 or self._local_rank in self.log_ranks:\n                    self.status['training_staus'].update(outputs)\n\n                self.status['batch_time'].update(time.time() - iter_tic)\n                self._compose_callback.on_step_end(self.status)\n                if self.use_ema:\n                    self.ema.update()\n                iter_tic = time.time()\n\n            if self.cfg.get('unstructured_prune'):\n                self.pruner.update_params()\n\n            is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == \"Pose3DEval\")) \\\n                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)\n            if is_snapshot and self.use_ema:\n                # apply ema weight on model\n                weight = copy.deepcopy(self.model.state_dict())\n                self.model.set_dict(self.ema.apply())\n                self.status['weight'] = weight\n\n            self._compose_callback.on_epoch_end(self.status)\n\n            if validate and is_snapshot:\n                if not hasattr(self, '_eval_loader'):\n                    # build evaluation dataset and loader\n                    self._eval_dataset = self.cfg.EvalDataset\n                    self._eval_batch_sampler = \\\n                        paddle.io.BatchSampler(\n                            self._eval_dataset,\n                            batch_size=self.cfg.EvalReader['batch_size'])\n                    # If metric is VOC, need to be set collate_batch=False.\n                    if self.cfg.metric == 'VOC':\n                        self.cfg['EvalReader']['collate_batch'] = False\n                    if self.cfg.metric == \"Pose3DEval\":\n                        self._eval_loader = create('EvalReader')(\n                            self._eval_dataset, self.cfg.worker_num)\n                    else:\n                        self._eval_loader = create('EvalReader')(\n                            self._eval_dataset,\n                            self.cfg.worker_num,\n                            batch_sampler=self._eval_batch_sampler)\n                # if validation in training is enabled, metrics should be re-init\n                # Init_mark makes sure this code will only execute once\n                if validate and Init_mark == False:\n                    Init_mark = True\n                    self._init_metrics(validate=validate)\n                    self._reset_metrics()\n\n                with paddle.no_grad():\n                    self.status['save_best_model'] = True\n                    self._eval_with_loader(self._eval_loader)\n\n            if is_snapshot and self.use_ema:\n                # reset original weight\n                self.model.set_dict(weight)\n                self.status.pop('weight')\n\n        self._compose_callback.on_train_end(self.status)\n\n    def _eval_with_loader(self, loader):\n        sample_num = 0\n        tic = time.time()\n        self._compose_callback.on_epoch_begin(self.status)\n        self.status['mode'] = 'eval'\n\n        self.model.eval()\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(\n                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)\n            self._flops(flops_loader)\n        for step_id, data in enumerate(loader):\n            self.status['step_id'] = step_id\n            self._compose_callback.on_step_begin(self.status)\n            # forward\n            if self.use_amp:\n                with paddle.amp.auto_cast(\n                        enable=self.cfg.use_gpu or self.cfg.use_npu or\n                        self.cfg.use_mlu,\n                        custom_white_list=self.custom_white_list,\n                        custom_black_list=self.custom_black_list,\n                        level=self.amp_level):\n                    outs = self.model(data)\n            else:\n                outs = self.model(data)\n\n            # update metrics\n            for metric in self._metrics:\n                metric.update(data, outs)\n\n            # multi-scale inputs: all inputs have same im_id\n            if isinstance(data, typing.Sequence):\n                sample_num += data[0]['im_id'].numpy().shape[0]\n            else:\n                sample_num += data['im_id'].numpy().shape[0]\n            self._compose_callback.on_step_end(self.status)\n\n        self.status['sample_num'] = sample_num\n        self.status['cost_time'] = time.time() - tic\n\n        # accumulate metric to log out\n        for metric in self._metrics:\n            metric.accumulate()\n            metric.log()\n        self._compose_callback.on_epoch_end(self.status)\n        # reset metric states for metric may performed multiple times\n        self._reset_metrics()\n\n    def evaluate(self):\n        # get distributed model\n        if self.cfg.get('fleet', False):\n            self.model = fleet.distributed_model(self.model)\n            self.optimizer = fleet.distributed_optimizer(self.optimizer)\n        elif self._nranks > 1:\n            find_unused_parameters = self.cfg[\n                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False\n            self.model = paddle.DataParallel(\n                self.model, find_unused_parameters=find_unused_parameters)\n        with paddle.no_grad():\n            self._eval_with_loader(self.loader)\n\n    def _eval_with_loader_slice(self,\n                                loader,\n                                slice_size=[640, 640],\n                                overlap_ratio=[0.25, 0.25],\n                                combine_method='nms',\n                                match_threshold=0.6,\n                                match_metric='iou'):\n        sample_num = 0\n        tic = time.time()\n        self._compose_callback.on_epoch_begin(self.status)\n        self.status['mode'] = 'eval'\n        self.model.eval()\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(\n                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)\n            self._flops(flops_loader)\n\n        merged_bboxs = []\n        for step_id, data in enumerate(loader):\n            self.status['step_id'] = step_id\n            self._compose_callback.on_step_begin(self.status)\n            # forward\n            if self.use_amp:\n                with paddle.amp.auto_cast(\n                        enable=self.cfg.use_gpu or self.cfg.use_npu or\n                        self.cfg.use_mlu,\n                        custom_white_list=self.custom_white_list,\n                        custom_black_list=self.custom_black_list,\n                        level=self.amp_level):\n                    outs = self.model(data)\n            else:\n                outs = self.model(data)\n\n            shift_amount = data['st_pix']\n            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount\n            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount\n            merged_bboxs.append(outs['bbox'])\n\n            if data['is_last'] > 0:\n                # merge matching predictions\n                merged_results = {'bbox': []}\n                if combine_method == 'nms':\n                    final_boxes = multiclass_nms(\n                        np.concatenate(merged_bboxs), self.cfg.num_classes,\n                        match_threshold, match_metric)\n                    merged_results['bbox'] = np.concatenate(final_boxes)\n                elif combine_method == 'concat':\n                    merged_results['bbox'] = np.concatenate(merged_bboxs)\n                else:\n                    raise ValueError(\n                        \"Now only support 'nms' or 'concat' to fuse detection results.\"\n                    )\n                merged_results['im_id'] = np.array([[0]])\n                merged_results['bbox_num'] = np.array(\n                    [len(merged_results['bbox'])])\n\n                merged_bboxs = []\n                data['im_id'] = data['ori_im_id']\n                # update metrics\n                for metric in self._metrics:\n                    metric.update(data, merged_results)\n\n                # multi-scale inputs: all inputs have same im_id\n                if isinstance(data, typing.Sequence):\n                    sample_num += data[0]['im_id'].numpy().shape[0]\n                else:\n                    sample_num += data['im_id'].numpy().shape[0]\n\n            self._compose_callback.on_step_end(self.status)\n\n        self.status['sample_num'] = sample_num\n        self.status['cost_time'] = time.time() - tic\n\n        # accumulate metric to log out\n        for metric in self._metrics:\n            metric.accumulate()\n            metric.log()\n        self._compose_callback.on_epoch_end(self.status)\n        # reset metric states for metric may performed multiple times\n        self._reset_metrics()\n\n    def evaluate_slice(self,\n                       slice_size=[640, 640],\n                       overlap_ratio=[0.25, 0.25],\n                       combine_method='nms',\n                       match_threshold=0.6,\n                       match_metric='iou'):\n        with paddle.no_grad():\n            self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio,\n                                         combine_method, match_threshold,\n                                         match_metric)\n\n    def slice_predict(self,\n                      images,\n                      slice_size=[640, 640],\n                      overlap_ratio=[0.25, 0.25],\n                      combine_method='nms',\n                      match_threshold=0.6,\n                      match_metric='iou',\n                      draw_threshold=0.5,\n                      output_dir='output',\n                      save_results=False,\n                      visualize=True):\n        if not os.path.exists(output_dir):\n            os.makedirs(output_dir)\n\n        self.dataset.set_slice_images(images, slice_size, overlap_ratio)\n        loader = create('TestReader')(self.dataset, 0)\n        imid2path = self.dataset.get_imid2path()\n\n        def setup_metrics_for_loader():\n            # mem\n            metrics = copy.deepcopy(self._metrics)\n            mode = self.mode\n            save_prediction_only = self.cfg[\n                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None\n            output_eval = self.cfg[\n                'output_eval'] if 'output_eval' in self.cfg else None\n\n            # modify\n            self.mode = '_test'\n            self.cfg['save_prediction_only'] = True\n            self.cfg['output_eval'] = output_dir\n            self.cfg['imid2path'] = imid2path\n            self._init_metrics()\n\n            # restore\n            self.mode = mode\n            self.cfg.pop('save_prediction_only')\n            if save_prediction_only is not None:\n                self.cfg['save_prediction_only'] = save_prediction_only\n\n            self.cfg.pop('output_eval')\n            if output_eval is not None:\n                self.cfg['output_eval'] = output_eval\n\n            self.cfg.pop('imid2path')\n\n            _metrics = copy.deepcopy(self._metrics)\n            self._metrics = metrics\n\n            return _metrics\n\n        if save_results:\n            metrics = setup_metrics_for_loader()\n        else:\n            metrics = []\n\n        anno_file = self.dataset.get_anno()\n        clsid2catid, catid2name = get_categories(\n            self.cfg.metric, anno_file=anno_file)\n\n        # Run Infer\n        self.status['mode'] = 'test'\n        self.model.eval()\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('TestReader')(self.dataset, 0)\n            self._flops(flops_loader)\n\n        results = []  # all images\n        merged_bboxs = []  # single image\n        for step_id, data in enumerate(tqdm(loader)):\n            self.status['step_id'] = step_id\n            # forward\n            outs = self.model(data)\n\n            outs['bbox'] = outs['bbox'].numpy()  # only in test mode\n            shift_amount = data['st_pix']\n            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy()\n            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy()\n            merged_bboxs.append(outs['bbox'])\n\n            if data['is_last'] > 0:\n                # merge matching predictions\n                merged_results = {'bbox': []}\n                if combine_method == 'nms':\n                    final_boxes = multiclass_nms(\n                        np.concatenate(merged_bboxs), self.cfg.num_classes,\n                        match_threshold, match_metric)\n                    merged_results['bbox'] = np.concatenate(final_boxes)\n                elif combine_method == 'concat':\n                    merged_results['bbox'] = np.concatenate(merged_bboxs)\n                else:\n                    raise ValueError(\n                        \"Now only support 'nms' or 'concat' to fuse detection results.\"\n                    )\n                merged_results['im_id'] = np.array([[0]])\n                merged_results['bbox_num'] = np.array(\n                    [len(merged_results['bbox'])])\n\n                merged_bboxs = []\n                data['im_id'] = data['ori_im_id']\n\n                for _m in metrics:\n                    _m.update(data, merged_results)\n\n                for key in ['im_shape', 'scale_factor', 'im_id']:\n                    if isinstance(data, typing.Sequence):\n                        merged_results[key] = data[0][key]\n                    else:\n                        merged_results[key] = data[key]\n                for key, value in merged_results.items():\n                    if hasattr(value, 'numpy'):\n                        merged_results[key] = value.numpy()\n                results.append(merged_results)\n\n        for _m in metrics:\n            _m.accumulate()\n            _m.reset()\n\n        if visualize:\n            for outs in results:\n                batch_res = get_infer_results(outs, clsid2catid)\n                bbox_num = outs['bbox_num']\n\n                start = 0\n                for i, im_id in enumerate(outs['im_id']):\n                    image_path = imid2path[int(im_id)]\n                    image = Image.open(image_path).convert('RGB')\n                    image = ImageOps.exif_transpose(image)\n                    self.status['original_image'] = np.array(image.copy())\n\n                    end = start + bbox_num[i]\n                    bbox_res = batch_res['bbox'][start:end] \\\n                            if 'bbox' in batch_res else None\n                    mask_res = batch_res['mask'][start:end] \\\n                            if 'mask' in batch_res else None\n                    segm_res = batch_res['segm'][start:end] \\\n                            if 'segm' in batch_res else None\n                    keypoint_res = batch_res['keypoint'][start:end] \\\n                            if 'keypoint' in batch_res else None\n                    pose3d_res = batch_res['pose3d'][start:end] \\\n                            if 'pose3d' in batch_res else None\n                    image = visualize_results(\n                        image, bbox_res, mask_res, segm_res, keypoint_res,\n                        pose3d_res, int(im_id), catid2name, draw_threshold)\n                    self.status['result_image'] = np.array(image.copy())\n                    if self._compose_callback:\n                        self._compose_callback.on_step_end(self.status)\n                    # save image with detection\n                    save_name = self._get_save_image_name(output_dir,\n                                                          image_path)\n                    logger.info(\"Detection bbox results save in {}\".format(\n                        save_name))\n                    image.save(save_name, quality=95)\n\n                    start = end\n\n    def predict(self,\n                images,\n                draw_threshold=0.5,\n                output_dir='output',\n                save_results=False,\n                visualize=True,\n                save_threshold=0,\n                do_eval=False):\n        if not os.path.exists(output_dir):\n            os.makedirs(output_dir)\n        if do_eval:\n            save_threshold = 0.0\n        self.dataset.set_images(images, do_eval=do_eval)\n        loader = create('TestReader')(self.dataset, 0)\n\n        imid2path = self.dataset.get_imid2path()\n\n        def setup_metrics_for_loader():\n            # mem\n            metrics = copy.deepcopy(self._metrics)\n            mode = self.mode\n            save_prediction_only = self.cfg[\n                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None\n            output_eval = self.cfg[\n                'output_eval'] if 'output_eval' in self.cfg else None\n\n            # modify\n            self.mode = '_test'\n            self.cfg['save_prediction_only'] = True\n            self.cfg['output_eval'] = output_dir\n            self.cfg['imid2path'] = imid2path\n            self.cfg['save_threshold'] = save_threshold\n            self._init_metrics()\n\n            # restore\n            self.mode = mode\n            self.cfg.pop('save_prediction_only')\n            if save_prediction_only is not None:\n                self.cfg['save_prediction_only'] = save_prediction_only            \n\n            self.cfg.pop('output_eval')\n            if output_eval is not None:\n                self.cfg['output_eval'] = output_eval\n\n            self.cfg.pop('imid2path')\n\n            _metrics = copy.deepcopy(self._metrics)\n            self._metrics = metrics\n\n            return _metrics\n\n        if save_results:\n            metrics = setup_metrics_for_loader()\n        else:\n            metrics = []\n\n        anno_file = self.dataset.get_anno()\n        clsid2catid, catid2name = get_categories(\n            self.cfg.metric, anno_file=anno_file)\n\n        # Run Infer\n        self.status['mode'] = 'test'\n        self.model.eval()\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('TestReader')(self.dataset, 0)\n            self._flops(flops_loader)\n        results = []\n        for step_id, data in enumerate(tqdm(loader)):\n            self.status['step_id'] = step_id\n            # forward\n            if hasattr(self.model, 'modelTeacher'):\n                outs = self.model.modelTeacher(data)\n            else:\n                outs = self.model(data)\n            for _m in metrics:\n                _m.update(data, outs)\n\n            for key in ['im_shape', 'scale_factor', 'im_id']:\n                if isinstance(data, typing.Sequence):\n                    outs[key] = data[0][key]\n                else:\n                    outs[key] = data[key]\n            for key, value in outs.items():\n                if hasattr(value, 'numpy'):\n                    outs[key] = value.numpy()\n            results.append(outs)\n\n        # sniper\n        if type(self.dataset) == SniperCOCODataSet:\n            results = self.dataset.anno_cropper.aggregate_chips_detections(\n                results)\n\n        for _m in metrics:\n            _m.accumulate()\n            _m.reset()\n\n        if visualize:\n            for outs in results:\n                batch_res = get_infer_results(outs, clsid2catid)\n                bbox_num = outs['bbox_num']\n\n                start = 0\n                for i, im_id in enumerate(outs['im_id']):\n                    image_path = imid2path[int(im_id)]\n                    image = Image.open(image_path).convert('RGB')\n                    image = ImageOps.exif_transpose(image)\n                    self.status['original_image'] = np.array(image.copy())\n\n                    end = start + bbox_num[i]\n                    bbox_res = batch_res['bbox'][start:end] \\\n                            if 'bbox' in batch_res else None\n                    mask_res = batch_res['mask'][start:end] \\\n                            if 'mask' in batch_res else None\n                    segm_res = batch_res['segm'][start:end] \\\n                            if 'segm' in batch_res else None\n                    keypoint_res = batch_res['keypoint'][start:end] \\\n                            if 'keypoint' in batch_res else None\n                    pose3d_res = batch_res['pose3d'][start:end] \\\n                            if 'pose3d' in batch_res else None\n                    image = visualize_results(\n                        image, bbox_res, mask_res, segm_res, keypoint_res,\n                        pose3d_res, int(im_id), catid2name, draw_threshold)\n                    self.status['result_image'] = np.array(image.copy())\n                    if self._compose_callback:\n                        self._compose_callback.on_step_end(self.status)\n                    # save image with detection\n                    save_name = self._get_save_image_name(output_dir,\n                                                          image_path)\n                    logger.info(\"Detection bbox results save in {}\".format(\n                        save_name))\n                    image.save(save_name, quality=95)\n\n                    start = end\n        return results\n\n    def _get_save_image_name(self, output_dir, image_path):\n        \"\"\"\n        Get save image name from source image path.\n        \"\"\"\n        image_name = os.path.split(image_path)[-1]\n        name, ext = os.path.splitext(image_name)\n        return os.path.join(output_dir, \"{}\".format(name)) + ext\n\n    def _get_infer_cfg_and_input_spec(self,\n                                      save_dir,\n                                      prune_input=True,\n                                      kl_quant=False,\n                                      yaml_name=None,\n                                      model=None):\n        if yaml_name is None:\n            yaml_name = 'infer_cfg.yml'\n        if model is None:\n            model = self.model\n        image_shape = None\n        im_shape = [None, 2]\n        scale_factor = [None, 2]\n        if self.cfg.architecture in MOT_ARCH:\n            test_reader_name = 'TestMOTReader'\n        else:\n            test_reader_name = 'TestReader'\n        if 'inputs_def' in self.cfg[test_reader_name]:\n            inputs_def = self.cfg[test_reader_name]['inputs_def']\n            image_shape = inputs_def.get('image_shape', None)\n        # set image_shape=[None, 3, -1, -1] as default\n        if image_shape is None:\n            image_shape = [None, 3, -1, -1]\n\n        if len(image_shape) == 3:\n            image_shape = [None] + image_shape\n        else:\n            im_shape = [image_shape[0], 2]\n            scale_factor = [image_shape[0], 2]\n\n        if hasattr(model, 'deploy'):\n            model.deploy = True\n        if 'slim' not in self.cfg:\n            for layer in model.sublayers():\n                if hasattr(layer, 'convert_to_deploy'):\n                    layer.convert_to_deploy()\n\n        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[\n                'export'] and self.cfg['export']['fuse_conv_bn']:\n            model = fuse_conv_bn(model)\n\n        export_post_process = self.cfg['export'].get(\n            'post_process', False) if hasattr(self.cfg, 'export') else True\n        export_nms = self.cfg['export'].get('nms', False) if hasattr(\n            self.cfg, 'export') else True\n        export_benchmark = self.cfg['export'].get(\n            'benchmark', False) if hasattr(self.cfg, 'export') else False\n        if hasattr(model, 'fuse_norm'):\n            model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize',\n                                                              False)\n        if hasattr(model, 'export_post_process'):\n            model.export_post_process = export_post_process if not export_benchmark else False\n        if hasattr(model, 'export_nms'):\n            model.export_nms = export_nms if not export_benchmark else False\n        if export_post_process and not export_benchmark:\n            image_shape = [None] + image_shape[1:]\n\n        # Save infer cfg\n        _dump_infer_config(self.cfg,\n                           os.path.join(save_dir, yaml_name), image_shape,\n                           model)\n\n        input_spec = [{\n            \"image\": InputSpec(\n                shape=image_shape, name='image'),\n            \"im_shape\": InputSpec(\n                shape=im_shape, name='im_shape'),\n            \"scale_factor\": InputSpec(\n                shape=scale_factor, name='scale_factor')\n        }]\n        if self.cfg.architecture == 'DeepSORT':\n            input_spec[0].update({\n                \"crops\": InputSpec(\n                    shape=[None, 3, 192, 64], name='crops')\n            })\n\n        if self.cfg.architecture == 'CLRNet':\n            input_spec[0].update({\n                \"full_img_path\": str,\n                \"img_name\": str,\n            })\n        if prune_input:\n            static_model = paddle.jit.to_static(\n                model, input_spec=input_spec, full_graph=True)\n            # NOTE: dy2st do not pruned program, but jit.save will prune program\n            # input spec, prune input spec here and save with pruned input spec\n            pruned_input_spec = _prune_input_spec(\n                input_spec, static_model.forward.main_program,\n                static_model.forward.outputs)\n        else:\n            static_model = None\n            pruned_input_spec = input_spec\n\n        # TODO: Hard code, delete it when support prune input_spec.\n        if self.cfg.architecture == 'PicoDet' and not export_post_process:\n            pruned_input_spec = [{\n                \"image\": InputSpec(\n                    shape=image_shape, name='image')\n            }]\n        if kl_quant:\n            if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights:\n                pruned_input_spec = [{\n                    \"image\": InputSpec(\n                        shape=image_shape, name='image'),\n                    \"scale_factor\": InputSpec(\n                        shape=scale_factor, name='scale_factor')\n                }]\n            elif 'tinypose' in self.cfg.weights:\n                pruned_input_spec = [{\n                    \"image\": InputSpec(\n                        shape=image_shape, name='image')\n                }]\n\n        return static_model, pruned_input_spec\n\n    def export(self, output_dir='output_inference', for_fd=False):\n        if hasattr(self.model, 'aux_neck'):\n            self.model.__delattr__('aux_neck')\n        if hasattr(self.model, 'aux_head'):\n            self.model.__delattr__('aux_head')\n        self.model.eval()\n        model = copy.deepcopy(self.model)\n\n        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]\n        if for_fd:\n            save_dir = output_dir\n            save_name = 'inference'\n            yaml_name = 'inference.yml'\n        else:\n            save_dir = os.path.join(output_dir, model_name)\n            save_name = 'model'\n            yaml_name = None\n\n        if not os.path.exists(save_dir):\n            os.makedirs(save_dir)\n\n        static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec(\n            save_dir, yaml_name=yaml_name, model=model)\n\n        # dy2st and save model\n        if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']:\n            paddle.jit.save(\n                static_model,\n                os.path.join(save_dir, save_name),\n                input_spec=pruned_input_spec)\n        else:\n            self.cfg.slim.save_quantized_model(\n                self.model,\n                os.path.join(save_dir, save_name),\n                input_spec=pruned_input_spec)\n        logger.info(\"Export model and saved in {}\".format(save_dir))\n\n    def post_quant(self, output_dir='output_inference'):\n        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]\n        save_dir = os.path.join(output_dir, model_name)\n        if not os.path.exists(save_dir):\n            os.makedirs(save_dir)\n\n        for idx, data in enumerate(self.loader):\n            self.model(data)\n            if idx == int(self.cfg.get('quant_batch_num', 10)):\n                break\n\n        # TODO: support prune input_spec\n        kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False\n        _, pruned_input_spec = self._get_infer_cfg_and_input_spec(\n            save_dir, prune_input=False, kl_quant=kl_quant)\n\n        self.cfg.slim.save_quantized_model(\n            self.model,\n            os.path.join(save_dir, 'model'),\n            input_spec=pruned_input_spec)\n        logger.info(\"Export Post-Quant model and saved in {}\".format(save_dir))\n\n    def _flops(self, loader):\n        if hasattr(self.model, 'aux_neck'):\n            self.model.__delattr__('aux_neck')\n        if hasattr(self.model, 'aux_head'):\n            self.model.__delattr__('aux_head')\n        self.model.eval()\n        try:\n            import paddleslim\n        except Exception as e:\n            logger.warning(\n                'Unable to calculate flops, please install paddleslim, for example: `pip install paddleslim`'\n            )\n            return\n\n        from paddleslim.analysis import dygraph_flops as flops\n        input_data = None\n        for data in loader:\n            input_data = data\n            break\n\n        input_spec = [{\n            \"image\": input_data['image'][0].unsqueeze(0),\n            \"im_shape\": input_data['im_shape'][0].unsqueeze(0),\n            \"scale_factor\": input_data['scale_factor'][0].unsqueeze(0)\n        }]\n        flops = flops(self.model, input_spec) / (1000**3)\n        logger.info(\" Model FLOPs : {:.6f}G. (image shape is {})\".format(\n            flops, input_data['image'][0].unsqueeze(0).shape))\n\n    def parse_mot_images(self, cfg):\n        import glob\n        # for quant\n        dataset_dir = cfg['EvalMOTDataset'].dataset_dir\n        data_root = cfg['EvalMOTDataset'].data_root\n        data_root = '{}/{}'.format(dataset_dir, data_root)\n        seqs = os.listdir(data_root)\n        seqs.sort()\n        all_images = []\n        for seq in seqs:\n            infer_dir = os.path.join(data_root, seq)\n            assert infer_dir is None or os.path.isdir(infer_dir), \\\n                \"{} is not a directory\".format(infer_dir)\n            images = set()\n            exts = ['jpg', 'jpeg', 'png', 'bmp']\n            exts += [ext.upper() for ext in exts]\n            for ext in exts:\n                images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))\n            images = list(images)\n            images.sort()\n            assert len(images) > 0, \"no image found in {}\".format(infer_dir)\n            all_images.extend(images)\n            logger.info(\"Found {} inference images in total.\".format(\n                len(images)))\n        return all_images\n\n    def predict_culane(self,\n                       images,\n                       output_dir='output',\n                       save_results=False,\n                       visualize=True):\n        if not os.path.exists(output_dir):\n            os.makedirs(output_dir)\n\n        self.dataset.set_images(images)\n        loader = create('TestReader')(self.dataset, 0)\n\n        imid2path = self.dataset.get_imid2path()\n\n        def setup_metrics_for_loader():\n            # mem\n            metrics = copy.deepcopy(self._metrics)\n            mode = self.mode\n            save_prediction_only = self.cfg[\n                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None\n            output_eval = self.cfg[\n                'output_eval'] if 'output_eval' in self.cfg else None\n\n            # modify\n            self.mode = '_test'\n            self.cfg['save_prediction_only'] = True\n            self.cfg['output_eval'] = output_dir\n            self.cfg['imid2path'] = imid2path\n            self._init_metrics()\n\n            # restore\n            self.mode = mode\n            self.cfg.pop('save_prediction_only')\n            if save_prediction_only is not None:\n                self.cfg['save_prediction_only'] = save_prediction_only\n\n            self.cfg.pop('output_eval')\n            if output_eval is not None:\n                self.cfg['output_eval'] = output_eval\n\n            self.cfg.pop('imid2path')\n\n            _metrics = copy.deepcopy(self._metrics)\n            self._metrics = metrics\n\n            return _metrics\n\n        if save_results:\n            metrics = setup_metrics_for_loader()\n        else:\n            metrics = []\n\n        # Run Infer\n        self.status['mode'] = 'test'\n        self.model.eval()\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('TestReader')(self.dataset, 0)\n            self._flops(flops_loader)\n        results = []\n        for step_id, data in enumerate(tqdm(loader)):\n            self.status['step_id'] = step_id\n            # forward\n            outs = self.model(data)\n\n            for _m in metrics:\n                _m.update(data, outs)\n\n            for key in ['im_shape', 'scale_factor', 'im_id']:\n                if isinstance(data, typing.Sequence):\n                    outs[key] = data[0][key]\n                else:\n                    outs[key] = data[key]\n            for key, value in outs.items():\n                if hasattr(value, 'numpy'):\n                    outs[key] = value.numpy()\n            results.append(outs)\n\n        for _m in metrics:\n            _m.accumulate()\n            _m.reset()\n\n        if visualize:\n            import cv2\n\n            for outs in results:\n                for i in range(len(outs['img_path'])):\n                    lanes = outs['lanes'][i]\n                    img_path = outs['img_path'][i]\n                    img = cv2.imread(img_path)\n                    out_file = os.path.join(output_dir,\n                                            os.path.basename(img_path))\n                    lanes = [\n                        lane.to_array(\n                            sample_y_range=[\n                                self.cfg['sample_y']['start'],\n                                self.cfg['sample_y']['end'],\n                                self.cfg['sample_y']['step']\n                            ],\n                            img_w=self.cfg.ori_img_w,\n                            img_h=self.cfg.ori_img_h) for lane in lanes\n                    ]\n                    imshow_lanes(img, lanes, out_file=out_file)\n\n        return results\n\n    def reset_norm_param_attr(self, layer, **kwargs):\n        if isinstance(layer, (nn.BatchNorm2D, nn.LayerNorm, nn.GroupNorm)):\n            src_state_dict = layer.state_dict()\n            if isinstance(layer, nn.BatchNorm2D):\n                layer = nn.BatchNorm2D(\n                    num_features=layer._num_features,\n                    momentum=layer._momentum,\n                    epsilon=layer._epsilon,\n                    **kwargs)\n            elif isinstance(layer, nn.LayerNorm):\n                layer = nn.LayerNorm(\n                    normalized_shape=layer._normalized_shape,\n                    epsilon=layer._epsilon,\n                    **kwargs)\n            else:\n                layer = nn.GroupNorm(\n                    num_groups=layer._num_groups,\n                    num_channels=layer._num_channels,\n                    epsilon=layer._epsilon,\n                    **kwargs)\n            layer.set_state_dict(src_state_dict)\n        else:\n            for name, sublayer in layer.named_children():\n                new_sublayer = self.reset_norm_param_attr(sublayer, **kwargs)\n                if new_sublayer is not sublayer:\n                    setattr(layer, name, new_sublayer)\n\n        return layer\n"
  },
  {
    "path": "ppdet/engine/trainer_cot.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom ppdet.core.workspace import create\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('ppdet.engine')\n\nfrom . import Trainer\n__all__ = ['TrainerCot']\n\nclass TrainerCot(Trainer):\n    \"\"\"\n    Trainer for label-cotuning\n    calculate the relationship between base_classes and novel_classes\n    \"\"\"\n    def __init__(self, cfg, mode='train'):\n        super(TrainerCot, self).__init__(cfg, mode)\n        self.cotuning_init()\n\n    def cotuning_init(self):    \n        num_classes_novel = self.cfg['num_classes']\n\n        self.load_weights(self.cfg.pretrain_weights)\n\n        self.model.eval()\n        relationship = self.model.relationship_learning(self.loader, num_classes_novel)\n    \n        self.model.init_cot_head(relationship)\n        self.optimizer = create('OptimizerBuilder')(self.lr, self.model)\n\n\n"
  },
  {
    "path": "ppdet/engine/trainer_ssod.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport copy\nimport time\nimport typing\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.distributed as dist\nfrom paddle.distributed import fleet\nfrom ppdet.optimizer import ModelEMA, SimpleModelEMA\nfrom ppdet.core.workspace import create\nfrom ppdet.utils.checkpoint import load_weight, load_pretrain_weight, save_model\nimport ppdet.utils.stats as stats\nfrom ppdet.utils import profiler\nfrom ppdet.modeling.ssod.utils import align_weak_strong_shape\nfrom .trainer import Trainer\nfrom ppdet.utils.logger import setup_logger\nfrom paddle.static import InputSpec\nfrom ppdet.engine.export_utils import _dump_infer_config, _prune_input_spec\nMOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']\n\nlogger = setup_logger('ppdet.engine')\n\n__all__ = ['Trainer_DenseTeacher', 'Trainer_ARSL', 'Trainer_Semi_RTDETR']\n\n\nclass Trainer_DenseTeacher(Trainer):\n    def __init__(self, cfg, mode='train'):\n        self.cfg = cfg\n        assert mode.lower() in ['train', 'eval', 'test'], \\\n                \"mode should be 'train', 'eval' or 'test'\"\n        self.mode = mode.lower()\n        self.optimizer = None\n        self.is_loaded_weights = False\n        self.use_amp = self.cfg.get('amp', False)\n        self.amp_level = self.cfg.get('amp_level', 'O1')\n        self.custom_white_list = self.cfg.get('custom_white_list', None)\n        self.custom_black_list = self.cfg.get('custom_black_list', None)\n\n        # build data loader\n        capital_mode = self.mode.capitalize()\n        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(\n            '{}Dataset'.format(capital_mode))()\n\n        if self.mode == 'train':\n            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(\n                'UnsupTrainDataset')\n            self.loader = create('SemiTrainReader')(\n                self.dataset, self.dataset_unlabel, cfg.worker_num)\n\n        # build model\n        if 'model' not in self.cfg:\n            self.model = create(cfg.architecture)\n        else:\n            self.model = self.cfg.model\n            self.is_loaded_weights = True\n\n        # EvalDataset build with BatchSampler to evaluate in single device\n        # TODO: multi-device evaluate\n        if self.mode == 'eval':\n            self._eval_batch_sampler = paddle.io.BatchSampler(\n                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])\n            # If metric is VOC, need to be set collate_batch=False.\n            if cfg.metric == 'VOC':\n                cfg['EvalReader']['collate_batch'] = False\n            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,\n                                               self._eval_batch_sampler)\n        # TestDataset build after user set images, skip loader creation here\n\n        # build optimizer in train mode\n        if self.mode == 'train':\n            steps_per_epoch = len(self.loader)\n            if steps_per_epoch < 1:\n                logger.warning(\n                    \"Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader.\"\n                )\n            self.lr = create('LearningRate')(steps_per_epoch)\n            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)\n\n            # Unstructured pruner is only enabled in the train mode.\n            if self.cfg.get('unstructured_prune'):\n                self.pruner = create('UnstructuredPruner')(self.model,\n                                                           steps_per_epoch)\n        if self.use_amp and self.amp_level == 'O2':\n            self.model, self.optimizer = paddle.amp.decorate(\n                models=self.model,\n                optimizers=self.optimizer,\n                level=self.amp_level)\n\n        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])\n        if self.use_ema:\n            ema_decay = self.cfg.get('ema_decay', 0.9998)\n            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')\n            cycle_epoch = self.cfg.get('cycle_epoch', -1)\n            ema_black_list = self.cfg.get('ema_black_list', None)\n            self.ema = ModelEMA(\n                self.model,\n                decay=ema_decay,\n                ema_decay_type=ema_decay_type,\n                cycle_epoch=cycle_epoch,\n                ema_black_list=ema_black_list)\n            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)\n\n        # simple_ema for SSOD\n        self.use_simple_ema = ('use_simple_ema' in cfg and\n                               cfg['use_simple_ema'])\n        if self.use_simple_ema:\n            self.use_ema = True\n            ema_decay = self.cfg.get('ema_decay', 0.9996)\n            self.ema = SimpleModelEMA(self.model, decay=ema_decay)\n            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)\n\n        self._nranks = dist.get_world_size()\n        self._local_rank = dist.get_rank()\n\n        self.status = {}\n\n        self.start_epoch = 0\n        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch\n\n        # initial default callbacks\n        self._init_callbacks()\n\n        # initial default metrics\n        self._init_metrics()\n        self._reset_metrics()\n\n    def load_weights(self, weights):\n        if self.is_loaded_weights:\n            return\n        self.start_epoch = 0\n        load_pretrain_weight(self.model, weights)\n        load_pretrain_weight(self.ema.model, weights)\n        logger.info(\"Load weights {} to start training for teacher and student\".\n                    format(weights))\n\n    def resume_weights(self, weights, exchange=True):\n        # support Distill resume weights\n        if hasattr(self.model, 'student_model'):\n            self.start_epoch = load_weight(self.model.student_model, weights,\n                                           self.optimizer, exchange)\n        else:\n            self.start_epoch = load_weight(self.model, weights, self.optimizer,\n                                           self.ema\n                                           if self.use_ema else None, exchange)\n        logger.debug(\"Resume weights of epoch {}\".format(self.start_epoch))\n\n    def train(self, validate=False):\n        self.semi_start_iters = self.cfg.get('semi_start_iters', 5000)\n        Init_mark = False\n        if validate:\n            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(\n                \"EvalDataset\")()\n\n        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and\n                   self.cfg.use_gpu and self._nranks > 1)\n        if sync_bn:\n            self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(\n                self.model)\n\n        if self.cfg.get('fleet', False):\n            self.model = fleet.distributed_model(self.model)\n            self.optimizer = fleet.distributed_optimizer(self.optimizer)\n        elif self._nranks > 1:\n            find_unused_parameters = self.cfg[\n                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False\n            self.model = paddle.DataParallel(\n                self.model, find_unused_parameters=find_unused_parameters)\n            self.ema.model = paddle.DataParallel(\n                self.ema.model, find_unused_parameters=find_unused_parameters)\n\n        self.status.update({\n            'epoch_id': self.start_epoch,\n            'step_id': 0,\n            'steps_per_epoch': len(self.loader),\n            'exchange_save_model': True,\n        })\n        # Note: exchange_save_model\n        # in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams\n\n        self.status['batch_time'] = stats.SmoothedValue(\n            self.cfg.log_iter, fmt='{avg:.4f}')\n        self.status['data_time'] = stats.SmoothedValue(\n            self.cfg.log_iter, fmt='{avg:.4f}')\n        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)\n        profiler_options = self.cfg.get('profiler_options', None)\n        self._compose_callback.on_train_begin(self.status)\n\n        train_cfg = self.cfg.DenseTeacher['train_cfg']\n        concat_sup_data = train_cfg.get('concat_sup_data', True)\n\n        for param in self.ema.model.parameters():\n            param.stop_gradient = True\n\n        for epoch_id in range(self.start_epoch, self.cfg.epoch):\n            self.status['mode'] = 'train'\n            self.status['epoch_id'] = epoch_id\n            self._compose_callback.on_epoch_begin(self.status)\n            self.loader.dataset_label.set_epoch(epoch_id)\n            self.loader.dataset_unlabel.set_epoch(epoch_id)\n            iter_tic = time.time()\n            loss_dict = {\n                'loss': paddle.to_tensor([0]),\n                'loss_sup_sum': paddle.to_tensor([0]),\n                'loss_unsup_sum': paddle.to_tensor([0]),\n                'fg_sum': paddle.to_tensor([0]),\n            }\n            if self._nranks > 1:\n                for k in self.model._layers.get_loss_keys():\n                    loss_dict.update({k: paddle.to_tensor([0.])})\n                for k in self.model._layers.get_loss_keys():\n                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})\n            else:\n                for k in self.model.get_loss_keys():\n                    loss_dict.update({k: paddle.to_tensor([0.])})\n                for k in self.model.get_loss_keys():\n                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})\n\n            # Note: for step_id, data in enumerate(self.loader): # enumerate bug\n            for step_id in range(len(self.loader)):\n                data = next(self.loader)\n\n                self.model.train()\n                self.ema.model.eval()\n                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data\n\n                self.status['data_time'].update(time.time() - iter_tic)\n                self.status['step_id'] = step_id\n                profiler.add_profiler_step(profiler_options)\n                self._compose_callback.on_step_begin(self.status)\n\n                if data_sup_w['image'].shape != data_sup_s['image'].shape:\n                    data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,\n                                                                     data_sup_s)\n\n                data_sup_w['epoch_id'] = epoch_id\n                data_sup_s['epoch_id'] = epoch_id\n                if concat_sup_data:\n                    for k, v in data_sup_s.items():\n                        if k in ['epoch_id']:\n                            continue\n                        data_sup_s[k] = paddle.concat([v, data_sup_w[k]])\n                    loss_dict_sup = self.model(data_sup_s)\n                else:\n                    loss_dict_sup_w = self.model(data_sup_w)\n                    loss_dict_sup = self.model(data_sup_s)\n                    for k, v in loss_dict_sup_w.items():\n                        loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5\n\n                losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight']\n                losses_sup.backward()\n\n                losses = losses_sup.detach()\n                loss_dict.update(loss_dict_sup)\n                loss_dict.update({'loss_sup_sum': loss_dict['loss']})\n\n                curr_iter = len(self.loader) * epoch_id + step_id\n                st_iter = self.semi_start_iters\n                if curr_iter == st_iter:\n                    logger.info(\"***\" * 30)\n                    logger.info('Semi starting ...')\n                    logger.info(\"***\" * 30)\n                if curr_iter > st_iter:\n                    unsup_weight = train_cfg['unsup_weight']\n                    if train_cfg['suppress'] == 'linear':\n                        tar_iter = st_iter * 2\n                        if curr_iter <= tar_iter:\n                            unsup_weight *= (curr_iter - st_iter) / st_iter\n                    elif train_cfg['suppress'] == 'exp':\n                        tar_iter = st_iter + 2000\n                        if curr_iter <= tar_iter:\n                            scale = np.exp((curr_iter - tar_iter) / 1000)\n                            unsup_weight *= scale\n                    elif train_cfg['suppress'] == 'step':\n                        tar_iter = st_iter * 2\n                        if curr_iter <= tar_iter:\n                            unsup_weight *= 0.25\n                    else:\n                        raise ValueError\n\n                    if data_unsup_w['image'].shape != data_unsup_s[\n                            'image'].shape:\n                        data_unsup_w, data_unsup_s = align_weak_strong_shape(\n                            data_unsup_w, data_unsup_s)\n\n                    data_unsup_w['epoch_id'] = epoch_id\n                    data_unsup_s['epoch_id'] = epoch_id\n\n                    data_unsup_s['get_data'] = True\n                    student_preds = self.model(data_unsup_s)\n\n                    with paddle.no_grad():\n                        data_unsup_w['is_teacher'] = True\n                        teacher_preds = self.ema.model(data_unsup_w)\n\n                    train_cfg['curr_iter'] = curr_iter\n                    train_cfg['st_iter'] = st_iter\n                    if self._nranks > 1:\n                        loss_dict_unsup = self.model._layers.get_ssod_loss(\n                            student_preds, teacher_preds, train_cfg)\n                    else:\n                        loss_dict_unsup = self.model.get_ssod_loss(\n                            student_preds, teacher_preds, train_cfg)\n\n                    fg_num = loss_dict_unsup[\"fg_sum\"]\n                    del loss_dict_unsup[\"fg_sum\"]\n                    distill_weights = train_cfg['loss_weight']\n                    loss_dict_unsup = {\n                        k: v * distill_weights[k]\n                        for k, v in loss_dict_unsup.items()\n                    }\n\n                    losses_unsup = sum([\n                        metrics_value\n                        for metrics_value in loss_dict_unsup.values()\n                    ]) * unsup_weight\n                    losses_unsup.backward()\n\n                    loss_dict.update(loss_dict_unsup)\n                    loss_dict.update({'loss_unsup_sum': losses_unsup})\n                    losses += losses_unsup.detach()\n                    loss_dict.update({\"fg_sum\": fg_num})\n                    loss_dict['loss'] = losses\n\n                self.optimizer.step()\n                curr_lr = self.optimizer.get_lr()\n                self.lr.step()\n                self.optimizer.clear_grad()\n                self.status['learning_rate'] = curr_lr\n                if self._nranks < 2 or self._local_rank == 0:\n                    self.status['training_staus'].update(loss_dict)\n\n                self.status['batch_time'].update(time.time() - iter_tic)\n                self._compose_callback.on_step_end(self.status)\n                # Note: ema_start_iters\n                if self.use_ema and curr_iter == self.ema_start_iters:\n                    logger.info(\"***\" * 30)\n                    logger.info('EMA starting ...')\n                    logger.info(\"***\" * 30)\n                    self.ema.update(self.model, decay=0)\n                elif self.use_ema and curr_iter > self.ema_start_iters:\n                    self.ema.update(self.model)\n                iter_tic = time.time()\n\n            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \\\n                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)\n            if is_snapshot and self.use_ema:\n                # apply ema weight on model\n                weight = copy.deepcopy(self.ema.model.state_dict())\n                for k, v in weight.items():\n                    if paddle.is_floating_point(v):\n                        weight[k].stop_gradient = True\n                self.status['weight'] = weight\n\n            self._compose_callback.on_epoch_end(self.status)\n\n            if validate and is_snapshot:\n                if not hasattr(self, '_eval_loader'):\n                    # build evaluation dataset and loader\n                    self._eval_dataset = self.cfg.EvalDataset\n                    self._eval_batch_sampler = \\\n                        paddle.io.BatchSampler(\n                            self._eval_dataset,\n                            batch_size=self.cfg.EvalReader['batch_size'])\n                    # If metric is VOC, need to be set collate_batch=False.\n                    if self.cfg.metric == 'VOC':\n                        self.cfg['EvalReader']['collate_batch'] = False\n                    self._eval_loader = create('EvalReader')(\n                        self._eval_dataset,\n                        self.cfg.worker_num,\n                        batch_sampler=self._eval_batch_sampler)\n                # if validation in training is enabled, metrics should be re-init\n                # Init_mark makes sure this code will only execute once\n                if validate and Init_mark == False:\n                    Init_mark = True\n                    self._init_metrics(validate=validate)\n                    self._reset_metrics()\n\n                with paddle.no_grad():\n                    self.status['save_best_model'] = True\n                    self._eval_with_loader(self._eval_loader)\n\n            if is_snapshot and self.use_ema:\n                self.status.pop('weight')\n\n        self._compose_callback.on_train_end(self.status)\n\n    def evaluate(self):\n        # get distributed model\n        if self.cfg.get('fleet', False):\n            self.model = fleet.distributed_model(self.model)\n            self.optimizer = fleet.distributed_optimizer(self.optimizer)\n        elif self._nranks > 1:\n            find_unused_parameters = self.cfg[\n                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False\n            self.model = paddle.DataParallel(\n                self.model, find_unused_parameters=find_unused_parameters)\n        with paddle.no_grad():\n            self._eval_with_loader(self.loader)\n\n    def _eval_with_loader(self, loader):\n        sample_num = 0\n        tic = time.time()\n        self._compose_callback.on_epoch_begin(self.status)\n        self.status['mode'] = 'eval'\n\n        test_cfg = self.cfg.DenseTeacher['test_cfg']\n        if test_cfg['inference_on'] == 'teacher':\n            logger.info(\"***** teacher model evaluating *****\")\n            eval_model = self.ema.model\n        else:\n            logger.info(\"***** student model evaluating *****\")\n            eval_model = self.model\n\n        eval_model.eval()\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(\n                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)\n            self._flops(flops_loader)\n        for step_id, data in enumerate(loader):\n            self.status['step_id'] = step_id\n            self._compose_callback.on_step_begin(self.status)\n            # forward\n            if self.use_amp:\n                with paddle.amp.auto_cast(\n                        enable=self.cfg.use_gpu or self.cfg.use_mlu,\n                        custom_white_list=self.custom_white_list,\n                        custom_black_list=self.custom_black_list,\n                        level=self.amp_level):\n                    outs = eval_model(data)\n            else:\n                outs = eval_model(data)\n\n            # update metrics\n            for metric in self._metrics:\n                metric.update(data, outs)\n\n            # multi-scale inputs: all inputs have same im_id\n            if isinstance(data, typing.Sequence):\n                sample_num += data[0]['im_id'].numpy().shape[0]\n            else:\n                sample_num += data['im_id'].numpy().shape[0]\n            self._compose_callback.on_step_end(self.status)\n\n        self.status['sample_num'] = sample_num\n        self.status['cost_time'] = time.time() - tic\n\n        # accumulate metric to log out\n        for metric in self._metrics:\n            metric.accumulate()\n            metric.log()\n        self._compose_callback.on_epoch_end(self.status)\n        self._reset_metrics()\n\n\nclass Trainer_ARSL(Trainer):\n    def __init__(self, cfg, mode='train'):\n        self.cfg = cfg\n        assert mode.lower() in ['train', 'eval', 'test'], \\\n                \"mode should be 'train', 'eval' or 'test'\"\n        self.mode = mode.lower()\n        self.optimizer = None\n        self.is_loaded_weights = False\n        capital_mode = self.mode.capitalize()\n        self.use_ema = False\n        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(\n            '{}Dataset'.format(capital_mode))()\n        if self.mode == 'train':\n            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(\n                'UnsupTrainDataset')\n            self.loader = create('SemiTrainReader')(\n                self.dataset, self.dataset_unlabel, cfg.worker_num)\n\n        # build model\n        if 'model' not in self.cfg:\n            self.student_model = create(cfg.architecture)\n            self.teacher_model = create(cfg.architecture)\n            self.model = EnsembleTSModel(self.teacher_model, self.student_model)\n        else:\n            self.model = self.cfg.model\n            self.is_loaded_weights = True\n        # save path for burn-in model\n        self.base_path = cfg.get('weights')\n        self.base_path = os.path.dirname(self.base_path)\n\n        # EvalDataset build with BatchSampler to evaluate in single device\n        # TODO: multi-device evaluate\n        if self.mode == 'eval':\n            self._eval_batch_sampler = paddle.io.BatchSampler(\n                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])\n            self.loader = create('{}Reader'.format(self.mode.capitalize()))(\n                self.dataset, cfg.worker_num, self._eval_batch_sampler)\n        # TestDataset build after user set images, skip loader creation here\n\n        self.start_epoch = 0\n        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch\n        self.epoch_iter = self.cfg.epoch_iter  # set fixed iter in each epoch to control checkpoint\n\n        # build optimizer in train mode\n        if self.mode == 'train':\n            steps_per_epoch = self.epoch_iter\n            self.lr = create('LearningRate')(steps_per_epoch)\n            self.optimizer = create('OptimizerBuilder')(self.lr,\n                                                        self.model.modelStudent)\n\n        self._nranks = dist.get_world_size()\n        self._local_rank = dist.get_rank()\n\n        self.status = {}\n\n        # initial default callbacks\n        self._init_callbacks()\n\n        # initial default metrics\n        self._init_metrics()\n        self._reset_metrics()\n        self.iter = 0\n\n    def resume_weights(self, weights):\n        # support Distill resume weights\n        if hasattr(self.model, 'student_model'):\n            self.start_epoch = load_weight(self.model.student_model, weights,\n                                           self.optimizer)\n        else:\n            self.start_epoch = load_weight(self.model, weights, self.optimizer)\n        logger.debug(\"Resume weights of epoch {}\".format(self.start_epoch))\n\n    def train(self, validate=False):\n        assert self.mode == 'train', \"Model not in 'train' mode\"\n        Init_mark = False\n\n        # if validation in training is enabled, metrics should be re-init\n        if validate:\n            self._init_metrics(validate=validate)\n            self._reset_metrics()\n\n        if self.cfg.get('fleet', False):\n            self.model.modelStudent = fleet.distributed_model(\n                self.model.modelStudent)\n            self.optimizer = fleet.distributed_optimizer(self.optimizer)\n        elif self._nranks > 1:\n            find_unused_parameters = self.cfg[\n                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False\n            self.model.modelStudent = paddle.DataParallel(\n                self.model.modelStudent,\n                find_unused_parameters=find_unused_parameters)\n\n        # set fixed iter in each epoch to control checkpoint\n        self.status.update({\n            'epoch_id': self.start_epoch,\n            'step_id': 0,\n            'steps_per_epoch': self.epoch_iter\n        })\n        print('338 Len of DataLoader: {}'.format(len(self.loader)))\n\n        self.status['batch_time'] = stats.SmoothedValue(\n            self.cfg.log_iter, fmt='{avg:.4f}')\n        self.status['data_time'] = stats.SmoothedValue(\n            self.cfg.log_iter, fmt='{avg:.4f}')\n        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)\n\n        self._compose_callback.on_train_begin(self.status)\n\n        epoch_id = self.start_epoch\n        self.iter = self.start_epoch * self.epoch_iter\n        # use iter rather than epoch to control training schedule\n        while self.iter < self.cfg.max_iter:\n            # epoch loop\n            self.status['mode'] = 'train'\n            self.status['epoch_id'] = epoch_id\n            self._compose_callback.on_epoch_begin(self.status)\n            self.loader.dataset_label.set_epoch(epoch_id)\n            self.loader.dataset_unlabel.set_epoch(epoch_id)\n            paddle.device.cuda.empty_cache()  # clear GPU memory\n            # set model status\n            self.model.modelStudent.train()\n            self.model.modelTeacher.eval()\n            iter_tic = time.time()\n\n            # iter loop in each eopch\n            for step_id in range(self.epoch_iter):\n                data = next(self.loader)\n                self.status['data_time'].update(time.time() - iter_tic)\n                self.status['step_id'] = step_id\n                # profiler.add_profiler_step(profiler_options)\n                self._compose_callback.on_step_begin(self.status)\n\n                # model forward and calculate loss\n                loss_dict = self.run_step_full_semisup(data)\n\n                if (step_id + 1) % self.cfg.optimize_rate == 0:\n                    self.optimizer.step()\n                    self.optimizer.clear_grad()\n                curr_lr = self.optimizer.get_lr()\n                self.lr.step()\n\n                # update log status\n                self.status['learning_rate'] = curr_lr\n                if self._nranks < 2 or self._local_rank == 0:\n                    self.status['training_staus'].update(loss_dict)\n                self.status['batch_time'].update(time.time() - iter_tic)\n                self._compose_callback.on_step_end(self.status)\n                self.iter += 1\n                iter_tic = time.time()\n\n            self._compose_callback.on_epoch_end(self.status)\n\n            if validate and (self._nranks < 2 or self._local_rank == 0) \\\n                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \\\n                             or epoch_id == self.end_epoch - 1):\n                if not hasattr(self, '_eval_loader'):\n                    # build evaluation dataset and loader\n                    self._eval_dataset = self.cfg.EvalDataset\n                    self._eval_batch_sampler = \\\n                        paddle.io.BatchSampler(\n                            self._eval_dataset,\n                            batch_size=self.cfg.EvalReader['batch_size'])\n                    self._eval_loader = create('EvalReader')(\n                        self._eval_dataset,\n                        self.cfg.worker_num,\n                        batch_sampler=self._eval_batch_sampler)\n                if validate and Init_mark == False:\n                    Init_mark = True\n                    self._init_metrics(validate=validate)\n                    self._reset_metrics()\n                with paddle.no_grad():\n                    self.status['save_best_model'] = True\n                    # before burn-in stage, eval student. after burn-in stage, eval teacher\n                    if self.iter <= self.cfg.SEMISUPNET['BURN_UP_STEP']:\n                        print(\"start eval student model\")\n                        self._eval_with_loader(\n                            self._eval_loader, mode=\"student\")\n                    else:\n                        print(\"start eval teacher model\")\n                        self._eval_with_loader(\n                            self._eval_loader, mode=\"teacher\")\n\n            epoch_id += 1\n\n        self._compose_callback.on_train_end(self.status)\n\n    def merge_data(self, data1, data2):\n        data = copy.deepcopy(data1)\n        for k, v in data1.items():\n            if type(v) is paddle.Tensor:\n                data[k] = paddle.concat(x=[data[k], data2[k]], axis=0)\n            elif type(v) is list:\n                data[k].extend(data2[k])\n        return data\n\n    def run_step_full_semisup(self, data):\n        label_data_k, label_data_q, unlabel_data_k, unlabel_data_q = data\n        data_merge = self.merge_data(label_data_k, label_data_q)\n        loss_sup_dict = self.model.modelStudent(data_merge, branch=\"supervised\")\n        loss_dict = {}\n        for key in loss_sup_dict.keys():\n            if key[:4] == \"loss\":\n                loss_dict[key] = loss_sup_dict[key] * 1\n        losses_sup = paddle.add_n(list(loss_dict.values()))\n        # norm loss when using gradient accumulation\n        losses_sup = losses_sup / self.cfg.optimize_rate\n        losses_sup.backward()\n\n        for key in loss_sup_dict.keys():\n            loss_dict[key + \"_pseudo\"] = paddle.to_tensor([0])\n        loss_dict[\"loss_tot\"] = losses_sup\n        \"\"\"\n        semi-supervised training after burn-in stage\n        \"\"\"\n        if self.iter >= self.cfg.SEMISUPNET['BURN_UP_STEP']:\n            # init teacher model with burn-up weight\n            if self.iter == self.cfg.SEMISUPNET['BURN_UP_STEP']:\n                print(\n                    'Starting semi-supervised learning and load the teacher model.'\n                )\n                self._update_teacher_model(keep_rate=0.00)\n                # save burn-in model\n                if dist.get_world_size() < 2 or dist.get_rank() == 0:\n                    print('saving burn-in model.')\n                    save_name = 'burnIn'\n                    epoch_id = self.iter // self.epoch_iter\n                    save_model(self.model, self.optimizer, self.base_path,\n                               save_name, epoch_id)\n            # Update teacher model with EMA\n            elif (self.iter + 1) % self.cfg.optimize_rate == 0:\n                self._update_teacher_model(\n                    keep_rate=self.cfg.SEMISUPNET['EMA_KEEP_RATE'])\n\n            #warm-up weight for pseudo loss\n            pseudo_weight = self.cfg.SEMISUPNET['UNSUP_LOSS_WEIGHT']\n            pseudo_warmup_iter = self.cfg.SEMISUPNET['PSEUDO_WARM_UP_STEPS']\n            temp = self.iter - self.cfg.SEMISUPNET['BURN_UP_STEP']\n            if temp <= pseudo_warmup_iter:\n                pseudo_weight *= (temp / pseudo_warmup_iter)\n\n            # get teacher predictions on weak-augmented unlabeled data\n            with paddle.no_grad():\n                teacher_pred = self.model.modelTeacher(\n                    unlabel_data_k, branch='semi_supervised')\n\n            # calculate unsupervised loss on strong-augmented unlabeled data\n            loss_unsup_dict = self.model.modelStudent(\n                unlabel_data_q,\n                branch=\"semi_supervised\",\n                teacher_prediction=teacher_pred, )\n\n            for key in loss_unsup_dict.keys():\n                if key[-6:] == \"pseudo\":\n                    loss_unsup_dict[key] = loss_unsup_dict[key] * pseudo_weight\n            losses_unsup = paddle.add_n(list(loss_unsup_dict.values()))\n            # norm loss when using gradient accumulation\n            losses_unsup = losses_unsup / self.cfg.optimize_rate\n            losses_unsup.backward()\n\n            loss_dict.update(loss_unsup_dict)\n            loss_dict[\"loss_tot\"] += losses_unsup\n        return loss_dict\n\n    def export(self, output_dir='output_inference'):\n        self.model.eval()\n        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]\n        save_dir = os.path.join(output_dir, model_name)\n        if not os.path.exists(save_dir):\n            os.makedirs(save_dir)\n        image_shape = None\n        if self.cfg.architecture in MOT_ARCH:\n            test_reader_name = 'TestMOTReader'\n        else:\n            test_reader_name = 'TestReader'\n        if 'inputs_def' in self.cfg[test_reader_name]:\n            inputs_def = self.cfg[test_reader_name]['inputs_def']\n            image_shape = inputs_def.get('image_shape', None)\n        # set image_shape=[3, -1, -1] as default\n        if image_shape is None:\n            image_shape = [3, -1, -1]\n\n        self.model.modelTeacher.eval()\n        if hasattr(self.model.modelTeacher, 'deploy'):\n            self.model.modelTeacher.deploy = True\n\n        # Save infer cfg\n        _dump_infer_config(self.cfg,\n                           os.path.join(save_dir, 'infer_cfg.yml'), image_shape,\n                           self.model.modelTeacher)\n\n        input_spec = [{\n            \"image\": InputSpec(\n                shape=[None] + image_shape, name='image'),\n            \"im_shape\": InputSpec(\n                shape=[None, 2], name='im_shape'),\n            \"scale_factor\": InputSpec(\n                shape=[None, 2], name='scale_factor')\n        }]\n        if self.cfg.architecture == 'DeepSORT':\n            input_spec[0].update({\n                \"crops\": InputSpec(\n                    shape=[None, 3, 192, 64], name='crops')\n            })\n\n        static_model = paddle.jit.to_static(\n            self.model.modelTeacher, input_spec=input_spec)\n        # NOTE: dy2st do not pruned program, but jit.save will prune program\n        # input spec, prune input spec here and save with pruned input spec\n        pruned_input_spec = _prune_input_spec(input_spec,\n                                              static_model.forward.main_program,\n                                              static_model.forward.outputs)\n\n        # dy2st and save model\n        if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT':\n            paddle.jit.save(\n                static_model,\n                os.path.join(save_dir, 'model'),\n                input_spec=pruned_input_spec)\n        else:\n            self.cfg.slim.save_quantized_model(\n                self.model.modelTeacher,\n                os.path.join(save_dir, 'model'),\n                input_spec=pruned_input_spec)\n        logger.info(\"Export model and saved in {}\".format(save_dir))\n\n    def _eval_with_loader(self, loader, mode=\"teacher\"):\n        sample_num = 0\n        tic = time.time()\n        self._compose_callback.on_epoch_begin(self.status)\n        self.status['mode'] = 'eval'\n        # self.model.eval()\n        self.model.modelTeacher.eval()\n        self.model.modelStudent.eval()\n        for step_id, data in enumerate(loader):\n            self.status['step_id'] = step_id\n            self._compose_callback.on_step_begin(self.status)\n            if mode == \"teacher\":\n                outs = self.model.modelTeacher(data)\n            else:\n                outs = self.model.modelStudent(data)\n\n            # update metrics\n            for metric in self._metrics:\n                metric.update(data, outs)\n\n            sample_num += data['im_id'].numpy().shape[0]\n            self._compose_callback.on_step_end(self.status)\n\n        self.status['sample_num'] = sample_num\n        self.status['cost_time'] = time.time() - tic\n\n        # accumulate metric to log out\n        for metric in self._metrics:\n            metric.accumulate()\n            metric.log()\n        self._compose_callback.on_epoch_end(self.status)\n        # reset metric states for metric may performed multiple times\n        self._reset_metrics()\n\n    def evaluate(self):\n        with paddle.no_grad():\n            self._eval_with_loader(self.loader)\n\n    @paddle.no_grad()\n    def _update_teacher_model(self, keep_rate=0.996):\n        student_model_dict = copy.deepcopy(self.model.modelStudent.state_dict())\n        new_teacher_dict = dict()\n        for key, value in self.model.modelTeacher.state_dict().items():\n            if key in student_model_dict.keys():\n                v = student_model_dict[key] * (1 - keep_rate\n                                               ) + value * keep_rate\n                v.stop_gradient = True\n                new_teacher_dict[key] = v\n            else:\n                raise Exception(\"{} is not found in student model\".format(key))\n\n        self.model.modelTeacher.set_dict(new_teacher_dict)\n\n\nclass EnsembleTSModel(nn.Layer):\n    def __init__(self, modelTeacher, modelStudent):\n        super(EnsembleTSModel, self).__init__()\n        self.modelTeacher = modelTeacher\n        self.modelStudent = modelStudent\n\n\nclass Trainer_Semi_RTDETR(Trainer):\n    def __init__(self, cfg, mode='train'):\n        self.cfg = cfg\n        assert mode.lower() in ['train', 'eval', 'test'], \\\n                \"mode should be 'train', 'eval' or 'test'\"\n        self.mode = mode.lower()\n        self.optimizer = None\n        self.is_loaded_weights = False\n        self.use_amp = self.cfg.get('amp', False)\n        self.amp_level = self.cfg.get('amp_level', 'O1')\n        self.custom_white_list = self.cfg.get('custom_white_list', None)\n        self.custom_black_list = self.cfg.get('custom_black_list', None)\n\n        # build data loader\n        capital_mode = self.mode.capitalize()\n        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(\n            '{}Dataset'.format(capital_mode))()\n\n        if self.mode == 'train':\n            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(\n                'UnsupTrainDataset')\n            self.loader = create('SemiTrainReader')(\n                self.dataset, self.dataset_unlabel, cfg.worker_num)\n\n        # build model\n        if 'model' not in self.cfg:\n            self.model = create(cfg.SSOD)\n        else:\n            self.model = self.cfg.model\n            self.is_loaded_weights = True\n\n        # EvalDataset build with BatchSampler to evaluate in single device\n        # TODO: multi-device evaluate\n        if self.mode == 'eval':\n            self._eval_batch_sampler = paddle.io.BatchSampler(\n                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])\n            # If metric is VOC, need to be set collate_batch=False.\n            if cfg.metric == 'VOC':\n                cfg['EvalReader']['collate_batch'] = False\n            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,\n                                               self._eval_batch_sampler)\n        # TestDataset build after user set images, skip loader creation here\n\n        # build optimizer in train mode\n        if self.mode == 'train':\n            steps_per_epoch = len(self.loader)\n            if steps_per_epoch < 1:\n                logger.warning(\n                    \"Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader.\"\n                )\n            self.lr = create('LearningRate')(steps_per_epoch)\n            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)\n\n            # Unstructured pruner is only enabled in the train mode.\n            if self.cfg.get('unstructured_prune'):\n                self.pruner = create('UnstructuredPruner')(self.model,\n                                                           steps_per_epoch)\n        if self.use_amp and self.amp_level == 'O2':\n            self.model, self.optimizer = paddle.amp.decorate(\n                models=self.model,\n                optimizers=self.optimizer,\n                level=self.amp_level)\n\n        self._nranks = dist.get_world_size()\n        self._local_rank = dist.get_rank()\n\n        self.status = {}\n\n        self.start_epoch = 0\n        self.start_iter = 0\n        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch\n\n        # initial default callbacks\n        self._init_callbacks()\n\n        # initial default metrics\n        self._init_metrics()\n        self._reset_metrics()\n\n    def load_semi_weights(self, t_weights, s_weights):\n        if self.is_loaded_weights:\n            return\n        self.start_epoch = 0\n        load_pretrain_weight(self.model.teacher, t_weights)\n        load_pretrain_weight(self.model.student, s_weights)\n        logger.info(\"Load teacher weights {} to start training\".format(\n            t_weights))\n        logger.info(\"Load student weights {} to start training\".format(\n            s_weights))\n\n    def resume_weights(self, weights, exchange=True):\n        # support Distill resume weights\n        if hasattr(self.model, 'student_model'):\n            self.start_epoch = load_weight(self.model.student_model, weights,\n                                           self.optimizer, exchange)\n        else:\n            self.start_iter, self.start_epoch = load_weight(\n                self.model, weights, self.optimizer, self.ema\n                if self.use_ema else None, exchange)\n        logger.debug(\"Resume weights of epoch {}\".format(self.start_epoch))\n        logger.debug(\"Resume weights of iter {}\".format(self.start_iter))\n\n    def train(self, validate=False):\n        assert self.mode == 'train', \"Model not in 'train' mode\"\n        Init_mark = False\n        if validate:\n            self.cfg.EvalDataset = create(\"EvalDataset\")()\n\n        model = self.model\n        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and\n                   self.cfg.use_gpu and self._nranks > 1)\n        if sync_bn:\n            # self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(\n            #     self.model)\n            model.teacher = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(\n                model.teacher)\n            model.student = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(\n                self.model.student)\n\n        if self.cfg.get('fleet', False):\n            # model = fleet.distributed_model(model)\n            model = fleet.distributed_model(model)\n\n            self.optimizer = fleet.distributed_optimizer(self.optimizer)\n        elif self._nranks > 1:\n            find_unused_parameters = self.cfg[\n                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False\n            model = paddle.DataParallel(\n                model, find_unused_parameters=find_unused_parameters)\n\n        if self.cfg.get('amp', False):\n            scaler = amp.GradScaler(\n                enable=self.cfg.use_gpu or self.cfg.use_npu,\n                init_loss_scaling=1024)\n\n        self.status.update({\n            'epoch_id': self.start_epoch,\n            'iter_id': self.start_iter,\n            # 'step_id': self.start_step,\n            'steps_per_epoch': len(self.loader),\n        })\n\n        self.status['batch_time'] = stats.SmoothedValue(\n            self.cfg.log_iter, fmt='{avg:.4f}')\n        self.status['data_time'] = stats.SmoothedValue(\n            self.cfg.log_iter, fmt='{avg:.4f}')\n        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)\n\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(\n                self.dataset, self.cfg.worker_num)\n            self._flops(flops_loader)\n        profiler_options = self.cfg.get('profiler_options', None)\n\n        self._compose_callback.on_train_begin(self.status)\n        iter_id = self.start_iter\n        self.status['iter_id'] = iter_id\n        self.status['eval_interval'] = self.cfg.eval_interval\n        self.status['save_interval'] = self.cfg.save_interval\n        for epoch_id in range(self.start_epoch, self.cfg.epoch):\n            self.status['mode'] = 'train'\n            self.status['epoch_id'] = epoch_id\n            self._compose_callback.on_epoch_begin(self.status)\n            self.loader.dataset_label.set_epoch(epoch_id)\n            self.loader.dataset_unlabel.set_epoch(epoch_id)\n            iter_tic = time.time()\n            if self._nranks > 1:\n                # print(model)\n                model._layers.teacher.eval()\n                model._layers.student.train()\n            else:\n                model.teacher.eval()\n                model.student.train()\n            iter_tic = time.time()\n            for step_id in range(len(self.loader)):\n                data = next(self.loader)\n                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data\n                data_sup_w['epoch_id'] = epoch_id\n                data_sup_s['epoch_id'] = epoch_id\n                data_unsup_w['epoch_id'] = epoch_id\n                data_unsup_s['epoch_id'] = epoch_id\n                data = [data_sup_w, data_sup_s, data_unsup_w, data_unsup_s]\n                iter_id += 1\n                self.status['data_time'].update(time.time() - iter_tic)\n                self.status['step_id'] = step_id\n                self.status['iter_id'] = iter_id\n                data.append(iter_id)\n                profiler.add_profiler_step(profiler_options)\n                self._compose_callback.on_step_begin(self.status)\n                if self.cfg.get('amp', False):\n                    with amp.auto_cast(enable=self.cfg.use_gpu):\n                        # model forward\n                        if self._nranks > 1:\n                            outputs = model._layers(data)\n                        else:\n                            outputs = model(data)\n                        loss = outputs['loss']\n\n                    scaled_loss = scaler.scale(loss)\n                    scaled_loss.backward()\n                    scaler.minimize(self.optimizer, scaled_loss)\n                else:\n                    outputs = model(data)\n                    loss = outputs['loss']\n                    # model backward\n                    loss.backward()\n                    self.optimizer.step()\n                curr_lr = self.optimizer.get_lr()\n                self.lr.step()\n                if self.cfg.get('unstructured_prune'):\n                    self.pruner.step()\n                self.optimizer.clear_grad()\n                # print(outputs)\n                # outputs=reduce_dict(outputs)\n                # if self.model.debug:\n                #     check_gradient(model)\n                # self.check_gradient()\n                self.status['learning_rate'] = curr_lr\n                if self._nranks < 2 or self._local_rank == 0:\n                    self.status['training_staus'].update(outputs)\n\n                self.status['batch_time'].update(time.time() - iter_tic)\n\n                if validate and (self._nranks < 2 or self._local_rank == 0) and \\\n                                ((iter_id + 1) % self.cfg.eval_interval == 0):\n                    if not hasattr(self, '_eval_loader'):\n                        # build evaluation dataset and loader\n                        self._eval_dataset = self.cfg.EvalDataset\n                        self._eval_batch_sampler = \\\n                            paddle.io.BatchSampler(\n                                self._eval_dataset,\n                                batch_size=self.cfg.EvalReader['batch_size'])\n                        # If metric is VOC, need to be set collate_batch=False.\n                        if self.cfg.metric == 'VOC':\n                            self.cfg['EvalReader']['collate_batch'] = False\n                        self._eval_loader = create('EvalReader')(\n                            self._eval_dataset,\n                            self.cfg.worker_num,\n                            batch_sampler=self._eval_batch_sampler)\n                    # if validation in training is enabled, metrics should be re-init\n                    # Init_mark makes sure this code will only execute once\n                    if validate and Init_mark == False:\n                        Init_mark = True\n                        self._init_metrics(validate=validate)\n                        self._reset_metrics()\n\n                    with paddle.no_grad():\n                        self.status['save_best_model'] = True\n                        self._eval_with_loader(self._eval_loader)\n                    model._layers.student.train()\n\n                self._compose_callback.on_step_end(self.status)\n\n                iter_tic = time.time()\n\n            if self.cfg.get('unstructured_prune'):\n                self.pruner.update_params()\n            self._compose_callback.on_epoch_end(self.status)\n\n        self._compose_callback.on_train_end(self.status)\n\n    def _eval_with_loader(self, loader):\n        sample_num = 0\n        tic = time.time()\n        self._compose_callback.on_epoch_begin(self.status)\n        self.status['mode'] = 'eval'\n        self.model.eval()\n        if self.cfg.get('print_flops', False):\n            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(\n                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)\n            self._flops(flops_loader)\n        print(\"*****teacher evaluate*****\")\n        for step_id, data in enumerate(loader):\n            self.status['step_id'] = step_id\n            self._compose_callback.on_step_begin(self.status)\n            # forward\n            outs = self.model.teacher(data)\n\n            # update metrics\n            for metric in self._metrics:\n                metric.update(data, outs)\n\n            # multi-scale inputs: all inputs have same im_id\n            if isinstance(data, typing.Sequence):\n                sample_num += data[0]['im_id'].numpy().shape[0]\n            else:\n                sample_num += data['im_id'].numpy().shape[0]\n            self._compose_callback.on_step_end(self.status)\n\n        self.status['sample_num'] = sample_num\n        self.status['cost_time'] = time.time() - tic\n\n        # accumulate metric to log out\n        for metric in self._metrics:\n            metric.accumulate()\n            metric.log()\n        self._compose_callback.on_epoch_end(self.status)\n        # reset metric states for metric may performed multiple times\n        self._reset_metrics()\n\n        print(\"*****student evaluate*****\")\n        for step_id, data in enumerate(loader):\n            self.status['step_id'] = step_id\n            self._compose_callback.on_step_begin(self.status)\n            # forward\n            outs = self.model.student(data)\n\n            # update metrics\n            for metric in self._metrics:\n                metric.update(data, outs)\n\n            # multi-scale inputs: all inputs have same im_id\n            if isinstance(data, typing.Sequence):\n                sample_num += data[0]['im_id'].numpy().shape[0]\n            else:\n                sample_num += data['im_id'].numpy().shape[0]\n            self._compose_callback.on_step_end(self.status)\n\n        self.status['sample_num'] = sample_num\n        self.status['cost_time'] = time.time() - tic\n\n        # accumulate metric to log out\n        for metric in self._metrics:\n            metric.accumulate()\n            metric.log()\n        # reset metric states for metric may performed multiple times\n        self._reset_metrics()\n        self.status['mode'] = 'train'\n\n    def evaluate(self):\n        with paddle.no_grad():\n            self._eval_with_loader(self.loader)\n"
  },
  {
    "path": "ppdet/ext_op/README.md",
    "content": "# 自定义OP编译\n旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。\n\n## 1. 环境依赖\n- Paddle >= 2.0.1\n- gcc 8.2\n\n## 2. 安装\n```\npython setup.py install\n```\n\n编译完成后即可使用，以下为`rbox_iou`的使用示例\n```\n# 引入自定义op\nfrom ext_op import rbox_iou\n\npaddle.set_device('gpu:0')\npaddle.disable_static()\n\nrbox1 = np.random.rand(13000, 5)\nrbox2 = np.random.rand(7, 5)\n\npd_rbox1 = paddle.to_tensor(rbox1)\npd_rbox2 = paddle.to_tensor(rbox2)\n\niou = rbox_iou(pd_rbox1, pd_rbox2)\nprint('iou', iou)\n```\n\n## 3. 单元测试\n可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：\n```\npython unittest/test_matched_rbox_iou.py\n```\n"
  },
  {
    "path": "ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc",
    "content": "//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n// The code is based on\n// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/\n\n#include \"../rbox_iou/rbox_iou_utils.h\"\n#include \"paddle/extension.h\"\n\ntemplate <typename T>\nvoid matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,\n                                 const T *rbox2_data_ptr, T *output_data_ptr) {\n\n  int i;\n  for (i = 0; i < rbox_num; i++) {\n    output_data_ptr[i] =\n        rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5);\n  }\n}\n\n#define CHECK_INPUT_CPU(x)                                                     \\\n  PD_CHECK(x.is_cpu(), #x \" must be a CPU Tensor.\")\n\nstd::vector<paddle::Tensor>\nMatchedRboxIouCPUForward(const paddle::Tensor &rbox1,\n                         const paddle::Tensor &rbox2) {\n  CHECK_INPUT_CPU(rbox1);\n  CHECK_INPUT_CPU(rbox2);\n  PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], \"inputs must be same dim\");\n\n  auto rbox_num = rbox1.shape()[0];\n  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace());\n\n  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), \"matched_rbox_iou_cpu_kernel\", ([&] {\n                               matched_rbox_iou_cpu_kernel<data_t>(\n                                   rbox_num, rbox1.data<data_t>(),\n                                   rbox2.data<data_t>(), output.data<data_t>());\n                             }));\n\n  return {output};\n}\n\n#ifdef PADDLE_WITH_CUDA\nstd::vector<paddle::Tensor>\nMatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,\n                          const paddle::Tensor &rbox2);\n#endif\n\n#define CHECK_INPUT_SAME(x1, x2)                                               \\\n  PD_CHECK(x1.place() == x2.place(), \"input must be smae pacle.\")\n\nstd::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,\n                                                  const paddle::Tensor &rbox2) {\n  CHECK_INPUT_SAME(rbox1, rbox2);\n  if (rbox1.is_cpu()) {\n    return MatchedRboxIouCPUForward(rbox1, rbox2);\n#ifdef PADDLE_WITH_CUDA\n  } else if (rbox1.is_gpu()) {\n    return MatchedRboxIouCUDAForward(rbox1, rbox2);\n#endif\n  }\n}\n\nstd::vector<std::vector<int64_t>>\nMatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape,\n                         std::vector<int64_t> rbox2_shape) {\n  return {{rbox1_shape[0]}};\n}\n\nstd::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1,\n                                                       paddle::DataType t2) {\n  return {t1};\n}\n\nPD_BUILD_OP(matched_rbox_iou)\n    .Inputs({\"RBOX1\", \"RBOX2\"})\n    .Outputs({\"Output\"})\n    .SetKernelFn(PD_KERNEL(MatchedRboxIouForward))\n    .SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape))\n    .SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype));\n"
  },
  {
    "path": "ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu",
    "content": "//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n// The code is based on\n// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/\n\n#include \"../rbox_iou/rbox_iou_utils.h\"\n#include \"paddle/extension.h\"\n\ntemplate <typename T>\n__global__ void\nmatched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,\n                             const T *rbox2_data_ptr, T *output_data_ptr) {\n  for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;\n       tid += blockDim.x * gridDim.x) {\n    output_data_ptr[tid] =\n        rbox_iou_single<T>(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5);\n  }\n}\n\n#define CHECK_INPUT_GPU(x)                                                     \\\n  PD_CHECK(x.is_gpu(), #x \" must be a GPU Tensor.\")\n\nstd::vector<paddle::Tensor>\nMatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,\n                          const paddle::Tensor &rbox2) {\n  CHECK_INPUT_GPU(rbox1);\n  CHECK_INPUT_GPU(rbox2);\n  PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], \"inputs must be same dim\");\n\n  auto rbox_num = rbox1.shape()[0];\n\n  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace());\n\n  const int thread_per_block = 512;\n  const int block_per_grid = CeilDiv(rbox_num, thread_per_block);\n\n  PD_DISPATCH_FLOATING_TYPES(\n      rbox1.type(), \"matched_rbox_iou_cuda_kernel\", ([&] {\n        matched_rbox_iou_cuda_kernel<\n            data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>(\n            rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(),\n            output.data<data_t>());\n      }));\n\n  return {output};\n}\n"
  },
  {
    "path": "ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc",
    "content": "//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"../rbox_iou/rbox_iou_utils.h\"\n#include \"paddle/extension.h\"\n\ntemplate <typename T>\nvoid nms_rotated_cpu_kernel(const T *boxes_data, const float threshold,\n                            const int64_t num_boxes, int64_t *num_keep_boxes,\n                            int64_t *output_data) {\n\n  int num_masks = CeilDiv(num_boxes, 64);\n  std::vector<int64_t> masks(num_masks, 0);\n  for (int64_t i = 0; i < num_boxes; ++i) {\n    if (masks[i / 64] & 1ULL << (i % 64))\n      continue;\n    T box_1[5];\n    for (int k = 0; k < 5; ++k) {\n      box_1[k] = boxes_data[i * 5 + k];\n    }\n    for (int64_t j = i + 1; j < num_boxes; ++j) {\n      if (masks[j / 64] & 1ULL << (j % 64))\n        continue;\n      T box_2[5];\n      for (int k = 0; k < 5; ++k) {\n        box_2[k] = boxes_data[j * 5 + k];\n      }\n      if (rbox_iou_single<T>(box_1, box_2) > threshold) {\n        masks[j / 64] |= 1ULL << (j % 64);\n      }\n    }\n  }\n  int64_t output_data_idx = 0;\n  for (int64_t i = 0; i < num_boxes; ++i) {\n    if (masks[i / 64] & 1ULL << (i % 64))\n      continue;\n    output_data[output_data_idx++] = i;\n  }\n  *num_keep_boxes = output_data_idx;\n  for (; output_data_idx < num_boxes; ++output_data_idx) {\n    output_data[output_data_idx] = 0;\n  }\n}\n\n#define CHECK_INPUT_CPU(x)                                                     \\\n  PD_CHECK(x.is_cpu(), #x \" must be a CPU Tensor.\")\n\nstd::vector<paddle::Tensor> NMSRotatedCPUForward(const paddle::Tensor &boxes,\n                                                 const paddle::Tensor &scores,\n                                                 float threshold) {\n  CHECK_INPUT_CPU(boxes);\n  CHECK_INPUT_CPU(scores);\n\n  auto num_boxes = boxes.shape()[0];\n\n  auto order_t =\n      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));\n  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);\n\n  auto keep =\n      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());\n  int64_t num_keep_boxes = 0;\n\n  PD_DISPATCH_FLOATING_TYPES(boxes.type(), \"nms_rotated_cpu_kernel\", ([&] {\n                               nms_rotated_cpu_kernel<data_t>(\n                                   boxes_sorted.data<data_t>(), threshold,\n                                   num_boxes, &num_keep_boxes,\n                                   keep.data<int64_t>());\n                             }));\n\n  keep = keep.slice(0, num_keep_boxes);\n  return {paddle::gather(order_t, keep, /* axis=*/0)};\n}\n\n#ifdef PADDLE_WITH_CUDA\nstd::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,\n                                                  const paddle::Tensor &scores,\n                                                  float threshold);\n#endif\n\nstd::vector<paddle::Tensor> NMSRotatedForward(const paddle::Tensor &boxes,\n                                              const paddle::Tensor &scores,\n                                              float threshold) {\n  if (boxes.is_cpu()) {\n    return NMSRotatedCPUForward(boxes, scores, threshold);\n#ifdef PADDLE_WITH_CUDA\n  } else if (boxes.is_gpu()) {\n    return NMSRotatedCUDAForward(boxes, scores, threshold);\n#endif\n  }\n}\n\nstd::vector<std::vector<int64_t>>\nNMSRotatedInferShape(std::vector<int64_t> boxes_shape,\n                     std::vector<int64_t> scores_shape) {\n  return {{-1}};\n}\n\nstd::vector<paddle::DataType> NMSRotatedInferDtype(paddle::DataType t1,\n                                                   paddle::DataType t2) {\n  return {paddle::DataType::INT64};\n}\n\nPD_BUILD_OP(nms_rotated)\n    .Inputs({\"Boxes\", \"Scores\"})\n    .Outputs({\"Output\"})\n    .Attrs({\"threshold: float\"})\n    .SetKernelFn(PD_KERNEL(NMSRotatedForward))\n    .SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape))\n    .SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype));"
  },
  {
    "path": "ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu",
    "content": "//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"../rbox_iou/rbox_iou_utils.h\"\n#include \"paddle/extension.h\"\n\nstatic const int64_t threadsPerBlock = sizeof(int64_t) * 8;\n\ntemplate <typename T>\n__global__ void\nnms_rotated_cuda_kernel(const T *boxes_data, const float threshold,\n                        const int64_t num_boxes, int64_t *masks) {\n  auto raw_start = blockIdx.y;\n  auto col_start = blockIdx.x;\n  if (raw_start > col_start)\n    return;\n  const int raw_last_storage =\n      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);\n  const int col_last_storage =\n      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);\n  if (threadIdx.x < raw_last_storage) {\n    int64_t mask = 0;\n    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;\n    const T *current_box = boxes_data + current_box_idx * 5;\n    for (int i = 0; i < col_last_storage; ++i) {\n      const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5;\n      if (rbox_iou_single<T>(current_box, target_box) > threshold) {\n        mask |= 1ULL << i;\n      }\n    }\n    const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);\n    masks[current_box_idx * blocks_per_line + col_start] = mask;\n  }\n}\n\n#define CHECK_INPUT_GPU(x)                                                     \\\n  PD_CHECK(x.is_gpu(), #x \" must be a GPU Tensor.\")\n\nstd::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,\n                                                  const paddle::Tensor &scores,\n                                                  float threshold) {\n  CHECK_INPUT_GPU(boxes);\n  CHECK_INPUT_GPU(scores);\n\n  auto num_boxes = boxes.shape()[0];\n  auto order_t =\n      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));\n  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);\n\n  const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);\n  dim3 block(threadsPerBlock);\n  dim3 grid(blocks_per_line, blocks_per_line);\n  auto mask_dev = paddle::empty({num_boxes * blocks_per_line},\n                                paddle::DataType::INT64, paddle::GPUPlace());\n\n  PD_DISPATCH_FLOATING_TYPES(\n      boxes.type(), \"nms_rotated_cuda_kernel\", ([&] {\n        nms_rotated_cuda_kernel<data_t><<<grid, block, 0, boxes.stream()>>>(\n            boxes_sorted.data<data_t>(), threshold, num_boxes,\n            mask_dev.data<int64_t>());\n      }));\n\n  auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true);\n  auto keep_host =\n      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());\n  int64_t *keep_host_ptr = keep_host.data<int64_t>();\n  int64_t *mask_host_ptr = mask_host.data<int64_t>();\n  std::vector<int64_t> remv(blocks_per_line);\n  int64_t last_box_num = 0;\n  for (int64_t i = 0; i < num_boxes; ++i) {\n    auto remv_element_id = i / threadsPerBlock;\n    auto remv_bit_id = i % threadsPerBlock;\n    if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {\n      keep_host_ptr[last_box_num++] = i;\n      int64_t *current_mask = mask_host_ptr + i * blocks_per_line;\n      for (auto j = remv_element_id; j < blocks_per_line; ++j) {\n        remv[j] |= current_mask[j];\n      }\n    }\n  }\n\n  keep_host = keep_host.slice(0, last_box_num);\n  auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true);\n  return {paddle::gather(order_t, keep_dev, /* axis=*/0)};\n}"
  },
  {
    "path": "ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc",
    "content": "//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n// The code is based on\n// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/\n\n#include \"paddle/extension.h\"\n#include \"rbox_iou_utils.h\"\n\ntemplate <typename T>\nvoid rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num,\n                         const T *rbox1_data_ptr, const T *rbox2_data_ptr,\n                         T *output_data_ptr) {\n\n  int i, j;\n  for (i = 0; i < rbox1_num; i++) {\n    for (j = 0; j < rbox2_num; j++) {\n      int offset = i * rbox2_num + j;\n      output_data_ptr[offset] =\n          rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);\n    }\n  }\n}\n\n#define CHECK_INPUT_CPU(x)                                                     \\\n  PD_CHECK(x.is_cpu(), #x \" must be a CPU Tensor.\")\n\nstd::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor &rbox1,\n                                              const paddle::Tensor &rbox2) {\n  CHECK_INPUT_CPU(rbox1);\n  CHECK_INPUT_CPU(rbox2);\n\n  auto rbox1_num = rbox1.shape()[0];\n  auto rbox2_num = rbox2.shape()[0];\n\n  auto output =\n      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace());\n\n  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), \"rbox_iou_cpu_kernel\", ([&] {\n                               rbox_iou_cpu_kernel<data_t>(\n                                   rbox1_num, rbox2_num, rbox1.data<data_t>(),\n                                   rbox2.data<data_t>(), output.data<data_t>());\n                             }));\n\n  return {output};\n}\n\n#ifdef PADDLE_WITH_CUDA\nstd::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,\n                                               const paddle::Tensor &rbox2);\n#endif\n\n#define CHECK_INPUT_SAME(x1, x2)                                               \\\n  PD_CHECK(x1.place() == x2.place(), \"input must be smae pacle.\")\n\nstd::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor &rbox1,\n                                           const paddle::Tensor &rbox2) {\n  CHECK_INPUT_SAME(rbox1, rbox2);\n  if (rbox1.is_cpu()) {\n    return RboxIouCPUForward(rbox1, rbox2);\n#ifdef PADDLE_WITH_CUDA\n  } else if (rbox1.is_gpu()) {\n    return RboxIouCUDAForward(rbox1, rbox2);\n#endif\n  }\n}\n\nstd::vector<std::vector<int64_t>>\nRboxIouInferShape(std::vector<int64_t> rbox1_shape,\n                  std::vector<int64_t> rbox2_shape) {\n  return {{rbox1_shape[0], rbox2_shape[0]}};\n}\n\nstd::vector<paddle::DataType> RboxIouInferDtype(paddle::DataType t1,\n                                                paddle::DataType t2) {\n  return {t1};\n}\n\nPD_BUILD_OP(rbox_iou)\n    .Inputs({\"RBox1\", \"RBox2\"})\n    .Outputs({\"Output\"})\n    .SetKernelFn(PD_KERNEL(RboxIouForward))\n    .SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape))\n    .SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype));\n"
  },
  {
    "path": "ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu",
    "content": "//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n// The code is based on\n// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/\n\n#include \"paddle/extension.h\"\n#include \"rbox_iou_utils.h\"\n\n// 2D block with 32 * 16 = 512 threads per block\nconst int BLOCK_DIM_X = 32;\nconst int BLOCK_DIM_Y = 16;\n\ntemplate <typename T>\n__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,\n                                     const T *rbox1_data_ptr,\n                                     const T *rbox2_data_ptr,\n                                     T *output_data_ptr) {\n\n  // get row_start and col_start\n  const int rbox1_block_idx = blockIdx.x * blockDim.x;\n  const int rbox2_block_idx = blockIdx.y * blockDim.y;\n\n  const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x);\n  const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y);\n\n  __shared__ T block_boxes1[BLOCK_DIM_X * 5];\n  __shared__ T block_boxes2[BLOCK_DIM_Y * 5];\n\n  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y\n  if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) {\n    block_boxes1[threadIdx.x * 5 + 0] =\n        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0];\n    block_boxes1[threadIdx.x * 5 + 1] =\n        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1];\n    block_boxes1[threadIdx.x * 5 + 2] =\n        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2];\n    block_boxes1[threadIdx.x * 5 + 3] =\n        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3];\n    block_boxes1[threadIdx.x * 5 + 4] =\n        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4];\n  }\n\n  // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as\n  // above: threadIdx.y == 0\n  if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) {\n    block_boxes2[threadIdx.x * 5 + 0] =\n        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0];\n    block_boxes2[threadIdx.x * 5 + 1] =\n        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1];\n    block_boxes2[threadIdx.x * 5 + 2] =\n        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2];\n    block_boxes2[threadIdx.x * 5 + 3] =\n        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3];\n    block_boxes2[threadIdx.x * 5 + 4] =\n        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4];\n  }\n\n  // sync\n  __syncthreads();\n\n  if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) {\n    int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx +\n                 threadIdx.y;\n    output_data_ptr[offset] = rbox_iou_single<T>(\n        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);\n  }\n}\n\n#define CHECK_INPUT_GPU(x)                                                     \\\n  PD_CHECK(x.is_gpu(), #x \" must be a GPU Tensor.\")\n\nstd::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,\n                                               const paddle::Tensor &rbox2) {\n  CHECK_INPUT_GPU(rbox1);\n  CHECK_INPUT_GPU(rbox2);\n\n  auto rbox1_num = rbox1.shape()[0];\n  auto rbox2_num = rbox2.shape()[0];\n\n  auto output =\n      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace());\n\n  const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);\n  const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);\n\n  dim3 blocks(blocks_x, blocks_y);\n  dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);\n\n  PD_DISPATCH_FLOATING_TYPES(\n      rbox1.type(), \"rbox_iou_cuda_kernel\", ([&] {\n        rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(\n            rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(),\n            output.data<data_t>());\n      }));\n\n  return {output};\n}\n"
  },
  {
    "path": "ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h",
    "content": "//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n// The code is based on\n// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/\n\n#pragma once\n\n#include <cassert>\n#include <cmath>\n#include <vector>\n\n#ifdef __CUDACC__\n// Designates functions callable from the host (CPU) and the device (GPU)\n#define HOST_DEVICE __host__ __device__\n#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__\n#else\n#include <algorithm>\n#define HOST_DEVICE\n#define HOST_DEVICE_INLINE HOST_DEVICE inline\n#endif\n\nnamespace {\n\ntemplate <typename T> struct RotatedBox { T x_ctr, y_ctr, w, h, a; };\n\ntemplate <typename T> struct Point {\n  T x, y;\n  HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}\n  HOST_DEVICE_INLINE Point operator+(const Point &p) const {\n    return Point(x + p.x, y + p.y);\n  }\n  HOST_DEVICE_INLINE Point &operator+=(const Point &p) {\n    x += p.x;\n    y += p.y;\n    return *this;\n  }\n  HOST_DEVICE_INLINE Point operator-(const Point &p) const {\n    return Point(x - p.x, y - p.y);\n  }\n  HOST_DEVICE_INLINE Point operator*(const T coeff) const {\n    return Point(x * coeff, y * coeff);\n  }\n};\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T dot_2d(const Point<T> &A, const Point<T> &B) {\n  return A.x * B.x + A.y * B.y;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T cross_2d(const Point<T> &A, const Point<T> &B) {\n  return A.x * B.y - B.x * A.y;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T> &box,\n                                             Point<T> (&pts)[4]) {\n  // M_PI / 180. == 0.01745329251\n  // double theta = box.a * 0.01745329251;\n  // MODIFIED\n  double theta = box.a;\n  T cosTheta2 = (T)cos(theta) * 0.5f;\n  T sinTheta2 = (T)sin(theta) * 0.5f;\n\n  // y: top --> down; x: left --> right\n  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;\n  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;\n  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;\n  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;\n  pts[2].x = 2 * box.x_ctr - pts[0].x;\n  pts[2].y = 2 * box.y_ctr - pts[0].y;\n  pts[3].x = 2 * box.x_ctr - pts[1].x;\n  pts[3].y = 2 * box.y_ctr - pts[1].y;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],\n                                               const Point<T> (&pts2)[4],\n                                               Point<T> (&intersections)[24]) {\n  // Line vector\n  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]\n  Point<T> vec1[4], vec2[4];\n  for (int i = 0; i < 4; i++) {\n    vec1[i] = pts1[(i + 1) % 4] - pts1[i];\n    vec2[i] = pts2[(i + 1) % 4] - pts2[i];\n  }\n\n  // Line test - test all line combos for intersection\n  int num = 0; // number of intersections\n  for (int i = 0; i < 4; i++) {\n    for (int j = 0; j < 4; j++) {\n      // Solve for 2x2 Ax=b\n      T det = cross_2d<T>(vec2[j], vec1[i]);\n\n      // This takes care of parallel lines\n      if (fabs(det) <= 1e-14) {\n        continue;\n      }\n\n      auto vec12 = pts2[j] - pts1[i];\n\n      T t1 = cross_2d<T>(vec2[j], vec12) / det;\n      T t2 = cross_2d<T>(vec1[i], vec12) / det;\n\n      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {\n        intersections[num++] = pts1[i] + vec1[i] * t1;\n      }\n    }\n  }\n\n  // Check for vertices of rect1 inside rect2\n  {\n    const auto &AB = vec2[0];\n    const auto &DA = vec2[3];\n    auto ABdotAB = dot_2d<T>(AB, AB);\n    auto ADdotAD = dot_2d<T>(DA, DA);\n    for (int i = 0; i < 4; i++) {\n      // assume ABCD is the rectangle, and P is the point to be judged\n      // P is inside ABCD iff. P's projection on AB lies within AB\n      // and P's projection on AD lies within AD\n\n      auto AP = pts1[i] - pts2[0];\n\n      auto APdotAB = dot_2d<T>(AP, AB);\n      auto APdotAD = -dot_2d<T>(AP, DA);\n\n      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&\n          (APdotAD <= ADdotAD)) {\n        intersections[num++] = pts1[i];\n      }\n    }\n  }\n\n  // Reverse the check - check for vertices of rect2 inside rect1\n  {\n    const auto &AB = vec1[0];\n    const auto &DA = vec1[3];\n    auto ABdotAB = dot_2d<T>(AB, AB);\n    auto ADdotAD = dot_2d<T>(DA, DA);\n    for (int i = 0; i < 4; i++) {\n      auto AP = pts2[i] - pts1[0];\n\n      auto APdotAB = dot_2d<T>(AP, AB);\n      auto APdotAD = -dot_2d<T>(AP, DA);\n\n      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&\n          (APdotAD <= ADdotAD)) {\n        intersections[num++] = pts2[i];\n      }\n    }\n  }\n\n  return num;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],\n                                          const int &num_in, Point<T> (&q)[24],\n                                          bool shift_to_zero = false) {\n  assert(num_in >= 2);\n\n  // Step 1:\n  // Find point with minimum y\n  // if more than 1 points have the same minimum y,\n  // pick the one with the minimum x.\n  int t = 0;\n  for (int i = 1; i < num_in; i++) {\n    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {\n      t = i;\n    }\n  }\n  auto &start = p[t]; // starting point\n\n  // Step 2:\n  // Subtract starting point from every points (for sorting in the next step)\n  for (int i = 0; i < num_in; i++) {\n    q[i] = p[i] - start;\n  }\n\n  // Swap the starting point to position 0\n  auto tmp = q[0];\n  q[0] = q[t];\n  q[t] = tmp;\n\n  // Step 3:\n  // Sort point 1 ~ num_in according to their relative cross-product values\n  // (essentially sorting according to angles)\n  // If the angles are the same, sort according to their distance to origin\n  T dist[24];\n  for (int i = 0; i < num_in; i++) {\n    dist[i] = dot_2d<T>(q[i], q[i]);\n  }\n\n#ifdef __CUDACC__\n  // CUDA version\n  // In the future, we can potentially use thrust\n  // for sorting here to improve speed (though not guaranteed)\n  for (int i = 1; i < num_in - 1; i++) {\n    for (int j = i + 1; j < num_in; j++) {\n      T crossProduct = cross_2d<T>(q[i], q[j]);\n      if ((crossProduct < -1e-6) ||\n          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {\n        auto q_tmp = q[i];\n        q[i] = q[j];\n        q[j] = q_tmp;\n        auto dist_tmp = dist[i];\n        dist[i] = dist[j];\n        dist[j] = dist_tmp;\n      }\n    }\n  }\n#else\n  // CPU version\n  std::sort(q + 1, q + num_in,\n            [](const Point<T> &A, const Point<T> &B) -> bool {\n              T temp = cross_2d<T>(A, B);\n              if (fabs(temp) < 1e-6) {\n                return dot_2d<T>(A, A) < dot_2d<T>(B, B);\n              } else {\n                return temp > 0;\n              }\n            });\n#endif\n\n  // Step 4:\n  // Make sure there are at least 2 points (that don't overlap with each other)\n  // in the stack\n  int k; // index of the non-overlapped second point\n  for (k = 1; k < num_in; k++) {\n    if (dist[k] > 1e-8) {\n      break;\n    }\n  }\n  if (k == num_in) {\n    // We reach the end, which means the convex hull is just one point\n    q[0] = p[t];\n    return 1;\n  }\n  q[1] = q[k];\n  int m = 2; // 2 points in the stack\n  // Step 5:\n  // Finally we can start the scanning process.\n  // When a non-convex relationship between the 3 points is found\n  // (either concave shape or duplicated points),\n  // we pop the previous point from the stack\n  // until the 3-point relationship is convex again, or\n  // until the stack only contains two points\n  for (int i = k + 1; i < num_in; i++) {\n    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {\n      m--;\n    }\n    q[m++] = q[i];\n  }\n\n  // Step 6 (Optional):\n  // In general sense we need the original coordinates, so we\n  // need to shift the points back (reverting Step 2)\n  // But if we're only interested in getting the area/perimeter of the shape\n  // We can simply return.\n  if (!shift_to_zero) {\n    for (int i = 0; i < m; i++) {\n      q[i] += start;\n    }\n  }\n\n  return m;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int &m) {\n  if (m <= 2) {\n    return 0;\n  }\n\n  T area = 0;\n  for (int i = 1; i < m - 1; i++) {\n    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));\n  }\n\n  return area / 2.0;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox<T> &box1,\n                                         const RotatedBox<T> &box2) {\n  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned\n  // from rotated_rect_intersection_pts\n  Point<T> intersectPts[24], orderedPts[24];\n\n  Point<T> pts1[4];\n  Point<T> pts2[4];\n  get_rotated_vertices<T>(box1, pts1);\n  get_rotated_vertices<T>(box2, pts2);\n\n  int num = get_intersection_points<T>(pts1, pts2, intersectPts);\n\n  if (num <= 2) {\n    return 0.0;\n  }\n\n  // Convex Hull to order the intersection points in clockwise order and find\n  // the contour area.\n  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);\n  return polygon_area<T>(orderedPts, num_convex);\n}\n\n} // namespace\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw,\n                                     T const *const box2_raw) {\n  // shift center to the middle point to achieve higher precision in result\n  RotatedBox<T> box1, box2;\n  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;\n  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;\n  box1.x_ctr = box1_raw[0] - center_shift_x;\n  box1.y_ctr = box1_raw[1] - center_shift_y;\n  box1.w = box1_raw[2];\n  box1.h = box1_raw[3];\n  box1.a = box1_raw[4];\n  box2.x_ctr = box2_raw[0] - center_shift_x;\n  box2.y_ctr = box2_raw[1] - center_shift_y;\n  box2.w = box2_raw[2];\n  box2.h = box2_raw[3];\n  box2.a = box2_raw[4];\n\n  if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) {\n    return 0.f;\n  }\n  const T area1 = box1.w * box1.h;\n  const T area2 = box2.w * box2.h;\n\n  const T intersection = rboxes_intersection<T>(box1, box2);\n  const T iou = intersection / (area1 + area2 - intersection);\n  return iou;\n}\n\n/**\n   Computes ceil(a / b)\n*/\n\nHOST_DEVICE inline int CeilDiv(const int a, const int b) {\n  return (a + b - 1) / b;\n}"
  },
  {
    "path": "ppdet/ext_op/setup.py",
    "content": "import os\nimport glob\nimport paddle\nfrom paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup\n\n\ndef get_extensions():\n    root_dir = os.path.dirname(os.path.abspath(__file__))\n    ext_root_dir = os.path.join(root_dir, 'csrc')\n    sources = []\n    for ext_name in os.listdir(ext_root_dir):\n        ext_dir = os.path.join(ext_root_dir, ext_name)\n        source = glob.glob(os.path.join(ext_dir, '*.cc'))\n        kwargs = dict()\n        if paddle.device.is_compiled_with_cuda():\n            source += glob.glob(os.path.join(ext_dir, '*.cu'))\n\n        if not source:\n            continue\n\n        sources += source\n\n    if paddle.device.is_compiled_with_cuda():\n        extension = CUDAExtension(\n            sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']})\n    else:\n        extension = CppExtension(sources)\n\n    return extension\n\n\nif __name__ == \"__main__\":\n    setup(name='ext_op', ext_modules=get_extensions())\n"
  },
  {
    "path": "ppdet/ext_op/unittest/test_matched_rbox_iou.py",
    "content": "import numpy as np\nimport sys\nimport time\nfrom shapely.geometry import Polygon\nimport paddle\nimport unittest\n\nfrom ext_op import matched_rbox_iou\n\n\ndef rbox2poly_single(rrect, get_best_begin_point=False):\n    \"\"\"\n    rrect:[x_ctr,y_ctr,w,h,angle]\n    to\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    \"\"\"\n    x_ctr, y_ctr, width, height, angle = rrect[:5]\n    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2\n    # rect 2x4\n    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])\n    R = np.array([[np.cos(angle), -np.sin(angle)],\n                  [np.sin(angle), np.cos(angle)]])\n    # poly\n    poly = R.dot(rect)\n    x0, x1, x2, x3 = poly[0, :4] + x_ctr\n    y0, y1, y2, y3 = poly[1, :4] + y_ctr\n    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)\n    return poly\n\n\ndef intersection(g, p):\n    \"\"\"\n    Intersection.\n    \"\"\"\n\n    g = g[:8].reshape((4, 2))\n    p = p[:8].reshape((4, 2))\n\n    a = g\n    b = p\n\n    use_filter = True\n    if use_filter:\n        # step1:\n        inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))\n        inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))\n        inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))\n        inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))\n        if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:\n            return 0.\n        x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))\n        x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))\n        y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))\n        y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))\n        if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:\n            return 0.\n\n    g = Polygon(g)\n    p = Polygon(p)\n    if not g.is_valid or not p.is_valid:\n        return 0\n\n    inter = Polygon(g).intersection(Polygon(p)).area\n    union = g.area + p.area - inter\n    if union == 0:\n        return 0\n    else:\n        return inter / union\n\n\ndef matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False):\n    \"\"\"\n\n    Args:\n        anchors: [M, 5]  x1,y1,x2,y2,angle\n        gt_bboxes: [M, 5]  x1,y1,x2,y2,angle\n\n    Returns:\n        macthed_iou: [M]\n    \"\"\"\n    assert anchors.shape[1] == 5\n    assert gt_bboxes.shape[1] == 5\n\n    gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]\n    anchors_ploy = [rbox2poly_single(e) for e in anchors]\n\n    num = len(anchors_ploy)\n    iou = np.zeros((num, ), dtype=np.float64)\n\n    start_time = time.time()\n    for i in range(num):\n        try:\n            iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i])\n        except Exception as e:\n            print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i],\n                  'anchors_ploy[j]', anchors_ploy[i], e)\n    return iou\n\n\ndef gen_sample(n):\n    rbox = np.random.rand(n, 5)\n    rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001\n    rbox[:, 4] = rbox[:, 4] - 0.5\n    return rbox\n\n\nclass MatchedRBoxIoUTest(unittest.TestCase):\n    def setUp(self):\n        self.initTestCase()\n        self.rbox1 = gen_sample(self.n)\n        self.rbox2 = gen_sample(self.n)\n\n    def initTestCase(self):\n        self.n = 1000\n\n    def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):\n        self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)\n\n    def get_places(self):\n        places = [paddle.CPUPlace()]\n        if paddle.device.is_compiled_with_cuda():\n            places.append(paddle.CUDAPlace(0))\n\n        return places\n\n    def check_output(self, place):\n        paddle.disable_static()\n        pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)\n        pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)\n        actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy()\n        poly_rbox1 = self.rbox1\n        poly_rbox2 = self.rbox2\n        poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024\n        poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024\n        expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)\n        self.assertAllClose(\n            actual_t,\n            expect_t,\n            msg=\"rbox_iou has diff at {} \\nExpect {}\\nBut got {}\".format(\n                str(place), str(expect_t), str(actual_t)))\n\n    def test_output(self):\n        places = self.get_places()\n        for place in places:\n            self.check_output(place)\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "ppdet/ext_op/unittest/test_rbox_iou.py",
    "content": "import numpy as np\nimport sys\nimport time\nfrom shapely.geometry import Polygon\nimport paddle\nimport unittest\n\nfrom ext_op import rbox_iou\n\n\ndef rbox2poly_single(rrect, get_best_begin_point=False):\n    \"\"\"\n    rrect:[x_ctr,y_ctr,w,h,angle]\n    to\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    \"\"\"\n    x_ctr, y_ctr, width, height, angle = rrect[:5]\n    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2\n    # rect 2x4\n    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])\n    R = np.array([[np.cos(angle), -np.sin(angle)],\n                  [np.sin(angle), np.cos(angle)]])\n    # poly\n    poly = R.dot(rect)\n    x0, x1, x2, x3 = poly[0, :4] + x_ctr\n    y0, y1, y2, y3 = poly[1, :4] + y_ctr\n    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)\n    return poly\n\n\ndef intersection(g, p):\n    \"\"\"\n    Intersection.\n    \"\"\"\n\n    g = g[:8].reshape((4, 2))\n    p = p[:8].reshape((4, 2))\n\n    a = g\n    b = p\n\n    use_filter = True\n    if use_filter:\n        # step1:\n        inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))\n        inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))\n        inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))\n        inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))\n        if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:\n            return 0.\n        x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))\n        x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))\n        y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))\n        y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))\n        if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:\n            return 0.\n\n    g = Polygon(g)\n    p = Polygon(p)\n    if not g.is_valid or not p.is_valid:\n        return 0\n\n    inter = Polygon(g).intersection(Polygon(p)).area\n    union = g.area + p.area - inter\n    if union == 0:\n        return 0\n    else:\n        return inter / union\n\n\ndef rbox_overlaps(anchors, gt_bboxes, use_cv2=False):\n    \"\"\"\n\n    Args:\n        anchors: [NA, 5]  x1,y1,x2,y2,angle\n        gt_bboxes: [M, 5]  x1,y1,x2,y2,angle\n\n    Returns:\n        iou: [NA, M]\n    \"\"\"\n    assert anchors.shape[1] == 5\n    assert gt_bboxes.shape[1] == 5\n\n    gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]\n    anchors_ploy = [rbox2poly_single(e) for e in anchors]\n\n    num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy)\n    iou = np.zeros((num_anchors, num_gt), dtype=np.float64)\n\n    start_time = time.time()\n    for i in range(num_anchors):\n        for j in range(num_gt):\n            try:\n                iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j])\n            except Exception as e:\n                print('cur anchors_ploy[i]', anchors_ploy[i],\n                      'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e)\n    return iou\n\n\ndef gen_sample(n):\n    rbox = np.random.rand(n, 5)\n    rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001\n    rbox[:, 4] = rbox[:, 4] - 0.5\n    return rbox\n\n\nclass RBoxIoUTest(unittest.TestCase):\n    def setUp(self):\n        self.initTestCase()\n        self.rbox1 = gen_sample(self.n)\n        self.rbox2 = gen_sample(self.m)\n\n    def initTestCase(self):\n        self.n = 13000\n        self.m = 7\n\n    def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):\n        self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)\n\n    def get_places(self):\n        places = [paddle.CPUPlace()]\n        if paddle.device.is_compiled_with_cuda():\n            places.append(paddle.CUDAPlace(0))\n\n        return places\n\n    def check_output(self, place):\n        paddle.disable_static()\n        pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)\n        pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)\n        actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy()\n        poly_rbox1 = self.rbox1\n        poly_rbox2 = self.rbox2\n        poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024\n        poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024\n        expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)\n        self.assertAllClose(\n            actual_t,\n            expect_t,\n            msg=\"rbox_iou has diff at {} \\nExpect {}\\nBut got {}\".format(\n                str(place), str(expect_t), str(actual_t)))\n\n    def test_output(self):\n        places = self.get_places()\n        for place in places:\n            self.check_output(place)\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "ppdet/metrics/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import metrics\nfrom . import keypoint_metrics\n\nfrom .metrics import *\nfrom .keypoint_metrics import *\nfrom .pose3d_metrics import *\n\n__all__ = metrics.__all__ + keypoint_metrics.__all__\n\nfrom . import mot_metrics\nfrom .mot_metrics import *\n__all__ = metrics.__all__ + mot_metrics.__all__\n\nfrom . import mcmot_metrics\nfrom .mcmot_metrics import *\n__all__ = metrics.__all__ + mcmot_metrics.__all__\n\nfrom . import culane_metrics\nfrom .culane_metrics import *\n__all__ = metrics.__all__ + culane_metrics.__all__"
  },
  {
    "path": "ppdet/metrics/coco_utils.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport numpy as np\nimport itertools\n\nfrom ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res\nfrom ppdet.metrics.map_utils import draw_pr_curve\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\ndef get_infer_results(outs, catid, bias=0, save_threshold=0):\n    \"\"\"\n    Get result at the stage of inference.\n    The output format is dictionary containing bbox or mask result.\n\n    For example, bbox result is a list and each element contains\n    image_id, category_id, bbox and score.\n    \"\"\"\n    if outs is None or len(outs) == 0:\n        raise ValueError(\n            'The number of valid detection result if zero. Please use reasonable model and check input data.'\n        )\n\n    im_id = outs['im_id']\n    im_file = outs['im_file'] if 'im_file' in outs else None\n\n    infer_res = {}\n    if 'bbox' in outs:\n        if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6:\n            infer_res['bbox'] = get_det_poly_res(\n                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)\n        else:\n            infer_res['bbox'] = get_det_res(\n                outs['bbox'],\n                outs['bbox_num'],\n                im_id,\n                catid,\n                bias=bias,\n                im_file=im_file,\n                save_threshold=save_threshold)\n\n    if 'mask' in outs:\n        # mask post process\n        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],\n                                        outs['bbox_num'], im_id, catid)\n\n    if 'segm' in outs:\n        infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)\n\n    if 'keypoint' in outs:\n        infer_res['keypoint'] = get_keypoint_res(outs, im_id)\n        outs['bbox_num'] = [len(infer_res['keypoint'])]\n\n    if 'pose3d' in outs:\n        infer_res['pose3d'] = get_pose3d_res(outs, im_id)\n        outs['bbox_num'] = [len(infer_res['pose3d'])]\n\n    return infer_res\n\n\ndef cocoapi_eval(jsonfile,\n                 style,\n                 coco_gt=None,\n                 anno_file=None,\n                 max_dets=(100, 300, 1000),\n                 classwise=False,\n                 sigmas=None,\n                 use_area=True):\n    \"\"\"\n    Args:\n        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.\n        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.\n        coco_gt (str): Whether to load COCOAPI through anno_file,\n                 eg: coco_gt = COCO(anno_file)\n        anno_file (str): COCO annotations file.\n        max_dets (tuple): COCO evaluation maxDets.\n        classwise (bool): Whether per-category AP and draw P-R Curve or not.\n        sigmas (nparray): keypoint labelling sigmas.\n        use_area (bool): If gt annotations (eg. CrowdPose, AIC)\n                         do not have 'area', please set use_area=False.\n    \"\"\"\n    assert coco_gt != None or anno_file != None\n    if style == 'keypoints_crowd':\n        #please install xtcocotools==1.6\n        from xtcocotools.coco import COCO\n        from xtcocotools.cocoeval import COCOeval\n    else:\n        from pycocotools.coco import COCO\n        try:\n            from .fast_cocoeval import FastCOCOeval as COCOeval\n        except:\n            from pycocotools.cocoeval import COCOeval\n\n    if coco_gt == None:\n        coco_gt = COCO(anno_file)\n    logger.info(\"Start evaluate...\")\n    coco_dt = coco_gt.loadRes(jsonfile)\n    if style == 'proposal':\n        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')\n        coco_eval.params.useCats = 0\n        coco_eval.params.maxDets = list(max_dets)\n    elif style == 'keypoints_crowd':\n        coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area)\n    else:\n        coco_eval = COCOeval(coco_gt, coco_dt, style)\n    coco_eval.evaluate()\n    coco_eval.accumulate()\n    coco_eval.summarize()\n    if classwise:\n        # Compute per-category AP and PR curve\n        try:\n            from terminaltables import AsciiTable\n        except Exception as e:\n            logger.error(\n                'terminaltables not found, plaese install terminaltables. '\n                'for example: `pip install terminaltables`.')\n            raise e\n        precisions = coco_eval.eval['precision']\n        cat_ids = coco_gt.getCatIds()\n        # precision: (iou, recall, cls, area range, max dets)\n        assert len(cat_ids) == precisions.shape[2]\n        results_per_category = []\n        for idx, catId in enumerate(cat_ids):\n            # area range index 0: all area ranges\n            # max dets index -1: typically 100 per image\n            nm = coco_gt.loadCats(catId)[0]\n            precision = precisions[:, :, idx, 0, -1]\n            precision = precision[precision > -1]\n            if precision.size:\n                ap = np.mean(precision)\n            else:\n                ap = float('nan')\n            results_per_category.append(\n                (str(nm[\"name\"]), '{:0.3f}'.format(float(ap))))\n            pr_array = precisions[0, :, idx, 0, 2]\n            recall_array = np.arange(0.0, 1.01, 0.01)\n            draw_pr_curve(\n                pr_array,\n                recall_array,\n                out_dir=style + '_pr_curve',\n                file_name='{}_precision_recall_curve.jpg'.format(nm[\"name\"]))\n\n        num_columns = min(6, len(results_per_category) * 2)\n        results_flatten = list(itertools.chain(*results_per_category))\n        headers = ['category', 'AP'] * (num_columns // 2)\n        results_2d = itertools.zip_longest(\n            *[results_flatten[i::num_columns] for i in range(num_columns)])\n        table_data = [headers]\n        table_data += [result for result in results_2d]\n        table = AsciiTable(table_data)\n        logger.info('Per-category of {} AP: \\n{}'.format(style, table.table))\n        logger.info(\"per-category PR curve has output to {} folder.\".format(\n            style + '_pr_curve'))\n    # flush coco evaluation result\n    sys.stdout.flush()\n    return coco_eval.stats\n\n\ndef json_eval_results(metric, json_directory, dataset):\n    \"\"\"\n    cocoapi eval with already exists proposal.json, bbox.json or mask.json\n    \"\"\"\n    assert metric == 'COCO'\n    anno_file = dataset.get_anno()\n    json_file_list = ['proposal.json', 'bbox.json', 'mask.json']\n    if json_directory:\n        assert os.path.exists(\n            json_directory), \"The json directory:{} does not exist\".format(\n                json_directory)\n        for k, v in enumerate(json_file_list):\n            json_file_list[k] = os.path.join(str(json_directory), v)\n\n    coco_eval_style = ['proposal', 'bbox', 'segm']\n    for i, v_json in enumerate(json_file_list):\n        if os.path.exists(v_json):\n            cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file)\n        else:\n            logger.info(\"{} not exists!\".format(v_json))\n"
  },
  {
    "path": "ppdet/metrics/culane_metrics.py",
    "content": "import os\nimport cv2\nimport numpy as np\nimport os.path as osp\nfrom functools import partial\nfrom .metrics import Metric\nfrom scipy.interpolate import splprep, splev\nfrom scipy.optimize import linear_sum_assignment\nfrom shapely.geometry import LineString, Polygon\nfrom ppdet.utils.logger import setup_logger\n\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'draw_lane', 'discrete_cross_iou', 'continuous_cross_iou', 'interp',\n    'culane_metric', 'load_culane_img_data', 'load_culane_data',\n    'eval_predictions', \"CULaneMetric\"\n]\n\nLIST_FILE = {\n    'train': 'list/train_gt.txt',\n    'val': 'list/val.txt',\n    'test': 'list/test.txt',\n}\n\nCATEGORYS = {\n    'normal': 'list/test_split/test0_normal.txt',\n    'crowd': 'list/test_split/test1_crowd.txt',\n    'hlight': 'list/test_split/test2_hlight.txt',\n    'shadow': 'list/test_split/test3_shadow.txt',\n    'noline': 'list/test_split/test4_noline.txt',\n    'arrow': 'list/test_split/test5_arrow.txt',\n    'curve': 'list/test_split/test6_curve.txt',\n    'cross': 'list/test_split/test7_cross.txt',\n    'night': 'list/test_split/test8_night.txt',\n}\n\n\ndef draw_lane(lane, img=None, img_shape=None, width=30):\n    if img is None:\n        img = np.zeros(img_shape, dtype=np.uint8)\n    lane = lane.astype(np.int32)\n    for p1, p2 in zip(lane[:-1], lane[1:]):\n        cv2.line(\n            img, tuple(p1), tuple(p2), color=(255, 255, 255), thickness=width)\n    return img\n\n\ndef discrete_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)):\n    xs = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in xs]\n    ys = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in ys]\n\n    ious = np.zeros((len(xs), len(ys)))\n    for i, x in enumerate(xs):\n        for j, y in enumerate(ys):\n            ious[i, j] = (x & y).sum() / (x | y).sum()\n    return ious\n\n\ndef continuous_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)):\n    h, w, _ = img_shape\n    image = Polygon([(0, 0), (0, h - 1), (w - 1, h - 1), (w - 1, 0)])\n    xs = [\n        LineString(lane).buffer(\n            distance=width / 2., cap_style=1, join_style=2).intersection(image)\n        for lane in xs\n    ]\n    ys = [\n        LineString(lane).buffer(\n            distance=width / 2., cap_style=1, join_style=2).intersection(image)\n        for lane in ys\n    ]\n\n    ious = np.zeros((len(xs), len(ys)))\n    for i, x in enumerate(xs):\n        for j, y in enumerate(ys):\n            ious[i, j] = x.intersection(y).area / x.union(y).area\n\n    return ious\n\n\ndef interp(points, n=50):\n    x = [x for x, _ in points]\n    y = [y for _, y in points]\n    tck, u = splprep([x, y], s=0, t=n, k=min(3, len(points) - 1))\n\n    u = np.linspace(0., 1., num=(len(u) - 1) * n + 1)\n    return np.array(splev(u, tck)).T\n\n\ndef culane_metric(pred,\n                  anno,\n                  width=30,\n                  iou_thresholds=[0.5],\n                  official=True,\n                  img_shape=(590, 1640, 3)):\n    _metric = {}\n    for thr in iou_thresholds:\n        tp = 0\n        fp = 0 if len(anno) != 0 else len(pred)\n        fn = 0 if len(pred) != 0 else len(anno)\n        _metric[thr] = [tp, fp, fn]\n\n    interp_pred = np.array(\n        [interp(\n            pred_lane, n=5) for pred_lane in pred], dtype=object)  # (4, 50, 2)\n    interp_anno = np.array(\n        [interp(\n            anno_lane, n=5) for anno_lane in anno], dtype=object)  # (4, 50, 2)\n\n    if official:\n        ious = discrete_cross_iou(\n            interp_pred, interp_anno, width=width, img_shape=img_shape)\n    else:\n        ious = continuous_cross_iou(\n            interp_pred, interp_anno, width=width, img_shape=img_shape)\n\n    row_ind, col_ind = linear_sum_assignment(1 - ious)\n\n    _metric = {}\n    for thr in iou_thresholds:\n        tp = int((ious[row_ind, col_ind] > thr).sum())\n        fp = len(pred) - tp\n        fn = len(anno) - tp\n        _metric[thr] = [tp, fp, fn]\n    return _metric\n\n\ndef load_culane_img_data(path):\n    with open(path, 'r') as data_file:\n        img_data = data_file.readlines()\n    img_data = [line.split() for line in img_data]\n    img_data = [list(map(float, lane)) for lane in img_data]\n    img_data = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)]\n                for lane in img_data]\n    img_data = [lane for lane in img_data if len(lane) >= 2]\n\n    return img_data\n\n\ndef load_culane_data(data_dir, file_list_path):\n    with open(file_list_path, 'r') as file_list:\n        filepaths = [\n            os.path.join(data_dir,\n                         line[1 if line[0] == '/' else 0:].rstrip().replace(\n                             '.jpg', '.lines.txt'))\n            for line in file_list.readlines()\n        ]\n\n    data = []\n    for path in filepaths:\n        img_data = load_culane_img_data(path)\n        data.append(img_data)\n\n    return data\n\n\ndef eval_predictions(pred_dir,\n                     anno_dir,\n                     list_path,\n                     iou_thresholds=[0.5],\n                     width=30,\n                     official=True,\n                     sequential=False):\n    logger.info('Calculating metric for List: {}'.format(list_path))\n    predictions = load_culane_data(pred_dir, list_path)\n    annotations = load_culane_data(anno_dir, list_path)\n    img_shape = (590, 1640, 3)\n    if sequential:\n        results = map(partial(\n            culane_metric,\n            width=width,\n            official=official,\n            iou_thresholds=iou_thresholds,\n            img_shape=img_shape),\n                      predictions,\n                      annotations)\n    else:\n        from multiprocessing import Pool, cpu_count\n        from itertools import repeat\n        with Pool(cpu_count()) as p:\n            results = p.starmap(culane_metric,\n                                zip(predictions, annotations,\n                                    repeat(width),\n                                    repeat(iou_thresholds),\n                                    repeat(official), repeat(img_shape)))\n\n    mean_f1, mean_prec, mean_recall, total_tp, total_fp, total_fn = 0, 0, 0, 0, 0, 0\n    ret = {}\n    for thr in iou_thresholds:\n        tp = sum(m[thr][0] for m in results)\n        fp = sum(m[thr][1] for m in results)\n        fn = sum(m[thr][2] for m in results)\n        precision = float(tp) / (tp + fp) if tp != 0 else 0\n        recall = float(tp) / (tp + fn) if tp != 0 else 0\n        f1 = 2 * precision * recall / (precision + recall) if tp != 0 else 0\n        logger.info('iou thr: {:.2f}, tp: {}, fp: {}, fn: {},'\n                    'precision: {}, recall: {}, f1: {}'.format(\n                        thr, tp, fp, fn, precision, recall, f1))\n        mean_f1 += f1 / len(iou_thresholds)\n        mean_prec += precision / len(iou_thresholds)\n        mean_recall += recall / len(iou_thresholds)\n        total_tp += tp\n        total_fp += fp\n        total_fn += fn\n        ret[thr] = {\n            'TP': tp,\n            'FP': fp,\n            'FN': fn,\n            'Precision': precision,\n            'Recall': recall,\n            'F1': f1\n        }\n    if len(iou_thresholds) > 2:\n        logger.info(\n            'mean result, total_tp: {}, total_fp: {}, total_fn: {},'\n            'precision: {}, recall: {}, f1: {}'.format(\n                total_tp, total_fp, total_fn, mean_prec, mean_recall, mean_f1))\n        ret['mean'] = {\n            'TP': total_tp,\n            'FP': total_fp,\n            'FN': total_fn,\n            'Precision': mean_prec,\n            'Recall': mean_recall,\n            'F1': mean_f1\n        }\n    return ret\n\n\nclass CULaneMetric(Metric):\n    def __init__(self,\n                 cfg,\n                 output_eval=None,\n                 split=\"test\",\n                 dataset_dir=\"dataset/CULane/\"):\n        super(CULaneMetric, self).__init__()\n        self.output_eval = \"evaluation\" if output_eval is None else output_eval\n        self.dataset_dir = dataset_dir\n        self.split = split\n        self.list_path = osp.join(dataset_dir, LIST_FILE[split])\n        self.predictions = []\n        self.img_names = []\n        self.lanes = []\n        self.eval_results = {}\n        self.cfg = cfg\n        self.reset()\n\n    def reset(self):\n        self.predictions = []\n        self.img_names = []\n        self.lanes = []\n        self.eval_results = {}\n\n    def get_prediction_string(self, pred):\n        ys = np.arange(270, 590, 8) / self.cfg.ori_img_h\n        out = []\n        for lane in pred:\n            xs = lane(ys)\n            valid_mask = (xs >= 0) & (xs < 1)\n            xs = xs * self.cfg.ori_img_w\n            lane_xs = xs[valid_mask]\n            lane_ys = ys[valid_mask] * self.cfg.ori_img_h\n            lane_xs, lane_ys = lane_xs[::-1], lane_ys[::-1]\n            lane_str = ' '.join([\n                '{:.5f} {:.5f}'.format(x, y) for x, y in zip(lane_xs, lane_ys)\n            ])\n            if lane_str != '':\n                out.append(lane_str)\n\n        return '\\n'.join(out)\n\n    def accumulate(self):\n        loss_lines = [[], [], [], []]\n        for idx, pred in enumerate(self.predictions):\n            output_dir = os.path.join(self.output_eval,\n                                      os.path.dirname(self.img_names[idx]))\n            output_filename = os.path.basename(self.img_names[\n                idx])[:-3] + 'lines.txt'\n            os.makedirs(output_dir, exist_ok=True)\n            output = self.get_prediction_string(pred)\n\n            # store loss lines\n            lanes = self.lanes[idx]\n            if len(lanes) - len(pred) in [1, 2, 3, 4]:\n                loss_lines[len(lanes) - len(pred) - 1].append(self.img_names[\n                    idx])\n\n            with open(os.path.join(output_dir, output_filename),\n                      'w') as out_file:\n                out_file.write(output)\n\n        for i, names in enumerate(loss_lines):\n            with open(\n                    os.path.join(output_dir, 'loss_{}_lines.txt'.format(i + 1)),\n                    'w') as f:\n                for name in names:\n                    f.write(name + '\\n')\n\n        for cate, cate_file in CATEGORYS.items():\n            result = eval_predictions(\n                self.output_eval,\n                self.dataset_dir,\n                os.path.join(self.dataset_dir, cate_file),\n                iou_thresholds=[0.5],\n                official=True)\n\n        result = eval_predictions(\n            self.output_eval,\n            self.dataset_dir,\n            self.list_path,\n            iou_thresholds=np.linspace(0.5, 0.95, 10),\n            official=True)\n        self.eval_results['F1@50'] = result[0.5]['F1']\n        self.eval_results['result'] = result\n\n    def update(self, inputs, outputs):\n        assert len(inputs['img_name']) == len(outputs['lanes'])\n        self.predictions.extend(outputs['lanes'])\n        self.img_names.extend(inputs['img_name'])\n        self.lanes.extend(inputs['lane_line'])\n\n    def log(self):\n        logger.info(self.eval_results)\n\n    # abstract method for getting metric results\n    def get_results(self):\n        return self.eval_results\n"
  },
  {
    "path": "ppdet/metrics/fast_cocoeval/README.md",
    "content": "# COCOeval C++ 扩展编译\n\n## 安装\n```\ncd ext\npython setup.py install\n```\n\n"
  },
  {
    "path": "ppdet/metrics/fast_cocoeval/__init__.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom . import fast_cocoeval\n\nfrom .fast_cocoeval import *\n"
  },
  {
    "path": "ppdet/metrics/fast_cocoeval/ext/cocoeval.cc",
    "content": "// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n// The code is based on\n// https://github.com/facebookresearch/detectron2/tree/main/detectron2/layers/csrc/cocoeval/\n\n#include \"cocoeval.h\"\n#include <time.h>\n#include <algorithm>\n#include <cstdint>\n#include <numeric>\n\nusing namespace pybind11::literals;\n\n// Sort detections from highest score to lowest, such that\n// detection_instances[detection_sorted_indices[t]] >=\n// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match\n// original COCO API\nvoid SortInstancesByDetectionScore(\n    const std::vector<InstanceAnnotation>& detection_instances,\n    std::vector<uint64_t>* detection_sorted_indices) {\n  detection_sorted_indices->resize(detection_instances.size());\n  std::iota(\n      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);\n  std::stable_sort(\n      detection_sorted_indices->begin(),\n      detection_sorted_indices->end(),\n      [&detection_instances](size_t j1, size_t j2) {\n        return detection_instances[j1].score > detection_instances[j2].score;\n      });\n}\n\n// Partition the ground truth objects based on whether or not to ignore them\n// based on area\nvoid SortInstancesByIgnore(\n    const std::array<double, 2>& area_range,\n    const std::vector<InstanceAnnotation>& ground_truth_instances,\n    std::vector<uint64_t>* ground_truth_sorted_indices,\n    std::vector<bool>* ignores) {\n  ignores->clear();\n  ignores->reserve(ground_truth_instances.size());\n  for (auto o : ground_truth_instances) {\n    ignores->push_back(\n        o.ignore || o.area < area_range[0] || o.area > area_range[1]);\n  }\n\n  ground_truth_sorted_indices->resize(ground_truth_instances.size());\n  std::iota(\n      ground_truth_sorted_indices->begin(),\n      ground_truth_sorted_indices->end(),\n      0);\n  std::stable_sort(\n      ground_truth_sorted_indices->begin(),\n      ground_truth_sorted_indices->end(),\n      [&ignores](size_t j1, size_t j2) {\n        return (int)(*ignores)[j1] < (int)(*ignores)[j2];\n      });\n}\n\n// For each IOU threshold, greedily match each detected instance to a ground\n// truth instance (if possible) and store the results\nvoid MatchDetectionsToGroundTruth(\n    const std::vector<InstanceAnnotation>& detection_instances,\n    const std::vector<uint64_t>& detection_sorted_indices,\n    const std::vector<InstanceAnnotation>& ground_truth_instances,\n    const std::vector<uint64_t>& ground_truth_sorted_indices,\n    const std::vector<bool>& ignores,\n    const std::vector<std::vector<double>>& ious,\n    const std::vector<double>& iou_thresholds,\n    const std::array<double, 2>& area_range,\n    ImageEvaluation* results) {\n  // Initialize memory to store return data matches and ignore\n  const int num_iou_thresholds = iou_thresholds.size();\n  const int num_ground_truth = ground_truth_sorted_indices.size();\n  const int num_detections = detection_sorted_indices.size();\n  std::vector<uint64_t> ground_truth_matches(\n      num_iou_thresholds * num_ground_truth, 0);\n  std::vector<uint64_t>& detection_matches = results->detection_matches;\n  std::vector<bool>& detection_ignores = results->detection_ignores;\n  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;\n  detection_matches.resize(num_iou_thresholds * num_detections, 0);\n  detection_ignores.resize(num_iou_thresholds * num_detections, false);\n  ground_truth_ignores.resize(num_ground_truth);\n  for (auto g = 0; g < num_ground_truth; ++g) {\n    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];\n  }\n\n  for (auto t = 0; t < num_iou_thresholds; ++t) {\n    for (auto d = 0; d < num_detections; ++d) {\n      // information about best match so far (match=-1 -> unmatched)\n      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);\n      int match = -1;\n      for (auto g = 0; g < num_ground_truth; ++g) {\n        // if this ground truth instance is already matched and not a\n        // crowd, it cannot be matched to another detection\n        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&\n            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {\n          continue;\n        }\n\n        // if detected instance matched to a regular ground truth\n        // instance, we can break on the first ground truth instance\n        // tagged as ignore (because they are sorted by the ignore tag)\n        if (match >= 0 && !ground_truth_ignores[match] &&\n            ground_truth_ignores[g]) {\n          break;\n        }\n\n        // if IOU overlap is the best so far, store the match appropriately\n        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {\n          best_iou = ious[d][ground_truth_sorted_indices[g]];\n          match = g;\n        }\n      }\n      // if match was made, store id of match for both detection and\n      // ground truth\n      if (match >= 0) {\n        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];\n        detection_matches[t * num_detections + d] =\n            ground_truth_instances[ground_truth_sorted_indices[match]].id;\n        ground_truth_matches[t * num_ground_truth + match] =\n            detection_instances[detection_sorted_indices[d]].id;\n      }\n\n      // set unmatched detections outside of area range to ignore\n      const InstanceAnnotation& detection =\n          detection_instances[detection_sorted_indices[d]];\n      detection_ignores[t * num_detections + d] =\n          detection_ignores[t * num_detections + d] ||\n          (detection_matches[t * num_detections + d] == 0 &&\n           (detection.area < area_range[0] || detection.area > area_range[1]));\n    }\n  }\n\n  // store detection score results\n  results->detection_scores.resize(detection_sorted_indices.size());\n  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {\n    results->detection_scores[d] =\n        detection_instances[detection_sorted_indices[d]].score;\n  }\n}\n\nstd::vector<ImageEvaluation> EvaluateImages(\n    const std::vector<std::array<double, 2>>& area_ranges,\n    int max_detections,\n    const std::vector<double>& iou_thresholds,\n    const ImageCategoryInstances<std::vector<double>>& image_category_ious,\n    const ImageCategoryInstances<InstanceAnnotation>&\n        image_category_ground_truth_instances,\n    const ImageCategoryInstances<InstanceAnnotation>&\n        image_category_detection_instances) {\n  const int num_area_ranges = area_ranges.size();\n  const int num_images = image_category_ground_truth_instances.size();\n  const int num_categories =\n      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;\n  std::vector<uint64_t> detection_sorted_indices;\n  std::vector<uint64_t> ground_truth_sorted_indices;\n  std::vector<bool> ignores;\n  std::vector<ImageEvaluation> results_all(\n      num_images * num_area_ranges * num_categories);\n\n  // Store results for each image, category, and area range combination. Results\n  // for each IOU threshold are packed into the same ImageEvaluation object\n  for (auto i = 0; i < num_images; ++i) {\n    for (auto c = 0; c < num_categories; ++c) {\n      const std::vector<InstanceAnnotation>& ground_truth_instances =\n          image_category_ground_truth_instances[i][c];\n      const std::vector<InstanceAnnotation>& detection_instances =\n          image_category_detection_instances[i][c];\n\n      SortInstancesByDetectionScore(\n          detection_instances, &detection_sorted_indices);\n      if ((int)detection_sorted_indices.size() > max_detections) {\n        detection_sorted_indices.resize(max_detections);\n      }\n\n      for (size_t a = 0; a < area_ranges.size(); ++a) {\n        SortInstancesByIgnore(\n            area_ranges[a],\n            ground_truth_instances,\n            &ground_truth_sorted_indices,\n            &ignores);\n\n        MatchDetectionsToGroundTruth(\n            detection_instances,\n            detection_sorted_indices,\n            ground_truth_instances,\n            ground_truth_sorted_indices,\n            ignores,\n            image_category_ious[i][c],\n            iou_thresholds,\n            area_ranges[a],\n            &results_all\n                [c * num_area_ranges * num_images + a * num_images + i]);\n      }\n    }\n  }\n\n  return results_all;\n}\n\n// Convert a python list to a vector\ntemplate <typename T>\nstd::vector<T> list_to_vec(const py::list& l) {\n  std::vector<T> v(py::len(l));\n  for (int i = 0; i < (int)py::len(l); ++i) {\n    v[i] = l[i].cast<T>();\n  }\n  return v;\n}\n\n// Helper function to Accumulate()\n// Considers the evaluation results applicable to a particular category, area\n// range, and max_detections parameter setting, which begin at\n// evaluations[evaluation_index].  Extracts a sorted list of length n of all\n// applicable detection instances concatenated across all images in the dataset,\n// which are represented by the outputs evaluation_indices, detection_scores,\n// image_detection_indices, and detection_sorted_indices--all of which are\n// length n. evaluation_indices[i] stores the applicable index into\n// evaluations[] for instance i, which has detection score detection_score[i],\n// and is the image_detection_indices[i]'th of the list of detections\n// for the image containing i.  detection_sorted_indices[] defines a sorted\n// permutation of the 3 other outputs\nint BuildSortedDetectionList(\n    const std::vector<ImageEvaluation>& evaluations,\n    const int64_t evaluation_index,\n    const int64_t num_images,\n    const int max_detections,\n    std::vector<uint64_t>* evaluation_indices,\n    std::vector<double>* detection_scores,\n    std::vector<uint64_t>* detection_sorted_indices,\n    std::vector<uint64_t>* image_detection_indices) {\n  assert(evaluations.size() >= evaluation_index + num_images);\n\n  // Extract a list of object instances of the applicable category, area\n  // range, and max detections requirements such that they can be sorted\n  image_detection_indices->clear();\n  evaluation_indices->clear();\n  detection_scores->clear();\n  image_detection_indices->reserve(num_images * max_detections);\n  evaluation_indices->reserve(num_images * max_detections);\n  detection_scores->reserve(num_images * max_detections);\n  int num_valid_ground_truth = 0;\n  for (auto i = 0; i < num_images; ++i) {\n    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];\n\n    for (int d = 0;\n         d < (int)evaluation.detection_scores.size() && d < max_detections;\n         ++d) { // detected instances\n      evaluation_indices->push_back(evaluation_index + i);\n      image_detection_indices->push_back(d);\n      detection_scores->push_back(evaluation.detection_scores[d]);\n    }\n    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {\n      if (!ground_truth_ignore) {\n        ++num_valid_ground_truth;\n      }\n    }\n  }\n\n  // Sort detections by decreasing score, using stable sort to match\n  // python implementation\n  detection_sorted_indices->resize(detection_scores->size());\n  std::iota(\n      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);\n  std::stable_sort(\n      detection_sorted_indices->begin(),\n      detection_sorted_indices->end(),\n      [&detection_scores](size_t j1, size_t j2) {\n        return (*detection_scores)[j1] > (*detection_scores)[j2];\n      });\n\n  return num_valid_ground_truth;\n}\n\n// Helper function to Accumulate()\n// Compute a precision recall curve given a sorted list of detected instances\n// encoded in evaluations, evaluation_indices, detection_scores,\n// detection_sorted_indices, image_detection_indices (see\n// BuildSortedDetectionList()). Using vectors precisions and recalls\n// and temporary storage, output the results into precisions_out, recalls_out,\n// and scores_out, which are large buffers containing many precion/recall curves\n// for all possible parameter settings, with precisions_out_index and\n// recalls_out_index defining the applicable indices to store results.\nvoid ComputePrecisionRecallCurve(\n    const int64_t precisions_out_index,\n    const int64_t precisions_out_stride,\n    const int64_t recalls_out_index,\n    const std::vector<double>& recall_thresholds,\n    const int iou_threshold_index,\n    const int num_iou_thresholds,\n    const int num_valid_ground_truth,\n    const std::vector<ImageEvaluation>& evaluations,\n    const std::vector<uint64_t>& evaluation_indices,\n    const std::vector<double>& detection_scores,\n    const std::vector<uint64_t>& detection_sorted_indices,\n    const std::vector<uint64_t>& image_detection_indices,\n    std::vector<double>* precisions,\n    std::vector<double>* recalls,\n    std::vector<double>* precisions_out,\n    std::vector<double>* scores_out,\n    std::vector<double>* recalls_out) {\n  assert(recalls_out->size() > recalls_out_index);\n\n  // Compute precision/recall for each instance in the sorted list of detections\n  int64_t true_positives_sum = 0, false_positives_sum = 0;\n  precisions->clear();\n  recalls->clear();\n  precisions->reserve(detection_sorted_indices.size());\n  recalls->reserve(detection_sorted_indices.size());\n  assert(!evaluations.empty() || detection_sorted_indices.empty());\n  for (auto detection_sorted_index : detection_sorted_indices) {\n    const ImageEvaluation& evaluation =\n        evaluations[evaluation_indices[detection_sorted_index]];\n    const auto num_detections =\n        evaluation.detection_matches.size() / num_iou_thresholds;\n    const auto detection_index = iou_threshold_index * num_detections +\n        image_detection_indices[detection_sorted_index];\n    assert(evaluation.detection_matches.size() > detection_index);\n    assert(evaluation.detection_ignores.size() > detection_index);\n    const int64_t detection_match =\n        evaluation.detection_matches[detection_index];\n    const bool detection_ignores =\n        evaluation.detection_ignores[detection_index];\n    const auto true_positive = detection_match > 0 && !detection_ignores;\n    const auto false_positive = detection_match == 0 && !detection_ignores;\n    if (true_positive) {\n      ++true_positives_sum;\n    }\n    if (false_positive) {\n      ++false_positives_sum;\n    }\n\n    const double recall =\n        static_cast<double>(true_positives_sum) / num_valid_ground_truth;\n    recalls->push_back(recall);\n    const int64_t num_valid_detections =\n        true_positives_sum + false_positives_sum;\n    const double precision = num_valid_detections > 0\n        ? static_cast<double>(true_positives_sum) / num_valid_detections\n        : 0.0;\n    precisions->push_back(precision);\n  }\n\n  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;\n\n  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {\n    if ((*precisions)[i] > (*precisions)[i - 1]) {\n      (*precisions)[i - 1] = (*precisions)[i];\n    }\n  }\n\n  // Sample the per instance precision/recall list at each recall threshold\n  for (size_t r = 0; r < recall_thresholds.size(); ++r) {\n    // first index in recalls >= recall_thresholds[r]\n    std::vector<double>::iterator low = std::lower_bound(\n        recalls->begin(), recalls->end(), recall_thresholds[r]);\n    size_t precisions_index = low - recalls->begin();\n\n    const auto results_ind = precisions_out_index + r * precisions_out_stride;\n    assert(results_ind < precisions_out->size());\n    assert(results_ind < scores_out->size());\n    if (precisions_index < precisions->size()) {\n      (*precisions_out)[results_ind] = (*precisions)[precisions_index];\n      (*scores_out)[results_ind] =\n          detection_scores[detection_sorted_indices[precisions_index]];\n    } else {\n      (*precisions_out)[results_ind] = 0;\n      (*scores_out)[results_ind] = 0;\n    }\n  }\n}\npy::dict Accumulate(\n    const py::object& params,\n    const std::vector<ImageEvaluation>& evaluations) {\n  const std::vector<double> recall_thresholds =\n      list_to_vec<double>(params.attr(\"recThrs\"));\n  const std::vector<int> max_detections =\n      list_to_vec<int>(params.attr(\"maxDets\"));\n  const int num_iou_thresholds = py::len(params.attr(\"iouThrs\"));\n  const int num_recall_thresholds = py::len(params.attr(\"recThrs\"));\n  const int num_categories = params.attr(\"useCats\").cast<int>() == 1\n      ? py::len(params.attr(\"catIds\"))\n      : 1;\n  const int num_area_ranges = py::len(params.attr(\"areaRng\"));\n  const int num_max_detections = py::len(params.attr(\"maxDets\"));\n  const int num_images = py::len(params.attr(\"imgIds\"));\n\n  std::vector<double> precisions_out(\n      num_iou_thresholds * num_recall_thresholds * num_categories *\n          num_area_ranges * num_max_detections,\n      -1);\n  std::vector<double> recalls_out(\n      num_iou_thresholds * num_categories * num_area_ranges *\n          num_max_detections,\n      -1);\n  std::vector<double> scores_out(\n      num_iou_thresholds * num_recall_thresholds * num_categories *\n          num_area_ranges * num_max_detections,\n      -1);\n\n  // Consider the list of all detected instances in the entire dataset in one\n  // large list.  evaluation_indices, detection_scores,\n  // image_detection_indices, and detection_sorted_indices all have the same\n  // length as this list, such that each entry corresponds to one detected\n  // instance\n  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]\n  std::vector<double> detection_scores; // detection scores of each instance\n  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all\n                                                  // instances in the dataset\n  std::vector<uint64_t>\n      image_detection_indices; // indices into the list of detected instances in\n                               // the same image as each instance\n  std::vector<double> precisions, recalls;\n\n  for (auto c = 0; c < num_categories; ++c) {\n    for (auto a = 0; a < num_area_ranges; ++a) {\n      for (auto m = 0; m < num_max_detections; ++m) {\n        // The COCO PythonAPI assumes evaluations[] (the return value of\n        // COCOeval::EvaluateImages() is one long list storing results for each\n        // combination of category, area range, and image id, with categories in\n        // the outermost loop and images in the innermost loop.\n        const int64_t evaluations_index =\n            c * num_area_ranges * num_images + a * num_images;\n        int num_valid_ground_truth = BuildSortedDetectionList(\n            evaluations,\n            evaluations_index,\n            num_images,\n            max_detections[m],\n            &evaluation_indices,\n            &detection_scores,\n            &detection_sorted_indices,\n            &image_detection_indices);\n\n        if (num_valid_ground_truth == 0) {\n          continue;\n        }\n\n        for (auto t = 0; t < num_iou_thresholds; ++t) {\n          // recalls_out is a flattened vectors representing a\n          // num_iou_thresholds X num_categories X num_area_ranges X\n          // num_max_detections matrix\n          const int64_t recalls_out_index =\n              t * num_categories * num_area_ranges * num_max_detections +\n              c * num_area_ranges * num_max_detections +\n              a * num_max_detections + m;\n\n          // precisions_out and scores_out are flattened vectors\n          // representing a num_iou_thresholds X num_recall_thresholds X\n          // num_categories X num_area_ranges X num_max_detections matrix\n          const int64_t precisions_out_stride =\n              num_categories * num_area_ranges * num_max_detections;\n          const int64_t precisions_out_index = t * num_recall_thresholds *\n                  num_categories * num_area_ranges * num_max_detections +\n              c * num_area_ranges * num_max_detections +\n              a * num_max_detections + m;\n\n          ComputePrecisionRecallCurve(\n              precisions_out_index,\n              precisions_out_stride,\n              recalls_out_index,\n              recall_thresholds,\n              t,\n              num_iou_thresholds,\n              num_valid_ground_truth,\n              evaluations,\n              evaluation_indices,\n              detection_scores,\n              detection_sorted_indices,\n              image_detection_indices,\n              &precisions,\n              &recalls,\n              &precisions_out,\n              &scores_out,\n              &recalls_out);\n        }\n      }\n    }\n  }\n\n  time_t rawtime;\n  struct tm local_time;\n  std::array<char, 200> buffer;\n  time(&rawtime);\n#ifdef _WIN32\n  localtime_s(&local_time, &rawtime);\n#else\n  localtime_r(&rawtime, &local_time);\n#endif\n  strftime(\n      buffer.data(), 200, \"%Y-%m-%d %H:%num_max_detections:%S\", &local_time);\n  return py::dict(\n      \"params\"_a = params,\n      \"counts\"_a = std::vector<int64_t>(\n          {num_iou_thresholds,\n           num_recall_thresholds,\n           num_categories,\n           num_area_ranges,\n           num_max_detections}),\n      \"date\"_a = buffer,\n      \"precision\"_a = precisions_out,\n      \"recall\"_a = recalls_out,\n      \"scores\"_a = scores_out);\n}\n\nPYBIND11_MODULE(cocoeval_ext, m) {\n  m.def(\"COCOevalAccumulate\", &Accumulate, \"Accumulate\");\n  m.def(\"COCOevalEvaluateImages\", &EvaluateImages, \"EvaluateImages\");\n  py::class_<InstanceAnnotation>(m, \"InstanceAnnotation\")\n      .def(py::init<uint64_t, double, double, bool, bool>());\n  py::class_<ImageEvaluation>(m, \"ImageEvaluation\")\n      .def(py::init<>());\n}"
  },
  {
    "path": "ppdet/metrics/fast_cocoeval/ext/cocoeval.h",
    "content": "// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n//\n// The code is based on\n// https://github.com/facebookresearch/detectron2/tree/main/detectron2/layers/csrc/cocoeval/\n\n#pragma once\n\n#include <pybind11/numpy.h>\n#include <pybind11/pybind11.h>\n#include <pybind11/stl.h>\n#include <pybind11/stl_bind.h>\n#include <vector>\n\nnamespace py = pybind11;\n\n// Annotation data for a single object instance in an image\nstruct InstanceAnnotation {\n  InstanceAnnotation(\n      uint64_t id,\n      double score,\n      double area,\n      bool is_crowd,\n      bool ignore)\n      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}\n  uint64_t id;\n  double score = 0.;\n  double area = 0.;\n  bool is_crowd = false;\n  bool ignore = false;\n};\n\n// Stores intermediate results for evaluating detection results for a single\n// image that has D detected instances and G ground truth instances. This stores\n// matches between detected and ground truth instances\nstruct ImageEvaluation {\n  // For each of the D detected instances, the id of the matched ground truth\n  // instance, or 0 if unmatched\n  std::vector<uint64_t> detection_matches;\n\n  // The detection score of each of the D detected instances\n  std::vector<double> detection_scores;\n\n  // Marks whether or not each of G instances was ignored from evaluation (e.g.,\n  // because it's outside area_range)\n  std::vector<bool> ground_truth_ignores;\n\n  // Marks whether or not each of D instances was ignored from evaluation (e.g.,\n  // because it's outside aRng)\n  std::vector<bool> detection_ignores;\n};\n\ntemplate <class T>\nusing ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;\n\n// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each\n// combination of image, category, area range settings, and IOU thresholds to\n// evaluate, it matches detected instances to ground truth instances and stores\n// the results into a vector of ImageEvaluation results, which will be\n// interpreted by the COCOeval::Accumulate() function to produce precion-recall\n// curves.  The parameters of nested vectors have the following semantics:\n//   image_category_ious[i][c][d][g] is the intersection over union of the d'th\n//     detected instance and g'th ground truth instance of\n//     category category_ids[c] in image image_ids[i]\n//   image_category_ground_truth_instances[i][c] is a vector of ground truth\n//     instances in image image_ids[i] of category category_ids[c]\n//   image_category_detection_instances[i][c] is a vector of detected\n//     instances in image image_ids[i] of category category_ids[c]\nstd::vector<ImageEvaluation> EvaluateImages(\n    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples\n    int max_detections,\n    const std::vector<double>& iou_thresholds,\n    const ImageCategoryInstances<std::vector<double>>& image_category_ious,\n    const ImageCategoryInstances<InstanceAnnotation>&\n        image_category_ground_truth_instances,\n    const ImageCategoryInstances<InstanceAnnotation>&\n        image_category_detection_instances);\n\n// C++ implementation of COCOeval.accumulate(), which generates precision\n// recall curves for each set of category, IOU threshold, detection area range,\n// and max number of detections parameters.  It is assumed that the parameter\n// evaluations is the return value of the functon COCOeval::EvaluateImages(),\n// which was called with the same parameter settings params\npy::dict Accumulate(\n    const py::object& params,\n    const std::vector<ImageEvaluation>& evalutations);\n"
  },
  {
    "path": "ppdet/metrics/fast_cocoeval/ext/setup.py",
    "content": "from pybind11.setup_helpers import Pybind11Extension, build_ext\nfrom setuptools import setup\n\next_modules = [Pybind11Extension(\"cocoeval_ext\", [\"cocoeval.cc\"])]\n\nsetup(\n    name=\"cocoeval_ext\",\n    version=\"0.0.0\",\n    ext_modules=ext_modules,\n    cmdclass={\"build_ext\": build_ext},\n)\n"
  },
  {
    "path": "ppdet/metrics/fast_cocoeval/fast_cocoeval.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The code is based on\n# https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py\n\n\nimport copy\nimport time\n\nimport numpy as np\nfrom cocoeval_ext import InstanceAnnotation, COCOevalEvaluateImages, COCOevalAccumulate\nfrom pycocotools.cocoeval import COCOeval\n\n__all__ = ['FastCOCOeval']\n\n\nclass FastCOCOeval(COCOeval):\n    \"\"\"\n    This is a slightly modified version of the original COCO API, where the functions evaluateImg()\n    and accumulate() are implemented in C++ to speedup evaluation\n    \"\"\"\n\n    def evaluate(self):\n        \"\"\"\n        Run per image evaluation on given images and store results in self.evalImgs_cpp, a\n        datastructure that isn't readable from Python but is used by a c++ implementation of\n        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure\n        self.evalImgs because this datastructure is a computational bottleneck.\n        :return: None\n        \"\"\"\n        tic = time.time()\n        print('Running per image evaluation...')\n        p = self.params\n        # add backward compatibility if useSegm is specified in params\n        if p.useSegm is not None:\n            p.iouType = \"segm\" if p.useSegm == 1 else \"bbox\"\n            print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))\n        print('Evaluate annotation type *{}*'.format(p.iouType))\n        p.imgIds = list(np.unique(p.imgIds))\n        if p.useCats:\n            p.catIds = list(np.unique(p.catIds))\n        p.maxDets = sorted(p.maxDets)\n        self.params = p\n\n        self._prepare()  # bottleneck\n\n        # loop through images, area range, max detection number\n        catIds = p.catIds if p.useCats else [-1]\n\n        if p.iouType == \"segm\" or p.iouType == \"bbox\":\n            computeIoU = self.computeIoU\n        elif p.iouType == \"keypoints\":\n            computeIoU = self.computeOks\n        self.ious = {\n            (imgId, catId): computeIoU(imgId, catId)\n            for imgId in p.imgIds for catId in catIds\n        }  # bottleneck\n\n        maxDet = p.maxDets[-1]\n\n        # <<<< Beginning of code differences with original COCO API\n        def convert_instances_to_cpp(instances, is_det=False):\n            # Convert annotations for a list of instances in an image to a format that's fast\n            # to access in C++\n            instances_cpp = []\n            for instance in instances:\n                instance_cpp = InstanceAnnotation(\n                    int(instance[\"id\"]),\n                    instance[\"score\"] if is_det else instance.get(\"score\", 0.0),\n                    instance[\"area\"],\n                    bool(instance.get(\"iscrowd\", 0)),\n                    bool(instance.get(\"ignore\", 0)),\n                )\n                instances_cpp.append(instance_cpp)\n            return instances_cpp\n\n        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++\n        ground_truth_instances = [\n            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]\n            for imgId in p.imgIds\n        ]\n        detected_instances = [\n            [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds]\n            for imgId in p.imgIds\n        ]\n        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]\n\n        if not p.useCats:\n            # For each image, flatten per-category lists into a single list\n            ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances]\n            detected_instances = [[[o for c in i for o in c]] for i in detected_instances]\n\n        # Call C++ implementation of self.evaluateImgs()\n        self._evalImgs_cpp = COCOevalEvaluateImages(\n            p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances\n        )\n        self._evalImgs = None\n\n        self._paramsEval = copy.deepcopy(self.params)\n        toc = time.time()\n        print('DONE (t={:0.2f}s).'.format(toc-tic))\n        # >>>> End of code differences with original COCO API\n\n    def accumulate(self, p=None):\n        \"\"\"\n        Accumulate per image evaluation results and store the result in self.eval.  Does not\n        support changing parameter settings from those used by self.evaluate()\n        \"\"\"\n        print('Accumulating evaluation results...')\n        tic = time.time()\n        assert hasattr(\n            self, \"_evalImgs_cpp\"\n        ), \"evaluate() must be called before accmulate() is called.\"\n\n        self.eval = COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)\n\n        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections\n        self.eval[\"recall\"] = np.array(self.eval[\"recall\"]).reshape(\n            self.eval[\"counts\"][:1] + self.eval[\"counts\"][2:]\n        )\n\n        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X\n        # num_area_ranges X num_max_detections\n        self.eval[\"precision\"] = np.array(self.eval[\"precision\"]).reshape(self.eval[\"counts\"])\n        self.eval[\"scores\"] = np.array(self.eval[\"scores\"]).reshape(self.eval[\"counts\"])\n        toc = time.time()\n        print('DONE (t={:0.2f}s).'.format( toc-tic))\n"
  },
  {
    "path": "ppdet/metrics/json_results.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport six\nimport numpy as np\n\n\ndef get_det_res(bboxes,\n                bbox_nums,\n                image_id,\n                label_to_cat_id_map,\n                bias=0,\n                im_file=None,\n                save_threshold=0):\n    det_res = []\n    k = 0\n    for i in range(len(bbox_nums)):\n        cur_image_id = int(image_id[i][0])\n        det_nums = bbox_nums[i]\n        for j in range(det_nums):\n            dt = bboxes[k]\n            k = k + 1\n            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()\n            if int(num_id) < 0 or score < save_threshold:\n                continue\n            category_id = label_to_cat_id_map[int(num_id)]\n            w = xmax - xmin + bias\n            h = ymax - ymin + bias\n            bbox = [xmin, ymin, w, h]\n            dt_res = {\n                'image_id': cur_image_id,\n                'category_id': category_id,\n                'bbox': bbox,\n                'score': score\n            }\n            if im_file:\n                dt_res['im_file'] = im_file\n            det_res.append(dt_res)\n    return det_res\n\n\ndef get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):\n    det_res = []\n    k = 0\n    for i in range(len(bbox_nums)):\n        cur_image_id = int(image_id[i][0])\n        det_nums = bbox_nums[i]\n        for j in range(det_nums):\n            dt = bboxes[k]\n            k = k + 1\n            num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist()\n            if int(num_id) < 0:\n                continue\n            category_id = label_to_cat_id_map[int(num_id)]\n            rbox = [x1, y1, x2, y2, x3, y3, x4, y4]\n            dt_res = {\n                'image_id': cur_image_id,\n                'category_id': category_id,\n                'bbox': rbox,\n                'score': score\n            }\n            det_res.append(dt_res)\n    return det_res\n\n\ndef strip_mask(mask):\n    row = mask[0, 0, :]\n    col = mask[0, :, 0]\n    im_h = len(col) - np.count_nonzero(col == -1)\n    im_w = len(row) - np.count_nonzero(row == -1)\n    return mask[:, :im_h, :im_w]\n\n\ndef get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):\n    import pycocotools.mask as mask_util\n    seg_res = []\n    k = 0\n    for i in range(len(mask_nums)):\n        cur_image_id = int(image_id[i][0])\n        det_nums = mask_nums[i]\n        mask_i = masks[k:k + det_nums]\n        mask_i = strip_mask(mask_i)\n        for j in range(det_nums):\n            mask = mask_i[j].astype(np.uint8)\n            score = float(bboxes[k][1])\n            label = int(bboxes[k][0])\n            k = k + 1\n            if label == -1:\n                continue\n            cat_id = label_to_cat_id_map[label]\n            rle = mask_util.encode(\n                np.array(\n                    mask[:, :, None], order=\"F\", dtype=\"uint8\"))[0]\n            if six.PY3:\n                if 'counts' in rle:\n                    rle['counts'] = rle['counts'].decode(\"utf8\")\n            sg_res = {\n                'image_id': cur_image_id,\n                'category_id': cat_id,\n                'segmentation': rle,\n                'score': score\n            }\n            seg_res.append(sg_res)\n    return seg_res\n\n\ndef get_solov2_segm_res(results, image_id, num_id_to_cat_id_map):\n    import pycocotools.mask as mask_util\n    segm_res = []\n    # for each batch\n    segms = results['segm'].astype(np.uint8)\n    clsid_labels = results['cate_label']\n    clsid_scores = results['cate_score']\n    lengths = segms.shape[0]\n    im_id = int(image_id[0][0])\n    if lengths == 0 or segms is None:\n        return None\n    # for each sample\n    for i in range(lengths - 1):\n        clsid = int(clsid_labels[i])\n        catid = num_id_to_cat_id_map[clsid]\n        score = float(clsid_scores[i])\n        mask = segms[i]\n        segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]\n        segm['counts'] = segm['counts'].decode('utf8')\n        coco_res = {\n            'image_id': im_id,\n            'category_id': catid,\n            'segmentation': segm,\n            'score': score\n        }\n        segm_res.append(coco_res)\n    return segm_res\n\n\ndef get_keypoint_res(results, im_id):\n    anns = []\n    preds = results['keypoint']\n    for idx in range(im_id.shape[0]):\n        image_id = im_id[idx].item()\n        kpts, scores = preds[idx]\n        for kpt, score in zip(kpts, scores):\n            kpt = kpt.flatten()\n            ann = {\n                'image_id': image_id,\n                'category_id': 1,  # XXX hard code\n                'keypoints': kpt.tolist(),\n                'score': float(score)\n            }\n            x = kpt[0::3]\n            y = kpt[1::3]\n            x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item(\n            ), np.max(y).item()\n            ann['area'] = (x1 - x0) * (y1 - y0)\n            ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]\n            anns.append(ann)\n    return anns\n\n\ndef get_pose3d_res(results, im_id):\n    anns = []\n    preds = results['pose3d']\n    for idx in range(im_id.shape[0]):\n        image_id = im_id[idx].item()\n        pose3d = preds[idx]\n        ann = {\n            'image_id': image_id,\n            'category_id': 1,  # XXX hard code\n            'pose3d': pose3d.tolist(),\n            'score': float(1.)\n        }\n        anns.append(ann)\n    return anns\n"
  },
  {
    "path": "ppdet/metrics/keypoint_metrics.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport os\nimport json\nfrom collections import defaultdict, OrderedDict\nimport numpy as np\nimport paddle\nfrom pycocotools.coco import COCO\nfrom pycocotools.cocoeval import COCOeval\nfrom ..modeling.keypoint_utils import oks_nms, keypoint_pck_accuracy, keypoint_auc, keypoint_epe\nfrom scipy.io import loadmat, savemat\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'KeyPointTopDownCOCOEval', 'KeyPointTopDownCOCOWholeBadyHandEval',\n    'KeyPointTopDownMPIIEval'\n]\n\n\nclass KeyPointTopDownCOCOEval(object):\n    \"\"\"refer to\n        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch\n        Copyright (c) Microsoft, under the MIT License.\n    \"\"\"\n\n    def __init__(self,\n                 anno_file,\n                 num_samples,\n                 num_joints,\n                 output_eval,\n                 iou_type='keypoints',\n                 in_vis_thre=0.2,\n                 oks_thre=0.9,\n                 save_prediction_only=False):\n        super(KeyPointTopDownCOCOEval, self).__init__()\n        self.coco = COCO(anno_file)\n        self.num_samples = num_samples\n        self.num_joints = num_joints\n        self.iou_type = iou_type\n        self.in_vis_thre = in_vis_thre\n        self.oks_thre = oks_thre\n        self.output_eval = output_eval\n        self.res_file = os.path.join(output_eval, \"keypoints_results.json\")\n        self.save_prediction_only = save_prediction_only\n        self.reset()\n\n    def reset(self):\n        self.results = {\n            'all_preds': np.zeros(\n                (self.num_samples, self.num_joints, 3), dtype=np.float32),\n            'all_boxes': np.zeros((self.num_samples, 6)),\n            'image_path': []\n        }\n        self.eval_results = {}\n        self.idx = 0\n\n    def update(self, inputs, outputs):\n        kpts, _ = outputs['keypoint'][0]\n\n        num_images = inputs['image'].shape[0]\n        self.results['all_preds'][self.idx:self.idx + num_images, :, 0:\n                                  3] = kpts[:, :, 0:3]\n        self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[\n            'center'].numpy()[:, 0:2] if isinstance(\n                inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2]\n        self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[\n            'scale'].numpy()[:, 0:2] if isinstance(\n                inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2]\n        self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(\n            inputs['scale'].numpy() * 200,\n            1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod(\n                inputs['scale'] * 200, 1)\n        self.results['all_boxes'][\n            self.idx:self.idx + num_images,\n            5] = np.squeeze(inputs['score'].numpy()) if isinstance(\n                inputs['score'], paddle.Tensor) else np.squeeze(inputs['score'])\n        if isinstance(inputs['im_id'], paddle.Tensor):\n            self.results['image_path'].extend(inputs['im_id'].numpy())\n        else:\n            self.results['image_path'].extend(inputs['im_id'])\n        self.idx += num_images\n\n    def _write_coco_keypoint_results(self, keypoints):\n        data_pack = [{\n            'cat_id': 1,\n            'cls': 'person',\n            'ann_type': 'keypoints',\n            'keypoints': keypoints\n        }]\n        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])\n        if not os.path.exists(self.output_eval):\n            os.makedirs(self.output_eval)\n        with open(self.res_file, 'w') as f:\n            json.dump(results, f, sort_keys=True, indent=4)\n            logger.info(f'The keypoint result is saved to {self.res_file}.')\n        try:\n            json.load(open(self.res_file))\n        except Exception:\n            content = []\n            with open(self.res_file, 'r') as f:\n                for line in f:\n                    content.append(line)\n            content[-1] = ']'\n            with open(self.res_file, 'w') as f:\n                for c in content:\n                    f.write(c)\n\n    def _coco_keypoint_results_one_category_kernel(self, data_pack):\n        cat_id = data_pack['cat_id']\n        keypoints = data_pack['keypoints']\n        cat_results = []\n\n        for img_kpts in keypoints:\n            if len(img_kpts) == 0:\n                continue\n\n            _key_points = np.array(\n                [img_kpts[k]['keypoints'] for k in range(len(img_kpts))])\n            _key_points = _key_points.reshape(_key_points.shape[0], -1)\n\n            result = [{\n                'image_id': img_kpts[k]['image'],\n                'category_id': cat_id,\n                'keypoints': _key_points[k].tolist(),\n                'score': img_kpts[k]['score'],\n                'center': list(img_kpts[k]['center']),\n                'scale': list(img_kpts[k]['scale'])\n            } for k in range(len(img_kpts))]\n            cat_results.extend(result)\n\n        return cat_results\n\n    def get_final_results(self, preds, all_boxes, img_path):\n        _kpts = []\n        for idx, kpt in enumerate(preds):\n            _kpts.append({\n                'keypoints': kpt,\n                'center': all_boxes[idx][0:2],\n                'scale': all_boxes[idx][2:4],\n                'area': all_boxes[idx][4],\n                'score': all_boxes[idx][5],\n                'image': int(img_path[idx])\n            })\n        # image x person x (keypoints)\n        kpts = defaultdict(list)\n        for kpt in _kpts:\n            kpts[kpt['image']].append(kpt)\n\n        # rescoring and oks nms\n        num_joints = preds.shape[1]\n        in_vis_thre = self.in_vis_thre\n        oks_thre = self.oks_thre\n        oks_nmsed_kpts = []\n        for img in kpts.keys():\n            img_kpts = kpts[img]\n            for n_p in img_kpts:\n                box_score = n_p['score']\n                kpt_score = 0\n                valid_num = 0\n                for n_jt in range(0, num_joints):\n                    t_s = n_p['keypoints'][n_jt][2]\n                    if t_s > in_vis_thre:\n                        kpt_score = kpt_score + t_s\n                        valid_num = valid_num + 1\n                if valid_num != 0:\n                    kpt_score = kpt_score / valid_num\n                # rescoring\n                n_p['score'] = kpt_score * box_score\n\n            keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))],\n                           oks_thre)\n\n            if len(keep) == 0:\n                oks_nmsed_kpts.append(img_kpts)\n            else:\n                oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep])\n\n        self._write_coco_keypoint_results(oks_nmsed_kpts)\n\n    def accumulate(self):\n        self.get_final_results(self.results['all_preds'],\n                               self.results['all_boxes'],\n                               self.results['image_path'])\n        if self.save_prediction_only:\n            logger.info(f'The keypoint result is saved to {self.res_file} '\n                        'and do not evaluate the mAP.')\n            return\n        coco_dt = self.coco.loadRes(self.res_file)\n        coco_eval = COCOeval(self.coco, coco_dt, 'keypoints')\n        coco_eval.params.useSegm = None\n        coco_eval.evaluate()\n        coco_eval.accumulate()\n        coco_eval.summarize()\n\n        keypoint_stats = []\n        for ind in range(len(coco_eval.stats)):\n            keypoint_stats.append((coco_eval.stats[ind]))\n        self.eval_results['keypoint'] = keypoint_stats\n\n    def log(self):\n        if self.save_prediction_only:\n            return\n        stats_names = [\n            'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',\n            'AR .75', 'AR (M)', 'AR (L)'\n        ]\n        num_values = len(stats_names)\n        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')\n        print('|---' * (num_values + 1) + '|')\n\n        print(' '.join([\n            '| {:.3f}'.format(value) for value in self.eval_results['keypoint']\n        ]) + ' |')\n\n    def get_results(self):\n        return self.eval_results\n\n\nclass KeyPointTopDownCOCOWholeBadyHandEval(object):\n    def __init__(self,\n                 anno_file,\n                 num_samples,\n                 num_joints,\n                 output_eval,\n                 save_prediction_only=False):\n        super(KeyPointTopDownCOCOWholeBadyHandEval, self).__init__()\n        self.coco = COCO(anno_file)\n        self.num_samples = num_samples\n        self.num_joints = num_joints\n        self.output_eval = output_eval\n        self.res_file = os.path.join(output_eval, \"keypoints_results.json\")\n        self.save_prediction_only = save_prediction_only\n        self.parse_dataset()\n        self.reset()\n\n    def parse_dataset(self):\n        gt_db = []\n        num_joints = self.num_joints\n        coco = self.coco\n        img_ids = coco.getImgIds()\n        for img_id in img_ids:\n            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)\n            objs = coco.loadAnns(ann_ids)\n\n            for obj in objs:\n                for type in ['left', 'right']:\n                    if (obj[f'{type}hand_valid'] and\n                            max(obj[f'{type}hand_kpts']) > 0):\n\n                        joints = np.zeros((num_joints, 3), dtype=np.float32)\n                        joints_vis = np.zeros((num_joints, 3), dtype=np.float32)\n\n                        keypoints = np.array(obj[f'{type}hand_kpts'])\n                        keypoints = keypoints.reshape(-1, 3)\n                        joints[:, :2] = keypoints[:, :2]\n                        joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])\n\n                        gt_db.append({\n                            'bbox': obj[f'{type}hand_box'],\n                            'gt_joints': joints,\n                            'joints_vis': joints_vis,\n                        })\n        self.db = gt_db\n\n    def reset(self):\n        self.results = {\n            'preds': np.zeros(\n                (self.num_samples, self.num_joints, 3), dtype=np.float32),\n        }\n        self.eval_results = {}\n        self.idx = 0\n\n    def update(self, inputs, outputs):\n        kpts, _ = outputs['keypoint'][0]\n        num_images = inputs['image'].shape[0]\n        self.results['preds'][self.idx:self.idx + num_images, :, 0:\n                              3] = kpts[:, :, 0:3]\n        self.idx += num_images\n\n    def accumulate(self):\n        self.get_final_results(self.results['preds'])\n        if self.save_prediction_only:\n            logger.info(f'The keypoint result is saved to {self.res_file} '\n                        'and do not evaluate the mAP.')\n            return\n\n        self.eval_results = self.evaluate(self.res_file, ('PCK', 'AUC', 'EPE'))\n\n    def get_final_results(self, preds):\n        kpts = []\n        for idx, kpt in enumerate(preds):\n            kpts.append({'keypoints': kpt.tolist()})\n\n        self._write_keypoint_results(kpts)\n\n    def _write_keypoint_results(self, keypoints):\n        if not os.path.exists(self.output_eval):\n            os.makedirs(self.output_eval)\n        with open(self.res_file, 'w') as f:\n            json.dump(keypoints, f, sort_keys=True, indent=4)\n            logger.info(f'The keypoint result is saved to {self.res_file}.')\n        try:\n            json.load(open(self.res_file))\n        except Exception:\n            content = []\n            with open(self.res_file, 'r') as f:\n                for line in f:\n                    content.append(line)\n            content[-1] = ']'\n            with open(self.res_file, 'w') as f:\n                for c in content:\n                    f.write(c)\n\n    def log(self):\n        if self.save_prediction_only:\n            return\n        for item, value in self.eval_results.items():\n            print(\"{} : {}\".format(item, value))\n\n    def get_results(self):\n        return self.eval_results\n\n    def evaluate(self, res_file, metrics, pck_thr=0.2, auc_nor=30):\n        \"\"\"Keypoint evaluation.\n\n        Args:\n            res_file (str): Json file stored prediction results.\n            metrics (str | list[str]): Metric to be performed.\n                Options: 'PCK', 'AUC', 'EPE'.\n            pck_thr (float): PCK threshold, default as 0.2.\n            auc_nor (float): AUC normalization factor, default as 30 pixel.\n\n        Returns:\n            List: Evaluation results for evaluation metric.\n        \"\"\"\n        info_str = []\n\n        with open(res_file, 'r') as fin:\n            preds = json.load(fin)\n        assert len(preds) == len(self.db)\n\n        outputs = []\n        gts = []\n        masks = []\n        threshold_bbox = []\n\n        for pred, item in zip(preds, self.db):\n            outputs.append(np.array(pred['keypoints'])[:, :-1])\n            gts.append(np.array(item['gt_joints'])[:, :-1])\n            masks.append((np.array(item['joints_vis'])[:, 0]) > 0)\n            if 'PCK' in metrics:\n                bbox = np.array(item['bbox'])\n                bbox_thr = np.max(bbox[2:])\n                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))\n\n        outputs = np.array(outputs)\n        gts = np.array(gts)\n        masks = np.array(masks)\n        threshold_bbox = np.array(threshold_bbox)\n\n        if 'PCK' in metrics:\n            _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,\n                                              threshold_bbox)\n            info_str.append(('PCK', pck))\n\n        if 'AUC' in metrics:\n            info_str.append(('AUC', keypoint_auc(outputs, gts, masks, auc_nor)))\n\n        if 'EPE' in metrics:\n            info_str.append(('EPE', keypoint_epe(outputs, gts, masks)))\n\n        name_value = OrderedDict(info_str)\n\n        return name_value\n\n\nclass KeyPointTopDownMPIIEval(object):\n    def __init__(self,\n                 anno_file,\n                 num_samples,\n                 num_joints,\n                 output_eval,\n                 oks_thre=0.9,\n                 save_prediction_only=False):\n        super(KeyPointTopDownMPIIEval, self).__init__()\n        self.ann_file = anno_file\n        self.res_file = os.path.join(output_eval, \"keypoints_results.json\")\n        self.save_prediction_only = save_prediction_only\n        self.reset()\n\n    def reset(self):\n        self.results = []\n        self.eval_results = {}\n        self.idx = 0\n\n    def update(self, inputs, outputs):\n        kpts, _ = outputs['keypoint'][0]\n\n        num_images = inputs['image'].shape[0]\n        results = {}\n        results['preds'] = kpts[:, :, 0:3]\n        results['boxes'] = np.zeros((num_images, 6))\n        results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2]\n        results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2]\n        results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1)\n        results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy())\n        results['image_path'] = inputs['image_file']\n\n        self.results.append(results)\n\n    def accumulate(self):\n        self._mpii_keypoint_results_save()\n        if self.save_prediction_only:\n            logger.info(f'The keypoint result is saved to {self.res_file} '\n                        'and do not evaluate the mAP.')\n            return\n\n        self.eval_results = self.evaluate(self.results)\n\n    def _mpii_keypoint_results_save(self):\n        results = []\n        for res in self.results:\n            if len(res) == 0:\n                continue\n            result = [{\n                'preds': res['preds'][k].tolist(),\n                'boxes': res['boxes'][k].tolist(),\n                'image_path': res['image_path'][k],\n            } for k in range(len(res))]\n            results.extend(result)\n        with open(self.res_file, 'w') as f:\n            json.dump(results, f, sort_keys=True, indent=4)\n            logger.info(f'The keypoint result is saved to {self.res_file}.')\n\n    def log(self):\n        if self.save_prediction_only:\n            return\n        for item, value in self.eval_results.items():\n            print(\"{} : {}\".format(item, value))\n\n    def get_results(self):\n        return self.eval_results\n\n    def evaluate(self, outputs, savepath=None):\n        \"\"\"Evaluate PCKh for MPII dataset. refer to\n        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch\n        Copyright (c) Microsoft, under the MIT License.\n\n        Args:\n            outputs(list(preds, boxes)):\n\n                * preds (np.ndarray[N,K,3]): The first two dimensions are\n                  coordinates, score is the third dimension of the array.\n                * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]\n                  , scale[1],area, score]\n\n        Returns:\n            dict: PCKh for each joint\n        \"\"\"\n\n        kpts = []\n        for output in outputs:\n            preds = output['preds']\n            batch_size = preds.shape[0]\n            for i in range(batch_size):\n                kpts.append({'keypoints': preds[i]})\n\n        preds = np.stack([kpt['keypoints'] for kpt in kpts])\n\n        # convert 0-based index to 1-based index,\n        # and get the first two dimensions.\n        preds = preds[..., :2] + 1.0\n\n        if savepath is not None:\n            pred_file = os.path.join(savepath, 'pred.mat')\n            savemat(pred_file, mdict={'preds': preds})\n\n        SC_BIAS = 0.6\n        threshold = 0.5\n\n        gt_file = os.path.join(\n            os.path.dirname(self.ann_file), 'mpii_gt_val.mat')\n        gt_dict = loadmat(gt_file)\n        dataset_joints = gt_dict['dataset_joints']\n        jnt_missing = gt_dict['jnt_missing']\n        pos_gt_src = gt_dict['pos_gt_src']\n        headboxes_src = gt_dict['headboxes_src']\n\n        pos_pred_src = np.transpose(preds, [1, 2, 0])\n\n        head = np.where(dataset_joints == 'head')[1][0]\n        lsho = np.where(dataset_joints == 'lsho')[1][0]\n        lelb = np.where(dataset_joints == 'lelb')[1][0]\n        lwri = np.where(dataset_joints == 'lwri')[1][0]\n        lhip = np.where(dataset_joints == 'lhip')[1][0]\n        lkne = np.where(dataset_joints == 'lkne')[1][0]\n        lank = np.where(dataset_joints == 'lank')[1][0]\n\n        rsho = np.where(dataset_joints == 'rsho')[1][0]\n        relb = np.where(dataset_joints == 'relb')[1][0]\n        rwri = np.where(dataset_joints == 'rwri')[1][0]\n        rkne = np.where(dataset_joints == 'rkne')[1][0]\n        rank = np.where(dataset_joints == 'rank')[1][0]\n        rhip = np.where(dataset_joints == 'rhip')[1][0]\n\n        jnt_visible = 1 - jnt_missing\n        uv_error = pos_pred_src - pos_gt_src\n        uv_err = np.linalg.norm(uv_error, axis=1)\n        headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]\n        headsizes = np.linalg.norm(headsizes, axis=0)\n        headsizes *= SC_BIAS\n        scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)\n        scaled_uv_err = uv_err / scale\n        scaled_uv_err = scaled_uv_err * jnt_visible\n        jnt_count = np.sum(jnt_visible, axis=1)\n        less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible\n        PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count\n\n        # save\n        rng = np.arange(0, 0.5 + 0.01, 0.01)\n        pckAll = np.zeros((len(rng), 16), dtype=np.float32)\n\n        for r, threshold in enumerate(rng):\n            less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible\n            pckAll[r, :] = 100. * np.sum(less_than_threshold,\n                                         axis=1) / jnt_count\n\n        PCKh = np.ma.array(PCKh, mask=False)\n        PCKh.mask[6:8] = True\n\n        jnt_count = np.ma.array(jnt_count, mask=False)\n        jnt_count.mask[6:8] = True\n        jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)\n\n        name_value = [  #noqa\n            ('Head', PCKh[head]),\n            ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),\n            ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),\n            ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),\n            ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),\n            ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),\n            ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),\n            ('PCKh', np.sum(PCKh * jnt_ratio)),\n            ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio))\n        ]\n        name_value = OrderedDict(name_value)\n\n        return name_value\n\n    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):\n        \"\"\"sort kpts and remove the repeated ones.\"\"\"\n        kpts = sorted(kpts, key=lambda x: x[key])\n        num = len(kpts)\n        for i in range(num - 1, 0, -1):\n            if kpts[i][key] == kpts[i - 1][key]:\n                del kpts[i]\n\n        return kpts\n"
  },
  {
    "path": "ppdet/metrics/lvis_utils.py",
    "content": "#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport numpy as np\nimport itertools\n\nfrom ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res\nfrom ppdet.metrics.map_utils import draw_pr_curve\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\ndef lvisapi_eval(jsonfile,\n                 style,\n                 lvis_gt=None,\n                 anno_file=None,\n                 max_dets=(100, 300, 1000),\n                 classwise=False,\n                 sigmas=None,\n                 use_area=True):\n    \"\"\"\n    Args:\n        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.\n        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.\n        coco_gt (str): Whether to load COCOAPI through anno_file,\n                 eg: coco_gt = COCO(anno_file)\n        anno_file (str): COCO annotations file.\n        max_dets (tuple): COCO evaluation maxDets.\n        classwise (bool): Whether per-category AP and draw P-R Curve or not.\n        sigmas (nparray): keypoint labelling sigmas.\n        use_area (bool): If gt annotations (eg. CrowdPose, AIC)\n                         do not have 'area', please set use_area=False.\n    \"\"\"\n    assert lvis_gt != None or anno_file != None\n    from lvis import LVIS, LVISEval, LVISResults\n\n    if lvis_gt == None:\n        # coco_gt = COCO(anno_file)\n        lvis_gt = LVIS(anno_file)\n\n    logger.info(\"Start evaluate...\")\n    lvis_dt = LVISResults(lvis_gt, jsonfile)\n    \n    lvis_eval = LVISEval(lvis_gt, lvis_dt, style)\n    lvis_eval.evaluate()\n    lvis_eval.accumulate()\n    lvis_eval.summarize()\n    if classwise:\n        # Compute per-category AP and PR curve\n        try:\n            from terminaltables import AsciiTable\n        except Exception as e:\n            logger.error(\n                'terminaltables not found, plaese install terminaltables. '\n                'for example: `pip install terminaltables`.')\n            raise e\n        precisions = coco_eval.eval['precision']\n        cat_ids = coco_gt.getCatIds()\n        # precision: (iou, recall, cls, area range, max dets)\n        assert len(cat_ids) == precisions.shape[2]\n        results_per_category = []\n        for idx, catId in enumerate(cat_ids):\n            # area range index 0: all area ranges\n            # max dets index -1: typically 100 per image\n            nm = coco_gt.loadCats(catId)[0]\n            precision = precisions[:, :, idx, 0, -1]\n            precision = precision[precision > -1]\n            if precision.size:\n                ap = np.mean(precision)\n            else:\n                ap = float('nan')\n            results_per_category.append(\n                (str(nm[\"name\"]), '{:0.3f}'.format(float(ap))))\n            pr_array = precisions[0, :, idx, 0, 2]\n            recall_array = np.arange(0.0, 1.01, 0.01)\n            draw_pr_curve(\n                pr_array,\n                recall_array,\n                out_dir=style + '_pr_curve',\n                file_name='{}_precision_recall_curve.jpg'.format(nm[\"name\"]))\n\n        num_columns = min(6, len(results_per_category) * 2)\n        results_flatten = list(itertools.chain(*results_per_category))\n        headers = ['category', 'AP'] * (num_columns // 2)\n        results_2d = itertools.zip_longest(\n            * [results_flatten[i::num_columns] for i in range(num_columns)])\n        table_data = [headers]\n        table_data += [result for result in results_2d]\n        table = AsciiTable(table_data)\n        logger.info('Per-category of {} AP: \\n{}'.format(style, table.table))\n        logger.info(\"per-category PR curve has output to {} folder.\".format(\n            style + '_pr_curve'))\n    # flush coco evaluation result\n    sys.stdout.flush()\n    \n    return lvis_eval.get_results()"
  },
  {
    "path": "ppdet/metrics/map_utils.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\n\nimport os\nimport sys\nimport numpy as np\nimport itertools\nimport paddle\nfrom ppdet.modeling.rbox_utils import poly2rbox_np\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'draw_pr_curve',\n    'bbox_area',\n    'jaccard_overlap',\n    'prune_zero_padding',\n    'DetectionMAP',\n    'ap_per_class',\n    'compute_ap',\n]\n\n\ndef draw_pr_curve(precision,\n                  recall,\n                  iou=0.5,\n                  out_dir='pr_curve',\n                  file_name='precision_recall_curve.jpg'):\n    if not os.path.exists(out_dir):\n        os.makedirs(out_dir)\n    output_path = os.path.join(out_dir, file_name)\n    try:\n        import matplotlib.pyplot as plt\n    except Exception as e:\n        logger.error('Matplotlib not found, plaese install matplotlib.'\n                     'for example: `pip install matplotlib`.')\n        raise e\n    plt.cla()\n    plt.figure('P-R Curve')\n    plt.title('Precision/Recall Curve(IoU={})'.format(iou))\n    plt.xlabel('Recall')\n    plt.ylabel('Precision')\n    plt.grid(True)\n    plt.plot(recall, precision)\n    plt.savefig(output_path)\n\n\ndef bbox_area(bbox, is_bbox_normalized):\n    \"\"\"\n    Calculate area of a bounding box\n    \"\"\"\n    norm = 1. - float(is_bbox_normalized)\n    width = bbox[2] - bbox[0] + norm\n    height = bbox[3] - bbox[1] + norm\n    return width * height\n\n\ndef jaccard_overlap(pred, gt, is_bbox_normalized=False):\n    \"\"\"\n    Calculate jaccard overlap ratio between two bounding box\n    \"\"\"\n    if pred[0] >= gt[2] or pred[2] <= gt[0] or \\\n        pred[1] >= gt[3] or pred[3] <= gt[1]:\n        return 0.\n    inter_xmin = max(pred[0], gt[0])\n    inter_ymin = max(pred[1], gt[1])\n    inter_xmax = min(pred[2], gt[2])\n    inter_ymax = min(pred[3], gt[3])\n    inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax],\n                           is_bbox_normalized)\n    pred_size = bbox_area(pred, is_bbox_normalized)\n    gt_size = bbox_area(gt, is_bbox_normalized)\n    overlap = float(inter_size) / (pred_size + gt_size - inter_size)\n    return overlap\n\n\ndef calc_rbox_iou(pred, gt_poly):\n    \"\"\"\n    calc iou between rotated bbox\n    \"\"\"\n    # calc iou of bounding box for speedup\n    pred = np.array(pred, np.float32).reshape(-1, 2)\n    gt_poly = np.array(gt_poly, np.float32).reshape(-1, 2)\n    pred_rect = [\n        np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]),\n        np.max(pred[:, 1])\n    ]\n    gt_rect = [\n        np.min(gt_poly[:, 0]), np.min(gt_poly[:, 1]), np.max(gt_poly[:, 0]),\n        np.max(gt_poly[:, 1])\n    ]\n    iou = jaccard_overlap(pred_rect, gt_rect, False)\n\n    if iou <= 0:\n        return iou\n\n    # calc rbox iou\n    pred_rbox = poly2rbox_np(pred.reshape(-1, 8)).reshape(-1, 5)\n    gt_rbox = poly2rbox_np(gt_poly.reshape(-1, 8)).reshape(-1, 5)\n    try:\n        from ext_op import rbox_iou\n    except Exception as e:\n        print(\"import custom_ops error, try install ext_op \" \\\n                  \"following ppdet/ext_op/README.md\", e)\n        sys.stdout.flush()\n        sys.exit(-1)\n    pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32')\n    pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32')\n    iou = rbox_iou(pd_gt_rbox, pd_pred_rbox)\n    iou = iou.numpy()\n    return iou[0][0]\n\n\ndef prune_zero_padding(gt_box, gt_label, difficult=None):\n    valid_cnt = 0\n    for i in range(len(gt_box)):\n        if (gt_box[i] == 0).all():\n            break\n        valid_cnt += 1\n    return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]\n            if difficult is not None else None)\n\n\nclass DetectionMAP(object):\n    \"\"\"\n    Calculate detection mean average precision.\n    Currently support two types: 11point and integral\n\n    Args:\n        class_num (int): The class number.\n        overlap_thresh (float): The threshold of overlap\n            ratio between prediction bounding box and \n            ground truth bounding box for deciding \n            true/false positive. Default 0.5.\n        map_type (str): Calculation method of mean average\n            precision, currently support '11point' and\n            'integral'. Default '11point'.\n        is_bbox_normalized (bool): Whether bounding boxes\n            is normalized to range[0, 1]. Default False.\n        evaluate_difficult (bool): Whether to evaluate\n            difficult bounding boxes. Default False.\n        catid2name (dict): Mapping between category id and category name.\n        classwise (bool): Whether per-category AP and draw\n            P-R Curve or not.\n    \"\"\"\n\n    def __init__(self,\n                 class_num,\n                 overlap_thresh=0.5,\n                 map_type='11point',\n                 is_bbox_normalized=False,\n                 evaluate_difficult=False,\n                 catid2name=None,\n                 classwise=False):\n        self.class_num = class_num\n        self.overlap_thresh = overlap_thresh\n        assert map_type in ['11point', 'integral'], \\\n                \"map_type currently only support '11point' \"\\\n                \"and 'integral'\"\n        self.map_type = map_type\n        self.is_bbox_normalized = is_bbox_normalized\n        self.evaluate_difficult = evaluate_difficult\n        self.classwise = classwise\n        self.classes = []\n        for cname in catid2name.values():\n            self.classes.append(cname)\n        self.reset()\n\n    def update(self, bbox, score, label, gt_box, gt_label, difficult=None):\n        \"\"\"\n        Update metric statics from given prediction and ground\n        truth infomations.\n        \"\"\"\n        if difficult is None:\n            difficult = np.zeros_like(gt_label)\n\n        # record class gt count\n        for gtl, diff in zip(gt_label, difficult):\n            if self.evaluate_difficult or int(diff) == 0:\n                self.class_gt_counts[int(np.array(gtl))] += 1\n\n        # record class score positive\n        visited = [False] * len(gt_label)\n        for b, s, l in zip(bbox, score, label):\n            pred = b.tolist() if isinstance(b, np.ndarray) else b\n            max_idx = -1\n            max_overlap = -1.0\n            for i, gl in enumerate(gt_label):\n                if int(gl) == int(l):\n                    if len(gt_box[i]) == 8:\n                        overlap = calc_rbox_iou(pred, gt_box[i])\n                    else:\n                        overlap = jaccard_overlap(pred, gt_box[i],\n                                                  self.is_bbox_normalized)\n                    if overlap > max_overlap:\n                        max_overlap = overlap\n                        max_idx = i\n\n            if max_overlap > self.overlap_thresh:\n                if self.evaluate_difficult or \\\n                        int(np.array(difficult[max_idx])) == 0:\n                    if not visited[max_idx]:\n                        self.class_score_poss[int(l)].append([s, 1.0])\n                        visited[max_idx] = True\n                    else:\n                        self.class_score_poss[int(l)].append([s, 0.0])\n            else:\n                self.class_score_poss[int(l)].append([s, 0.0])\n\n    def reset(self):\n        \"\"\"\n        Reset metric statics\n        \"\"\"\n        self.class_score_poss = [[] for _ in range(self.class_num)]\n        self.class_gt_counts = [0] * self.class_num\n        self.mAP = 0.0\n\n    def accumulate(self):\n        \"\"\"\n        Accumulate metric results and calculate mAP\n        \"\"\"\n        mAP = 0.\n        valid_cnt = 0\n        eval_results = []\n        for score_pos, count in zip(self.class_score_poss,\n                                    self.class_gt_counts):\n            if count == 0: continue\n            if len(score_pos) == 0:\n                valid_cnt += 1\n                continue\n\n            accum_tp_list, accum_fp_list = \\\n                    self._get_tp_fp_accum(score_pos)\n            precision = []\n            recall = []\n            for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):\n                precision.append(float(ac_tp) / (ac_tp + ac_fp))\n                recall.append(float(ac_tp) / count)\n\n            one_class_ap = 0.0\n            if self.map_type == '11point':\n                max_precisions = [0.] * 11\n                start_idx = len(precision) - 1\n                for j in range(10, -1, -1):\n                    for i in range(start_idx, -1, -1):\n                        if recall[i] < float(j) / 10.:\n                            start_idx = i\n                            if j > 0:\n                                max_precisions[j - 1] = max_precisions[j]\n                                break\n                        else:\n                            if max_precisions[j] < precision[i]:\n                                max_precisions[j] = precision[i]\n                one_class_ap = sum(max_precisions) / 11.\n                mAP += one_class_ap\n                valid_cnt += 1\n            elif self.map_type == 'integral':\n                import math\n                prev_recall = 0.\n                for i in range(len(precision)):\n                    recall_gap = math.fabs(recall[i] - prev_recall)\n                    if recall_gap > 1e-6:\n                        one_class_ap += precision[i] * recall_gap\n                        prev_recall = recall[i]\n                mAP += one_class_ap\n                valid_cnt += 1\n            else:\n                logger.error(\"Unspported mAP type {}\".format(self.map_type))\n                sys.exit(1)\n            eval_results.append({\n                'class': self.classes[valid_cnt - 1],\n                'ap': one_class_ap,\n                'precision': precision,\n                'recall': recall,\n            })\n        self.eval_results = eval_results\n        self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP\n\n    def get_map(self):\n        \"\"\"\n        Get mAP result\n        \"\"\"\n        if self.mAP is None:\n            logger.error(\"mAP is not calculated.\")\n        if self.classwise:\n            # Compute per-category AP and PR curve\n            try:\n                from terminaltables import AsciiTable\n            except Exception as e:\n                logger.error(\n                    'terminaltables not found, plaese install terminaltables. '\n                    'for example: `pip install terminaltables`.')\n                raise e\n            results_per_category = []\n            for eval_result in self.eval_results:\n                results_per_category.append(\n                    (str(eval_result['class']),\n                     '{:0.3f}'.format(float(eval_result['ap']))))\n                draw_pr_curve(\n                    eval_result['precision'],\n                    eval_result['recall'],\n                    out_dir='voc_pr_curve',\n                    file_name='{}_precision_recall_curve.jpg'.format(\n                        eval_result['class']))\n\n            num_columns = min(6, len(results_per_category) * 2)\n            results_flatten = list(itertools.chain(*results_per_category))\n            headers = ['category', 'AP'] * (num_columns // 2)\n            results_2d = itertools.zip_longest(* [\n                results_flatten[i::num_columns] for i in range(num_columns)\n            ])\n            table_data = [headers]\n            table_data += [result for result in results_2d]\n            table = AsciiTable(table_data)\n            logger.info('Per-category of VOC AP: \\n{}'.format(table.table))\n            logger.info(\n                \"per-category PR curve has output to voc_pr_curve folder.\")\n        return self.mAP\n\n    def _get_tp_fp_accum(self, score_pos_list):\n        \"\"\"\n        Calculate accumulating true/false positive results from\n        [score, pos] records\n        \"\"\"\n        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)\n        accum_tp = 0\n        accum_fp = 0\n        accum_tp_list = []\n        accum_fp_list = []\n        for (score, pos) in sorted_list:\n            accum_tp += int(pos)\n            accum_tp_list.append(accum_tp)\n            accum_fp += 1 - int(pos)\n            accum_fp_list.append(accum_fp)\n        return accum_tp_list, accum_fp_list\n\n\ndef ap_per_class(tp, conf, pred_cls, target_cls):\n    \"\"\"\n    Computes the average precision, given the recall and precision curves.\n    Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.\n    \n    Args:\n        tp (list): True positives.\n        conf (list): Objectness value from 0-1.\n        pred_cls (list): Predicted object classes.\n        target_cls (list): Target object classes.\n    \"\"\"\n    tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(\n        pred_cls), np.array(target_cls)\n\n    # Sort by objectness\n    i = np.argsort(-conf)\n    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]\n\n    # Find unique classes\n    unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))\n\n    # Create Precision-Recall curve and compute AP for each class\n    ap, p, r = [], [], []\n    for c in unique_classes:\n        i = pred_cls == c\n        n_gt = sum(target_cls == c)  # Number of ground truth objects\n        n_p = sum(i)  # Number of predicted objects\n\n        if (n_p == 0) and (n_gt == 0):\n            continue\n        elif (n_p == 0) or (n_gt == 0):\n            ap.append(0)\n            r.append(0)\n            p.append(0)\n        else:\n            # Accumulate FPs and TPs\n            fpc = np.cumsum(1 - tp[i])\n            tpc = np.cumsum(tp[i])\n\n            # Recall\n            recall_curve = tpc / (n_gt + 1e-16)\n            r.append(tpc[-1] / (n_gt + 1e-16))\n\n            # Precision\n            precision_curve = tpc / (tpc + fpc)\n            p.append(tpc[-1] / (tpc[-1] + fpc[-1]))\n\n            # AP from recall-precision curve\n            ap.append(compute_ap(recall_curve, precision_curve))\n\n    return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(\n        p)\n\n\ndef compute_ap(recall, precision):\n    \"\"\"\n    Computes the average precision, given the recall and precision curves.\n    Code originally from https://github.com/rbgirshick/py-faster-rcnn.\n    \n    Args:\n        recall (list): The recall curve.\n        precision (list): The precision curve.\n\n    Returns:\n        The average precision as computed in py-faster-rcnn.\n    \"\"\"\n    # correct AP calculation\n    # first append sentinel values at the end\n    mrec = np.concatenate(([0.], recall, [1.]))\n    mpre = np.concatenate(([0.], precision, [0.]))\n\n    # compute the precision envelope\n    for i in range(mpre.size - 1, 0, -1):\n        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])\n\n    # to calculate area under PR curve, look for points\n    # where X axis (recall) changes value\n    i = np.where(mrec[1:] != mrec[:-1])[0]\n\n    # and sum (\\Delta recall) * prec\n    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])\n    return ap\n"
  },
  {
    "path": "ppdet/metrics/mcmot_metrics.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport copy\nimport sys\nimport math\nfrom collections import defaultdict\n\nimport numpy as np\nimport pandas as pd\n\nfrom .metrics import Metric\ntry:\n    import motmetrics as mm\n    from motmetrics.math_util import quiet_divide\n    metrics = mm.metrics.motchallenge_metrics\n    mh = mm.metrics.create()\nexcept:\n    print(\n        'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'\n    )\n    pass\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = ['MCMOTEvaluator', 'MCMOTMetric']\n\nMETRICS_LIST = [\n    'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend',\n    'num_migrate', 'num_false_positives', 'num_misses', 'num_detections',\n    'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked',\n    'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota',\n    'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1'\n]\n\nNAME_MAP = {\n    'num_frames': 'num_frames',\n    'num_matches': 'num_matches',\n    'num_switches': 'IDs',\n    'num_transfer': 'IDt',\n    'num_ascend': 'IDa',\n    'num_migrate': 'IDm',\n    'num_false_positives': 'FP',\n    'num_misses': 'FN',\n    'num_detections': 'num_detections',\n    'num_objects': 'num_objects',\n    'num_predictions': 'num_predictions',\n    'num_unique_objects': 'GT',\n    'mostly_tracked': 'MT',\n    'partially_tracked': 'partially_tracked',\n    'mostly_lost': 'ML',\n    'num_fragmentations': 'FM',\n    'motp': 'MOTP',\n    'mota': 'MOTA',\n    'precision': 'Prcn',\n    'recall': 'Rcll',\n    'idfp': 'idfp',\n    'idfn': 'idfn',\n    'idtp': 'idtp',\n    'idp': 'IDP',\n    'idr': 'IDR',\n    'idf1': 'IDF1'\n}\n\n\ndef parse_accs_metrics(seq_acc, index_name, verbose=False):\n    \"\"\"\n    Parse the evaluation indicators of multiple MOTAccumulator \n    \"\"\"\n    mh = mm.metrics.create()\n    summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST)\n    summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \\\n                                     summary.loc['OVERALL', 'num_detections']\n    if verbose:\n        strsummary = mm.io.render_summary(\n            summary, formatters=mh.formatters, namemap=NAME_MAP)\n        print(strsummary)\n\n    return summary\n\n\ndef seqs_overall_metrics(summary_df, verbose=False):\n    \"\"\"\n    Calculate overall metrics for multiple sequences\n    \"\"\"\n    add_col = [\n        'num_frames', 'num_matches', 'num_switches', 'num_transfer',\n        'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses',\n        'num_detections', 'num_objects', 'num_predictions',\n        'num_unique_objects', 'mostly_tracked', 'partially_tracked',\n        'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp'\n    ]\n    calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1']\n    calc_df = summary_df.copy()\n\n    overall_dic = {}\n    for col in add_col:\n        overall_dic[col] = calc_df[col].sum()\n\n    for col in calc_col:\n        overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')(\n            calc_df, overall_dic)\n\n    overall_df = pd.DataFrame(overall_dic, index=['overall_calc'])\n    calc_df = pd.concat([calc_df, overall_df])\n\n    if verbose:\n        mh = mm.metrics.create()\n        str_calc_df = mm.io.render_summary(\n            calc_df, formatters=mh.formatters, namemap=NAME_MAP)\n        print(str_calc_df)\n\n    return calc_df\n\n\nclass MCMOTMetricOverall(object):\n    def motp_overall(summary_df, overall_dic):\n        motp = quiet_divide((summary_df['motp'] *\n                             summary_df['num_detections']).sum(),\n                            overall_dic['num_detections'])\n        return motp\n\n    def mota_overall(summary_df, overall_dic):\n        del summary_df\n        mota = 1. - quiet_divide(\n            (overall_dic['num_misses'] + overall_dic['num_switches'] +\n             overall_dic['num_false_positives']), overall_dic['num_objects'])\n        return mota\n\n    def precision_overall(summary_df, overall_dic):\n        del summary_df\n        precision = quiet_divide(overall_dic['num_detections'], (\n            overall_dic['num_false_positives'] + overall_dic['num_detections']))\n        return precision\n\n    def recall_overall(summary_df, overall_dic):\n        del summary_df\n        recall = quiet_divide(overall_dic['num_detections'],\n                              overall_dic['num_objects'])\n        return recall\n\n    def idp_overall(summary_df, overall_dic):\n        del summary_df\n        idp = quiet_divide(overall_dic['idtp'],\n                           (overall_dic['idtp'] + overall_dic['idfp']))\n        return idp\n\n    def idr_overall(summary_df, overall_dic):\n        del summary_df\n        idr = quiet_divide(overall_dic['idtp'],\n                           (overall_dic['idtp'] + overall_dic['idfn']))\n        return idr\n\n    def idf1_overall(summary_df, overall_dic):\n        del summary_df\n        idf1 = quiet_divide(2. * overall_dic['idtp'], (\n            overall_dic['num_objects'] + overall_dic['num_predictions']))\n        return idf1\n\n\ndef read_mcmot_results_union(filename, is_gt, is_ignore):\n    results_dict = dict()\n    if os.path.isfile(filename):\n        all_result = np.loadtxt(filename, delimiter=',')\n        if all_result.shape[0] == 0 or all_result.shape[1] < 7:\n            return results_dict\n        if is_ignore:\n            return results_dict\n        if is_gt:\n            # only for test use\n            all_result = all_result[all_result[:, 7] != 0]\n            all_result[:, 7] = all_result[:, 7] - 1\n\n        if all_result.shape[0] == 0:\n            return results_dict\n\n        class_unique = np.unique(all_result[:, 7])\n\n        last_max_id = 0\n        result_cls_list = []\n        for cls in class_unique:\n            result_cls_split = all_result[all_result[:, 7] == cls]\n            result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id\n            # make sure track id different between every category\n            last_max_id = max(np.unique(result_cls_split[:, 1])) + 1\n            result_cls_list.append(result_cls_split)\n\n        results_con = np.concatenate(result_cls_list)\n\n        for line in range(len(results_con)):\n            linelist = results_con[line]\n            fid = int(linelist[0])\n            if fid < 1:\n                continue\n            results_dict.setdefault(fid, list())\n\n            if is_gt:\n                score = 1\n            else:\n                score = float(linelist[6])\n\n            tlwh = tuple(map(float, linelist[2:6]))\n            target_id = int(linelist[1])\n            cls = int(linelist[7])\n\n            results_dict[fid].append((tlwh, target_id, cls, score))\n\n        return results_dict\n\n\ndef read_mcmot_results(filename, is_gt, is_ignore):\n    results_dict = dict()\n    if os.path.isfile(filename):\n        with open(filename, 'r') as f:\n            for line in f.readlines():\n                linelist = line.strip().split(',')\n                if len(linelist) < 7:\n                    continue\n                fid = int(linelist[0])\n                if fid < 1:\n                    continue\n                cid = int(linelist[7])\n                if is_gt:\n                    score = 1\n                    # only for test use\n                    cid -= 1\n                else:\n                    score = float(linelist[6])\n\n                cls_result_dict = results_dict.setdefault(cid, dict())\n                cls_result_dict.setdefault(fid, list())\n\n                tlwh = tuple(map(float, linelist[2:6]))\n                target_id = int(linelist[1])\n                cls_result_dict[fid].append((tlwh, target_id, score))\n    return results_dict\n\n\ndef read_results(filename,\n                 data_type,\n                 is_gt=False,\n                 is_ignore=False,\n                 multi_class=False,\n                 union=False):\n    if data_type in ['mcmot', 'lab']:\n        if multi_class:\n            if union:\n                # The results are evaluated by union all the categories.\n                # Track IDs between different categories cannot be duplicate.\n                read_fun = read_mcmot_results_union\n            else:\n                # The results are evaluated separately by category.\n                read_fun = read_mcmot_results\n        else:\n            raise ValueError('multi_class: {}, MCMOT should have cls_id.'.\n                             format(multi_class))\n    else:\n        raise ValueError('Unknown data type: {}'.format(data_type))\n\n    return read_fun(filename, is_gt, is_ignore)\n\n\ndef unzip_objs(objs):\n    if len(objs) > 0:\n        tlwhs, ids, scores = zip(*objs)\n    else:\n        tlwhs, ids, scores = [], [], []\n    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)\n    return tlwhs, ids, scores\n\n\ndef unzip_objs_cls(objs):\n    if len(objs) > 0:\n        tlwhs, ids, cls, scores = zip(*objs)\n    else:\n        tlwhs, ids, cls, scores = [], [], [], []\n    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)\n    ids = np.array(ids)\n    cls = np.array(cls)\n    scores = np.array(scores)\n    return tlwhs, ids, cls, scores\n\n\nclass MCMOTEvaluator(object):\n    def __init__(self, data_root, seq_name, data_type, num_classes):\n        self.data_root = data_root\n        self.seq_name = seq_name\n        self.data_type = data_type\n        self.num_classes = num_classes\n\n        self.load_annotations()\n        try:\n            import motmetrics as mm\n            mm.lap.default_solver = 'lap'\n        except Exception as e:\n            raise RuntimeError(\n                'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'\n            )\n        self.reset_accumulator()\n\n        self.class_accs = []\n\n    def load_annotations(self):\n        assert self.data_type == 'mcmot'\n        self.gt_filename = os.path.join(self.data_root, '../', 'sequences',\n                                        '{}.txt'.format(self.seq_name))\n        if not os.path.exists(self.gt_filename):\n            logger.warning(\n                \"gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF.\"\n            )\n\n    def reset_accumulator(self):\n        self.acc = mm.MOTAccumulator(auto_id=True)\n\n    def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False):\n        if union:\n            trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3]\n            gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3]\n\n            # get distance matrix\n            iou_distance = mm.distances.iou_matrix(\n                gt_tlwhs, trk_tlwhs, max_iou=0.5)\n\n            # Set the distance between objects of different categories to nan\n            gt_cls_len = len(gt_cls)\n            trk_cls_len = len(trk_cls)\n            # When the number of GT or Trk is 0, iou_distance dimension is (0,0)\n            if gt_cls_len != 0 and trk_cls_len != 0:\n                gt_cls = gt_cls.reshape(gt_cls_len, 1)\n                gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1)\n                trk_cls = trk_cls.reshape(1, trk_cls_len)\n                trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0)\n                iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan)\n\n        else:\n            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]\n            gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]\n\n            # get distance matrix\n            iou_distance = mm.distances.iou_matrix(\n                gt_tlwhs, trk_tlwhs, max_iou=0.5)\n\n        self.acc.update(gt_ids, trk_ids, iou_distance)\n\n        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,\n                                                            'mot_events'):\n            events = self.acc.mot_events  # only supported by https://github.com/longcw/py-motmetrics\n        else:\n            events = None\n        return events\n\n    def eval_file(self, result_filename):\n        # evaluation of each category\n        gt_frame_dict = read_results(\n            self.gt_filename,\n            self.data_type,\n            is_gt=True,\n            multi_class=True,\n            union=False)\n        result_frame_dict = read_results(\n            result_filename,\n            self.data_type,\n            is_gt=False,\n            multi_class=True,\n            union=False)\n\n        for cid in range(self.num_classes):\n            self.reset_accumulator()\n            cls_result_frame_dict = result_frame_dict.setdefault(cid, dict())\n            cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict())\n\n            # only labeled frames will be evaluated\n            frames = sorted(list(set(cls_gt_frame_dict.keys())))\n\n            for frame_id in frames:\n                trk_objs = cls_result_frame_dict.get(frame_id, [])\n                gt_objs = cls_gt_frame_dict.get(frame_id, [])\n                self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False)\n\n            self.class_accs.append(self.acc)\n\n        return self.class_accs\n\n    @staticmethod\n    def get_summary(accs,\n                    names,\n                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',\n                             'precision', 'recall')):\n        names = copy.deepcopy(names)\n        if metrics is None:\n            metrics = mm.metrics.motchallenge_metrics\n        metrics = copy.deepcopy(metrics)\n\n        mh = mm.metrics.create()\n        summary = mh.compute_many(\n            accs, metrics=metrics, names=names, generate_overall=True)\n\n        return summary\n\n    @staticmethod\n    def save_summary(summary, filename):\n        import pandas as pd\n        writer = pd.ExcelWriter(filename)\n        summary.to_excel(writer)\n        writer.save()\n\n\nclass MCMOTMetric(Metric):\n    def __init__(self, num_classes, save_summary=False):\n        self.num_classes = num_classes\n        self.save_summary = save_summary\n        self.MCMOTEvaluator = MCMOTEvaluator\n        self.result_root = None\n        self.reset()\n\n        self.seqs_overall = defaultdict(list)\n\n    def reset(self):\n        self.accs = []\n        self.seqs = []\n\n    def update(self, data_root, seq, data_type, result_root, result_filename):\n        evaluator = self.MCMOTEvaluator(data_root, seq, data_type,\n                                        self.num_classes)\n        seq_acc = evaluator.eval_file(result_filename)\n        self.accs.append(seq_acc)\n        self.seqs.append(seq)\n        self.result_root = result_root\n\n        cls_index_name = [\n            '{}_{}'.format(seq, i) for i in range(self.num_classes)\n        ]\n        summary = parse_accs_metrics(seq_acc, cls_index_name)\n        summary.rename(\n            index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True)\n        for row in range(len(summary)):\n            self.seqs_overall[row].append(summary.iloc[row:row + 1])\n\n    def accumulate(self):\n        self.cls_summary_list = []\n        for row in range(self.num_classes):\n            seqs_cls_df = pd.concat(self.seqs_overall[row])\n            seqs_cls_summary = seqs_overall_metrics(seqs_cls_df)\n            cls_summary_overall = seqs_cls_summary.iloc[-1:].copy()\n            cls_summary_overall.rename(\n                index={'overall_calc': 'overall_calc_{}'.format(row)},\n                inplace=True)\n            self.cls_summary_list.append(cls_summary_overall)\n\n    def log(self):\n        seqs_summary = seqs_overall_metrics(\n            pd.concat(self.seqs_overall[self.num_classes]), verbose=True)\n        class_summary = seqs_overall_metrics(\n            pd.concat(self.cls_summary_list), verbose=True)\n\n    def get_results(self):\n        return 1\n"
  },
  {
    "path": "ppdet/metrics/metrics.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport json\nimport paddle\nimport numpy as np\nimport typing\nfrom collections import defaultdict\nfrom pathlib import Path\n\nfrom .map_utils import prune_zero_padding, DetectionMAP\nfrom .coco_utils import get_infer_results, cocoapi_eval\nfrom .lvis_utils import lvisapi_eval\nfrom .widerface_utils import (face_eval_run, image_eval, img_pr_info,\n                              dataset_pr_info, voc_ap)\nfrom ppdet.data.source.category import get_categories\nfrom ppdet.modeling.rbox_utils import poly2rbox_np\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results',\n    'RBoxMetric', 'SNIPERCOCOMetric', 'LVISMetric'\n]\n\nCOCO_SIGMAS = np.array([\n    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87,\n    .89, .89\n]) / 10.0\nCROWD_SIGMAS = np.array(\n    [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79,\n     .79]) / 10.0\n\n\nclass Metric(paddle.metric.Metric):\n    def name(self):\n        return self.__class__.__name__\n\n    def reset(self):\n        pass\n\n    def accumulate(self):\n        pass\n\n    # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate`\n    # :metch:`reset`, in ppdet, we also need following 2 methods:\n\n    # abstract method for logging metric results\n    def log(self):\n        pass\n\n    # abstract method for getting metric results\n    def get_results(self):\n        pass\n\n\nclass COCOMetric(Metric):\n    def __init__(self, anno_file, **kwargs):\n        self.anno_file = anno_file\n        self.clsid2catid = kwargs.get('clsid2catid', None)\n        if self.clsid2catid is None:\n            self.clsid2catid, _ = get_categories('COCO', anno_file)\n        self.classwise = kwargs.get('classwise', False)\n        self.output_eval = kwargs.get('output_eval', None)\n        # TODO: bias should be unified\n        self.bias = kwargs.get('bias', 0)\n        self.save_prediction_only = kwargs.get('save_prediction_only', False)\n        self.iou_type = kwargs.get('IouType', 'bbox')\n\n        if not self.save_prediction_only:\n            assert os.path.isfile(anno_file), \\\n                    \"anno_file {} not a file\".format(anno_file)\n\n        if self.output_eval is not None:\n            Path(self.output_eval).mkdir(exist_ok=True)\n\n        self.save_threshold = kwargs.get('save_threshold', 0)\n\n        self.reset()\n\n    def reset(self):\n        # only bbox and mask evaluation support currently\n        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}\n        self.eval_results = {}\n\n    def update(self, inputs, outputs):\n        outs = {}\n        # outputs Tensor -> numpy.ndarray\n        for k, v in outputs.items():\n            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v\n\n        # multi-scale inputs: all inputs have same im_id\n        if isinstance(inputs, typing.Sequence):\n            im_id = inputs[0]['im_id']\n        else:\n            im_id = inputs['im_id']\n        outs['im_id'] = im_id.numpy() if isinstance(im_id,\n                                                    paddle.Tensor) else im_id\n        if 'im_file' in inputs:\n            outs['im_file'] = inputs['im_file']\n\n        infer_results = get_infer_results(\n            outs,\n            self.clsid2catid,\n            bias=self.bias,\n            save_threshold=self.save_threshold)\n        self.results['bbox'] += infer_results[\n            'bbox'] if 'bbox' in infer_results else []\n        self.results['mask'] += infer_results[\n            'mask'] if 'mask' in infer_results else []\n        self.results['segm'] += infer_results[\n            'segm'] if 'segm' in infer_results else []\n        self.results['keypoint'] += infer_results[\n            'keypoint'] if 'keypoint' in infer_results else []\n\n    def accumulate(self):\n        if len(self.results['bbox']) > 0:\n            output = \"bbox.json\"\n            if self.output_eval:\n                output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results['bbox'], f)\n                logger.info('The bbox result is saved to bbox.json.')\n\n            if self.save_prediction_only:\n                logger.info('The bbox result is saved to {} and do not '\n                            'evaluate the mAP.'.format(output))\n            else:\n                bbox_stats = cocoapi_eval(\n                    output,\n                    'bbox',\n                    anno_file=self.anno_file,\n                    classwise=self.classwise)\n                self.eval_results['bbox'] = bbox_stats\n                sys.stdout.flush()\n\n        if len(self.results['mask']) > 0:\n            output = \"mask.json\"\n            if self.output_eval:\n                output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results['mask'], f)\n                logger.info('The mask result is saved to mask.json.')\n\n            if self.save_prediction_only:\n                logger.info('The mask result is saved to {} and do not '\n                            'evaluate the mAP.'.format(output))\n            else:\n                seg_stats = cocoapi_eval(\n                    output,\n                    'segm',\n                    anno_file=self.anno_file,\n                    classwise=self.classwise)\n                self.eval_results['mask'] = seg_stats\n                sys.stdout.flush()\n\n        if len(self.results['segm']) > 0:\n            output = \"segm.json\"\n            if self.output_eval:\n                output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results['segm'], f)\n                logger.info('The segm result is saved to segm.json.')\n\n            if self.save_prediction_only:\n                logger.info('The segm result is saved to {} and do not '\n                            'evaluate the mAP.'.format(output))\n            else:\n                seg_stats = cocoapi_eval(\n                    output,\n                    'segm',\n                    anno_file=self.anno_file,\n                    classwise=self.classwise)\n                self.eval_results['mask'] = seg_stats\n                sys.stdout.flush()\n\n        if len(self.results['keypoint']) > 0:\n            output = \"keypoint.json\"\n            if self.output_eval:\n                output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results['keypoint'], f)\n                logger.info('The keypoint result is saved to keypoint.json.')\n\n            if self.save_prediction_only:\n                logger.info('The keypoint result is saved to {} and do not '\n                            'evaluate the mAP.'.format(output))\n            else:\n                style = 'keypoints'\n                use_area = True\n                sigmas = COCO_SIGMAS\n                if self.iou_type == 'keypoints_crowd':\n                    style = 'keypoints_crowd'\n                    use_area = False\n                    sigmas = CROWD_SIGMAS\n                keypoint_stats = cocoapi_eval(\n                    output,\n                    style,\n                    anno_file=self.anno_file,\n                    classwise=self.classwise,\n                    sigmas=sigmas,\n                    use_area=use_area)\n                self.eval_results['keypoint'] = keypoint_stats\n                sys.stdout.flush()\n\n    def log(self):\n        pass\n\n    def get_results(self):\n        return self.eval_results\n\nclass LVISMetric(Metric):\n    def __init__(self, anno_file, **kwargs):\n        self.anno_file = anno_file\n        self.clsid2catid = kwargs.get('clsid2catid', None)\n        if self.clsid2catid is None:\n            self.clsid2catid, _ = get_categories('COCO', anno_file)\n        self.classwise = kwargs.get('classwise', False)\n        self.output_eval = kwargs.get('output_eval', None)\n        # TODO: bias should be unified\n        self.bias = kwargs.get('bias', 0)\n        self.save_prediction_only = kwargs.get('save_prediction_only', False)\n        self.iou_type = kwargs.get('IouType', 'bbox')\n\n        if not self.save_prediction_only:\n            assert os.path.isfile(anno_file), \\\n                    \"anno_file {} not a file\".format(anno_file)\n\n        if self.output_eval is not None:\n            Path(self.output_eval).mkdir(exist_ok=True)\n\n        self.reset()\n\n    def reset(self):\n        # only bbox and mask evaluation support currently\n        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}\n        self.eval_results = {}\n\n    def update(self, inputs, outputs):\n        outs = {}\n        # outputs Tensor -> numpy.ndarray\n        for k, v in outputs.items():\n            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v\n\n        # multi-scale inputs: all inputs have same im_id\n        if isinstance(inputs, typing.Sequence):\n            im_id = inputs[0]['im_id']\n        else:\n            im_id = inputs['im_id']\n        outs['im_id'] = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id\n\n        infer_results = get_infer_results(\n            outs, self.clsid2catid, bias=self.bias)\n        self.results['bbox'] += infer_results[\n            'bbox'] if 'bbox' in infer_results else []\n        self.results['mask'] += infer_results[\n            'mask'] if 'mask' in infer_results else []\n        self.results['segm'] += infer_results[\n            'segm'] if 'segm' in infer_results else []\n        self.results['keypoint'] += infer_results[\n            'keypoint'] if 'keypoint' in infer_results else []\n\n    def accumulate(self):\n        if len(self.results['bbox']) > 0:\n            output = \"bbox.json\"\n            if self.output_eval:\n                output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results['bbox'], f)\n                logger.info('The bbox result is saved to bbox.json.')\n\n            if self.save_prediction_only:\n                logger.info('The bbox result is saved to {} and do not '\n                            'evaluate the mAP.'.format(output))\n            else:\n                bbox_stats = lvisapi_eval(\n                    output,\n                    'bbox',\n                    anno_file=self.anno_file,\n                    classwise=self.classwise\n                )\n                self.eval_results['bbox'] = bbox_stats\n                sys.stdout.flush()\n\n        if len(self.results['mask']) > 0:\n            output = \"mask.json\"\n            if self.output_eval:\n                output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results['mask'], f)\n                logger.info('The mask result is saved to mask.json.')\n\n            if self.save_prediction_only:\n                logger.info('The mask result is saved to {} and do not '\n                            'evaluate the mAP.'.format(output))\n            else:\n                seg_stats = cocoapi_eval(\n                    output,\n                    'segm',\n                    anno_file=self.anno_file,\n                    classwise=self.classwise)\n                self.eval_results['mask'] = seg_stats\n                sys.stdout.flush()\n\n        if len(self.results['segm']) > 0:\n            output = \"segm.json\"\n            if self.output_eval:\n                output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results['segm'], f)\n                logger.info('The segm result is saved to segm.json.')\n\n            if self.save_prediction_only:\n                logger.info('The segm result is saved to {} and do not '\n                            'evaluate the mAP.'.format(output))\n            else:\n                seg_stats = cocoapi_eval(\n                    output,\n                    'segm',\n                    anno_file=self.anno_file,\n                    classwise=self.classwise)\n                self.eval_results['mask'] = seg_stats\n                sys.stdout.flush()\n\n        if len(self.results['keypoint']) > 0:\n            output = \"keypoint.json\"\n            if self.output_eval:\n                output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results['keypoint'], f)\n                logger.info('The keypoint result is saved to keypoint.json.')\n\n            if self.save_prediction_only:\n                logger.info('The keypoint result is saved to {} and do not '\n                            'evaluate the mAP.'.format(output))\n            else:\n                style = 'keypoints'\n                use_area = True\n                sigmas = COCO_SIGMAS\n                if self.iou_type == 'keypoints_crowd':\n                    style = 'keypoints_crowd'\n                    use_area = False\n                    sigmas = CROWD_SIGMAS\n                keypoint_stats = cocoapi_eval(\n                    output,\n                    style,\n                    anno_file=self.anno_file,\n                    classwise=self.classwise,\n                    sigmas=sigmas,\n                    use_area=use_area)\n                self.eval_results['keypoint'] = keypoint_stats\n                sys.stdout.flush()\n\n    def log(self):\n        # pass\n        logger.info(self.eval_results['bbox'])\n\n    def get_results(self):\n        return self.eval_results\n\nclass VOCMetric(Metric):\n    def __init__(self,\n                 label_list,\n                 class_num=20,\n                 overlap_thresh=0.5,\n                 map_type='11point',\n                 is_bbox_normalized=False,\n                 evaluate_difficult=False,\n                 classwise=False,\n                 output_eval=None,\n                 save_prediction_only=False):\n        assert os.path.isfile(label_list), \\\n                \"label_list {} not a file\".format(label_list)\n        self.clsid2catid, self.catid2name = get_categories('VOC', label_list)\n\n        self.overlap_thresh = overlap_thresh\n        self.map_type = map_type\n        self.evaluate_difficult = evaluate_difficult\n        self.output_eval = output_eval\n        self.save_prediction_only = save_prediction_only\n        self.detection_map = DetectionMAP(\n            class_num=class_num,\n            overlap_thresh=overlap_thresh,\n            map_type=map_type,\n            is_bbox_normalized=is_bbox_normalized,\n            evaluate_difficult=evaluate_difficult,\n            catid2name=self.catid2name,\n            classwise=classwise)\n\n        self.reset()\n\n    def reset(self):\n        self.results = {'bbox': [], 'score': [], 'label': []}\n        self.detection_map.reset()\n\n    def update(self, inputs, outputs):\n        bbox_np = outputs['bbox'].numpy() if isinstance(\n            outputs['bbox'], paddle.Tensor) else outputs['bbox']\n        bboxes = bbox_np[:, 2:]\n        scores = bbox_np[:, 1]\n        labels = bbox_np[:, 0]\n        bbox_lengths = outputs['bbox_num'].numpy() if isinstance(\n            outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num']\n\n        self.results['bbox'].append(bboxes.tolist())\n        self.results['score'].append(scores.tolist())\n        self.results['label'].append(labels.tolist())\n\n        if bboxes.shape == (1, 1) or bboxes is None:\n            return\n        if self.save_prediction_only:\n            return\n\n        gt_boxes = inputs['gt_bbox']\n        gt_labels = inputs['gt_class']\n        difficults = inputs['difficult'] if not self.evaluate_difficult \\\n                            else None\n\n        if 'scale_factor' in inputs:\n            scale_factor = inputs['scale_factor'].numpy() if isinstance(\n                inputs['scale_factor'],\n                paddle.Tensor) else inputs['scale_factor']\n        else:\n            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')\n\n        bbox_idx = 0\n        for i in range(len(gt_boxes)):\n            gt_box = gt_boxes[i].numpy() if isinstance(\n                gt_boxes[i], paddle.Tensor) else gt_boxes[i]\n            h, w = scale_factor[i]\n            gt_box = gt_box / np.array([w, h, w, h])\n            gt_label = gt_labels[i].numpy() if isinstance(\n                gt_labels[i], paddle.Tensor) else gt_labels[i]\n            if difficults is not None:\n                difficult = difficults[i].numpy() if isinstance(\n                    difficults[i], paddle.Tensor) else difficults[i]\n            else:\n                difficult = None\n            bbox_num = bbox_lengths[i]\n            bbox = bboxes[bbox_idx:bbox_idx + bbox_num]\n            score = scores[bbox_idx:bbox_idx + bbox_num]\n            label = labels[bbox_idx:bbox_idx + bbox_num]\n            gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label,\n                                                             difficult)\n            self.detection_map.update(bbox, score, label, gt_box, gt_label,\n                                      difficult)\n            bbox_idx += bbox_num\n\n    def accumulate(self):\n        output = \"bbox.json\"\n        if self.output_eval:\n            output = os.path.join(self.output_eval, output)\n            with open(output, 'w') as f:\n                json.dump(self.results, f)\n                logger.info('The bbox result is saved to bbox.json.')\n        if self.save_prediction_only:\n            return\n\n        logger.info(\"Accumulating evaluatation results...\")\n        self.detection_map.accumulate()\n\n    def log(self):\n        map_stat = 100. * self.detection_map.get_map()\n        logger.info(\"mAP({:.2f}, {}) = {:.2f}%\".format(self.overlap_thresh,\n                                                       self.map_type, map_stat))\n\n    def get_results(self):\n        return {'bbox': [self.detection_map.get_map()]}\n\n\nclass WiderFaceMetric(Metric):\n    def __init__(self, iou_thresh=0.5):\n        self.iou_thresh = iou_thresh\n        self.reset()\n\n    def reset(self):\n        self.pred_boxes_list = []\n        self.gt_boxes_list = []\n        self.aps = []\n\n        self.hard_ignore_list = []\n        self.medium_ignore_list = []\n        self.easy_ignore_list = []\n\n    def update(self, data, outs):\n        batch_pred_bboxes = outs['bbox']\n        batch_pred_bboxes_num = outs['bbox_num']\n        assert len(batch_pred_bboxes_num) == len(data['gt_bbox'])\n        batch_size = len(data['gt_bbox'])\n        box_cnt = 0\n        for batch_id in range(batch_size):\n            pred_bboxes_num = batch_pred_bboxes_num[batch_id]\n            pred_bboxes = batch_pred_bboxes[box_cnt: box_cnt + \n                                            pred_bboxes_num].numpy() \n            box_cnt += pred_bboxes_num\n\n            det_conf = pred_bboxes[:, 1]\n            det_xmin = pred_bboxes[:, 2]\n            det_ymin = pred_bboxes[:, 3]\n            det_xmax = pred_bboxes[:, 4]\n            det_ymax = pred_bboxes[:, 5]\n            det = np.column_stack((det_xmin, det_ymin, det_xmax, \n                                   det_ymax, det_conf))\n            self.pred_boxes_list.append(det) # xyxy conf\n            self.gt_boxes_list.append(data['gt_ori_bbox'][batch_id].numpy()) # xywh\n            self.hard_ignore_list.append(\n                data['gt_hard_ignore'][batch_id].numpy())\n            self.medium_ignore_list.append(\n                data['gt_medium_ignore'][batch_id].numpy())\n            self.easy_ignore_list.append(\n                data['gt_easy_ignore'][batch_id].numpy())\n    \n    def accumulate(self):\n        total_num = len(self.gt_boxes_list)\n        settings = ['easy', 'medium', 'hard']\n        setting_ingores = [self.easy_ignore_list, \n                           self.medium_ignore_list, \n                           self.hard_ignore_list]\n        thresh_num = 1000\n        aps = []\n        for setting_id in range(3):\n            count_face = 0\n            pr_curve = np.zeros((thresh_num, 2)).astype(np.float32)\n            gt_ignore_list = setting_ingores[setting_id]\n            for i in range(total_num):\n                pred_boxes = self.pred_boxes_list[i] # xyxy conf\n                gt_boxes = self.gt_boxes_list[i] # xywh\n                ignore = gt_ignore_list[i]\n                count_face += np.sum(ignore)\n\n                if len(gt_boxes) == 0 or len(pred_boxes) == 0:\n                    continue\n                pred_recall, proposal_list = image_eval(pred_boxes, gt_boxes, \n                                                        ignore, self.iou_thresh)\n                _img_pr_info = img_pr_info(thresh_num, pred_boxes,\n                                           proposal_list, pred_recall)\n                pr_curve += _img_pr_info\n            pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face)\n\n            propose = pr_curve[:, 0]\n            recall = pr_curve[:, 1]\n\n            ap = voc_ap(recall, propose)\n            aps.append(ap)\n        self.aps = aps\n    \n    def log(self):\n        logger.info(\"==================== Results ====================\")\n        logger.info(\"Easy   Val AP: {}\".format(self.aps[0]))\n        logger.info(\"Medium Val AP: {}\".format(self.aps[1]))\n        logger.info(\"Hard   Val AP: {}\".format(self.aps[2]))\n        logger.info(\"=================================================\")\n    \n    def get_results(self):\n        return {\n            'easy_ap': self.aps[0],\n            'medium_ap': self.aps[1],\n            'hard_ap': self.aps[2]}\n\nclass RBoxMetric(Metric):\n    def __init__(self, anno_file, **kwargs):\n        self.anno_file = anno_file\n        self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file)\n        self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}\n        self.classwise = kwargs.get('classwise', False)\n        self.output_eval = kwargs.get('output_eval', None)\n        self.save_prediction_only = kwargs.get('save_prediction_only', False)\n        self.overlap_thresh = kwargs.get('overlap_thresh', 0.5)\n        self.map_type = kwargs.get('map_type', '11point')\n        self.evaluate_difficult = kwargs.get('evaluate_difficult', False)\n        self.imid2path = kwargs.get('imid2path', None)\n        class_num = len(self.catid2name)\n        self.detection_map = DetectionMAP(\n            class_num=class_num,\n            overlap_thresh=self.overlap_thresh,\n            map_type=self.map_type,\n            is_bbox_normalized=False,\n            evaluate_difficult=self.evaluate_difficult,\n            catid2name=self.catid2name,\n            classwise=self.classwise)\n\n        self.reset()\n\n    def reset(self):\n        self.results = []\n        self.detection_map.reset()\n\n    def update(self, inputs, outputs):\n        outs = {}\n        # outputs Tensor -> numpy.ndarray\n        for k, v in outputs.items():\n            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v\n\n        im_id = inputs['im_id']\n        im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id\n        outs['im_id'] = im_id\n\n        infer_results = get_infer_results(outs, self.clsid2catid)\n        infer_results = infer_results['bbox'] if 'bbox' in infer_results else []\n        self.results += infer_results\n        if self.save_prediction_only:\n            return\n\n        gt_boxes = inputs['gt_poly']\n        gt_labels = inputs['gt_class']\n\n        if 'scale_factor' in inputs:\n            scale_factor = inputs['scale_factor'].numpy() if isinstance(\n                inputs['scale_factor'],\n                paddle.Tensor) else inputs['scale_factor']\n        else:\n            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')\n\n        for i in range(len(gt_boxes)):\n            gt_box = gt_boxes[i].numpy() if isinstance(\n                gt_boxes[i], paddle.Tensor) else gt_boxes[i]\n            h, w = scale_factor[i]\n            gt_box = gt_box / np.array([w, h, w, h, w, h, w, h])\n            gt_label = gt_labels[i].numpy() if isinstance(\n                gt_labels[i], paddle.Tensor) else gt_labels[i]\n            gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label)\n            bbox = [\n                res['bbox'] for res in infer_results\n                if int(res['image_id']) == int(im_id[i])\n            ]\n            score = [\n                res['score'] for res in infer_results\n                if int(res['image_id']) == int(im_id[i])\n            ]\n            label = [\n                self.catid2clsid[int(res['category_id'])]\n                for res in infer_results\n                if int(res['image_id']) == int(im_id[i])\n            ]\n            self.detection_map.update(bbox, score, label, gt_box, gt_label)\n\n    def save_results(self, results, output_dir, imid2path):\n        if imid2path:\n            data_dicts = defaultdict(list)\n            for result in results:\n                image_id = result['image_id']\n                data_dicts[image_id].append(result)\n\n            for image_id, image_path in imid2path.items():\n                basename = os.path.splitext(os.path.split(image_path)[-1])[0]\n                output = os.path.join(output_dir, \"{}.txt\".format(basename))\n                dets = data_dicts.get(image_id, [])\n                with open(output, 'w') as f:\n                    for det in dets:\n                        catid, bbox, score = det['category_id'], det[\n                            'bbox'], det['score']\n                        bbox_pred = '{} {} '.format(self.catid2name[catid],\n                                                    score) + ' '.join(\n                                                        [str(e) for e in bbox])\n                        f.write(bbox_pred + '\\n')\n\n            logger.info('The bbox result is saved to {}.'.format(output_dir))\n        else:\n            output = os.path.join(output_dir, \"bbox.json\")\n            with open(output, 'w') as f:\n                json.dump(results, f)\n\n            logger.info('The bbox result is saved to {}.'.format(output))\n\n    def accumulate(self):\n        if self.output_eval:\n            self.save_results(self.results, self.output_eval, self.imid2path)\n\n        if not self.save_prediction_only:\n            logger.info(\"Accumulating evaluatation results...\")\n            self.detection_map.accumulate()\n\n    def log(self):\n        map_stat = 100. * self.detection_map.get_map()\n        logger.info(\"mAP({:.2f}, {}) = {:.2f}%\".format(self.overlap_thresh,\n                                                       self.map_type, map_stat))\n\n    def get_results(self):\n        return {'bbox': [self.detection_map.get_map()]}\n\n\nclass SNIPERCOCOMetric(COCOMetric):\n    def __init__(self, anno_file, **kwargs):\n        super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs)\n        self.dataset = kwargs[\"dataset\"]\n        self.chip_results = []\n\n    def reset(self):\n        # only bbox and mask evaluation support currently\n        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}\n        self.eval_results = {}\n        self.chip_results = []\n\n    def update(self, inputs, outputs):\n        outs = {}\n        # outputs Tensor -> numpy.ndarray\n        for k, v in outputs.items():\n            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v\n\n        im_id = inputs['im_id']\n        outs['im_id'] = im_id.numpy() if isinstance(im_id,\n                                                    paddle.Tensor) else im_id\n\n        self.chip_results.append(outs)\n\n    def accumulate(self):\n        results = self.dataset.anno_cropper.aggregate_chips_detections(\n            self.chip_results)\n        for outs in results:\n            infer_results = get_infer_results(\n                outs, self.clsid2catid, bias=self.bias)\n            self.results['bbox'] += infer_results[\n                'bbox'] if 'bbox' in infer_results else []\n\n        super(SNIPERCOCOMetric, self).accumulate()\n"
  },
  {
    "path": "ppdet/metrics/mot_metrics.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport copy\nimport sys\nimport math\nfrom collections import defaultdict\nimport numpy as np\n\nfrom ppdet.modeling.bbox_utils import bbox_iou_np_expand\nfrom .map_utils import ap_per_class\nfrom .metrics import Metric\nfrom .munkres import Munkres\n\ntry:\n    import motmetrics as mm\n    mm.lap.default_solver = 'lap'\nexcept:\n    print(\n        'Warning: Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'\n    )\n    pass\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric']\n\n\ndef read_mot_results(filename, is_gt=False, is_ignore=False):\n    valid_label = [1]\n    ignore_labels = [2, 7, 8, 12]  # only in motchallenge datasets like 'MOT16'\n    if is_gt:\n        logger.info(\n            \"In MOT16/17 dataset the valid_label of ground truth is '{}', \"\n            \"in other dataset it should be '0' for single classs MOT.\".format(\n                valid_label[0]))\n    results_dict = dict()\n    if os.path.isfile(filename):\n        with open(filename, 'r') as f:\n            for line in f.readlines():\n                linelist = line.split(',')\n                if len(linelist) < 7:\n                    continue\n                fid = int(linelist[0])\n                if fid < 1:\n                    continue\n                results_dict.setdefault(fid, list())\n\n                if is_gt:\n                    label = int(float(linelist[7]))\n                    mark = int(float(linelist[6]))\n                    if mark == 0 or label not in valid_label:\n                        continue\n                    score = 1\n                elif is_ignore:\n                    if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename:\n                        label = int(float(linelist[7]))\n                        vis_ratio = float(linelist[8])\n                        if label not in ignore_labels and vis_ratio >= 0:\n                            continue\n                    else:\n                        continue\n                    score = 1\n                else:\n                    score = float(linelist[6])\n\n                tlwh = tuple(map(float, linelist[2:6]))\n                target_id = int(linelist[1])\n\n                results_dict[fid].append((tlwh, target_id, score))\n    return results_dict\n\n\n\"\"\"\nMOT dataset label list, see in https://motchallenge.net\nlabels={'ped', ...\t\t\t    % 1\n        'person_on_vhcl', ...\t% 2\n        'car', ...\t\t\t\t% 3\n        'bicycle', ...\t\t\t% 4\n        'mbike', ...\t\t\t% 5\n        'non_mot_vhcl', ...\t\t% 6\n        'static_person', ...\t% 7\n        'distractor', ...\t\t% 8\n        'occluder', ...\t\t\t% 9\n        'occluder_on_grnd', ...\t% 10\n        'occluder_full', ...\t% 11\n        'reflection', ...\t\t% 12\n        'crowd' ...\t\t\t    % 13\n};\n\"\"\"\n\n\ndef unzip_objs(objs):\n    if len(objs) > 0:\n        tlwhs, ids, scores = zip(*objs)\n    else:\n        tlwhs, ids, scores = [], [], []\n    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)\n    return tlwhs, ids, scores\n\n\nclass MOTEvaluator(object):\n    def __init__(self, data_root, seq_name, data_type):\n        self.data_root = data_root\n        self.seq_name = seq_name\n        self.data_type = data_type\n\n        self.load_annotations()\n        try:\n            import motmetrics as mm\n            mm.lap.default_solver = 'lap'\n        except Exception as e:\n            raise RuntimeError(\n                'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'\n            )\n        self.reset_accumulator()\n\n    def load_annotations(self):\n        assert self.data_type == 'mot'\n        gt_filename = os.path.join(self.data_root, self.seq_name, 'gt',\n                                   'gt.txt')\n        if not os.path.exists(gt_filename):\n            logger.warning(\n                \"gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF.\"\n            )\n        self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True)\n        self.gt_ignore_frame_dict = read_mot_results(\n            gt_filename, is_ignore=True)\n\n    def reset_accumulator(self):\n        self.acc = mm.MOTAccumulator(auto_id=True)\n\n    def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):\n        # results\n        trk_tlwhs = np.copy(trk_tlwhs)\n        trk_ids = np.copy(trk_ids)\n\n        # gts\n        gt_objs = self.gt_frame_dict.get(frame_id, [])\n        gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]\n\n        # ignore boxes\n        ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])\n        ignore_tlwhs = unzip_objs(ignore_objs)[0]\n\n        # remove ignored results\n        keep = np.ones(len(trk_tlwhs), dtype=bool)\n        iou_distance = mm.distances.iou_matrix(\n            ignore_tlwhs, trk_tlwhs, max_iou=0.5)\n        if len(iou_distance) > 0:\n            match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)\n            match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])\n            match_ious = iou_distance[match_is, match_js]\n\n            match_js = np.asarray(match_js, dtype=int)\n            match_js = match_js[np.logical_not(np.isnan(match_ious))]\n            keep[match_js] = False\n            trk_tlwhs = trk_tlwhs[keep]\n            trk_ids = trk_ids[keep]\n\n        # get distance matrix\n        iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5)\n\n        # acc\n        self.acc.update(gt_ids, trk_ids, iou_distance)\n\n        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,\n                                                            'last_mot_events'):\n            events = self.acc.last_mot_events  # only supported by https://github.com/longcw/py-motmetrics\n        else:\n            events = None\n        return events\n\n    def eval_file(self, filename):\n        self.reset_accumulator()\n\n        result_frame_dict = read_mot_results(filename, is_gt=False)\n        frames = sorted(list(set(result_frame_dict.keys())))\n        for frame_id in frames:\n            trk_objs = result_frame_dict.get(frame_id, [])\n            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]\n            self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)\n\n        return self.acc\n\n    @staticmethod\n    def get_summary(accs,\n                    names,\n                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',\n                             'precision', 'recall')):\n        names = copy.deepcopy(names)\n        if metrics is None:\n            metrics = mm.metrics.motchallenge_metrics\n        metrics = copy.deepcopy(metrics)\n\n        mh = mm.metrics.create()\n        summary = mh.compute_many(\n            accs, metrics=metrics, names=names, generate_overall=True)\n        return summary\n\n    @staticmethod\n    def save_summary(summary, filename):\n        import pandas as pd\n        writer = pd.ExcelWriter(filename)\n        summary.to_excel(writer)\n        writer.save()\n\n\nclass MOTMetric(Metric):\n    def __init__(self, save_summary=False):\n        self.save_summary = save_summary\n        self.MOTEvaluator = MOTEvaluator\n        self.result_root = None\n        self.reset()\n\n    def reset(self):\n        self.accs = []\n        self.seqs = []\n\n    def update(self, data_root, seq, data_type, result_root, result_filename):\n        evaluator = self.MOTEvaluator(data_root, seq, data_type)\n        self.accs.append(evaluator.eval_file(result_filename))\n        self.seqs.append(seq)\n        self.result_root = result_root\n\n    def accumulate(self):\n        metrics = mm.metrics.motchallenge_metrics\n        mh = mm.metrics.create()\n        summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics)\n        self.strsummary = mm.io.render_summary(\n            summary,\n            formatters=mh.formatters,\n            namemap=mm.io.motchallenge_metric_names)\n        if self.save_summary:\n            self.MOTEvaluator.save_summary(\n                summary, os.path.join(self.result_root, 'summary.xlsx'))\n\n    def log(self):\n        print(self.strsummary)\n\n    def get_results(self):\n        return self.strsummary\n\n\nclass JDEDetMetric(Metric):\n    # Note this detection AP metric is different from COCOMetric or VOCMetric,\n    # and the bboxes coordinates are not scaled to the original image\n    def __init__(self, overlap_thresh=0.5):\n        self.overlap_thresh = overlap_thresh\n        self.reset()\n\n    def reset(self):\n        self.AP_accum = np.zeros(1)\n        self.AP_accum_count = np.zeros(1)\n\n    def update(self, inputs, outputs):\n        bboxes = outputs['bbox'][:, 2:].numpy()\n        scores = outputs['bbox'][:, 1].numpy()\n        labels = outputs['bbox'][:, 0].numpy()\n        bbox_lengths = outputs['bbox_num'].numpy()\n        if bboxes.shape[0] == 1 and bboxes.sum() == 0.0:\n            return\n\n        gt_boxes = inputs['gt_bbox'].numpy()[0]\n        gt_labels = inputs['gt_class'].numpy()[0]\n        if gt_labels.shape[0] == 0:\n            return\n\n        correct = []\n        detected = []\n        for i in range(bboxes.shape[0]):\n            obj_pred = 0\n            pred_bbox = bboxes[i].reshape(1, 4)\n            # Compute iou with target boxes\n            iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0]\n            # Extract index of largest overlap\n            best_i = np.argmax(iou)\n            # If overlap exceeds threshold and classification is correct mark as correct\n            if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[\n                    best_i] and best_i not in detected:\n                correct.append(1)\n                detected.append(best_i)\n            else:\n                correct.append(0)\n\n        # Compute Average Precision (AP) per class\n        target_cls = list(gt_labels.T[0])\n        AP, AP_class, R, P = ap_per_class(\n            tp=correct,\n            conf=scores,\n            pred_cls=np.zeros_like(scores),\n            target_cls=target_cls)\n        self.AP_accum_count += np.bincount(AP_class, minlength=1)\n        self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP)\n\n    def accumulate(self):\n        logger.info(\"Accumulating evaluatation results...\")\n        self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16)\n\n    def log(self):\n        map_stat = 100. * self.map_stat\n        logger.info(\"mAP({:.2f}) = {:.2f}%\".format(self.overlap_thresh,\n                                                   map_stat))\n\n    def get_results(self):\n        return self.map_stat\n\n\n\"\"\"\nFollowing code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py\n\"\"\"\n\n\nclass tData:\n    \"\"\"\n        Utility class to load data.\n    \"\"\"\n    def __init__(self,frame=-1,obj_type=\"unset\",truncation=-1,occlusion=-1,\\\n                 obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\\\n                 X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1):\n        \"\"\"\n            Constructor, initializes the object given the parameters.\n        \"\"\"\n        self.frame = frame\n        self.track_id = track_id\n        self.obj_type = obj_type\n        self.truncation = truncation\n        self.occlusion = occlusion\n        self.obs_angle = obs_angle\n        self.x1 = x1\n        self.y1 = y1\n        self.x2 = x2\n        self.y2 = y2\n        self.w = w\n        self.h = h\n        self.l = l\n        self.X = X\n        self.Y = Y\n        self.Z = Z\n        self.yaw = yaw\n        self.score = score\n        self.ignored = False\n        self.valid = False\n        self.tracker = -1\n\n    def __str__(self):\n        attrs = vars(self)\n        return '\\n'.join(\"%s: %s\" % item for item in attrs.items())\n\n\nclass KITTIEvaluation(object):\n    \"\"\" KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall)\n             MOTA\t- Multi-object tracking accuracy in [0,100]\n             MOTP\t- Multi-object tracking precision in [0,100] (3D) / [td,100] (2D)\n             MOTAL\t- Multi-object tracking accuracy in [0,100] with log10(id-switches)\n\n             id-switches - number of id switches\n             fragments   - number of fragmentations\n\n             MT, PT, ML\t- number of mostly tracked, partially tracked and mostly lost trajectories\n\n             recall\t        - recall = percentage of detected targets\n             precision\t    - precision = percentage of correctly detected targets\n             FAR\t\t    - number of false alarms per frame\n             falsepositives - number of false positives (FP)\n             missed         - number of missed targets (FN)\n    \"\"\"\n    def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\\\n                min_height = 25, max_occlusion = 2, cls=\"car\",\\\n                n_frames=[], seqs=[], n_sequences=0):\n        # get number of sequences and\n        # get number of frames per sequence from test mapping\n        # (created while extracting the benchmark)\n        self.gt_path = os.path.join(gt_path, \"../labels\")\n        self.n_frames = n_frames\n        self.sequence_name = seqs\n        self.n_sequences = n_sequences\n\n        self.cls = cls  # class to evaluate, i.e. pedestrian or car\n\n        self.result_path = result_path\n\n        # statistics and numbers for evaluation\n        self.n_gt = 0  # number of ground truth detections minus ignored false negatives and true positives\n        self.n_igt = 0  # number of ignored ground truth detections\n        self.n_gts = [\n        ]  # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE\n        self.n_igts = [\n        ]  # number of ground ignored truth detections PER SEQUENCE\n        self.n_gt_trajectories = 0\n        self.n_gt_seq = []\n        self.n_tr = 0  # number of tracker detections minus ignored tracker detections\n        self.n_trs = [\n        ]  # number of tracker detections minus ignored tracker detections PER SEQUENCE\n        self.n_itr = 0  # number of ignored tracker detections\n        self.n_itrs = []  # number of ignored tracker detections PER SEQUENCE\n        self.n_igttr = 0  # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored\n        self.n_tr_trajectories = 0\n        self.n_tr_seq = []\n        self.MOTA = 0\n        self.MOTP = 0\n        self.MOTAL = 0\n        self.MODA = 0\n        self.MODP = 0\n        self.MODP_t = []\n        self.recall = 0\n        self.precision = 0\n        self.F1 = 0\n        self.FAR = 0\n        self.total_cost = 0\n        self.itp = 0  # number of ignored true positives\n        self.itps = []  # number of ignored true positives PER SEQUENCE\n        self.tp = 0  # number of true positives including ignored true positives!\n        self.tps = [\n        ]  # number of true positives including ignored true positives PER SEQUENCE\n        self.fn = 0  # number of false negatives WITHOUT ignored false negatives\n        self.fns = [\n        ]  # number of false negatives WITHOUT ignored false negatives PER SEQUENCE\n        self.ifn = 0  # number of ignored false negatives\n        self.ifns = []  # number of ignored false negatives PER SEQUENCE\n        self.fp = 0  # number of false positives\n        # a bit tricky, the number of ignored false negatives and ignored true positives \n        # is subtracted, but if both tracker detection and ground truth detection\n        # are ignored this number is added again to avoid double counting\n        self.fps = []  # above PER SEQUENCE\n        self.mme = 0\n        self.fragments = 0\n        self.id_switches = 0\n        self.MT = 0\n        self.PT = 0\n        self.ML = 0\n\n        self.min_overlap = min_overlap  # minimum bounding box overlap for 3rd party metrics\n        self.max_truncation = max_truncation  # maximum truncation of an object for evaluation\n        self.max_occlusion = max_occlusion  # maximum occlusion of an object for evaluation\n        self.min_height = min_height  # minimum height of an object for evaluation\n        self.n_sample_points = 500\n\n        # this should be enough to hold all groundtruth trajectories\n        # is expanded if necessary and reduced in any case\n        self.gt_trajectories = [[] for x in range(self.n_sequences)]\n        self.ign_trajectories = [[] for x in range(self.n_sequences)]\n\n    def loadGroundtruth(self):\n        try:\n            self._loadData(self.gt_path, cls=self.cls, loading_groundtruth=True)\n        except IOError:\n            return False\n        return True\n\n    def loadTracker(self):\n        try:\n            if not self._loadData(\n                    self.result_path, cls=self.cls, loading_groundtruth=False):\n                return False\n        except IOError:\n            return False\n        return True\n\n    def _loadData(self,\n                  root_dir,\n                  cls,\n                  min_score=-1000,\n                  loading_groundtruth=False):\n        \"\"\"\n            Generic loader for ground truth and tracking data.\n            Use loadGroundtruth() or loadTracker() to load this data.\n            Loads detections in KITTI format from textfiles.\n        \"\"\"\n        # construct objectDetections object to hold detection data\n        t_data = tData()\n        data = []\n        eval_2d = True\n        eval_3d = True\n\n        seq_data = []\n        n_trajectories = 0\n        n_trajectories_seq = []\n        for seq, s_name in enumerate(self.sequence_name):\n            i = 0\n            filename = os.path.join(root_dir, \"%s.txt\" % s_name)\n            f = open(filename, \"r\")\n\n            f_data = [\n                [] for x in range(self.n_frames[seq])\n            ]  # current set has only 1059 entries, sufficient length is checked anyway\n            ids = []\n            n_in_seq = 0\n            id_frame_cache = []\n            for line in f:\n                # KITTI tracking benchmark data format:\n                # (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry)\n                line = line.strip()\n                fields = line.split(\" \")\n                # classes that should be loaded (ignored neighboring classes)\n                if \"car\" in cls.lower():\n                    classes = [\"car\", \"van\"]\n                elif \"pedestrian\" in cls.lower():\n                    classes = [\"pedestrian\", \"person_sitting\"]\n                else:\n                    classes = [cls.lower()]\n                classes += [\"dontcare\"]\n                if not any([s for s in classes if s in fields[2].lower()]):\n                    continue\n                # get fields from table\n                t_data.frame = int(float(fields[0]))  # frame\n                t_data.track_id = int(float(fields[1]))  # id\n                t_data.obj_type = fields[\n                    2].lower()  # object type [car, pedestrian, cyclist, ...]\n                t_data.truncation = int(\n                    float(fields[3]))  # truncation [-1,0,1,2]\n                t_data.occlusion = int(\n                    float(fields[4]))  # occlusion  [-1,0,1,2]\n                t_data.obs_angle = float(fields[5])  # observation angle [rad]\n                t_data.x1 = float(fields[6])  # left   [px]\n                t_data.y1 = float(fields[7])  # top    [px]\n                t_data.x2 = float(fields[8])  # right  [px]\n                t_data.y2 = float(fields[9])  # bottom [px]\n                t_data.h = float(fields[10])  # height [m]\n                t_data.w = float(fields[11])  # width  [m]\n                t_data.l = float(fields[12])  # length [m]\n                t_data.X = float(fields[13])  # X [m]\n                t_data.Y = float(fields[14])  # Y [m]\n                t_data.Z = float(fields[15])  # Z [m]\n                t_data.yaw = float(fields[16])  # yaw angle [rad]\n                if not loading_groundtruth:\n                    if len(fields) == 17:\n                        t_data.score = -1\n                    elif len(fields) == 18:\n                        t_data.score = float(fields[17])  # detection score\n                    else:\n                        logger.info(\"file is not in KITTI format\")\n                        return\n\n                # do not consider objects marked as invalid\n                if t_data.track_id is -1 and t_data.obj_type != \"dontcare\":\n                    continue\n\n                idx = t_data.frame\n                # check if length for frame data is sufficient\n                if idx >= len(f_data):\n                    print(\"extend f_data\", idx, len(f_data))\n                    f_data += [[] for x in range(max(500, idx - len(f_data)))]\n                try:\n                    id_frame = (t_data.frame, t_data.track_id)\n                    if id_frame in id_frame_cache and not loading_groundtruth:\n                        logger.info(\n                            \"track ids are not unique for sequence %d: frame %d\"\n                            % (seq, t_data.frame))\n                        logger.info(\n                            \"track id %d occurred at least twice for this frame\"\n                            % t_data.track_id)\n                        logger.info(\"Exiting...\")\n                        #continue # this allows to evaluate non-unique result files\n                        return False\n                    id_frame_cache.append(id_frame)\n                    f_data[t_data.frame].append(copy.copy(t_data))\n                except:\n                    print(len(f_data), idx)\n                    raise\n\n                if t_data.track_id not in ids and t_data.obj_type != \"dontcare\":\n                    ids.append(t_data.track_id)\n                    n_trajectories += 1\n                    n_in_seq += 1\n\n                # check if uploaded data provides information for 2D and 3D evaluation\n                if not loading_groundtruth and eval_2d is True and (\n                        t_data.x1 == -1 or t_data.x2 == -1 or t_data.y1 == -1 or\n                        t_data.y2 == -1):\n                    eval_2d = False\n                if not loading_groundtruth and eval_3d is True and (\n                        t_data.X == -1000 or t_data.Y == -1000 or\n                        t_data.Z == -1000):\n                    eval_3d = False\n\n            # only add existing frames\n            n_trajectories_seq.append(n_in_seq)\n            seq_data.append(f_data)\n            f.close()\n\n        if not loading_groundtruth:\n            self.tracker = seq_data\n            self.n_tr_trajectories = n_trajectories\n            self.eval_2d = eval_2d\n            self.eval_3d = eval_3d\n            self.n_tr_seq = n_trajectories_seq\n            if self.n_tr_trajectories == 0:\n                return False\n        else:\n            # split ground truth and DontCare areas\n            self.dcareas = []\n            self.groundtruth = []\n            for seq_idx in range(len(seq_data)):\n                seq_gt = seq_data[seq_idx]\n                s_g, s_dc = [], []\n                for f in range(len(seq_gt)):\n                    all_gt = seq_gt[f]\n                    g, dc = [], []\n                    for gg in all_gt:\n                        if gg.obj_type == \"dontcare\":\n                            dc.append(gg)\n                        else:\n                            g.append(gg)\n                    s_g.append(g)\n                    s_dc.append(dc)\n                self.dcareas.append(s_dc)\n                self.groundtruth.append(s_g)\n            self.n_gt_seq = n_trajectories_seq\n            self.n_gt_trajectories = n_trajectories\n        return True\n\n    def boxoverlap(self, a, b, criterion=\"union\"):\n        \"\"\"\n            boxoverlap computes intersection over union for bbox a and b in KITTI format.\n            If the criterion is 'union', overlap = (a inter b) / a union b).\n            If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area.\n        \"\"\"\n        x1 = max(a.x1, b.x1)\n        y1 = max(a.y1, b.y1)\n        x2 = min(a.x2, b.x2)\n        y2 = min(a.y2, b.y2)\n\n        w = x2 - x1\n        h = y2 - y1\n\n        if w <= 0. or h <= 0.:\n            return 0.\n        inter = w * h\n        aarea = (a.x2 - a.x1) * (a.y2 - a.y1)\n        barea = (b.x2 - b.x1) * (b.y2 - b.y1)\n        # intersection over union overlap\n        if criterion.lower() == \"union\":\n            o = inter / float(aarea + barea - inter)\n        elif criterion.lower() == \"a\":\n            o = float(inter) / float(aarea)\n        else:\n            raise TypeError(\"Unkown type for criterion\")\n        return o\n\n    def compute3rdPartyMetrics(self):\n        \"\"\"\n            Computes the metrics defined in\n                - Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics\n                  MOTA, MOTAL, MOTP\n                - Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows\n                  MT/PT/ML\n        \"\"\"\n        # construct Munkres object for Hungarian Method association\n        hm = Munkres()\n        max_cost = 1e9\n\n        # go through all frames and associate ground truth and tracker results\n        # groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections\n        fr, ids = 0, 0\n        for seq_idx in range(len(self.groundtruth)):\n            seq_gt = self.groundtruth[seq_idx]\n            seq_dc = self.dcareas[seq_idx]  # don't care areas\n            seq_tracker = self.tracker[seq_idx]\n            seq_trajectories = defaultdict(list)\n            seq_ignored = defaultdict(list)\n\n            # statistics over the current sequence, check the corresponding\n            # variable comments in __init__ to get their meaning\n            seqtp = 0\n            seqitp = 0\n            seqfn = 0\n            seqifn = 0\n            seqfp = 0\n            seqigt = 0\n            seqitr = 0\n\n            last_ids = [[], []]\n            n_gts = 0\n            n_trs = 0\n\n            for f in range(len(seq_gt)):\n                g = seq_gt[f]\n                dc = seq_dc[f]\n\n                t = seq_tracker[f]\n                # counting total number of ground truth and tracker objects\n                self.n_gt += len(g)\n                self.n_tr += len(t)\n\n                n_gts += len(g)\n                n_trs += len(t)\n\n                # use hungarian method to associate, using boxoverlap 0..1 as cost\n                # build cost matrix\n                cost_matrix = []\n                this_ids = [[], []]\n                for gg in g:\n                    # save current ids\n                    this_ids[0].append(gg.track_id)\n                    this_ids[1].append(-1)\n                    gg.tracker = -1\n                    gg.id_switch = 0\n                    gg.fragmentation = 0\n                    cost_row = []\n                    for tt in t:\n                        # overlap == 1 is cost ==0\n                        c = 1 - self.boxoverlap(gg, tt)\n                        # gating for boxoverlap\n                        if c <= self.min_overlap:\n                            cost_row.append(c)\n                        else:\n                            cost_row.append(max_cost)  # = 1e9\n                    cost_matrix.append(cost_row)\n                    # all ground truth trajectories are initially not associated\n                    # extend groundtruth trajectories lists (merge lists)\n                    seq_trajectories[gg.track_id].append(-1)\n                    seq_ignored[gg.track_id].append(False)\n\n                if len(g) is 0:\n                    cost_matrix = [[]]\n                # associate\n                association_matrix = hm.compute(cost_matrix)\n\n                # tmp variables for sanity checks and MODP computation\n                tmptp = 0\n                tmpfp = 0\n                tmpfn = 0\n                tmpc = 0  # this will sum up the overlaps for all true positives\n                tmpcs = [0] * len(\n                    g)  # this will save the overlaps for all true positives\n                # the reason is that some true positives might be ignored\n                # later such that the corrsponding overlaps can\n                # be subtracted from tmpc for MODP computation\n\n                # mapping for tracker ids and ground truth ids\n                for row, col in association_matrix:\n                    # apply gating on boxoverlap\n                    c = cost_matrix[row][col]\n                    if c < max_cost:\n                        g[row].tracker = t[col].track_id\n                        this_ids[1][row] = t[col].track_id\n                        t[col].valid = True\n                        g[row].distance = c\n                        self.total_cost += 1 - c\n                        tmpc += 1 - c\n                        tmpcs[row] = 1 - c\n                        seq_trajectories[g[row].track_id][-1] = t[col].track_id\n\n                        # true positives are only valid associations\n                        self.tp += 1\n                        tmptp += 1\n                    else:\n                        g[row].tracker = -1\n                        self.fn += 1\n                        tmpfn += 1\n\n                # associate tracker and DontCare areas\n                # ignore tracker in neighboring classes\n                nignoredtracker = 0  # number of ignored tracker detections\n                ignoredtrackers = dict()  # will associate the track_id with -1\n                # if it is not ignored and 1 if it is\n                # ignored;\n                # this is used to avoid double counting ignored\n                # cases, see the next loop\n\n                for tt in t:\n                    ignoredtrackers[tt.track_id] = -1\n                    # ignore detection if it belongs to a neighboring class or is\n                    # smaller or equal to the minimum height\n\n                    tt_height = abs(tt.y1 - tt.y2)\n                    if ((self.cls == \"car\" and tt.obj_type == \"van\") or\n                        (self.cls == \"pedestrian\" and\n                         tt.obj_type == \"person_sitting\") or\n                            tt_height <= self.min_height) and not tt.valid:\n                        nignoredtracker += 1\n                        tt.ignored = True\n                        ignoredtrackers[tt.track_id] = 1\n                        continue\n                    for d in dc:\n                        overlap = self.boxoverlap(tt, d, \"a\")\n                        if overlap > 0.5 and not tt.valid:\n                            tt.ignored = True\n                            nignoredtracker += 1\n                            ignoredtrackers[tt.track_id] = 1\n                            break\n\n                # check for ignored FN/TP (truncation or neighboring object class)\n                ignoredfn = 0  # the number of ignored false negatives\n                nignoredtp = 0  # the number of ignored true positives\n                nignoredpairs = 0  # the number of ignored pairs, i.e. a true positive\n                # which is ignored but where the associated tracker\n                # detection has already been ignored\n\n                gi = 0\n                for gg in g:\n                    if gg.tracker < 0:\n                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\\\n                                or (self.cls==\"car\" and gg.obj_type==\"van\") or (self.cls==\"pedestrian\" and gg.obj_type==\"person_sitting\"):\n                            seq_ignored[gg.track_id][-1] = True\n                            gg.ignored = True\n                            ignoredfn += 1\n\n                    elif gg.tracker >= 0:\n                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\\\n                                or (self.cls==\"car\" and gg.obj_type==\"van\") or (self.cls==\"pedestrian\" and gg.obj_type==\"person_sitting\"):\n\n                            seq_ignored[gg.track_id][-1] = True\n                            gg.ignored = True\n                            nignoredtp += 1\n\n                            # if the associated tracker detection is already ignored,\n                            # we want to avoid double counting ignored detections\n                            if ignoredtrackers[gg.tracker] > 0:\n                                nignoredpairs += 1\n\n                            # for computing MODP, the overlaps from ignored detections\n                            # are subtracted\n                            tmpc -= tmpcs[gi]\n                    gi += 1\n\n                # the below might be confusion, check the comments in __init__\n                # to see what the individual statistics represent\n\n                # correct TP by number of ignored TP due to truncation\n                # ignored TP are shown as tracked in visualization\n                tmptp -= nignoredtp\n\n                # count the number of ignored true positives\n                self.itp += nignoredtp\n\n                # adjust the number of ground truth objects considered\n                self.n_gt -= (ignoredfn + nignoredtp)\n\n                # count the number of ignored ground truth objects\n                self.n_igt += ignoredfn + nignoredtp\n\n                # count the number of ignored tracker objects\n                self.n_itr += nignoredtracker\n\n                # count the number of ignored pairs, i.e. associated tracker and\n                # ground truth objects that are both ignored\n                self.n_igttr += nignoredpairs\n\n                # false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes\n                tmpfn += len(g) - len(association_matrix) - ignoredfn\n                self.fn += len(g) - len(association_matrix) - ignoredfn\n                self.ifn += ignoredfn\n\n                # false positives = tracker bboxes - associated tracker bboxes\n                # mismatches (mme_t)\n                tmpfp += len(\n                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs\n                self.fp += len(\n                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs\n\n                # update sequence data\n                seqtp += tmptp\n                seqitp += nignoredtp\n                seqfp += tmpfp\n                seqfn += tmpfn\n                seqifn += ignoredfn\n                seqigt += ignoredfn + nignoredtp\n                seqitr += nignoredtracker\n\n                # sanity checks\n                # - the number of true positives minues ignored true positives\n                #   should be greater or equal to 0\n                # - the number of false negatives should be greater or equal to 0\n                # - the number of false positives needs to be greater or equal to 0\n                #   otherwise ignored detections might be counted double\n                # - the number of counted true positives (plus ignored ones)\n                #   and the number of counted false negatives (plus ignored ones)\n                #   should match the total number of ground truth objects\n                # - the number of counted true positives (plus ignored ones)\n                #   and the number of counted false positives\n                #   plus the number of ignored tracker detections should\n                #   match the total number of tracker detections; note that\n                #   nignoredpairs is subtracted here to avoid double counting\n                #   of ignored detection sin nignoredtp and nignoredtracker\n                if tmptp < 0:\n                    print(tmptp, nignoredtp)\n                    raise NameError(\"Something went wrong! TP is negative\")\n                if tmpfn < 0:\n                    print(tmpfn,\n                          len(g),\n                          len(association_matrix), ignoredfn, nignoredpairs)\n                    raise NameError(\"Something went wrong! FN is negative\")\n                if tmpfp < 0:\n                    print(tmpfp,\n                          len(t), tmptp, nignoredtracker, nignoredtp,\n                          nignoredpairs)\n                    raise NameError(\"Something went wrong! FP is negative\")\n                if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp:\n                    print(\"seqidx\", seq_idx)\n                    print(\"frame \", f)\n                    print(\"TP    \", tmptp)\n                    print(\"FN    \", tmpfn)\n                    print(\"FP    \", tmpfp)\n                    print(\"nGT   \", len(g))\n                    print(\"nAss  \", len(association_matrix))\n                    print(\"ign GT\", ignoredfn)\n                    print(\"ign TP\", nignoredtp)\n                    raise NameError(\n                        \"Something went wrong! nGroundtruth is not TP+FN\")\n                if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len(\n                        t):\n                    print(seq_idx, f, len(t), tmptp, tmpfp)\n                    print(len(association_matrix), association_matrix)\n                    raise NameError(\n                        \"Something went wrong! nTracker is not TP+FP\")\n\n                # check for id switches or fragmentations\n                for i, tt in enumerate(this_ids[0]):\n                    if tt in last_ids[0]:\n                        idx = last_ids[0].index(tt)\n                        tid = this_ids[1][i]\n                        lid = last_ids[1][idx]\n                        if tid != lid and lid != -1 and tid != -1:\n                            if g[i].truncation < self.max_truncation:\n                                g[i].id_switch = 1\n                                ids += 1\n                        if tid != lid and lid != -1:\n                            if g[i].truncation < self.max_truncation:\n                                g[i].fragmentation = 1\n                                fr += 1\n\n                # save current index\n                last_ids = this_ids\n                # compute MOTP_t\n                MODP_t = 1\n                if tmptp != 0:\n                    MODP_t = tmpc / float(tmptp)\n                self.MODP_t.append(MODP_t)\n\n            # remove empty lists for current gt trajectories\n            self.gt_trajectories[seq_idx] = seq_trajectories\n            self.ign_trajectories[seq_idx] = seq_ignored\n\n            # gather statistics for \"per sequence\" statistics.\n            self.n_gts.append(n_gts)\n            self.n_trs.append(n_trs)\n            self.tps.append(seqtp)\n            self.itps.append(seqitp)\n            self.fps.append(seqfp)\n            self.fns.append(seqfn)\n            self.ifns.append(seqifn)\n            self.n_igts.append(seqigt)\n            self.n_itrs.append(seqitr)\n\n        # compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories\n        n_ignored_tr_total = 0\n        for seq_idx, (\n                seq_trajectories, seq_ignored\n        ) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)):\n            if len(seq_trajectories) == 0:\n                continue\n            tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5\n            n_ignored_tr = 0\n            for g, ign_g in zip(seq_trajectories.values(),\n                                seq_ignored.values()):\n                # all frames of this gt trajectory are ignored\n                if all(ign_g):\n                    n_ignored_tr += 1\n                    n_ignored_tr_total += 1\n                    continue\n                # all frames of this gt trajectory are not assigned to any detections\n                if all([this == -1 for this in g]):\n                    tmpML += 1\n                    self.ML += 1\n                    continue\n                # compute tracked frames in trajectory\n                last_id = g[0]\n                # first detection (necessary to be in gt_trajectories) is always tracked\n                tracked = 1 if g[0] >= 0 else 0\n                lgt = 0 if ign_g[0] else 1\n                for f in range(1, len(g)):\n                    if ign_g[f]:\n                        last_id = -1\n                        continue\n                    lgt += 1\n                    if last_id != g[f] and last_id != -1 and g[f] != -1 and g[\n                            f - 1] != -1:\n                        tmpId_switches += 1\n                        self.id_switches += 1\n                    if f < len(g) - 1 and g[f - 1] != g[\n                            f] and last_id != -1 and g[f] != -1 and g[f +\n                                                                      1] != -1:\n                        tmpFragments += 1\n                        self.fragments += 1\n                    if g[f] != -1:\n                        tracked += 1\n                        last_id = g[f]\n                # handle last frame; tracked state is handled in for loop (g[f]!=-1)\n                if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[\n                        f] != -1 and not ign_g[f]:\n                    tmpFragments += 1\n                    self.fragments += 1\n\n                # compute MT/PT/ML\n                tracking_ratio = tracked / float(len(g) - sum(ign_g))\n                if tracking_ratio > 0.8:\n                    tmpMT += 1\n                    self.MT += 1\n                elif tracking_ratio < 0.2:\n                    tmpML += 1\n                    self.ML += 1\n                else:  # 0.2 <= tracking_ratio <= 0.8\n                    tmpPT += 1\n                    self.PT += 1\n\n        if (self.n_gt_trajectories - n_ignored_tr_total) == 0:\n            self.MT = 0.\n            self.PT = 0.\n            self.ML = 0.\n        else:\n            self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total)\n            self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total)\n            self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total)\n\n        # precision/recall etc.\n        if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0:\n            self.recall = 0.\n            self.precision = 0.\n        else:\n            self.recall = self.tp / float(self.tp + self.fn)\n            self.precision = self.tp / float(self.fp + self.tp)\n        if (self.recall + self.precision) == 0:\n            self.F1 = 0.\n        else:\n            self.F1 = 2. * (self.precision * self.recall) / (\n                self.precision + self.recall)\n        if sum(self.n_frames) == 0:\n            self.FAR = \"n/a\"\n        else:\n            self.FAR = self.fp / float(sum(self.n_frames))\n\n        # compute CLEARMOT\n        if self.n_gt == 0:\n            self.MOTA = -float(\"inf\")\n            self.MODA = -float(\"inf\")\n        else:\n            self.MOTA = 1 - (self.fn + self.fp + self.id_switches\n                             ) / float(self.n_gt)\n            self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt)\n        if self.tp == 0:\n            self.MOTP = float(\"inf\")\n        else:\n            self.MOTP = self.total_cost / float(self.tp)\n        if self.n_gt != 0:\n            if self.id_switches == 0:\n                self.MOTAL = 1 - (self.fn + self.fp + self.id_switches\n                                  ) / float(self.n_gt)\n            else:\n                self.MOTAL = 1 - (self.fn + self.fp +\n                                  math.log10(self.id_switches)\n                                  ) / float(self.n_gt)\n        else:\n            self.MOTAL = -float(\"inf\")\n        if sum(self.n_frames) == 0:\n            self.MODP = \"n/a\"\n        else:\n            self.MODP = sum(self.MODP_t) / float(sum(self.n_frames))\n        return True\n\n    def createSummary(self):\n        summary = \"\"\n        summary += \"tracking evaluation summary\".center(80, \"=\") + \"\\n\"\n        summary += self.printEntry(\"Multiple Object Tracking Accuracy (MOTA)\",\n                                   self.MOTA) + \"\\n\"\n        summary += self.printEntry(\"Multiple Object Tracking Precision (MOTP)\",\n                                   self.MOTP) + \"\\n\"\n        summary += self.printEntry(\"Multiple Object Tracking Accuracy (MOTAL)\",\n                                   self.MOTAL) + \"\\n\"\n        summary += self.printEntry(\"Multiple Object Detection Accuracy (MODA)\",\n                                   self.MODA) + \"\\n\"\n        summary += self.printEntry(\"Multiple Object Detection Precision (MODP)\",\n                                   self.MODP) + \"\\n\"\n        summary += \"\\n\"\n        summary += self.printEntry(\"Recall\", self.recall) + \"\\n\"\n        summary += self.printEntry(\"Precision\", self.precision) + \"\\n\"\n        summary += self.printEntry(\"F1\", self.F1) + \"\\n\"\n        summary += self.printEntry(\"False Alarm Rate\", self.FAR) + \"\\n\"\n        summary += \"\\n\"\n        summary += self.printEntry(\"Mostly Tracked\", self.MT) + \"\\n\"\n        summary += self.printEntry(\"Partly Tracked\", self.PT) + \"\\n\"\n        summary += self.printEntry(\"Mostly Lost\", self.ML) + \"\\n\"\n        summary += \"\\n\"\n        summary += self.printEntry(\"True Positives\", self.tp) + \"\\n\"\n        #summary += self.printEntry(\"True Positives per Sequence\", self.tps) + \"\\n\"\n        summary += self.printEntry(\"Ignored True Positives\", self.itp) + \"\\n\"\n        #summary += self.printEntry(\"Ignored True Positives per Sequence\", self.itps) + \"\\n\"\n\n        summary += self.printEntry(\"False Positives\", self.fp) + \"\\n\"\n        #summary += self.printEntry(\"False Positives per Sequence\", self.fps) + \"\\n\"\n        summary += self.printEntry(\"False Negatives\", self.fn) + \"\\n\"\n        #summary += self.printEntry(\"False Negatives per Sequence\", self.fns) + \"\\n\"\n        summary += self.printEntry(\"ID-switches\", self.id_switches) + \"\\n\"\n        self.fp = self.fp / self.n_gt\n        self.fn = self.fn / self.n_gt\n        self.id_switches = self.id_switches / self.n_gt\n        summary += self.printEntry(\"False Positives Ratio\", self.fp) + \"\\n\"\n        #summary += self.printEntry(\"False Positives per Sequence\", self.fps) + \"\\n\"\n        summary += self.printEntry(\"False Negatives Ratio\", self.fn) + \"\\n\"\n        #summary += self.printEntry(\"False Negatives per Sequence\", self.fns) + \"\\n\"\n        summary += self.printEntry(\"Ignored False Negatives Ratio\",\n                                   self.ifn) + \"\\n\"\n\n        #summary += self.printEntry(\"Ignored False Negatives per Sequence\", self.ifns) + \"\\n\"\n        summary += self.printEntry(\"Missed Targets\", self.fn) + \"\\n\"\n        summary += self.printEntry(\"ID-switches\", self.id_switches) + \"\\n\"\n        summary += self.printEntry(\"Fragmentations\", self.fragments) + \"\\n\"\n        summary += \"\\n\"\n        summary += self.printEntry(\"Ground Truth Objects (Total)\", self.n_gt +\n                                   self.n_igt) + \"\\n\"\n        #summary += self.printEntry(\"Ground Truth Objects (Total) per Sequence\", self.n_gts) + \"\\n\"\n        summary += self.printEntry(\"Ignored Ground Truth Objects\",\n                                   self.n_igt) + \"\\n\"\n        #summary += self.printEntry(\"Ignored Ground Truth Objects per Sequence\", self.n_igts) + \"\\n\"\n        summary += self.printEntry(\"Ground Truth Trajectories\",\n                                   self.n_gt_trajectories) + \"\\n\"\n        summary += \"\\n\"\n        summary += self.printEntry(\"Tracker Objects (Total)\", self.n_tr) + \"\\n\"\n        #summary += self.printEntry(\"Tracker Objects (Total) per Sequence\", self.n_trs) + \"\\n\"\n        summary += self.printEntry(\"Ignored Tracker Objects\", self.n_itr) + \"\\n\"\n        #summary += self.printEntry(\"Ignored Tracker Objects per Sequence\", self.n_itrs) + \"\\n\"\n        summary += self.printEntry(\"Tracker Trajectories\",\n                                   self.n_tr_trajectories) + \"\\n\"\n        #summary += \"\\n\"\n        #summary += self.printEntry(\"Ignored Tracker Objects with Associated Ignored Ground Truth Objects\", self.n_igttr) + \"\\n\"\n        summary += \"=\" * 80\n        return summary\n\n    def printEntry(self, key, val, width=(70, 10)):\n        \"\"\"\n            Pretty print an entry in a table fashion.\n        \"\"\"\n        s_out = key.ljust(width[0])\n        if type(val) == int:\n            s = \"%%%dd\" % width[1]\n            s_out += s % val\n        elif type(val) == float:\n            s = \"%%%df\" % (width[1])\n            s_out += s % val\n        else:\n            s_out += (\"%s\" % val).rjust(width[1])\n        return s_out\n\n    def saveToStats(self, save_summary):\n        \"\"\"\n            Save the statistics in a whitespace separate file.\n        \"\"\"\n        summary = self.createSummary()\n        if save_summary:\n            filename = os.path.join(self.result_path,\n                                    \"summary_%s.txt\" % self.cls)\n            dump = open(filename, \"w+\")\n            dump.write(summary)\n            dump.close()\n        return summary\n\n\nclass KITTIMOTMetric(Metric):\n    def __init__(self, save_summary=True):\n        self.save_summary = save_summary\n        self.MOTEvaluator = KITTIEvaluation\n        self.result_root = None\n        self.reset()\n\n    def reset(self):\n        self.seqs = []\n        self.n_sequences = 0\n        self.n_frames = []\n        self.strsummary = ''\n\n    def update(self, data_root, seq, data_type, result_root, result_filename):\n        assert data_type == 'kitti', \"data_type should 'kitti'\"\n        self.result_root = result_root\n        self.gt_path = data_root\n        gt_path = '{}/../labels/{}.txt'.format(data_root, seq)\n        gt = open(gt_path, \"r\")\n        max_frame = 0\n        for line in gt:\n            line = line.strip()\n            line_list = line.split(\" \")\n            if int(line_list[0]) > max_frame:\n                max_frame = int(line_list[0])\n        rs = open(result_filename, \"r\")\n        for line in rs:\n            line = line.strip()\n            line_list = line.split(\" \")\n            if int(line_list[0]) > max_frame:\n                max_frame = int(line_list[0])\n        gt.close()\n        rs.close()\n        self.n_frames.append(max_frame + 1)\n        self.seqs.append(seq)\n        self.n_sequences += 1\n\n    def accumulate(self):\n        logger.info(\"Processing Result for KITTI Tracking Benchmark\")\n        e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\\\n            n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences)\n        try:\n            if not e.loadTracker():\n                return\n            logger.info(\"Loading Results - Success\")\n            logger.info(\"Evaluate Object Class: %s\" % c.upper())\n        except:\n            logger.info(\"Caught exception while loading result data.\")\n        if not e.loadGroundtruth():\n            raise ValueError(\"Ground truth not found.\")\n        logger.info(\"Loading Groundtruth - Success\")\n        # sanity checks\n        if len(e.groundtruth) is not len(e.tracker):\n            logger.info(\n                \"The uploaded data does not provide results for every sequence.\")\n            return False\n        logger.info(\"Loaded %d Sequences.\" % len(e.groundtruth))\n        logger.info(\"Start Evaluation...\")\n\n        if e.compute3rdPartyMetrics():\n            self.strsummary = e.saveToStats(self.save_summary)\n        else:\n            logger.info(\n                \"There seem to be no true positives or false positives at all in the submitted data.\"\n            )\n\n    def log(self):\n        print(self.strsummary)\n\n    def get_results(self):\n        return self.strsummary\n"
  },
  {
    "path": "ppdet/metrics/munkres.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\"\"\"\nThis code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py\n\"\"\"\n\nimport sys\n\n__all__ = ['Munkres', 'make_cost_matrix']\n\n\nclass Munkres:\n    \"\"\"\n    Calculate the Munkres solution to the classical assignment problem.\n    See the module documentation for usage.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"Create a new instance\"\"\"\n        self.C = None\n        self.row_covered = []\n        self.col_covered = []\n        self.n = 0\n        self.Z0_r = 0\n        self.Z0_c = 0\n        self.marked = None\n        self.path = None\n\n    def make_cost_matrix(profit_matrix, inversion_function):\n        \"\"\"\n        **DEPRECATED**\n\n        Please use the module function ``make_cost_matrix()``.\n        \"\"\"\n        import munkres\n        return munkres.make_cost_matrix(profit_matrix, inversion_function)\n\n    make_cost_matrix = staticmethod(make_cost_matrix)\n\n    def pad_matrix(self, matrix, pad_value=0):\n        \"\"\"\n        Pad a possibly non-square matrix to make it square.\n\n        :Parameters:\n            matrix : list of lists\n                matrix to pad\n\n            pad_value : int\n                value to use to pad the matrix\n\n        :rtype: list of lists\n        :return: a new, possibly padded, matrix\n        \"\"\"\n        max_columns = 0\n        total_rows = len(matrix)\n\n        for row in matrix:\n            max_columns = max(max_columns, len(row))\n\n        total_rows = max(max_columns, total_rows)\n\n        new_matrix = []\n        for row in matrix:\n            row_len = len(row)\n            new_row = row[:]\n            if total_rows > row_len:\n                # Row too short. Pad it.\n                new_row += [0] * (total_rows - row_len)\n            new_matrix += [new_row]\n\n        while len(new_matrix) < total_rows:\n            new_matrix += [[0] * total_rows]\n\n        return new_matrix\n\n    def compute(self, cost_matrix):\n        \"\"\"\n        Compute the indexes for the lowest-cost pairings between rows and\n        columns in the database. Returns a list of (row, column) tuples\n        that can be used to traverse the matrix.\n\n        :Parameters:\n            cost_matrix : list of lists\n                The cost matrix. If this cost matrix is not square, it\n                will be padded with zeros, via a call to ``pad_matrix()``.\n                (This method does *not* modify the caller's matrix. It\n                operates on a copy of the matrix.)\n\n                **WARNING**: This code handles square and rectangular\n                matrices. It does *not* handle irregular matrices.\n\n        :rtype: list\n        :return: A list of ``(row, column)`` tuples that describe the lowest\n                 cost path through the matrix\n\n        \"\"\"\n        self.C = self.pad_matrix(cost_matrix)\n        self.n = len(self.C)\n        self.original_length = len(cost_matrix)\n        self.original_width = len(cost_matrix[0])\n        self.row_covered = [False for i in range(self.n)]\n        self.col_covered = [False for i in range(self.n)]\n        self.Z0_r = 0\n        self.Z0_c = 0\n        self.path = self.__make_matrix(self.n * 2, 0)\n        self.marked = self.__make_matrix(self.n, 0)\n\n        done = False\n        step = 1\n\n        steps = {\n            1: self.__step1,\n            2: self.__step2,\n            3: self.__step3,\n            4: self.__step4,\n            5: self.__step5,\n            6: self.__step6\n        }\n\n        while not done:\n            try:\n                func = steps[step]\n                step = func()\n            except KeyError:\n                done = True\n\n        # Look for the starred columns\n        results = []\n        for i in range(self.original_length):\n            for j in range(self.original_width):\n                if self.marked[i][j] == 1:\n                    results += [(i, j)]\n\n        return results\n\n    def __copy_matrix(self, matrix):\n        \"\"\"Return an exact copy of the supplied matrix\"\"\"\n        return copy.deepcopy(matrix)\n\n    def __make_matrix(self, n, val):\n        \"\"\"Create an *n*x*n* matrix, populating it with the specific value.\"\"\"\n        matrix = []\n        for i in range(n):\n            matrix += [[val for j in range(n)]]\n        return matrix\n\n    def __step1(self):\n        \"\"\"\n        For each row of the matrix, find the smallest element and\n        subtract it from every element in its row. Go to Step 2.\n        \"\"\"\n        C = self.C\n        n = self.n\n        for i in range(n):\n            minval = min(self.C[i])\n            # Find the minimum value for this row and subtract that minimum\n            # from every element in the row.\n            for j in range(n):\n                self.C[i][j] -= minval\n\n        return 2\n\n    def __step2(self):\n        \"\"\"\n        Find a zero (Z) in the resulting matrix. If there is no starred\n        zero in its row or column, star Z. Repeat for each element in the\n        matrix. Go to Step 3.\n        \"\"\"\n        n = self.n\n        for i in range(n):\n            for j in range(n):\n                if (self.C[i][j] == 0) and \\\n                   (not self.col_covered[j]) and \\\n                   (not self.row_covered[i]):\n                    self.marked[i][j] = 1\n                    self.col_covered[j] = True\n                    self.row_covered[i] = True\n\n        self.__clear_covers()\n        return 3\n\n    def __step3(self):\n        \"\"\"\n        Cover each column containing a starred zero. If K columns are\n        covered, the starred zeros describe a complete set of unique\n        assignments. In this case, Go to DONE, otherwise, Go to Step 4.\n        \"\"\"\n        n = self.n\n        count = 0\n        for i in range(n):\n            for j in range(n):\n                if self.marked[i][j] == 1:\n                    self.col_covered[j] = True\n                    count += 1\n\n        if count >= n:\n            step = 7  # done\n        else:\n            step = 4\n\n        return step\n\n    def __step4(self):\n        \"\"\"\n        Find a noncovered zero and prime it. If there is no starred zero\n        in the row containing this primed zero, Go to Step 5. Otherwise,\n        cover this row and uncover the column containing the starred\n        zero. Continue in this manner until there are no uncovered zeros\n        left. Save the smallest uncovered value and Go to Step 6.\n        \"\"\"\n        step = 0\n        done = False\n        row = -1\n        col = -1\n        star_col = -1\n        while not done:\n            (row, col) = self.__find_a_zero()\n            if row < 0:\n                done = True\n                step = 6\n            else:\n                self.marked[row][col] = 2\n                star_col = self.__find_star_in_row(row)\n                if star_col >= 0:\n                    col = star_col\n                    self.row_covered[row] = True\n                    self.col_covered[col] = False\n                else:\n                    done = True\n                    self.Z0_r = row\n                    self.Z0_c = col\n                    step = 5\n\n        return step\n\n    def __step5(self):\n        \"\"\"\n        Construct a series of alternating primed and starred zeros as\n        follows. Let Z0 represent the uncovered primed zero found in Step 4.\n        Let Z1 denote the starred zero in the column of Z0 (if any).\n        Let Z2 denote the primed zero in the row of Z1 (there will always\n        be one). Continue until the series terminates at a primed zero\n        that has no starred zero in its column. Unstar each starred zero\n        of the series, star each primed zero of the series, erase all\n        primes and uncover every line in the matrix. Return to Step 3\n        \"\"\"\n        count = 0\n        path = self.path\n        path[count][0] = self.Z0_r\n        path[count][1] = self.Z0_c\n        done = False\n        while not done:\n            row = self.__find_star_in_col(path[count][1])\n            if row >= 0:\n                count += 1\n                path[count][0] = row\n                path[count][1] = path[count - 1][1]\n            else:\n                done = True\n\n            if not done:\n                col = self.__find_prime_in_row(path[count][0])\n                count += 1\n                path[count][0] = path[count - 1][0]\n                path[count][1] = col\n\n        self.__convert_path(path, count)\n        self.__clear_covers()\n        self.__erase_primes()\n        return 3\n\n    def __step6(self):\n        \"\"\"\n        Add the value found in Step 4 to every element of each covered\n        row, and subtract it from every element of each uncovered column.\n        Return to Step 4 without altering any stars, primes, or covered\n        lines.\n        \"\"\"\n        minval = self.__find_smallest()\n        for i in range(self.n):\n            for j in range(self.n):\n                if self.row_covered[i]:\n                    self.C[i][j] += minval\n                if not self.col_covered[j]:\n                    self.C[i][j] -= minval\n        return 4\n\n    def __find_smallest(self):\n        \"\"\"Find the smallest uncovered value in the matrix.\"\"\"\n        minval = 2e9  # sys.maxint\n        for i in range(self.n):\n            for j in range(self.n):\n                if (not self.row_covered[i]) and (not self.col_covered[j]):\n                    if minval > self.C[i][j]:\n                        minval = self.C[i][j]\n        return minval\n\n    def __find_a_zero(self):\n        \"\"\"Find the first uncovered element with value 0\"\"\"\n        row = -1\n        col = -1\n        i = 0\n        n = self.n\n        done = False\n\n        while not done:\n            j = 0\n            while True:\n                if (self.C[i][j] == 0) and \\\n                   (not self.row_covered[i]) and \\\n                   (not self.col_covered[j]):\n                    row = i\n                    col = j\n                    done = True\n                j += 1\n                if j >= n:\n                    break\n            i += 1\n            if i >= n:\n                done = True\n\n        return (row, col)\n\n    def __find_star_in_row(self, row):\n        \"\"\"\n        Find the first starred element in the specified row. Returns\n        the column index, or -1 if no starred element was found.\n        \"\"\"\n        col = -1\n        for j in range(self.n):\n            if self.marked[row][j] == 1:\n                col = j\n                break\n\n        return col\n\n    def __find_star_in_col(self, col):\n        \"\"\"\n        Find the first starred element in the specified row. Returns\n        the row index, or -1 if no starred element was found.\n        \"\"\"\n        row = -1\n        for i in range(self.n):\n            if self.marked[i][col] == 1:\n                row = i\n                break\n\n        return row\n\n    def __find_prime_in_row(self, row):\n        \"\"\"\n        Find the first prime element in the specified row. Returns\n        the column index, or -1 if no starred element was found.\n        \"\"\"\n        col = -1\n        for j in range(self.n):\n            if self.marked[row][j] == 2:\n                col = j\n                break\n\n        return col\n\n    def __convert_path(self, path, count):\n        for i in range(count + 1):\n            if self.marked[path[i][0]][path[i][1]] == 1:\n                self.marked[path[i][0]][path[i][1]] = 0\n            else:\n                self.marked[path[i][0]][path[i][1]] = 1\n\n    def __clear_covers(self):\n        \"\"\"Clear all covered matrix cells\"\"\"\n        for i in range(self.n):\n            self.row_covered[i] = False\n            self.col_covered[i] = False\n\n    def __erase_primes(self):\n        \"\"\"Erase all prime markings\"\"\"\n        for i in range(self.n):\n            for j in range(self.n):\n                if self.marked[i][j] == 2:\n                    self.marked[i][j] = 0\n\n\ndef make_cost_matrix(profit_matrix, inversion_function):\n    \"\"\"\n    Create a cost matrix from a profit matrix by calling\n    'inversion_function' to invert each value. The inversion\n    function must take one numeric argument (of any type) and return\n    another numeric argument which is presumed to be the cost inverse\n    of the original profit.\n\n    This is a static method. Call it like this:\n\n    .. python::\n\n        cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func)\n\n    For example:\n\n    .. python::\n\n        cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x)\n\n    :Parameters:\n        profit_matrix : list of lists\n            The matrix to convert from a profit to a cost matrix\n\n        inversion_function : function\n            The function to use to invert each entry in the profit matrix\n\n    :rtype: list of lists\n    :return: The converted matrix\n    \"\"\"\n    cost_matrix = []\n    for row in profit_matrix:\n        cost_matrix.append([inversion_function(value) for value in row])\n    return cost_matrix\n"
  },
  {
    "path": "ppdet/metrics/pose3d_metrics.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nfrom paddle.distributed import ParallelEnv\nimport os\nimport json\nfrom collections import defaultdict, OrderedDict\nimport numpy as np\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = ['Pose3DEval']\n\n\nclass AverageMeter(object):\n    def __init__(self):\n        self.reset()\n\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n\n\ndef mean_per_joint_position_error(pred, gt, has_3d_joints):\n    \"\"\" \n    Compute mPJPE\n    \"\"\"\n    gt = gt[has_3d_joints == 1]\n    gt = gt[:, :, :3]\n    pred = pred[has_3d_joints == 1]\n\n    with paddle.no_grad():\n        gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2\n        gt = gt - gt_pelvis[:, None, :]\n        pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2\n        pred = pred - pred_pelvis[:, None, :]\n        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy()\n        return error\n\n\ndef compute_similarity_transform(S1, S2):\n    \"\"\"Computes a similarity transform (sR, t) that takes\n    a set of 3D points S1 (3 x N) closest to a set of 3D points S2,\n    where R is an 3x3 rotation matrix, t 3x1 translation, s scale.\n    i.e. solves the orthogonal Procrutes problem.\n    \"\"\"\n    transposed = False\n    if S1.shape[0] != 3 and S1.shape[0] != 2:\n        S1 = S1.T\n        S2 = S2.T\n        transposed = True\n    assert (S2.shape[1] == S1.shape[1])\n\n    # 1. Remove mean.\n    mu1 = S1.mean(axis=1, keepdims=True)\n    mu2 = S2.mean(axis=1, keepdims=True)\n    X1 = S1 - mu1\n    X2 = S2 - mu2\n\n    # 2. Compute variance of X1 used for scale.\n    var1 = np.sum(X1**2)\n\n    # 3. The outer product of X1 and X2.\n    K = X1.dot(X2.T)\n\n    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are\n    # singular vectors of K.\n    U, s, Vh = np.linalg.svd(K)\n    V = Vh.T\n    # Construct Z that fixes the orientation of R to get det(R)=1.\n    Z = np.eye(U.shape[0])\n    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))\n    # Construct R.\n    R = V.dot(Z.dot(U.T))\n\n    # 5. Recover scale.\n    scale = np.trace(R.dot(K)) / var1\n\n    # 6. Recover translation.\n    t = mu2 - scale * (R.dot(mu1))\n\n    # 7. Error:\n    S1_hat = scale * R.dot(S1) + t\n\n    if transposed:\n        S1_hat = S1_hat.T\n\n    return S1_hat\n\n\ndef compute_similarity_transform_batch(S1, S2):\n    \"\"\"Batched version of compute_similarity_transform.\"\"\"\n    S1_hat = np.zeros_like(S1)\n    for i in range(S1.shape[0]):\n        S1_hat[i] = compute_similarity_transform(S1[i], S2[i])\n    return S1_hat\n\n\ndef reconstruction_error(S1, S2, reduction='mean'):\n    \"\"\"Do Procrustes alignment and compute reconstruction error.\"\"\"\n    S1_hat = compute_similarity_transform_batch(S1, S2)\n    re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1)\n    if reduction == 'mean':\n        re = re.mean()\n    elif reduction == 'sum':\n        re = re.sum()\n    return re\n\n\ndef all_gather(data):\n    if paddle.distributed.get_world_size() == 1:\n        return data\n    vlist = []\n    paddle.distributed.all_gather(vlist, data)\n    data = paddle.concat(vlist, 0)\n    return data\n\n\nclass Pose3DEval(object):\n    def __init__(self, output_eval, save_prediction_only=False):\n        super(Pose3DEval, self).__init__()\n        self.output_eval = output_eval\n        self.res_file = os.path.join(output_eval, \"pose3d_results.json\")\n        self.save_prediction_only = save_prediction_only\n        self.reset()\n\n    def reset(self):\n        self.PAmPJPE = AverageMeter()\n        self.mPJPE = AverageMeter()\n        self.eval_results = {}\n\n    def get_human36m_joints(self, input):\n        J24_TO_J14 = paddle.to_tensor(\n            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])\n        J24_TO_J17 = paddle.to_tensor(\n            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19])\n        return paddle.index_select(input, J24_TO_J14, axis=1)\n\n    def update(self, inputs, outputs):\n        gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv()\n                                                           .local_rank))\n        has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv()\n                                                                .local_rank))\n        pred_3d_joints = all_gather(outputs['pose3d'])\n        if gt_3d_joints.shape[1] == 24:\n            gt_3d_joints = self.get_human36m_joints(gt_3d_joints)\n        if pred_3d_joints.shape[1] == 24:\n            pred_3d_joints = self.get_human36m_joints(pred_3d_joints)\n        mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints,\n                                                  has_3d_joints).mean()\n        PAmPJPE_val = reconstruction_error(\n            pred_3d_joints.numpy(),\n            gt_3d_joints[:, :, :3].numpy(),\n            reduction=None).mean()\n        count = int(np.sum(has_3d_joints.numpy()))\n        self.PAmPJPE.update(PAmPJPE_val * 1000., count)\n        self.mPJPE.update(mPJPE_val * 1000., count)\n\n    def accumulate(self):\n        if self.save_prediction_only:\n            logger.info(f'The pose3d result is saved to {self.res_file} '\n                        'and do not evaluate the model.')\n            return\n        self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg]\n\n    def log(self):\n        if self.save_prediction_only:\n            return\n        stats_names = ['mPJPE', 'PAmPJPE']\n        num_values = len(stats_names)\n        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')\n        print('|---' * (num_values + 1) + '|')\n\n        print(' '.join([\n            '| {:.3f}'.format(abs(value))\n            for value in self.eval_results['pose3d']\n        ]) + ' |')\n\n    def get_results(self):\n        return self.eval_results\n"
  },
  {
    "path": "ppdet/metrics/widerface_utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport cv2\nimport numpy as np\nfrom collections import OrderedDict\n\nimport paddle\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = ['face_eval_run', 'lmk2out']\n\n\ndef face_eval_run(model,\n                  image_dir,\n                  gt_file,\n                  pred_dir='output/pred',\n                  eval_mode='widerface',\n                  multi_scale=False):\n    # load ground truth files\n    with open(gt_file, 'r') as f:\n        gt_lines = f.readlines()\n    imid2path = []\n    pos_gt = 0\n    while pos_gt < len(gt_lines):\n        name_gt = gt_lines[pos_gt].strip('\\n\\t').split()[0]\n        imid2path.append(name_gt)\n        pos_gt += 1\n        n_gt = int(gt_lines[pos_gt].strip('\\n\\t').split()[0])\n        pos_gt += 1 + n_gt\n    logger.info('The ground truth file load {} images'.format(len(imid2path)))\n\n    dets_dist = OrderedDict()\n    for iter_id, im_path in enumerate(imid2path):\n        image_path = os.path.join(image_dir, im_path)\n        if eval_mode == 'fddb':\n            image_path += '.jpg'\n        assert os.path.exists(image_path)\n        image = cv2.imread(image_path)\n        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n        if multi_scale:\n            shrink, max_shrink = get_shrink(image.shape[0], image.shape[1])\n            det0 = detect_face(model, image, shrink)\n            det1 = flip_test(model, image, shrink)\n            [det2, det3] = multi_scale_test(model, image, max_shrink)\n            det4 = multi_scale_test_pyramid(model, image, max_shrink)\n            det = np.row_stack((det0, det1, det2, det3, det4))\n            dets = bbox_vote(det)\n        else:\n            dets = detect_face(model, image, 1)\n        if eval_mode == 'widerface':\n            save_widerface_bboxes(image_path, dets, pred_dir)\n        else:\n            dets_dist[im_path] = dets\n        if iter_id % 100 == 0:\n            logger.info('Test iter {}'.format(iter_id))\n    if eval_mode == 'fddb':\n        save_fddb_bboxes(dets_dist, pred_dir)\n    logger.info(\"Finish evaluation.\")\n\n\ndef detect_face(model, image, shrink):\n    image_shape = [image.shape[0], image.shape[1]]\n    if shrink != 1:\n        h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink)\n        image = cv2.resize(image, (w, h))\n        image_shape = [h, w]\n\n    img = face_img_process(image)\n    image_shape = np.asarray([image_shape])\n    scale_factor = np.asarray([[shrink, shrink]])\n    data = {\n        \"image\": paddle.to_tensor(\n            img, dtype='float32'),\n        \"im_shape\": paddle.to_tensor(\n            image_shape, dtype='float32'),\n        \"scale_factor\": paddle.to_tensor(\n            scale_factor, dtype='float32')\n    }\n    model.eval()\n    detection = model(data)\n    detection = detection['bbox'].numpy()\n    # layout: xmin, ymin, xmax. ymax, score\n    if np.prod(detection.shape) == 1:\n        logger.info(\"No face detected\")\n        return np.array([[0, 0, 0, 0, 0]])\n    det_conf = detection[:, 1]\n    det_xmin = detection[:, 2]\n    det_ymin = detection[:, 3]\n    det_xmax = detection[:, 4]\n    det_ymax = detection[:, 5]\n\n    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))\n    return det\n\n\ndef flip_test(model, image, shrink):\n    img = cv2.flip(image, 1)\n    det_f = detect_face(model, img, shrink)\n    det_t = np.zeros(det_f.shape)\n    img_width = image.shape[1]\n    det_t[:, 0] = img_width - det_f[:, 2]\n    det_t[:, 1] = det_f[:, 1]\n    det_t[:, 2] = img_width - det_f[:, 0]\n    det_t[:, 3] = det_f[:, 3]\n    det_t[:, 4] = det_f[:, 4]\n    return det_t\n\n\ndef multi_scale_test(model, image, max_shrink):\n    # Shrink detecting is only used to detect big faces\n    st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink\n    det_s = detect_face(model, image, st)\n    index = np.where(\n        np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)\n        > 30)[0]\n    det_s = det_s[index, :]\n    # Enlarge one times\n    bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2\n    det_b = detect_face(model, image, bt)\n\n    # Enlarge small image x times for small faces\n    if max_shrink > 2:\n        bt *= 2\n        while bt < max_shrink:\n            det_b = np.row_stack((det_b, detect_face(model, image, bt)))\n            bt *= 2\n        det_b = np.row_stack((det_b, detect_face(model, image, max_shrink)))\n\n    # Enlarged images are only used to detect small faces.\n    if bt > 1:\n        index = np.where(\n            np.minimum(det_b[:, 2] - det_b[:, 0] + 1,\n                       det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]\n        det_b = det_b[index, :]\n    # Shrinked images are only used to detect big faces.\n    else:\n        index = np.where(\n            np.maximum(det_b[:, 2] - det_b[:, 0] + 1,\n                       det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]\n        det_b = det_b[index, :]\n    return det_s, det_b\n\n\ndef multi_scale_test_pyramid(model, image, max_shrink):\n    # Use image pyramids to detect faces\n    det_b = detect_face(model, image, 0.25)\n    index = np.where(\n        np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)\n        > 30)[0]\n    det_b = det_b[index, :]\n\n    st = [0.75, 1.25, 1.5, 1.75]\n    for i in range(len(st)):\n        if st[i] <= max_shrink:\n            det_temp = detect_face(model, image, st[i])\n            # Enlarged images are only used to detect small faces.\n            if st[i] > 1:\n                index = np.where(\n                    np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,\n                               det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]\n                det_temp = det_temp[index, :]\n            # Shrinked images are only used to detect big faces.\n            else:\n                index = np.where(\n                    np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,\n                               det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]\n                det_temp = det_temp[index, :]\n            det_b = np.row_stack((det_b, det_temp))\n    return det_b\n\n\ndef to_chw(image):\n    \"\"\"\n    Transpose image from HWC to CHW.\n    Args:\n        image (np.array): an image with HWC layout.\n    \"\"\"\n    # HWC to CHW\n    if len(image.shape) == 3:\n        image = np.swapaxes(image, 1, 2)\n        image = np.swapaxes(image, 1, 0)\n    return image\n\n\ndef face_img_process(image,\n                     mean=[104., 117., 123.],\n                     std=[127.502231, 127.502231, 127.502231]):\n    img = np.array(image)\n    img = to_chw(img)\n    img = img.astype('float32')\n    img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')\n    img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32')\n    img = [img]\n    img = np.array(img)\n    return img\n\n\ndef get_shrink(height, width):\n    \"\"\"\n    Args:\n        height (int): image height.\n        width (int): image width.\n    \"\"\"\n    # avoid out of memory\n    max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5\n    max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5\n\n    def get_round(x, loc):\n        str_x = str(x)\n        if '.' in str_x:\n            str_before, str_after = str_x.split('.')\n            len_after = len(str_after)\n            if len_after >= 3:\n                str_final = str_before + '.' + str_after[0:loc]\n                return float(str_final)\n            else:\n                return x\n\n    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3\n    if max_shrink >= 1.5 and max_shrink < 2:\n        max_shrink = max_shrink - 0.1\n    elif max_shrink >= 2 and max_shrink < 3:\n        max_shrink = max_shrink - 0.2\n    elif max_shrink >= 3 and max_shrink < 4:\n        max_shrink = max_shrink - 0.3\n    elif max_shrink >= 4 and max_shrink < 5:\n        max_shrink = max_shrink - 0.4\n    elif max_shrink >= 5:\n        max_shrink = max_shrink - 0.5\n    elif max_shrink <= 0.1:\n        max_shrink = 0.1\n\n    shrink = max_shrink if max_shrink < 1 else 1\n    return shrink, max_shrink\n\n\ndef bbox_vote(det):\n    order = det[:, 4].ravel().argsort()[::-1]\n    det = det[order, :]\n    if det.shape[0] == 0:\n        dets = np.array([[10, 10, 20, 20, 0.002]])\n        det = np.empty(shape=[0, 5])\n    while det.shape[0] > 0:\n        # IOU\n        area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)\n        xx1 = np.maximum(det[0, 0], det[:, 0])\n        yy1 = np.maximum(det[0, 1], det[:, 1])\n        xx2 = np.minimum(det[0, 2], det[:, 2])\n        yy2 = np.minimum(det[0, 3], det[:, 3])\n        w = np.maximum(0.0, xx2 - xx1 + 1)\n        h = np.maximum(0.0, yy2 - yy1 + 1)\n        inter = w * h\n        o = inter / (area[0] + area[:] - inter)\n\n        # nms\n        merge_index = np.where(o >= 0.3)[0]\n        det_accu = det[merge_index, :]\n        det = np.delete(det, merge_index, 0)\n        if merge_index.shape[0] <= 1:\n            if det.shape[0] == 0:\n                try:\n                    dets = np.row_stack((dets, det_accu))\n                except:\n                    dets = det_accu\n            continue\n        det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))\n        max_score = np.max(det_accu[:, 4])\n        det_accu_sum = np.zeros((1, 5))\n        det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],\n                                      axis=0) / np.sum(det_accu[:, -1:])\n        det_accu_sum[:, 4] = max_score\n        try:\n            dets = np.row_stack((dets, det_accu_sum))\n        except:\n            dets = det_accu_sum\n    dets = dets[0:750, :]\n    keep_index = np.where(dets[:, 4] >= 0.01)[0]\n    dets = dets[keep_index, :]\n    return dets\n\n\ndef save_widerface_bboxes(image_path, bboxes_scores, output_dir):\n    image_name = image_path.split('/')[-1]\n    image_class = image_path.split('/')[-2]\n    odir = os.path.join(output_dir, image_class)\n    if not os.path.exists(odir):\n        os.makedirs(odir)\n\n    ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))\n    f = open(ofname, 'w')\n    f.write('{:s}\\n'.format(image_class + '/' + image_name))\n    f.write('{:d}\\n'.format(bboxes_scores.shape[0]))\n    for box_score in bboxes_scores:\n        xmin, ymin, xmax, ymax, score = box_score\n        f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\\n'.format(xmin, ymin, (\n            xmax - xmin + 1), (ymax - ymin + 1), score))\n    f.close()\n    logger.info(\"The predicted result is saved as {}\".format(ofname))\n\n\ndef save_fddb_bboxes(bboxes_scores,\n                     output_dir,\n                     output_fname='pred_fddb_res.txt'):\n    if not os.path.exists(output_dir):\n        os.makedirs(output_dir)\n    predict_file = os.path.join(output_dir, output_fname)\n    f = open(predict_file, 'w')\n    for image_path, dets in bboxes_scores.iteritems():\n        f.write('{:s}\\n'.format(image_path))\n        f.write('{:d}\\n'.format(dets.shape[0]))\n        for box_score in dets:\n            xmin, ymin, xmax, ymax, score = box_score\n            width, height = xmax - xmin, ymax - ymin\n            f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\\n'\n                    .format(xmin, ymin, width, height, score))\n    logger.info(\"The predicted result is saved as {}\".format(predict_file))\n    return predict_file\n\n\ndef lmk2out(results, is_bbox_normalized=False):\n    \"\"\"\n    Args:\n        results: request a dict, should include: `landmark`, `im_id`,\n                 if is_bbox_normalized=True, also need `im_shape`.\n        is_bbox_normalized: whether or not landmark is normalized.\n    \"\"\"\n    xywh_res = []\n    for t in results:\n        bboxes = t['bbox'][0]\n        lengths = t['bbox'][1][0]\n        im_ids = np.array(t['im_id'][0]).flatten()\n        if bboxes.shape == (1, 1) or bboxes is None:\n            continue\n        face_index = t['face_index'][0]\n        prior_box = t['prior_boxes'][0]\n        predict_lmk = t['landmark'][0]\n        prior = np.reshape(prior_box, (-1, 4))\n        predictlmk = np.reshape(predict_lmk, (-1, 10))\n\n        k = 0\n        for a in range(len(lengths)):\n            num = lengths[a]\n            im_id = int(im_ids[a])\n            for i in range(num):\n                score = bboxes[k][1]\n                theindex = face_index[i][0]\n                me_prior = prior[theindex, :]\n                lmk_pred = predictlmk[theindex, :]\n                prior_w = me_prior[2] - me_prior[0]\n                prior_h = me_prior[3] - me_prior[1]\n                prior_w_center = (me_prior[2] + me_prior[0]) / 2\n                prior_h_center = (me_prior[3] + me_prior[1]) / 2\n                lmk_decode = np.zeros((10))\n                for j in [0, 2, 4, 6, 8]:\n                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center\n                for j in [1, 3, 5, 7, 9]:\n                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center\n                im_shape = t['im_shape'][0][a].tolist()\n                image_h, image_w = int(im_shape[0]), int(im_shape[1])\n                if is_bbox_normalized:\n                    lmk_decode = lmk_decode * np.array([\n                        image_w, image_h, image_w, image_h, image_w, image_h,\n                        image_w, image_h, image_w, image_h\n                    ])\n                lmk_res = {\n                    'image_id': im_id,\n                    'landmark': lmk_decode,\n                    'score': score,\n                }\n                xywh_res.append(lmk_res)\n                k += 1\n    return xywh_res\n\ndef image_eval(pred, gt, ignore, iou_thresh):\n    \"\"\" single image evaluation\n    pred: Nx5 xyxys\n    gt: Nx4 xywh\n    ignore:\n    \"\"\"\n    _pred = pred.copy()\n    _gt = gt.copy()\n    pred_recall = np.zeros(_pred.shape[0])\n    recall_list = np.zeros(_gt.shape[0])\n    proposal_list = np.ones(_pred.shape[0])\n\n    _gt[:, 2] = _gt[:, 2] + _gt[:, 0]\n    _gt[:, 3] = _gt[:, 3] + _gt[:, 1]\n\n    overlaps = bbox_overlaps(_pred[:, :4], _gt)\n\n    for h in range(_pred.shape[0]):\n\n        gt_overlap = overlaps[h]\n        max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()\n        if max_overlap >= iou_thresh:\n            if ignore[max_idx] == 0:\n                recall_list[max_idx] = -1\n                proposal_list[h] = -1\n            elif recall_list[max_idx] == 0:\n                recall_list[max_idx] = 1\n\n        r_keep_index = np.where(recall_list == 1)[0]\n        pred_recall[h] = len(r_keep_index)\n    return pred_recall, proposal_list\n    \n\ndef bbox_overlaps(boxes1, boxes2):\n    \"\"\"\n    Parameters\n    ----------\n    boxes1: (N, 4) ndarray of float\n    boxes2: (K, 4) ndarray of float\n    Returns\n    -------\n    overlaps: (N, K) ndarray of overlap between boxes1 and boxes2\n    \"\"\"\n    # Calculate the area of each box\n    box_areas1 = (boxes1[:, 2] - boxes1[:, 0] + 1) * (\n        boxes1[:, 3] - boxes1[:, 1] + 1)\n    box_areas2 = (boxes2[:, 2] - boxes2[:, 0] + 1) * (\n        boxes2[:, 3] - boxes2[:, 1] + 1)\n    # Calculate the intersection areas\n    iw = np.minimum(boxes1[:, None, 2], boxes2[None, :, 2]) - np.maximum(\n        boxes1[:, None, 0], boxes2[None, :, 0]) + 1\n    ih = np.minimum(boxes1[:, None, 3], boxes2[None, :, 3]) - np.maximum(\n        boxes1[:, None, 1], boxes2[None, :, 1]) + 1\n    # Ensure that the intersection width and height are non-negative\n    iw = np.maximum(iw, 0)\n    ih = np.maximum(ih, 0)\n    # Calculate the intersection area\n    intersection = iw * ih\n    # Calculate the union area\n    union = box_areas1[:, None] + box_areas2[None, :] - intersection\n    union = box_areas1[:, None] + box_areas2[None, :] - intersection\n    union = np.maximum(union, 1e-8)\n    # Calculate the overlaps (intersection over union)\n    overlaps = intersection / union\n    return overlaps\n\n\ndef img_pr_info(thresh_num, pred_info, proposal_list, pred_recall):\n    pr_info = np.zeros((thresh_num, 2)).astype('float')\n    for t in range(thresh_num):\n\n        thresh = 1 - (t+1)/thresh_num\n        r_index = np.where(pred_info[:, 4] >= thresh)[0]\n        if len(r_index) == 0:\n            pr_info[t, 0] = 0\n            pr_info[t, 1] = 0\n        else:\n            r_index = r_index[-1]\n            p_index = np.where(proposal_list[:r_index+1] == 1)[0]\n            pr_info[t, 0] = len(p_index)\n            pr_info[t, 1] = pred_recall[r_index]\n    return pr_info\n\n\ndef dataset_pr_info(thresh_num, pr_curve, count_face):\n    _pr_curve = np.zeros((thresh_num, 2))\n    for i in range(thresh_num):\n        _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0]\n        _pr_curve[i, 1] = pr_curve[i, 1] / count_face\n    return _pr_curve\n\n\ndef voc_ap(rec, prec):\n\n    # correct AP calculation\n    # first append sentinel values at the end\n    mrec = np.concatenate(([0.], rec, [1.]))\n    mpre = np.concatenate(([0.], prec, [0.]))\n\n    # compute the precision envelope\n    for i in range(mpre.size - 1, 0, -1):\n        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])\n\n    # to calculate area under PR curve, look for points\n    # where X axis (recall) changes value\n    i = np.where(mrec[1:] != mrec[:-1])[0]\n\n    # and sum (\\Delta recall) * prec\n    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])\n    return ap"
  },
  {
    "path": "ppdet/model_zoo/.gitignore",
    "content": "MODEL_ZOO\n"
  },
  {
    "path": "ppdet/model_zoo/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom . import model_zoo\nfrom .model_zoo import *\n\n__all__ = model_zoo.__all__\n"
  },
  {
    "path": "ppdet/model_zoo/model_zoo.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport os.path as osp\nimport pkg_resources\n\ntry:\n    from collections.abc import Sequence\nexcept:\n    from collections import Sequence\n\nfrom ppdet.core.workspace import load_config, create\nfrom ppdet.utils.checkpoint import load_weight\nfrom ppdet.utils.download import get_config_path\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'list_model', 'get_config_file', 'get_weights_url', 'get_model',\n    'MODEL_ZOO_FILENAME'\n]\n\nMODEL_ZOO_FILENAME = 'MODEL_ZOO'\n\n\ndef list_model(filters=[]):\n    model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo',\n                                                     MODEL_ZOO_FILENAME)\n    with open(model_zoo_file) as f:\n        model_names = f.read().splitlines()\n\n    # filter model_name\n    def filt(name):\n        for f in filters:\n            if name.find(f) < 0:\n                return False\n        return True\n\n    if isinstance(filters, str) or not isinstance(filters, Sequence):\n        filters = [filters]\n    model_names = [name for name in model_names if filt(name)]\n    if len(model_names) == 0 and len(filters) > 0:\n        raise ValueError(\"no model found, please check filters seeting, \"\n                         \"filters can be set as following kinds:\\n\"\n                         \"\\tDataset: coco, voc ...\\n\"\n                         \"\\tArchitecture: yolo, rcnn, ssd ...\\n\"\n                         \"\\tBackbone: resnet, vgg, darknet ...\\n\")\n\n    model_str = \"Available Models:\\n\"\n    for model_name in model_names:\n        model_str += \"\\t{}\\n\".format(model_name)\n    logger.info(model_str)\n\n\n# models and configs save on bcebos under dygraph directory\ndef get_config_file(model_name):\n    return get_config_path(\"ppdet://configs/{}.yml\".format(model_name))\n\n\ndef get_weights_url(model_name):\n    return \"ppdet://models/{}.pdparams\".format(osp.split(model_name)[-1])\n\n\ndef get_model(model_name, pretrained=True):\n    cfg_file = get_config_file(model_name)\n    cfg = load_config(cfg_file)\n    model = create(cfg.architecture)\n\n    if pretrained:\n        load_weight(model, get_weights_url(model_name))\n\n    return model\n"
  },
  {
    "path": "ppdet/model_zoo/tests/__init__.py",
    "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppdet/model_zoo/tests/test_get_model.py",
    "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport paddle\nimport ppdet\nimport unittest\n\n# NOTE: weights downloading costs time, we choose\n#       a small model for unittesting\nMODEL_NAME = 'ppyolo/ppyolo_tiny_650e_coco'\n\n\nclass TestGetConfigFile(unittest.TestCase):\n    def test_main(self):\n        try:\n            cfg_file = ppdet.model_zoo.get_config_file(MODEL_NAME)\n            assert os.path.isfile(cfg_file)\n        except:\n            self.assertTrue(False)\n\n\nclass TestGetModel(unittest.TestCase):\n    def test_main(self):\n        try:\n            model = ppdet.model_zoo.get_model(MODEL_NAME)\n            assert isinstance(model, paddle.nn.Layer)\n        except:\n            self.assertTrue(False)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "ppdet/model_zoo/tests/test_list_model.py",
    "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport unittest\nimport ppdet\n\n\nclass TestListModel(unittest.TestCase):\n    def setUp(self):\n        self._filter = []\n\n    def test_main(self):\n        try:\n            ppdet.model_zoo.list_model(self._filter)\n            self.assertTrue(True)\n        except:\n            self.assertTrue(False)\n\n\nclass TestListModelYOLO(TestListModel):\n    def setUp(self):\n        self._filter = ['yolo']\n\n\nclass TestListModelRCNN(TestListModel):\n    def setUp(self):\n        self._filter = ['rcnn']\n\n\nclass TestListModelSSD(TestListModel):\n    def setUp(self):\n        self._filter = ['ssd']\n\n\nclass TestListModelMultiFilter(TestListModel):\n    def setUp(self):\n        self._filter = ['yolo', 'darknet']\n\n\nclass TestListModelError(unittest.TestCase):\n    def setUp(self):\n        self._filter = ['xxx']\n\n    def test_main(self):\n        try:\n            ppdet.model_zoo.list_model(self._filter)\n            self.assertTrue(False)\n        except ValueError:\n            self.assertTrue(True)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "ppdet/modeling/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport warnings\nwarnings.filterwarnings(\n    action='ignore', category=DeprecationWarning, module='ops')\n\nfrom . import ops\nfrom . import backbones\nfrom . import necks\nfrom . import proposal_generator\nfrom . import heads\nfrom . import losses\nfrom . import architectures\nfrom . import post_process\nfrom . import layers\nfrom . import reid\nfrom . import mot\nfrom . import transformers\nfrom . import assigners\nfrom . import rbox_utils\nfrom . import ssod\n\nfrom .ops import *\nfrom .backbones import *\nfrom .necks import *\nfrom .proposal_generator import *\nfrom .heads import *\nfrom .losses import *\nfrom .architectures import *\nfrom .post_process import *\nfrom .layers import *\nfrom .reid import *\nfrom .mot import *\nfrom .transformers import *\nfrom .assigners import *\nfrom .rbox_utils import *\nfrom .ssod import *\n"
  },
  {
    "path": "ppdet/modeling/architectures/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom . import meta_arch\nfrom . import faster_rcnn\nfrom . import mask_rcnn\nfrom . import yolo\nfrom . import ppyoloe\nfrom . import cascade_rcnn\nfrom . import ssd\nfrom . import fcos\nfrom . import solov2\nfrom . import ttfnet\nfrom . import s2anet\nfrom . import keypoint_hrhrnet\nfrom . import keypoint_hrnet\nfrom . import keypoint_vitpose\nfrom . import jde\nfrom . import deepsort\nfrom . import fairmot\nfrom . import centernet\nfrom . import gfl\nfrom . import picodet\nfrom . import detr\nfrom . import sparse_rcnn\nfrom . import tood\nfrom . import retinanet\nfrom . import bytetrack\nfrom . import yolox\nfrom . import yolof\nfrom . import pose3d_metro\nfrom . import centertrack\nfrom . import queryinst\nfrom . import detr_ssod\nfrom . import multi_stream_detector\nfrom . import clrnet\n\nfrom .meta_arch import *\nfrom .faster_rcnn import *\nfrom .mask_rcnn import *\nfrom .yolo import *\nfrom .ppyoloe import *\nfrom .cascade_rcnn import *\nfrom .ssd import *\nfrom .fcos import *\nfrom .solov2 import *\nfrom .ttfnet import *\nfrom .s2anet import *\nfrom .keypoint_hrhrnet import *\nfrom .keypoint_hrnet import *\nfrom .keypoint_vitpose import *\nfrom .jde import *\nfrom .deepsort import *\nfrom .fairmot import *\nfrom .centernet import *\nfrom .blazeface import *\nfrom .gfl import *\nfrom .picodet import *\nfrom .detr import *\nfrom .sparse_rcnn import *\nfrom .tood import *\nfrom .retinanet import *\nfrom .bytetrack import *\nfrom .yolox import *\nfrom .yolof import *\nfrom .pose3d_metro import *\nfrom .centertrack import *\nfrom .queryinst import *\nfrom .keypoint_petr import *\nfrom .detr_ssod import *\nfrom .multi_stream_detector import *\nfrom .clrnet import *\n\nfrom . import rtdetrv3\nfrom .rtdetrv3 import *"
  },
  {
    "path": "ppdet/modeling/architectures/blazeface.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\nimport paddle\nimport paddle.nn.functional as F\n\n__all__ = ['BlazeFace']\n\n\n@register\nclass BlazeFace(BaseArch):\n    \"\"\"\n    BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs,\n               see https://arxiv.org/abs/1907.05047\n\n    Args:\n        backbone (nn.Layer): backbone instance\n        neck (nn.Layer): neck instance\n        blaze_head (nn.Layer): `blazeHead` instance\n        post_process (object): `BBoxPostProcess` instance\n    \"\"\"\n\n    __category__ = 'architecture'\n    __inject__ = ['post_process']\n\n    def __init__(self, backbone, blaze_head, neck, post_process):\n        super(BlazeFace, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.blaze_head = blaze_head\n        self.post_process = post_process\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n        # fpn\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n        # head\n        kwargs = {'input_shape': neck.out_shape}\n        blaze_head = create(cfg['blaze_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            'blaze_head': blaze_head,\n        }\n\n    def _forward(self):\n        # Backbone\n        body_feats = self.backbone(self.inputs)\n        # neck\n        neck_feats = self.neck(body_feats)\n        # blaze Head\n        if self.training:\n            return self.blaze_head(neck_feats, self.inputs['image'],\n                                   self.inputs['gt_bbox'],\n                                   self.inputs['gt_class'])\n        else:\n            preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])\n            bbox, bbox_num, nms_keep_idx = self.post_process(\n                preds, anchors, self.inputs['im_shape'],\n                self.inputs['scale_factor'])\n            if self.use_extra_data:\n                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx\n                \"\"\"extra_data:{\n                            'scores': predict scores,\n                            'nms_keep_idx': bbox index before nms,\n                           }\n                           \"\"\"\n                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]\n                extra_data['scores'] = F.softmax(paddle.concat(\n                    preds_logits, axis=1)).transpose([0, 2, 1])\n                extra_data['logits'] = paddle.concat(\n                    preds_logits, axis=1).transpose([0, 2, 1])\n                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms\n                return bbox, bbox_num, extra_data\n            else:\n                return bbox, bbox_num\n\n    def get_loss(self, ):\n        return {\"loss\": self._forward()}\n\n    def get_pred(self):\n        if self.use_extra_data:\n            bbox_pred, bbox_num, extra_data = self._forward()\n            output = {\n                \"bbox\": bbox_pred,\n                \"bbox_num\": bbox_num,\n                \"extra_data\": extra_data\n            }\n        else:\n            bbox_pred, bbox_num = self._forward()\n            output = {\n                \"bbox\": bbox_pred,\n                \"bbox_num\": bbox_num,\n            }\n\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/bytetrack.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['ByteTrack']\n\n\n@register\nclass ByteTrack(BaseArch):\n    \"\"\"\n    ByteTrack network, see https://arxiv.org/abs/2110.06864\n\n    Args:\n        detector (object): detector model instance\n        reid (object): reid model instance, default None\n        tracker (object): tracker instance\n    \"\"\"\n    __category__ = 'architecture'\n\n    def __init__(self,\n                 detector='YOLOX',\n                 reid=None,\n                 tracker='JDETracker'):\n        super(ByteTrack, self).__init__()\n        self.detector = detector\n        self.reid = reid\n        self.tracker = tracker\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        detector = create(cfg['detector'])\n\n        if cfg['reid'] != 'None':\n            reid = create(cfg['reid'])\n        else:\n            reid = None\n\n        tracker = create(cfg['tracker'])\n\n        return {\n            \"detector\": detector,\n            \"reid\": reid,\n            \"tracker\": tracker,\n        }\n\n    def _forward(self):\n        det_outs = self.detector(self.inputs)\n\n        if self.training:\n            return det_outs\n        else:\n            if self.reid is not None:\n                assert 'crops' in self.inputs\n                crops = self.inputs['crops']\n                pred_embs = self.reid(crops)\n            else:\n                pred_embs = None\n            det_outs['embeddings'] = pred_embs\n            return det_outs\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n\n"
  },
  {
    "path": "ppdet/modeling/architectures/cascade_rcnn.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['CascadeRCNN']\n\n\n@register\nclass CascadeRCNN(BaseArch):\n    \"\"\"\n    Cascade R-CNN network, see https://arxiv.org/abs/1712.00726\n\n    Args:\n        backbone (object): backbone instance\n        rpn_head (object): `RPNHead` instance\n        bbox_head (object): `BBoxHead` instance\n        bbox_post_process (object): `BBoxPostProcess` instance\n        neck (object): 'FPN' instance\n        mask_head (object): `MaskHead` instance\n        mask_post_process (object): `MaskPostProcess` instance\n    \"\"\"\n    __category__ = 'architecture'\n    __inject__ = [\n        'bbox_post_process',\n        'mask_post_process',\n    ]\n\n    def __init__(self,\n                 backbone,\n                 rpn_head,\n                 bbox_head,\n                 bbox_post_process,\n                 neck=None,\n                 mask_head=None,\n                 mask_post_process=None):\n        super(CascadeRCNN, self).__init__()\n        self.backbone = backbone\n        self.rpn_head = rpn_head\n        self.bbox_head = bbox_head\n        self.bbox_post_process = bbox_post_process\n        self.neck = neck\n        self.mask_head = mask_head\n        self.mask_post_process = mask_post_process\n        self.with_mask = mask_head is not None\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = cfg['neck'] and create(cfg['neck'], **kwargs)\n\n        out_shape = neck and neck.out_shape or backbone.out_shape\n        kwargs = {'input_shape': out_shape}\n        rpn_head = create(cfg['rpn_head'], **kwargs)\n        bbox_head = create(cfg['bbox_head'], **kwargs)\n\n        out_shape = neck and out_shape or bbox_head.get_head().out_shape\n        kwargs = {'input_shape': out_shape}\n        mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs)\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"rpn_head\": rpn_head,\n            \"bbox_head\": bbox_head,\n            \"mask_head\": mask_head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        if self.neck is not None:\n            body_feats = self.neck(body_feats)\n\n        if self.training:\n            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)\n            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,\n                                                  self.inputs)\n            rois, rois_num = self.bbox_head.get_assigned_rois()\n            bbox_targets = self.bbox_head.get_assigned_targets()\n            if self.with_mask:\n                mask_loss = self.mask_head(body_feats, rois, rois_num,\n                                           self.inputs, bbox_targets, bbox_feat)\n                return rpn_loss, bbox_loss, mask_loss\n            else:\n                return rpn_loss, bbox_loss, {}\n        else:\n            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)\n            preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs)\n            refined_rois = self.bbox_head.get_refined_rois()\n\n            im_shape = self.inputs['im_shape']\n            scale_factor = self.inputs['scale_factor']\n\n            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(\n                preds, (refined_rois, rois_num), im_shape, scale_factor)\n            # rescale the prediction back to origin image\n            bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(\n                bbox, bbox_num, im_shape, scale_factor)\n            if not self.with_mask:\n                return bbox_pred, bbox_num, None\n            mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs)\n            origin_shape = self.bbox_post_process.get_origin_shape()\n            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,\n                                               origin_shape)\n            return bbox_pred, bbox_num, mask_pred\n\n    def get_loss(self, ):\n        rpn_loss, bbox_loss, mask_loss = self._forward()\n        loss = {}\n        loss.update(rpn_loss)\n        loss.update(bbox_loss)\n        if self.with_mask:\n            loss.update(mask_loss)\n        total_loss = paddle.add_n(list(loss.values()))\n        loss.update({'loss': total_loss})\n        return loss\n\n    def get_pred(self):\n        bbox_pred, bbox_num, mask_pred = self._forward()\n        output = {\n            'bbox': bbox_pred,\n            'bbox_num': bbox_num,\n        }\n        if self.with_mask:\n            output.update({'mask': mask_pred})\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/centernet.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['CenterNet']\n\n\n@register\nclass CenterNet(BaseArch):\n    \"\"\"\n    CenterNet network, see http://arxiv.org/abs/1904.07850\n\n    Args:\n        backbone (object): backbone instance\n        neck (object): FPN instance, default use 'CenterNetDLAFPN'\n        head (object): 'CenterNetHead' instance\n        post_process (object): 'CenterNetPostProcess' instance\n        for_mot (bool): whether return other features used in tracking model\n\n    \"\"\"\n    __category__ = 'architecture'\n    __inject__ = ['post_process']\n    __shared__ = ['for_mot']\n\n    def __init__(self,\n                 backbone,\n                 neck='CenterNetDLAFPN',\n                 head='CenterNetHead',\n                 post_process='CenterNetPostProcess',\n                 for_mot=False):\n        super(CenterNet, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.head = head\n        self.post_process = post_process\n        self.for_mot = for_mot\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = cfg['neck'] and create(cfg['neck'], **kwargs)\n\n        out_shape = neck and neck.out_shape or backbone.out_shape\n        kwargs = {'input_shape': out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {'backbone': backbone, 'neck': neck, \"head\": head}\n\n    def _forward(self):\n        neck_feat = self.backbone(self.inputs)\n        if self.neck is not None:\n            neck_feat = self.neck(neck_feat)\n        head_out = self.head(neck_feat, self.inputs)\n        if self.for_mot:\n            head_out.update({'neck_feat': neck_feat})\n        elif self.training:\n            head_out['loss'] = head_out.pop('det_loss')\n        return head_out\n\n    def get_pred(self):\n        head_out = self._forward()\n        bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process(\n            head_out['heatmap'],\n            head_out['size'],\n            head_out['offset'],\n            im_shape=self.inputs['im_shape'],\n            scale_factor=self.inputs['scale_factor'])\n\n        if self.for_mot:\n            output = {\n                \"bbox\": bbox,\n                \"bbox_num\": bbox_num,\n                \"bbox_inds\": bbox_inds,\n                \"topk_clses\": topk_clses,\n                \"topk_ys\": topk_ys,\n                \"topk_xs\": topk_xs,\n                \"neck_feat\": head_out['neck_feat']\n            }\n        else:\n            output = {\"bbox\": bbox, \"bbox_num\": bbox_num}\n        return output\n\n    def get_loss(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/centertrack.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport copy\nimport math\nimport numpy as np\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\nfrom ..keypoint_utils import affine_transform\nfrom ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian\n\n__all__ = ['CenterTrack']\n\n\n@register\nclass CenterTrack(BaseArch):\n    \"\"\"\n    CenterTrack network, see http://arxiv.org/abs/2004.01177\n\n    Args:\n        detector (object): 'CenterNet' instance\n        plugin_head (object): 'CenterTrackHead' instance\n        tracker (object): 'CenterTracker' instance\n    \"\"\"\n    __category__ = 'architecture'\n    __shared__ = ['mot_metric']\n\n    def __init__(self,\n                 detector='CenterNet',\n                 plugin_head='CenterTrackHead',\n                 tracker='CenterTracker',\n                 mot_metric=False):\n        super(CenterTrack, self).__init__()\n        self.detector = detector\n        self.plugin_head = plugin_head\n        self.tracker = tracker\n        self.mot_metric = mot_metric\n        self.pre_image = None\n        self.deploy = False\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        detector = create(cfg['detector'])\n        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape\n\n        kwargs = {'input_shape': detector_out_shape}\n        plugin_head = create(cfg['plugin_head'], **kwargs)\n        tracker = create(cfg['tracker'])\n\n        return {\n            'detector': detector,\n            'plugin_head': plugin_head,\n            'tracker': tracker,\n        }\n\n    def _forward(self):\n        if self.training:\n            det_outs = self.detector(self.inputs)\n            neck_feat = det_outs['neck_feat']\n\n            losses = {}\n            for k, v in det_outs.items():\n                if 'loss' not in k: continue\n                losses.update({k: v})\n\n            plugin_outs = self.plugin_head(neck_feat, self.inputs)\n            for k, v in plugin_outs.items():\n                if 'loss' not in k: continue\n                losses.update({k: v})\n\n            losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss']\n            return losses\n\n        else:\n            if not self.mot_metric:\n                # detection, support bs>=1\n                det_outs = self.detector(self.inputs)\n                return {\n                    'bbox': det_outs['bbox'],\n                    'bbox_num': det_outs['bbox_num']\n                }\n\n            else:\n                # MOT, only support bs=1\n                if not self.deploy:\n                    if self.pre_image is None:\n                        self.pre_image = self.inputs['image']\n                        # initializing tracker for the first frame\n                        self.tracker.init_track([])\n                    self.inputs['pre_image'] = self.pre_image\n                    self.pre_image = self.inputs[\n                        'image']  # Note: update for next image\n\n                    # render input heatmap from tracker status\n                    pre_hm = self.get_additional_inputs(\n                        self.tracker.tracks, self.inputs, with_hm=True)\n                    self.inputs['pre_hm'] = paddle.to_tensor(pre_hm)\n\n                # model inference\n                det_outs = self.detector(self.inputs)\n                neck_feat = det_outs['neck_feat']\n                result = self.plugin_head(\n                    neck_feat, self.inputs, det_outs['bbox'],\n                    det_outs['bbox_inds'], det_outs['topk_clses'],\n                    det_outs['topk_ys'], det_outs['topk_xs'])\n\n                if not self.deploy:\n                    # convert the cropped and 4x downsampled output coordinate system\n                    # back to the input image coordinate system\n                    result = self.plugin_head.centertrack_post_process(\n                        result, self.inputs, self.tracker.out_thresh)\n                return result\n\n    def get_pred(self):\n        return self._forward()\n\n    def get_loss(self):\n        return self._forward()\n\n    def reset_tracking(self):\n        self.tracker.reset()\n        self.pre_image = None\n\n    def get_additional_inputs(self, dets, meta, with_hm=True):\n        # Render input heatmap from previous trackings.\n        trans_input = meta['trans_input'][0].numpy()\n        inp_width, inp_height = int(meta['inp_width'][0]), int(meta[\n            'inp_height'][0])\n        input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32)\n\n        for det in dets:\n            if det['score'] < self.tracker.pre_thresh:\n                continue\n            bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width,\n                                         inp_height)\n            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]\n            if (h > 0 and w > 0):\n                radius = gaussian_radius(\n                    (math.ceil(h), math.ceil(w)), min_overlap=0.7)\n                radius = max(0, int(radius))\n                ct = np.array(\n                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],\n                    dtype=np.float32)\n                ct_int = ct.astype(np.int32)\n                if with_hm:\n                    input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int,\n                                                      radius)\n        if with_hm:\n            input_hm = input_hm[np.newaxis]\n        return input_hm\n\n\ndef affine_transform_bbox(bbox, trans, width, height):\n    bbox = np.array(copy.deepcopy(bbox), dtype=np.float32)\n    bbox[:2] = affine_transform(bbox[:2], trans)\n    bbox[2:] = affine_transform(bbox[2:], trans)\n    bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1)\n    bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1)\n    return bbox\n"
  },
  {
    "path": "ppdet/modeling/architectures/clrnet.py",
    "content": "from .meta_arch import BaseArch\nfrom ppdet.core.workspace import register, create\nfrom paddle import in_dynamic_mode\n\n__all__ = ['CLRNet']\n\n\n@register\nclass CLRNet(BaseArch):\n    __category__ = 'architecture'\n\n    def __init__(self,\n                 backbone=\"CLRResNet\",\n                 neck=\"CLRFPN\",\n                 clr_head=\"CLRHead\",\n                 post_process=None):\n        super(CLRNet, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.heads = clr_head\n        self.post_process = post_process\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n        # fpn\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n        # head\n        kwargs = {'input_shape': neck.out_shape}\n        clr_head = create(cfg['clr_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            'clr_head': clr_head,\n        }\n\n    def _forward(self):\n        # Backbone\n        body_feats = self.backbone(self.inputs['image'])\n        # neck\n        neck_feats = self.neck(body_feats)\n        # CRL Head\n\n        if self.training:\n            output = self.heads(neck_feats, self.inputs)\n        else:\n            output = self.heads(neck_feats)\n            output = {'lanes': output}\n            # TODO: hard code fix as_lanes=False problem in clrnet_head.py \"get_lanes\" function for static mode\n            if in_dynamic_mode():\n                output = self.heads.get_lanes(output['lanes'])\n                output = {\n                    \"lanes\": output,\n                    \"img_path\": self.inputs['full_img_path'],\n                    \"img_name\": self.inputs['img_name']\n                }\n\n        return output\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/deepsort.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\nfrom ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box\n\n__all__ = ['DeepSORT']\n\n\n@register\nclass DeepSORT(BaseArch):\n    \"\"\"\n    DeepSORT network, see https://arxiv.org/abs/1703.07402\n\n    Args:\n        detector (object): detector model instance\n        reid (object): reid model instance\n        tracker (object): tracker instance\n    \"\"\"\n    __category__ = 'architecture'\n\n    def __init__(self,\n                 detector='YOLOv3',\n                 reid='PCBPyramid',\n                 tracker='DeepSORTTracker'):\n        super(DeepSORT, self).__init__()\n        self.detector = detector\n        self.reid = reid\n        self.tracker = tracker\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        if cfg['detector'] != 'None':\n            detector = create(cfg['detector'])\n        else:\n            detector = None\n        reid = create(cfg['reid'])\n        tracker = create(cfg['tracker'])\n\n        return {\n            \"detector\": detector,\n            \"reid\": reid,\n            \"tracker\": tracker,\n        }\n\n    def _forward(self):\n        crops = self.inputs['crops']\n        outs = {}\n        outs['embeddings'] = self.reid(crops)\n        return outs\n\n    def get_pred(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/detr.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom .meta_arch import BaseArch\nfrom ppdet.core.workspace import register, create\n\n__all__ = ['DETR']\n# Deformable DETR, DINO use the same architecture as DETR\n\n\n@register\nclass DETR(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['post_process', 'post_process_semi']\n    __shared__ = ['with_mask', 'exclude_post_process']\n\n    def __init__(self,\n                 backbone,\n                 transformer='DETRTransformer',\n                 detr_head='DETRHead',\n                 neck=None,\n                 post_process='DETRPostProcess',\n                 post_process_semi=None,\n                 with_mask=False,\n                 exclude_post_process=False):\n        super(DETR, self).__init__()\n        self.backbone = backbone\n        self.transformer = transformer\n        self.detr_head = detr_head\n        self.neck = neck\n        self.post_process = post_process\n        self.with_mask = with_mask\n        self.exclude_post_process = exclude_post_process\n        self.post_process_semi = post_process_semi\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n        # neck\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None\n\n        # transformer\n        if neck is not None:\n            kwargs = {'input_shape': neck.out_shape}\n        transformer = create(cfg['transformer'], **kwargs)\n        # head\n        kwargs = {\n            'hidden_dim': transformer.hidden_dim,\n            'nhead': transformer.nhead,\n            'input_shape': backbone.out_shape\n        }\n        detr_head = create(cfg['detr_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'transformer': transformer,\n            \"detr_head\": detr_head,\n            \"neck\": neck\n        }\n\n    def _forward(self):\n        # Backbone\n        body_feats = self.backbone(self.inputs)\n\n        # Neck\n        if self.neck is not None:\n            body_feats = self.neck(body_feats)\n\n        # Transformer\n        pad_mask = self.inputs.get('pad_mask', None)\n        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)\n\n        # DETR Head\n        if self.training:\n            detr_losses = self.detr_head(out_transformer, body_feats,\n                                         self.inputs)\n            detr_losses.update({\n                'loss': paddle.add_n(\n                    [v for k, v in detr_losses.items() if 'log' not in k])\n            })\n            return detr_losses\n        else:\n            preds = self.detr_head(out_transformer, body_feats)\n            if self.exclude_post_process:\n                bbox, bbox_num, mask = preds\n            else:\n                bbox, bbox_num, mask = self.post_process(\n                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],\n                    self.inputs['image'][2:].shape)\n\n            output = {'bbox': bbox, 'bbox_num': bbox_num}\n            if self.with_mask:\n                output['mask'] = mask\n            return output\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/detr_ssod.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom ppdet.core.workspace import register, create, merge_config\nimport paddle\n\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, create\nfrom ppdet.utils.logger import setup_logger\nfrom ppdet.modeling.ssod.utils import filter_invalid\nfrom .multi_stream_detector import MultiSteamDetector\nlogger = setup_logger(__name__)\n\n__all__ = ['DETR_SSOD']\n__shared__ = ['num_classes']\n\n\n@register\nclass DETR_SSOD(MultiSteamDetector):\n    def __init__(self,\n                 teacher,\n                 student,\n                 train_cfg=None,\n                 test_cfg=None,\n                 RTDETRTransformer=None,\n                 num_classes=80):\n        super(DETR_SSOD, self).__init__(\n            dict(\n                teacher=teacher, student=student),\n            train_cfg=train_cfg,\n            test_cfg=test_cfg, )\n        self.ema_start_iters = train_cfg['ema_start_iters']\n        self.momentum = 0.9996\n        self.cls_thr = None\n        self.cls_thr_ig = None\n        self.num_classes = num_classes\n        if train_cfg is not None:\n            self.freeze(\"teacher\")\n            self.unsup_weight = self.train_cfg['unsup_weight']\n            self.sup_weight = self.train_cfg['sup_weight']\n            self._teacher = None\n            self._student = None\n            self._transformer = None\n\n    @classmethod\n    def from_config(cls, cfg):\n        teacher = create(cfg['teacher'])\n        merge_config(cfg)\n        student = create(cfg['student'])\n        train_cfg = cfg['train_cfg']\n        test_cfg = cfg['test_cfg']\n        RTDETRTransformer = cfg['RTDETRTransformer']\n        return {\n            'teacher': teacher,\n            'student': student,\n            'train_cfg': train_cfg,\n            'test_cfg': test_cfg,\n            'RTDETRTransformer': RTDETRTransformer\n        }\n\n    def forward_train(self, inputs, **kwargs):\n        if isinstance(inputs, dict):\n            iter_id = inputs['iter_id']\n        elif isinstance(inputs, list):\n            iter_id = inputs[-1]\n        if iter_id == self.ema_start_iters:\n            self.update_ema_model(momentum=0)\n        elif iter_id > self.ema_start_iters:\n            self.update_ema_model(momentum=self.momentum)\n        if iter_id > self.ema_start_iters:\n            data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs\n\n            if data_sup_w['image'].shape != data_sup_s['image'].shape:\n                data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,\n                                                                 data_sup_s)\n\n            if 'gt_bbox' in data_unsup_s.keys():\n                del data_unsup_s['gt_bbox']\n            if 'gt_class' in data_unsup_s.keys():\n                del data_unsup_s['gt_class']\n            if 'gt_class' in data_unsup_w.keys():\n                del data_unsup_w['gt_class']\n            if 'gt_bbox' in data_unsup_w.keys():\n                del data_unsup_w['gt_bbox']\n            for k, v in data_sup_s.items():\n                if k in ['epoch_id']:\n                    continue\n                elif k in ['gt_class', 'gt_bbox', 'is_crowd']:\n                    data_sup_s[k].extend(data_sup_w[k])\n                else:\n                    data_sup_s[k] = paddle.concat([v, data_sup_w[k]])\n\n            loss = {}\n            body_feats = self.student.backbone(data_sup_s)\n            if self.student.neck is not None:\n                body_feats = self.student.neck(body_feats)\n            out_transformer = self.student.transformer(body_feats, None,\n                                                       data_sup_s)\n            sup_loss = self.student.detr_head(out_transformer, body_feats,\n                                              data_sup_s)\n            sup_loss.update({\n                'loss': paddle.add_n(\n                    [v for k, v in sup_loss.items() if 'log' not in k])\n            })\n            sup_loss = {\"sup_\" + k: v for k, v in sup_loss.items()}\n\n            loss.update(**sup_loss)\n            unsup_loss = self.foward_unsup_train(data_unsup_w, data_unsup_s)\n            unsup_loss.update({\n                'loss': paddle.add_n(\n                    [v for k, v in unsup_loss.items() if 'log' not in k])\n            })\n            unsup_loss = {\"unsup_\" + k: v for k, v in unsup_loss.items()}\n            unsup_loss.update({\n                'loss': paddle.add_n(\n                    [v for k, v in unsup_loss.items() if 'log' not in k])\n            })\n            loss.update(**unsup_loss)\n            loss.update({'loss': loss['sup_loss'] + loss['unsup_loss']})\n        else:\n            if iter_id == self.ema_start_iters:\n                logger.info(\"start semi_supervised_traing\")\n            data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs\n\n            if data_sup_w['image'].shape != data_sup_s['image'].shape:\n                data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,\n                                                                 data_sup_s)\n            for k, v in data_sup_s.items():\n                if k in ['epoch_id']:\n                    continue\n                elif k in ['gt_class', 'gt_bbox', 'is_crowd']:\n                    data_sup_s[k].extend(data_sup_w[k])\n                else:\n                    data_sup_s[k] = paddle.concat([v, data_sup_w[k]])\n            loss = {}\n            sup_loss = self.student(data_sup_s)\n            unsup_loss = {\n                \"unsup_\" + k: v * paddle.to_tensor(0)\n                for k, v in sup_loss.items()\n            }\n            sup_loss = {\"sup_\" + k: v for k, v in sup_loss.items()}\n            loss.update(**sup_loss)\n            unsup_loss.update({\n                'loss': paddle.add_n(\n                    [v * 0 for k, v in sup_loss.items() if 'log' not in k])\n            })\n            unsup_loss = {\"unsup_\" + k: v * 0 for k, v in unsup_loss.items()}\n            loss.update(**unsup_loss)\n            loss.update({'loss': loss['sup_loss']})\n        return loss\n\n    def foward_unsup_train(self, data_unsup_w, data_unsup_s):\n\n        with paddle.no_grad():\n            body_feats = self.teacher.backbone(data_unsup_w)\n            if self.teacher.neck is not None:\n                body_feats = self.teacher.neck(body_feats, is_teacher=True)\n            out_transformer = self.teacher.transformer(\n                body_feats, None, data_unsup_w, is_teacher=True)\n            preds = self.teacher.detr_head(out_transformer, body_feats)\n            bbox, bbox_num = self.teacher.post_process_semi(preds)\n        self.place = body_feats[0].place\n\n        proposal_bbox_list = bbox[:, -4:]\n        proposal_bbox_list = proposal_bbox_list.split(\n            tuple(np.array(bbox_num)), 0)\n\n        proposal_label_list = paddle.cast(bbox[:, :1], np.float32)\n        proposal_label_list = proposal_label_list.split(\n            tuple(np.array(bbox_num)), 0)\n        proposal_score_list = paddle.cast(bbox[:, 1:self.num_classes + 1],\n                                          np.float32)\n        proposal_score_list = proposal_score_list.split(\n            tuple(np.array(bbox_num)), 0)\n        proposal_bbox_list = [\n            paddle.to_tensor(\n                p, place=self.place) for p in proposal_bbox_list\n        ]\n        proposal_label_list = [\n            paddle.to_tensor(\n                p, place=self.place) for p in proposal_label_list\n        ]\n        # filter invalid box roughly\n        if isinstance(self.train_cfg['pseudo_label_initial_score_thr'], float):\n            thr = self.train_cfg['pseudo_label_initial_score_thr']\n        else:\n            # TODO: use dynamic threshold\n            raise NotImplementedError(\n                \"Dynamic Threshold is not implemented yet.\")\n        proposal_bbox_list, proposal_label_list, proposal_score_list = list(\n            zip(* [\n                filter_invalid(\n                    proposal[:, :4],\n                    proposal_label,\n                    proposal_score,\n                    thr=thr,\n                    min_size=self.train_cfg['min_pseduo_box_size'], )\n                for proposal, proposal_label, proposal_score in\n                zip(proposal_bbox_list, proposal_label_list,\n                    proposal_score_list)\n            ]))\n\n        teacher_bboxes = list(proposal_bbox_list)\n        teacher_labels = proposal_label_list\n        teacher_info = [teacher_bboxes, teacher_labels]\n        student_unsup = data_unsup_s\n        return self.compute_pseudo_label_loss(student_unsup, teacher_info,\n                                              proposal_score_list)\n\n    def compute_pseudo_label_loss(self, student_unsup, teacher_info,\n                                  proposal_score_list):\n\n        pseudo_bboxes = list(teacher_info[0])\n        pseudo_labels = list(teacher_info[1])\n        losses = dict()\n        for i in range(len(pseudo_bboxes)):\n            if pseudo_labels[i].shape[0] == 0:\n                pseudo_bboxes[i] = paddle.zeros([0, 4]).numpy()\n                pseudo_labels[i] = paddle.zeros([0, 1]).numpy()\n            else:\n                pseudo_bboxes[i] = pseudo_bboxes[i][:, :4].numpy()\n                pseudo_labels[i] = pseudo_labels[i].numpy()\n        for i in range(len(pseudo_bboxes)):\n            pseudo_labels[i] = paddle.to_tensor(\n                pseudo_labels[i], dtype=paddle.int32, place=self.place)\n            pseudo_bboxes[i] = paddle.to_tensor(\n                pseudo_bboxes[i], dtype=paddle.float32, place=self.place)\n        student_unsup.update({\n            'gt_bbox': pseudo_bboxes,\n            'gt_class': pseudo_labels\n        })\n        pseudo_sum = 0\n        for i in range(len(pseudo_bboxes)):\n            pseudo_sum += pseudo_bboxes[i].sum()\n        if pseudo_sum == 0:  #input fake data when there are no pseudo labels\n            pseudo_bboxes[0] = paddle.ones([1, 4]) - 0.5\n            pseudo_labels[0] = paddle.ones([1, 1]).astype('int32')\n            student_unsup.update({\n                'gt_bbox': pseudo_bboxes,\n                'gt_class': pseudo_labels\n            })\n            body_feats = self.student.backbone(student_unsup)\n            if self.student.neck is not None:\n                body_feats = self.student.neck(body_feats)\n            out_transformer = self.student.transformer(body_feats, None,\n                                                       student_unsup)\n            losses = self.student.detr_head(out_transformer, body_feats,\n                                            student_unsup)\n            for n, v in losses.items():\n                losses[n] = v * 0\n        else:\n            gt_bbox = []\n            gt_class = []\n            images = []\n            proposal_score = []\n            for i in range(len(pseudo_bboxes)):\n                if pseudo_labels[i].shape[0] == 0:\n                    continue\n                else:\n                    proposal_score.append(proposal_score_list[i].max(-1)\n                                          .unsqueeze(-1))\n                    gt_class.append(pseudo_labels[i])\n                    gt_bbox.append(pseudo_bboxes[i])\n                    images.append(student_unsup['image'][i])\n            images = paddle.stack(images)\n            student_unsup.update({\n                'image': images,\n                'gt_bbox': gt_bbox,\n                'gt_class': gt_class\n            })\n            body_feats = self.student.backbone(student_unsup)\n            if self.student.neck is not None:\n                body_feats = self.student.neck(body_feats)\n            out_transformer = self.student.transformer(body_feats, None,\n                                                       student_unsup)\n            student_unsup.update({'gt_score': proposal_score})\n            losses = self.student.detr_head(out_transformer, body_feats,\n                                            student_unsup)\n        return losses\n\n\ndef box_cxcywh_to_xyxy(x):\n    x_c, y_c, w, h = x.unbind(-1)\n    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]\n    return paddle.stack(b, axis=-1)\n\n\ndef box_xyxy_to_cxcywh(x):\n    x0, y0, x1, y1 = x.unbind(-1)\n    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]\n    return paddle.stack(b, axis=-1)\n\n\ndef get_size_with_aspect_ratio(image_size, size, max_size=None):\n    w, h = image_size\n    if max_size is not None:\n        min_original_size = float(min((w, h)))\n        max_original_size = float(max((w, h)))\n        if max_original_size / min_original_size * size > max_size:\n            size = int(round(max_size * min_original_size / max_original_size))\n\n    if (w <= h and w == size) or (h <= w and h == size):\n        return (w, h)\n\n    if w < h:\n        ow = size\n        oh = int(size * h / w)\n    else:\n        oh = size\n        ow = int(size * w / h)\n\n    return (ow, oh)\n\n\ndef align_weak_strong_shape(data_weak, data_strong):\n    shape_x = data_strong['image'].shape[2]\n    shape_y = data_strong['image'].shape[3]\n\n    target_size = [shape_x, shape_y]\n    data_weak['image'] = F.interpolate(\n        data_weak['image'],\n        size=target_size,\n        mode='bilinear',\n        align_corners=False)\n    return data_weak, data_strong\n"
  },
  {
    "path": "ppdet/modeling/architectures/fairmot.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['FairMOT']\n\n\n@register\nclass FairMOT(BaseArch):\n    \"\"\"\n    FairMOT network, see http://arxiv.org/abs/2004.01888\n\n    Args:\n        detector (object): 'CenterNet' instance\n        reid (object): 'FairMOTEmbeddingHead' instance\n        tracker (object): 'JDETracker' instance\n        loss (object): 'FairMOTLoss' instance\n\n    \"\"\"\n\n    __category__ = 'architecture'\n    __inject__ = ['loss']\n\n    def __init__(self,\n                 detector='CenterNet',\n                 reid='FairMOTEmbeddingHead',\n                 tracker='JDETracker',\n                 loss='FairMOTLoss'):\n        super(FairMOT, self).__init__()\n        self.detector = detector\n        self.reid = reid\n        self.tracker = tracker\n        self.loss = loss\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        detector = create(cfg['detector'])\n        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape\n\n        kwargs = {'input_shape': detector_out_shape}\n        reid = create(cfg['reid'], **kwargs)\n        loss = create(cfg['loss'])\n        tracker = create(cfg['tracker'])\n\n        return {\n            'detector': detector,\n            'reid': reid,\n            'loss': loss,\n            'tracker': tracker\n        }\n\n    def _forward(self):\n        loss = dict()\n        # det_outs keys:\n        # train: neck_feat, det_loss, heatmap_loss, size_loss, offset_loss (optional: iou_loss)\n        # eval/infer: neck_feat, bbox, bbox_inds\n        det_outs = self.detector(self.inputs)\n        neck_feat = det_outs['neck_feat']\n        if self.training:\n            reid_loss = self.reid(neck_feat, self.inputs)\n\n            det_loss = det_outs['det_loss']\n            loss = self.loss(det_loss, reid_loss)\n            for k, v in det_outs.items():\n                if 'loss' not in k:\n                    continue\n                loss.update({k: v})\n            loss.update({'reid_loss': reid_loss})\n            return loss\n        else:\n            pred_dets, pred_embs = self.reid(\n                neck_feat, self.inputs, det_outs['bbox'], det_outs['bbox_inds'],\n                det_outs['topk_clses'])\n            return pred_dets, pred_embs\n\n    def get_pred(self):\n        output = self._forward()\n        return output\n\n    def get_loss(self):\n        loss = self._forward()\n        return loss\n"
  },
  {
    "path": "ppdet/modeling/architectures/faster_rcnn.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\nimport numpy as np\n\n__all__ = ['FasterRCNN']\n\n\n@register\nclass FasterRCNN(BaseArch):\n    \"\"\"\n    Faster R-CNN network, see https://arxiv.org/abs/1506.01497\n\n    Args:\n        backbone (object): backbone instance\n        rpn_head (object): `RPNHead` instance\n        bbox_head (object): `BBoxHead` instance\n        bbox_post_process (object): `BBoxPostProcess` instance\n        neck (object): 'FPN' instance\n    \"\"\"\n    __category__ = 'architecture'\n    __inject__ = ['bbox_post_process']\n\n    def __init__(self,\n                 backbone,\n                 rpn_head,\n                 bbox_head,\n                 bbox_post_process,\n                 neck=None):\n        super(FasterRCNN, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.rpn_head = rpn_head\n        self.bbox_head = bbox_head\n        self.bbox_post_process = bbox_post_process\n\n    def init_cot_head(self, relationship):\n        self.bbox_head.init_cot_head(relationship)\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = cfg['neck'] and create(cfg['neck'], **kwargs)\n\n        out_shape = neck and neck.out_shape or backbone.out_shape\n        kwargs = {'input_shape': out_shape}\n        rpn_head = create(cfg['rpn_head'], **kwargs)\n        bbox_head = create(cfg['bbox_head'], **kwargs)\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"rpn_head\": rpn_head,\n            \"bbox_head\": bbox_head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        if self.neck is not None:\n            body_feats = self.neck(body_feats)\n        if self.training:\n            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)\n            bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num,\n                                          self.inputs)\n            return rpn_loss, bbox_loss\n        else:\n            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)\n            preds, _ = self.bbox_head(body_feats, rois, rois_num, None)\n            im_shape = self.inputs['im_shape']\n            scale_factor = self.inputs['scale_factor']\n            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(\n                preds, (rois, rois_num), im_shape, scale_factor)\n\n            # rescale the prediction back to origin image\n            bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(\n                bbox, bbox_num, im_shape, scale_factor)\n\n            if self.use_extra_data:\n                extra_data = {\n                }  # record the bbox output before nms, such like scores and nms_keep_idx\n                \"\"\"extra_data:{\n                            'scores': predict scores,\n                            'nms_keep_idx': bbox index before nms,\n                           }\n                \"\"\"\n                extra_data['scores'] = preds[1]  # predict scores (probability)\n                # Todo: get logits output\n                extra_data[\n                    'nms_keep_idx'] = nms_keep_idx  # bbox index before nms\n                return bbox_pred, bbox_num, extra_data\n            else:\n                return bbox_pred, bbox_num\n\n    def get_loss(self, ):\n        rpn_loss, bbox_loss = self._forward()\n        loss = {}\n        loss.update(rpn_loss)\n        loss.update(bbox_loss)\n        total_loss = paddle.add_n(list(loss.values()))\n        loss.update({'loss': total_loss})\n        return loss\n\n    def get_pred(self):\n        if self.use_extra_data:\n            bbox_pred, bbox_num, extra_data = self._forward()\n            output = {\n                'bbox': bbox_pred,\n                'bbox_num': bbox_num,\n                'extra_data': extra_data\n            }\n        else:\n            bbox_pred, bbox_num = self._forward()\n            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}\n        return output\n\n    def target_bbox_forward(self, data):\n        body_feats = self.backbone(data)\n        if self.neck is not None:\n            body_feats = self.neck(body_feats)\n        rois = [roi for roi in data['gt_bbox']]\n        rois_num = paddle.concat([paddle.shape(roi)[0:1] for roi in rois])\n\n        preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True)\n        return preds\n\n    def relationship_learning(self, loader, num_classes_novel):\n        print('computing relationship')\n        train_labels_list = []\n        label_list = []\n\n        for step_id, data in enumerate(loader):\n            _, bbox_prob = self.target_bbox_forward(data)\n            batch_size = data['im_id'].shape[0]\n            for i in range(batch_size):\n                num_bbox = data['gt_class'][i].shape[0]\n                train_labels = data['gt_class'][i]\n                train_labels_list.append(train_labels.numpy().squeeze(1))\n            base_labels = bbox_prob.detach().numpy()[:, :-1]\n            label_list.append(base_labels)\n\n        labels = np.concatenate(train_labels_list, 0)\n        probabilities = np.concatenate(label_list, 0)\n        N_t = np.max(labels) + 1\n        conditional = []\n        for i in range(N_t):\n            this_class = probabilities[labels == i]\n            average = np.mean(this_class, axis=0, keepdims=True)\n            conditional.append(average)\n        return np.concatenate(conditional)\n"
  },
  {
    "path": "ppdet/modeling/architectures/fcos.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['FCOS', 'ARSL_FCOS']\n\n\n@register\nclass FCOS(BaseArch):\n    \"\"\"\n    FCOS network, see https://arxiv.org/abs/1904.01355\n\n    Args:\n        backbone (object): backbone instance\n        neck (object): 'FPN' instance\n        fcos_head (object): 'FCOSHead' instance\n        ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod) by DenseTeacher\n    \"\"\"\n\n    __category__ = 'architecture'\n    __inject__ = ['ssod_loss']\n\n    def __init__(self,\n                 backbone='ResNet',\n                 neck='FPN',\n                 fcos_head='FCOSHead',\n                 ssod_loss='SSODFCOSLoss'):\n        super(FCOS, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.fcos_head = fcos_head\n\n        # for ssod, semi-det\n        self.is_teacher = False\n        self.ssod_loss = ssod_loss\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        fcos_head = create(cfg['fcos_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"fcos_head\": fcos_head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        fpn_feats = self.neck(body_feats)\n\n        self.is_teacher = self.inputs.get('is_teacher', False)\n        if self.training or self.is_teacher:\n            losses = self.fcos_head(fpn_feats, self.inputs)\n            return losses\n        else:\n            fcos_head_outs = self.fcos_head(fpn_feats)\n            bbox_pred, bbox_num = self.fcos_head.post_process(\n                fcos_head_outs, self.inputs['scale_factor'])\n            return {'bbox': bbox_pred, 'bbox_num': bbox_num}\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n\n    def get_loss_keys(self):\n        return ['loss_cls', 'loss_box', 'loss_quality']\n\n    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):\n        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,\n                                     train_cfg)\n        return ssod_losses\n\n\n@register\nclass ARSL_FCOS(BaseArch):\n    \"\"\"\n    FCOS ARSL network, see https://arxiv.org/abs/\n\n    Args:\n        backbone (object): backbone instance\n        neck (object): 'FPN' instance\n        fcos_head (object): 'FCOSHead_ARSL' instance\n        fcos_cr_loss (object): 'FCOSLossCR' instance, only used for semi-det(ssod) by ARSL\n    \"\"\"\n\n    __category__ = 'architecture'\n    __inject__ = ['fcos_cr_loss']\n\n    def __init__(self,\n                 backbone,\n                 neck,\n                 fcos_head='FCOSHead_ARSL',\n                 fcos_cr_loss='FCOSLossCR'):\n        super(ARSL_FCOS, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.fcos_head = fcos_head\n        self.fcos_cr_loss = fcos_cr_loss\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        fcos_head = create(cfg['fcos_head'], **kwargs)\n\n        # consistency regularization loss\n        fcos_cr_loss = create(cfg['fcos_cr_loss'])\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            'fcos_head': fcos_head,\n            'fcos_cr_loss': fcos_cr_loss,\n        }\n\n    def forward(self, inputs, branch=\"supervised\", teacher_prediction=None):\n        assert branch in ['supervised', 'semi_supervised'], \\\n            print('In ARSL, type must be supervised or semi_supervised.')\n\n        if self.data_format == 'NHWC':\n            image = inputs['image']\n            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])\n        self.inputs = inputs\n\n        if self.training:\n            if branch == \"supervised\":\n                out = self.get_loss()\n            else:\n                out = self.get_pseudo_loss(teacher_prediction)\n        else:\n            # norm test\n            if branch == \"supervised\":\n                out = self.get_pred()\n                # predict pseudo labels\n            else:\n                out = self.get_pseudo_pred()\n        return out\n\n    # model forward \n    def model_forward(self):\n        body_feats = self.backbone(self.inputs)\n        fpn_feats = self.neck(body_feats)\n        fcos_head_outs = self.fcos_head(fpn_feats)\n        return fcos_head_outs\n\n    # supervised loss for labeled data\n    def get_loss(self):\n        loss = {}\n        tag_labels, tag_bboxes, tag_centerness = [], [], []\n        for i in range(len(self.fcos_head.fpn_stride)):\n            # labels, reg_target, centerness\n            k_lbl = 'labels{}'.format(i)\n            if k_lbl in self.inputs:\n                tag_labels.append(self.inputs[k_lbl])\n            k_box = 'reg_target{}'.format(i)\n            if k_box in self.inputs:\n                tag_bboxes.append(self.inputs[k_box])\n            k_ctn = 'centerness{}'.format(i)\n            if k_ctn in self.inputs:\n                tag_centerness.append(self.inputs[k_ctn])\n        fcos_head_outs = self.model_forward()\n        loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels,\n                                            tag_bboxes, tag_centerness)\n        loss.update(loss_fcos)\n        return loss\n\n    # unsupervised loss for unlabeled data\n    def get_pseudo_loss(self, teacher_prediction):\n        loss = {}\n        fcos_head_outs = self.model_forward()\n        unsup_loss = self.fcos_cr_loss(fcos_head_outs, teacher_prediction)\n        for k in unsup_loss.keys():\n            loss[k + '_pseudo'] = unsup_loss[k]\n        return loss\n\n    # get detection results for test, decode and rescale the results to original size\n    def get_pred(self):\n        fcos_head_outs = self.model_forward()\n        scale_factor = self.inputs['scale_factor']\n        bbox_pred, bbox_num = self.fcos_head.post_process(fcos_head_outs,\n                                                          scale_factor)\n        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}\n        return output\n\n    # generate pseudo labels to guide student\n    def get_pseudo_pred(self):\n        fcos_head_outs = self.model_forward()\n        pred_cls, pred_loc, pred_iou = fcos_head_outs[1:]  # 0 is locations\n        for lvl, _ in enumerate(pred_loc):\n            pred_loc[lvl] = pred_loc[lvl] / self.fcos_head.fpn_stride[lvl]\n\n        return [pred_cls, pred_loc, pred_iou, self.fcos_head.fpn_stride]\n"
  },
  {
    "path": "ppdet/modeling/architectures/gfl.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['GFL']\n\n\n@register\nclass GFL(BaseArch):\n    \"\"\"\n    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388\n\n    Args:\n        backbone (object): backbone instance\n        neck (object): 'FPN' instance\n        head (object): 'GFLHead' instance\n    \"\"\"\n\n    __category__ = 'architecture'\n\n    def __init__(self, backbone, neck, head='GFLHead'):\n        super(GFL, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.head = head\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"head\": head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        fpn_feats = self.neck(body_feats)\n        head_outs = self.head(fpn_feats)\n        if not self.training:\n            im_shape = self.inputs['im_shape']\n            scale_factor = self.inputs['scale_factor']\n            bboxes, bbox_num = self.head.post_process(head_outs, im_shape,\n                                                      scale_factor)\n            return bboxes, bbox_num\n        else:\n            return head_outs\n\n    def get_loss(self, ):\n        loss = {}\n\n        head_outs = self._forward()\n        loss_gfl = self.head.get_loss(head_outs, self.inputs)\n        loss.update(loss_gfl)\n        total_loss = paddle.add_n(list(loss.values()))\n        loss.update({'loss': total_loss})\n        return loss\n\n    def get_pred(self):\n        bbox_pred, bbox_num = self._forward()\n        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/jde.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['JDE']\n\n\n@register\nclass JDE(BaseArch):\n    __category__ = 'architecture'\n    __shared__ = ['metric']\n    \"\"\"\n    JDE network, see https://arxiv.org/abs/1909.12605v1\n\n    Args:\n        detector (object): detector model instance\n        reid (object): reid model instance\n        tracker (object): tracker instance\n        metric (str): 'MOTDet' for training and detection evaluation, 'ReID'\n            for ReID embedding evaluation, or 'MOT' for multi object tracking\n            evaluation.\n    \"\"\"\n\n    def __init__(self,\n                 detector='YOLOv3',\n                 reid='JDEEmbeddingHead',\n                 tracker='JDETracker',\n                 metric='MOT'):\n        super(JDE, self).__init__()\n        self.detector = detector\n        self.reid = reid\n        self.tracker = tracker\n        self.metric = metric\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        detector = create(cfg['detector'])\n        kwargs = {'input_shape': detector.neck.out_shape}\n\n        reid = create(cfg['reid'], **kwargs)\n\n        tracker = create(cfg['tracker'])\n\n        return {\n            \"detector\": detector,\n            \"reid\": reid,\n            \"tracker\": tracker,\n        }\n\n    def _forward(self):\n        det_outs = self.detector(self.inputs)\n\n        if self.training:\n            emb_feats = det_outs['emb_feats']\n            loss_confs = det_outs['det_losses']['loss_confs']\n            loss_boxes = det_outs['det_losses']['loss_boxes']\n            jde_losses = self.reid(\n                emb_feats,\n                self.inputs,\n                loss_confs=loss_confs,\n                loss_boxes=loss_boxes)\n            return jde_losses\n        else:\n            if self.metric == 'MOTDet':\n                det_results = {\n                    'bbox': det_outs['bbox'],\n                    'bbox_num': det_outs['bbox_num'],\n                }\n                return det_results\n\n            elif self.metric == 'MOT':\n                emb_feats = det_outs['emb_feats']\n                bboxes = det_outs['bbox']\n                boxes_idx = det_outs['boxes_idx']\n                nms_keep_idx = det_outs['nms_keep_idx']\n\n                pred_dets, pred_embs = self.reid(\n                    emb_feats,\n                    self.inputs,\n                    bboxes=bboxes,\n                    boxes_idx=boxes_idx,\n                    nms_keep_idx=nms_keep_idx)\n                return pred_dets, pred_embs\n\n            else:\n                raise ValueError(\"Unknown metric {} for multi object tracking.\".\n                                 format(self.metric))\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/keypoint_hrhrnet.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom scipy.optimize import linear_sum_assignment\nfrom collections import abc, defaultdict\nimport numpy as np\nimport paddle\n\nfrom ppdet.core.workspace import register, create, serializable\nfrom .meta_arch import BaseArch\nfrom .. import layers as L\nfrom ..keypoint_utils import transpred\n\n__all__ = ['HigherHRNet']\n\n\n@register\nclass HigherHRNet(BaseArch):\n    __category__ = 'architecture'\n\n    def __init__(self,\n                 backbone='HRNet',\n                 hrhrnet_head='HrHRNetHead',\n                 post_process='HrHRNetPostProcess',\n                 eval_flip=True,\n                 flip_perm=None,\n                 max_num_people=30):\n        \"\"\"\n        HigherHRNet network, see https://arxiv.org/abs/1908.10357；\n        HigherHRNet+swahr, see https://arxiv.org/abs/2012.15175\n\n        Args:\n            backbone (nn.Layer): backbone instance\n            hrhrnet_head (nn.Layer): keypoint_head instance\n            bbox_post_process (object): `BBoxPostProcess` instance\n        \"\"\"\n        super(HigherHRNet, self).__init__()\n        self.backbone = backbone\n        self.hrhrnet_head = hrhrnet_head\n        self.post_process = post_process\n        self.flip = eval_flip\n        self.flip_perm = paddle.to_tensor(flip_perm)\n        self.deploy = False\n        self.interpolate = L.Upsample(2, mode='bilinear')\n        self.pool = L.MaxPool(5, 1, 2)\n        self.max_num_people = max_num_people\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n        # head\n        kwargs = {'input_shape': backbone.out_shape}\n        hrhrnet_head = create(cfg['hrhrnet_head'], **kwargs)\n        post_process = create(cfg['post_process'])\n\n        return {\n            'backbone': backbone,\n            \"hrhrnet_head\": hrhrnet_head,\n            \"post_process\": post_process,\n        }\n\n    def _forward(self):\n        if self.flip and not self.training and not self.deploy:\n            self.inputs['image'] = paddle.concat(\n                (self.inputs['image'], paddle.flip(self.inputs['image'], [3])))\n        body_feats = self.backbone(self.inputs)\n\n        if self.training:\n            return self.hrhrnet_head(body_feats, self.inputs)\n        else:\n            outputs = self.hrhrnet_head(body_feats)\n\n            if self.flip and not self.deploy:\n                outputs = [paddle.split(o, 2) for o in outputs]\n                output_rflip = [\n                    paddle.flip(paddle.gather(o[1], self.flip_perm, 1), [3])\n                    for o in outputs\n                ]\n                output1 = [o[0] for o in outputs]\n                heatmap = (output1[0] + output_rflip[0]) / 2.\n                tagmaps = [output1[1], output_rflip[1]]\n                outputs = [heatmap] + tagmaps\n            outputs = self.get_topk(outputs)\n\n            if self.deploy:\n                return outputs\n\n            res_lst = []\n            h = self.inputs['im_shape'][0, 0].numpy().item()\n            w = self.inputs['im_shape'][0, 1].numpy().item()\n            kpts, scores = self.post_process(*outputs, h, w)\n            res_lst.append([kpts, scores])\n            return res_lst\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        outputs = {}\n        res_lst = self._forward()\n        outputs['keypoint'] = res_lst\n        return outputs\n\n    def get_topk(self, outputs):\n        # resize to image size\n        outputs = [self.interpolate(x) for x in outputs]\n        if len(outputs) == 3:\n            tagmap = paddle.concat(\n                (outputs[1].unsqueeze(4), outputs[2].unsqueeze(4)), axis=4)\n        else:\n            tagmap = outputs[1].unsqueeze(4)\n\n        heatmap = outputs[0]\n        N, J = 1, self.hrhrnet_head.num_joints\n        heatmap_maxpool = self.pool(heatmap)\n        # topk\n        maxmap = heatmap * (heatmap == heatmap_maxpool)\n        maxmap = maxmap.reshape([N, J, -1])\n        heat_k, inds_k = maxmap.topk(self.max_num_people, axis=2)\n\n        outputs = [heatmap, tagmap, heat_k, inds_k]\n        return outputs\n\n\n@register\n@serializable\nclass HrHRNetPostProcess(object):\n    '''\n    HrHRNet postprocess contain:\n        1) get topk keypoints in the output heatmap\n        2) sample the tagmap's value corresponding to each of the topk coordinate\n        3) match different joints to combine to some people with Hungary algorithm\n        4) adjust the coordinate by +-0.25 to decrease error std\n        5) salvage missing joints by check positivity of heatmap - tagdiff_norm\n    Args:\n        max_num_people (int): max number of people support in postprocess\n        heat_thresh (float): value of topk below this threshhold will be ignored\n        tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init\n\n        inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk\n        original_height, original_width (float): the original image size\n    '''\n\n    def __init__(self, max_num_people=30, heat_thresh=0.1, tag_thresh=1.):\n        self.max_num_people = max_num_people\n        self.heat_thresh = heat_thresh\n        self.tag_thresh = tag_thresh\n\n    def lerp(self, j, y, x, heatmap):\n        H, W = heatmap.shape[-2:]\n        left = np.clip(x - 1, 0, W - 1)\n        right = np.clip(x + 1, 0, W - 1)\n        up = np.clip(y - 1, 0, H - 1)\n        down = np.clip(y + 1, 0, H - 1)\n        offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25,\n                            -0.25)\n        offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25,\n                            -0.25)\n        return offset_y + 0.5, offset_x + 0.5\n\n    def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,\n                 original_width):\n\n        N, J, H, W = heatmap.shape\n        assert N == 1, \"only support batch size 1\"\n        heatmap = heatmap[0].cpu().detach().numpy()\n        tagmap = tagmap[0].cpu().detach().numpy()\n        heats = heat_k[0].cpu().detach().numpy()\n        inds_np = inds_k[0].cpu().detach().numpy()\n        y = inds_np // W\n        x = inds_np % W\n        tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people),\n                      y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1])\n        coords = np.stack((y, x), axis=2)\n        # threshold\n        mask = heats > self.heat_thresh\n        # cluster\n        cluster = defaultdict(lambda: {\n            'coords': np.zeros((J, 2), dtype=np.float32),\n            'scores': np.zeros(J, dtype=np.float32),\n            'tags': []\n        })\n        for jid, m in enumerate(mask):\n            num_valid = m.sum()\n            if num_valid == 0:\n                continue\n            valid_inds = np.where(m)[0]\n            valid_tags = tags[jid, m, :]\n            if len(cluster) == 0:  # initialize\n                for i in valid_inds:\n                    tag = tags[jid, i]\n                    key = tag[0]\n                    cluster[key]['tags'].append(tag)\n                    cluster[key]['scores'][jid] = heats[jid, i]\n                    cluster[key]['coords'][jid] = coords[jid, i]\n                continue\n            candidates = list(cluster.keys())[:self.max_num_people]\n            centroids = [\n                np.mean(\n                    cluster[k]['tags'], axis=0) for k in candidates\n            ]\n            num_clusters = len(centroids)\n            # shape is (num_valid, num_clusters, tag_dim)\n            dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]\n            l2_dist = np.linalg.norm(dist, ord=2, axis=2)\n            # modulate dist with heat value, see `use_detection_val`\n            cost = np.round(l2_dist) * 100 - heats[jid, m, None]\n            # pad the cost matrix, otherwise new pose are ignored\n            if num_valid > num_clusters:\n                cost = np.pad(cost, ((0, 0), (0, num_valid - num_clusters)),\n                              'constant',\n                              constant_values=((0, 0), (0, 1e-10)))\n            rows, cols = linear_sum_assignment(cost)\n            for y, x in zip(rows, cols):\n                tag = tags[jid, y]\n                if y < num_valid and x < num_clusters and \\\n                   l2_dist[y, x] < self.tag_thresh:\n                    key = candidates[x]  # merge to cluster\n                else:\n                    key = tag[0]  # initialize new cluster\n                cluster[key]['tags'].append(tag)\n                cluster[key]['scores'][jid] = heats[jid, y]\n                cluster[key]['coords'][jid] = coords[jid, y]\n\n        # shape is [k, J, 2] and [k, J]\n        pose_tags = np.array([cluster[k]['tags'] for k in cluster])\n        pose_coords = np.array([cluster[k]['coords'] for k in cluster])\n        pose_scores = np.array([cluster[k]['scores'] for k in cluster])\n        valid = pose_scores > 0\n\n        pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32)\n        if valid.sum() == 0:\n            return pose_kpts, pose_kpts\n\n        # refine coords\n        valid_coords = pose_coords[valid].astype(np.int32)\n        y = valid_coords[..., 0].flatten()\n        x = valid_coords[..., 1].flatten()\n        _, j = np.nonzero(valid)\n        offsets = self.lerp(j, y, x, heatmap)\n        pose_coords[valid, 0] += offsets[0]\n        pose_coords[valid, 1] += offsets[1]\n\n        # mean score before salvage\n        mean_score = pose_scores.mean(axis=1)\n        pose_kpts[valid, 2] = pose_scores[valid]\n\n        # salvage missing joints\n        if True:\n            for pid, coords in enumerate(pose_coords):\n                tag_mean = np.array(pose_tags[pid]).mean(axis=0)\n                norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5\n                score = heatmap - np.round(norm)  # (J, H, W)\n                flat_score = score.reshape(J, -1)\n                max_inds = np.argmax(flat_score, axis=1)\n                max_scores = np.max(flat_score, axis=1)\n                salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0)\n                if salvage_joints.sum() == 0:\n                    continue\n                y = max_inds[salvage_joints] // W\n                x = max_inds[salvage_joints] % W\n                offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap)\n                y = y.astype(np.float32) + offsets[0]\n                x = x.astype(np.float32) + offsets[1]\n                pose_coords[pid][salvage_joints, 0] = y\n                pose_coords[pid][salvage_joints, 1] = x\n                pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]\n        pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1],\n                                       original_height, original_width,\n                                       min(H, W))\n        return pose_kpts, mean_score\n"
  },
  {
    "path": "ppdet/modeling/architectures/keypoint_hrnet.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License. \n# You may obtain a copy of the License at \n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport numpy as np\nimport math\nimport cv2\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\nfrom ..keypoint_utils import transform_preds\nfrom .. import layers as L\nfrom paddle.nn import functional as F\n\n__all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet']\n\n\n@register\nclass TopDownHRNet(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['loss']\n\n    def __init__(self,\n                 width,\n                 num_joints,\n                 backbone='HRNet',\n                 loss='KeyPointMSELoss',\n                 post_process='HRNetPostProcess',\n                 flip_perm=None,\n                 flip=True,\n                 shift_heatmap=True,\n                 use_dark=True):\n        \"\"\"\n        HRNet network, see https://arxiv.org/abs/1902.09212\n \n        Args:\n            backbone (nn.Layer): backbone instance\n            post_process (object): `HRNetPostProcess` instance\n            flip_perm (list): The left-right joints exchange order list\n            use_dark(bool): Whether to use DARK in post processing\n        \"\"\"\n        super(TopDownHRNet, self).__init__()\n        self.backbone = backbone\n        self.post_process = HRNetPostProcess(use_dark)\n        self.loss = loss\n        self.flip_perm = flip_perm\n        self.flip = flip\n        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)\n        self.shift_heatmap = shift_heatmap\n        self.deploy = False\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n\n        return {'backbone': backbone, }\n\n    def _forward(self):\n        feats = self.backbone(self.inputs)\n        hrnet_outputs = self.final_conv(feats[0])\n\n        if self.training:\n            return self.loss(hrnet_outputs, self.inputs)\n        elif self.deploy:\n            outshape = hrnet_outputs.shape\n            max_idx = paddle.argmax(\n                hrnet_outputs.reshape(\n                    (outshape[0], outshape[1], outshape[2] * outshape[3])),\n                axis=-1)\n            return hrnet_outputs, max_idx\n        else:\n            if self.flip:\n                self.inputs['image'] = self.inputs['image'].flip([3])\n                feats = self.backbone(self.inputs)\n                output_flipped = self.final_conv(feats[0])\n                output_flipped = self.flip_back(output_flipped.numpy(),\n                                                self.flip_perm)\n                output_flipped = paddle.to_tensor(output_flipped.copy())\n                if self.shift_heatmap:\n                    output_flipped[:, :, :, 1:] = output_flipped.clone(\n                    )[:, :, :, 0:-1]\n                hrnet_outputs = (hrnet_outputs + output_flipped) * 0.5\n            imshape = (self.inputs['im_shape'].numpy()\n                       )[:, ::-1] if 'im_shape' in self.inputs else None\n            center = self.inputs['center'].numpy(\n            ) if 'center' in self.inputs else np.round(imshape / 2.)\n            scale = self.inputs['scale'].numpy(\n            ) if 'scale' in self.inputs else imshape / 200.\n            outputs = self.post_process(hrnet_outputs, center, scale)\n            return outputs\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        res_lst = self._forward()\n        outputs = {'keypoint': res_lst}\n        return outputs\n\n    def flip_back(self, output_flipped, matched_parts):\n        assert output_flipped.ndim == 4,\\\n                'output_flipped should be [batch_size, num_joints, height, width]'\n\n        output_flipped = output_flipped[:, :, :, ::-1]\n\n        for pair in matched_parts:\n            tmp = output_flipped[:, pair[0], :, :].copy()\n            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]\n            output_flipped[:, pair[1], :, :] = tmp\n\n        return output_flipped\n\n\nclass HRNetPostProcess(object):\n    def __init__(self, use_dark=True):\n        self.use_dark = use_dark\n\n    def get_max_preds(self, heatmaps):\n        '''get predictions from score maps\n \n        Args:\n            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])\n \n        Returns:\n            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords\n            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints\n        '''\n        assert isinstance(heatmaps,\n                          np.ndarray), 'heatmaps should be numpy.ndarray'\n        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'\n\n        batch_size = heatmaps.shape[0]\n        num_joints = heatmaps.shape[1]\n        width = heatmaps.shape[3]\n        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))\n        idx = np.argmax(heatmaps_reshaped, 2)\n        maxvals = np.amax(heatmaps_reshaped, 2)\n\n        maxvals = maxvals.reshape((batch_size, num_joints, 1))\n        idx = idx.reshape((batch_size, num_joints, 1))\n\n        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)\n\n        preds[:, :, 0] = (preds[:, :, 0]) % width\n        preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)\n\n        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))\n        pred_mask = pred_mask.astype(np.float32)\n\n        preds *= pred_mask\n\n        return preds, maxvals\n\n    def gaussian_blur(self, heatmap, kernel):\n        border = (kernel - 1) // 2\n        batch_size = heatmap.shape[0]\n        num_joints = heatmap.shape[1]\n        height = heatmap.shape[2]\n        width = heatmap.shape[3]\n        for i in range(batch_size):\n            for j in range(num_joints):\n                origin_max = np.max(heatmap[i, j])\n                dr = np.zeros((height + 2 * border, width + 2 * border))\n                dr[border:-border, border:-border] = heatmap[i, j].copy()\n                dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)\n                heatmap[i, j] = dr[border:-border, border:-border].copy()\n                heatmap[i, j] *= origin_max / np.max(heatmap[i, j])\n        return heatmap\n\n    def dark_parse(self, hm, coord):\n        heatmap_height = hm.shape[0]\n        heatmap_width = hm.shape[1]\n        px = int(coord[0])\n        py = int(coord[1])\n        if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2:\n            dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])\n            dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])\n            dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])\n            dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \\\n                + hm[py-1][px-1])\n            dyy = 0.25 * (\n                hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])\n            derivative = np.matrix([[dx], [dy]])\n            hessian = np.matrix([[dxx, dxy], [dxy, dyy]])\n            if dxx * dyy - dxy**2 != 0:\n                hessianinv = hessian.I\n                offset = -hessianinv * derivative\n                offset = np.squeeze(np.array(offset.T), axis=0)\n                coord += offset\n        return coord\n\n    def dark_postprocess(self, hm, coords, kernelsize):\n        '''DARK postpocessing, Zhang et al. Distribution-Aware Coordinate\n        Representation for Human Pose Estimation (CVPR 2020).\n        '''\n\n        hm = self.gaussian_blur(hm, kernelsize)\n        hm = np.maximum(hm, 1e-10)\n        hm = np.log(hm)\n        for n in range(coords.shape[0]):\n            for p in range(coords.shape[1]):\n                coords[n, p] = self.dark_parse(hm[n][p], coords[n][p])\n        return coords\n\n    def get_final_preds(self, heatmaps, center, scale, kernelsize=3):\n        \"\"\"the highest heatvalue location with a quarter offset in the\n        direction from the highest response to the second highest response.\n \n        Args:\n            heatmaps (numpy.ndarray): The predicted heatmaps\n            center (numpy.ndarray): The boxes center\n            scale (numpy.ndarray): The scale factor\n \n        Returns:\n            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords\n            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints\n        \"\"\"\n        coords, maxvals = self.get_max_preds(heatmaps)\n\n        heatmap_height = heatmaps.shape[2]\n        heatmap_width = heatmaps.shape[3]\n\n        if self.use_dark:\n            coords = self.dark_postprocess(heatmaps, coords, kernelsize)\n        else:\n            for n in range(coords.shape[0]):\n                for p in range(coords.shape[1]):\n                    hm = heatmaps[n][p]\n                    px = int(math.floor(coords[n][p][0] + 0.5))\n                    py = int(math.floor(coords[n][p][1] + 0.5))\n                    if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:\n                        diff = np.array([\n                            hm[py][px + 1] - hm[py][px - 1],\n                            hm[py + 1][px] - hm[py - 1][px]\n                        ])\n                        coords[n][p] += np.sign(diff) * .25\n        preds = coords.copy()\n\n        # Transform back\n        for i in range(coords.shape[0]):\n            preds[i] = transform_preds(coords[i], center[i], scale[i],\n                                       [heatmap_width, heatmap_height])\n\n        return preds, maxvals\n\n    def __call__(self, output, center, scale):\n        preds, maxvals = self.get_final_preds(output.numpy(), center, scale)\n        outputs = [[\n            np.concatenate(\n                (preds, maxvals), axis=-1), np.mean(\n                    maxvals, axis=1)\n        ]]\n        return outputs\n\n\nclass TinyPose3DPostProcess(object):\n    def __init__(self):\n        pass\n\n    def __call__(self, output, center, scale):\n        \"\"\"\n        Args:\n            output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords\n            scale (numpy.ndarray): The scale factor\n        Returns:\n            preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords\n        \"\"\"\n\n        preds = output.numpy().copy()\n\n        # Transform back\n        for i in range(output.shape[0]):  # batch_size\n            preds[i][:, 0] = preds[i][:, 0] * scale[i][0]\n            preds[i][:, 1] = preds[i][:, 1] * scale[i][1]\n\n        return preds\n\n\ndef soft_argmax(heatmaps, joint_num):\n    dims = heatmaps.shape\n    depth_dim = (int)(dims[1] / joint_num)\n    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3]))\n    heatmaps = F.softmax(heatmaps, 2)\n    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3]))\n\n    accu_x = heatmaps.sum(axis=(2, 3))\n    accu_y = heatmaps.sum(axis=(2, 4))\n    accu_z = heatmaps.sum(axis=(3, 4))\n\n    accu_x = accu_x * paddle.arange(1, 33)\n    accu_y = accu_y * paddle.arange(1, 33)\n    accu_z = accu_z * paddle.arange(1, 33)\n\n    accu_x = accu_x.sum(axis=2, keepdim=True) - 1\n    accu_y = accu_y.sum(axis=2, keepdim=True) - 1\n    accu_z = accu_z.sum(axis=2, keepdim=True) - 1\n\n    coord_out = paddle.concat(\n        (accu_x, accu_y, accu_z), axis=2)  # [batch_size, joint_num, 3]\n\n    return coord_out\n\n\n@register\nclass TinyPose3DHRHeatmapNet(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['loss']\n\n    def __init__(\n            self,\n            width,  # 40, backbone输出的channel数目\n            num_joints,\n            backbone='HRNet',\n            loss='KeyPointRegressionMSELoss',\n            post_process=TinyPose3DPostProcess):\n        \"\"\"\n        Args:\n            backbone (nn.Layer): backbone instance\n            post_process (object): post process instance\n        \"\"\"\n        super(TinyPose3DHRHeatmapNet, self).__init__()\n\n        self.backbone = backbone\n        self.post_process = TinyPose3DPostProcess()\n        self.loss = loss\n        self.deploy = False\n        self.num_joints = num_joints\n\n        self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True)\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n\n        return {'backbone': backbone, }\n\n    def _forward(self):\n        feats = self.backbone(self.inputs)  # feats:[[batch_size, 40, 32, 24]]\n\n        hrnet_outputs = self.final_conv(feats[0])\n        res = soft_argmax(hrnet_outputs, self.num_joints)\n        return res\n\n    def get_loss(self):\n        pose3d = self._forward()\n        loss = self.loss(pose3d, None, self.inputs)\n        outputs = {'loss': loss}\n        return outputs\n\n    def get_pred(self):\n        res_lst = self._forward()\n        outputs = {'pose3d': res_lst}\n        return outputs\n\n    def flip_back(self, output_flipped, matched_parts):\n        assert output_flipped.ndim == 4,\\\n                'output_flipped should be [batch_size, num_joints, height, width]'\n\n        output_flipped = output_flipped[:, :, :, ::-1]\n\n        for pair in matched_parts:\n            tmp = output_flipped[:, pair[0], :, :].copy()\n            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]\n            output_flipped[:, pair[1], :, :] = tmp\n\n        return output_flipped\n\n\n@register\nclass TinyPose3DHRNet(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['loss']\n\n    def __init__(self,\n                 width,\n                 num_joints,\n                 fc_channel=768,\n                 backbone='HRNet',\n                 loss='KeyPointRegressionMSELoss',\n                 post_process=TinyPose3DPostProcess):\n        \"\"\"\n        Args:\n            backbone (nn.Layer): backbone instance\n            post_process (object): post process instance\n        \"\"\"\n        super(TinyPose3DHRNet, self).__init__()\n        self.backbone = backbone\n        self.post_process = TinyPose3DPostProcess()\n        self.loss = loss\n        self.deploy = False\n        self.num_joints = num_joints\n\n        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)\n\n        self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3)\n        self.fc1 = paddle.nn.Linear(fc_channel, 256)\n        self.act1 = paddle.nn.ReLU()\n        self.fc2 = paddle.nn.Linear(256, 64)\n        self.act2 = paddle.nn.ReLU()\n        self.fc3 = paddle.nn.Linear(64, 3)\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n\n        return {'backbone': backbone, }\n\n    def _forward(self):\n        '''\n        self.inputs is a dict\n        '''\n        feats = self.backbone(\n            self.inputs)  # feats:[[batch_size, 40, width/4, height/4]]\n\n        hrnet_outputs = self.final_conv(\n            feats[0])  # hrnet_outputs: [batch_size, num_joints*32,32,32]\n\n        flatten_res = self.flatten(\n            hrnet_outputs)  # [batch_size,num_joints*32,32*32]\n\n        res = self.fc1(flatten_res)\n        res = self.act1(res)\n        res = self.fc2(res)\n        res = self.act2(res)\n        res = self.fc3(res)\n\n        if self.training:\n            return self.loss(res, self.inputs)\n        else:  # export model need\n            return res\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        res_lst = self._forward()\n        outputs = {'pose3d': res_lst}\n        return outputs\n\n    def flip_back(self, output_flipped, matched_parts):\n        assert output_flipped.ndim == 4,\\\n                'output_flipped should be [batch_size, num_joints, height, width]'\n\n        output_flipped = output_flipped[:, :, :, ::-1]\n\n        for pair in matched_parts:\n            tmp = output_flipped[:, pair[0], :, :].copy()\n            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]\n            output_flipped[:, pair[1], :, :] = tmp\n\n        return output_flipped\n"
  },
  {
    "path": "ppdet/modeling/architectures/keypoint_petr.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. \n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License. \n# You may obtain a copy of the License at \n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and \n# limitations under the License.\n\"\"\"\nthis code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py\n\"\"\"\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register\nfrom .meta_arch import BaseArch\nfrom .. import layers as L\n\n__all__ = ['PETR']\n\n\n@register\nclass PETR(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['backbone', 'neck', 'bbox_head']\n\n    def __init__(self,\n                 backbone='ResNet',\n                 neck='ChannelMapper',\n                 bbox_head='PETRHead'):\n        \"\"\"\n        PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf\n\n        Args:\n            backbone (nn.Layer): backbone instance\n            neck (nn.Layer): neck between backbone and head\n            bbox_head (nn.Layer): model output and loss\n        \"\"\"\n        super(PETR, self).__init__()\n        self.backbone = backbone\n        if neck is not None:\n            self.with_neck = True\n        self.neck = neck\n        self.bbox_head = bbox_head\n        self.deploy = False\n\n    def extract_feat(self, img):\n        \"\"\"Directly extract features from the backbone+neck.\"\"\"\n        x = self.backbone(img)\n        if self.with_neck:\n            x = self.neck(x)\n        return x\n\n    def get_inputs(self):\n        img_metas = []\n        gt_bboxes = []\n        gt_labels = []\n        gt_keypoints = []\n        gt_areas = []\n        pad_gt_mask = self.inputs['pad_gt_mask'].astype(\"bool\").squeeze(-1)\n        for idx, im_shape in enumerate(self.inputs['im_shape']):\n            img_meta = {\n                'img_shape': im_shape.astype(\"int32\").tolist() + [1, ],\n                'batch_input_shape': self.inputs['image'].shape[-2:],\n                'image_name': self.inputs['image_file'][idx]\n            }\n            img_metas.append(img_meta)\n            if (not pad_gt_mask[idx].any()):\n                gt_keypoints.append(self.inputs['gt_joints'][idx][:1])\n                gt_labels.append(self.inputs['gt_class'][idx][:1])\n                gt_bboxes.append(self.inputs['gt_bbox'][idx][:1])\n                gt_areas.append(self.inputs['gt_areas'][idx][:1])\n                continue\n\n            gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]])\n            gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]])\n            gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]])\n            gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]])\n\n        return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas\n\n    def get_loss(self):\n        \"\"\"\n        Args:\n            img (Tensor): Input images of shape (N, C, H, W).\n                Typically these should be mean centered and std scaled.\n            img_metas (list[dict]): A List of image info dict where each dict\n                has: 'img_shape', 'scale_factor', 'flip', and may also contain\n                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.\n                For details on the values of these keys see\n                :class:`mmdet.datasets.pipelines.Collect`.\n            gt_bboxes (list[Tensor]): Each item are the truth boxes for each\n                image in [tl_x, tl_y, br_x, br_y] format.\n            gt_labels (list[Tensor]): Class indices corresponding to each box.\n            gt_keypoints (list[Tensor]): Each item are the truth keypoints for\n                each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x,\n                p^{K}_y, p^{K}_v] format.\n            gt_areas (list[Tensor]): mask areas corresponding to each box.\n            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding\n                boxes can be ignored when computing the loss.\n\n        Returns:\n            dict[str, Tensor]: A dictionary of loss components.\n        \"\"\"\n\n        img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs(\n        )\n        gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None)\n\n        x = self.extract_feat(self.inputs)\n        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,\n                                              gt_labels, gt_keypoints, gt_areas,\n                                              gt_bboxes_ignore)\n        loss = 0\n        for k, v in losses.items():\n            loss += v\n        losses['loss'] = loss\n\n        return losses\n\n    def get_pred_numpy(self):\n        \"\"\"Used for computing network flops.\n        \"\"\"\n\n        img = self.inputs['image']\n        batch_size, _, height, width = img.shape\n        dummy_img_metas = [\n            dict(\n                batch_input_shape=(height, width),\n                img_shape=(height, width, 3),\n                scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size)\n        ]\n        x = self.extract_feat(img)\n        outs = self.bbox_head(x, img_metas=dummy_img_metas)\n        bbox_list = self.bbox_head.get_bboxes(\n            *outs, dummy_img_metas, rescale=True)\n        return bbox_list\n\n    def get_pred(self):\n        \"\"\"\n        \"\"\"\n        img = self.inputs['image']\n        batch_size, _, height, width = img.shape\n        img_metas = [\n            dict(\n                batch_input_shape=(height, width),\n                img_shape=(height, width, 3),\n                scale_factor=self.inputs['scale_factor'][i])\n            for i in range(batch_size)\n        ]\n        kptpred = self.simple_test(\n            self.inputs, img_metas=img_metas, rescale=True)\n        keypoints = kptpred[0][1][0]\n        bboxs = kptpred[0][0][0]\n        keypoints[..., 2] = bboxs[:, None, 4]\n        res_lst = [[keypoints, bboxs[:, 4]]]\n        outputs = {'keypoint': res_lst}\n        return outputs\n\n    def simple_test(self, inputs, img_metas, rescale=False):\n        \"\"\"Test function without test time augmentation.\n\n        Args:\n            inputs (list[paddle.Tensor]): List of multiple images.\n            img_metas (list[dict]): List of image information.\n            rescale (bool, optional): Whether to rescale the results.\n                Defaults to False.\n\n        Returns:\n            list[list[np.ndarray]]: BBox and keypoint results of each image\n                and classes. The outer list corresponds to each image.\n                The inner list corresponds to each class.\n        \"\"\"\n        batch_size = len(img_metas)\n        assert batch_size == 1, 'Currently only batch_size 1 for inference ' \\\n            f'mode is supported. Found batch_size {batch_size}.'\n        feat = self.extract_feat(inputs)\n        results_list = self.bbox_head.simple_test(\n            feat, img_metas, rescale=rescale)\n\n        bbox_kpt_results = [\n            self.bbox_kpt2result(det_bboxes, det_labels, det_kpts,\n                                 self.bbox_head.num_classes)\n            for det_bboxes, det_labels, det_kpts in results_list\n        ]\n        return bbox_kpt_results\n\n    def bbox_kpt2result(self, bboxes, labels, kpts, num_classes):\n        \"\"\"Convert detection results to a list of numpy arrays.\n\n        Args:\n            bboxes (paddle.Tensor | np.ndarray): shape (n, 5).\n            labels (paddle.Tensor | np.ndarray): shape (n, ).\n            kpts (paddle.Tensor | np.ndarray): shape (n, K, 3).\n            num_classes (int): class number, including background class.\n\n        Returns:\n            list(ndarray): bbox and keypoint results of each class.\n        \"\"\"\n        if bboxes.shape[0] == 0:\n            return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \\\n                [np.zeros((0, kpts.size(1), 3), dtype=np.float32)\n                    for i in range(num_classes)]\n        else:\n            if isinstance(bboxes, paddle.Tensor):\n                bboxes = bboxes.numpy()\n                labels = labels.numpy()\n                kpts = kpts.numpy()\n            return [bboxes[labels == i, :] for i in range(num_classes)], \\\n                [kpts[labels == i, :, :] for i in range(num_classes)]\n"
  },
  {
    "path": "ppdet/modeling/architectures/keypoint_vitpose.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. \n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License. \n# You may obtain a copy of the License at \n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport numpy as np\nimport math\nimport cv2\nfrom ppdet.core.workspace import register, create, serializable\nfrom .meta_arch import BaseArch\nfrom ..keypoint_utils import transform_preds\nfrom .. import layers as L\n\n__all__ = ['VitPose_TopDown', 'VitPosePostProcess']\n\n\n@register\nclass VitPose_TopDown(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['loss']\n\n    def __init__(self, backbone, head, loss, post_process, flip_test):\n        \"\"\"\n        VitPose network, see https://arxiv.org/pdf/2204.12484v2.pdf\n\n        Args:\n            backbone (nn.Layer): backbone instance\n            post_process (object): `HRNetPostProcess` instance\n            \n        \"\"\"\n        super(VitPose_TopDown, self).__init__()\n        self.backbone = backbone\n        self.head = head\n        self.loss = loss\n        self.post_process = post_process\n        self.flip_test = flip_test\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n        #head\n        head = create(cfg['head'])\n        #post_process\n        post_process = create(cfg['post_process'])\n\n        return {\n            'backbone': backbone,\n            'head': head,\n            'post_process': post_process\n        }\n\n    def _forward_train(self):\n\n        feats = self.backbone.forward_features(self.inputs['image'])\n        vitpost_output = self.head(feats)\n        return self.loss(vitpost_output, self.inputs)\n\n    def _forward_test(self):\n\n        feats = self.backbone.forward_features(self.inputs['image'])\n        output_heatmap = self.head(feats)\n\n        if self.flip_test:\n            img_flipped = self.inputs['image'].flip(3)\n            features_flipped = self.backbone.forward_features(img_flipped)\n            output_flipped_heatmap = self.head.inference_model(features_flipped,\n                                                               self.flip_test)\n\n            output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5\n\n        imshape = (self.inputs['im_shape'].numpy()\n                   )[:, ::-1] if 'im_shape' in self.inputs else None\n        center = self.inputs['center'].numpy(\n        ) if 'center' in self.inputs else np.round(imshape / 2.)\n        scale = self.inputs['scale'].numpy(\n        ) if 'scale' in self.inputs else imshape / 200.\n\n        result = self.post_process(output_heatmap.cpu().numpy(), center, scale)\n\n        return result\n\n    def get_loss(self):\n        return self._forward_train()\n\n    def get_pred(self):\n        res_lst = self._forward_test()\n        outputs = {'keypoint': res_lst}\n        return outputs\n\n\n@register\n@serializable\nclass VitPosePostProcess(object):\n    def __init__(self, use_dark=False):\n        self.use_dark = use_dark\n\n    def get_max_preds(self, heatmaps):\n        '''get predictions from score maps\n\n        Args:\n            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])\n\n        Returns:\n            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords\n            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints\n        '''\n        assert isinstance(heatmaps,\n                          np.ndarray), 'heatmaps should be numpy.ndarray'\n        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'\n\n        batch_size = heatmaps.shape[0]\n        num_joints = heatmaps.shape[1]\n        width = heatmaps.shape[3]\n        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))\n        idx = np.argmax(heatmaps_reshaped, 2)\n        maxvals = np.amax(heatmaps_reshaped, 2)\n\n        maxvals = maxvals.reshape((batch_size, num_joints, 1))\n        idx = idx.reshape((batch_size, num_joints, 1))\n\n        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)\n\n        preds[:, :, 0] = (preds[:, :, 0]) % width\n        preds[:, :, 1] = np.floor((preds[:, :, 1]) // width)\n\n        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))\n        pred_mask = pred_mask.astype(np.float32)\n\n        preds *= pred_mask\n\n        return preds, maxvals\n\n    def post_datk_udp(self, coords, batch_heatmaps, kernel=3):\n        \"\"\"DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The\n        Devil is in the Details: Delving into Unbiased Data Processing for Human\n        Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate\n        Representation for Human Pose Estimation (CVPR 2020).\n\n        Note:\n            - batch size: B\n            - num keypoints: K\n            - num persons: N\n            - height of heatmaps: H\n            - width of heatmaps: W\n\n            B=1 for bottom_up paradigm where all persons share the same heatmap.\n            B=N for top_down paradigm where each person has its own heatmaps.\n\n        Args:\n            coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.\n            batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps\n            kernel (int): Gaussian kernel size (K) for modulation.\n\n        Returns:\n            np.ndarray([N, K, 2]): Refined coordinates.\n        \"\"\"\n        if not isinstance(batch_heatmaps, np.ndarray):\n            batch_heatmaps = batch_heatmaps.cpu().numpy()\n        B, K, H, W = batch_heatmaps.shape\n        N = coords.shape[0]\n        assert (B == 1 or B == N)\n        for heatmaps in batch_heatmaps:\n            for heatmap in heatmaps:\n                cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)\n        np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)\n        np.log(batch_heatmaps, batch_heatmaps)\n\n        batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1),\n                                                     (1, 1)),\n                                    mode='edge').flatten()\n\n        index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)\n        index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)\n        index = index.astype(int).reshape(-1, 1)\n        i_ = batch_heatmaps_pad[index]\n        ix1 = batch_heatmaps_pad[index + 1]\n        iy1 = batch_heatmaps_pad[index + W + 2]\n        ix1y1 = batch_heatmaps_pad[index + W + 3]\n        ix1_y1_ = batch_heatmaps_pad[index - W - 3]\n        ix1_ = batch_heatmaps_pad[index - 1]\n        iy1_ = batch_heatmaps_pad[index - 2 - W]\n\n        dx = 0.5 * (ix1 - ix1_)\n        dy = 0.5 * (iy1 - iy1_)\n        derivative = np.concatenate([dx, dy], axis=1)\n        derivative = derivative.reshape(N, K, 2, 1)\n        dxx = ix1 - 2 * i_ + ix1_\n        dyy = iy1 - 2 * i_ + iy1_\n        dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)\n        hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)\n        hessian = hessian.reshape(N, K, 2, 2)\n        hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))\n        coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()\n        return coords\n\n    def transform_preds_udp(self,\n                            coords,\n                            center,\n                            scale,\n                            output_size,\n                            use_udp=True):\n        \"\"\"Get final keypoint predictions from heatmaps and apply scaling and\n        translation to map them back to the image.\n\n        Note:\n            num_keypoints: K\n\n        Args:\n            coords (np.ndarray[K, ndims]):\n\n                * If ndims=2, corrds are predicted keypoint location.\n                * If ndims=4, corrds are composed of (x, y, scores, tags)\n                * If ndims=5, corrds are composed of (x, y, scores, tags,\n                flipped_tags)\n\n            center (np.ndarray[2, ]): Center of the bounding box (x, y).\n            scale (np.ndarray[2, ]): Scale of the bounding box\n                wrt [width, height].\n            output_size (np.ndarray[2, ] | list(2,)): Size of the\n                destination heatmaps.\n            use_udp (bool): Use unbiased data processing\n\n        Returns:\n            np.ndarray: Predicted coordinates in the images.\n        \"\"\"\n\n        assert coords.shape[1] in (2, 4, 5)\n        assert len(center) == 2\n        assert len(scale) == 2\n        assert len(output_size) == 2\n\n        # Recover the scale which is normalized by a factor of 200.\n        scale = scale * 200.0\n\n        if use_udp:\n            scale_x = scale[0] / (output_size[0] - 1.0)\n            scale_y = scale[1] / (output_size[1] - 1.0)\n        else:\n            scale_x = scale[0] / output_size[0]\n            scale_y = scale[1] / output_size[1]\n\n        target_coords = np.ones_like(coords)\n        target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[\n            0] * 0.5\n        target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[\n            1] * 0.5\n\n        return target_coords\n\n    def get_final_preds(self, heatmaps, center, scale, kernelsize=11):\n        \"\"\"the highest heatvalue location with a quarter offset in the\n        direction from the highest response to the second highest response.\n\n        Args:\n            heatmaps (numpy.ndarray): The predicted heatmaps\n            center (numpy.ndarray): The boxes center\n            scale (numpy.ndarray): The scale factor\n\n        Returns:\n            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords\n            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints\n        \"\"\"\n        coords, maxvals = self.get_max_preds(heatmaps)\n\n        N, K, H, W = heatmaps.shape\n\n        if self.use_dark:\n            coords = self.post_datk_udp(coords, heatmaps, kernelsize)\n            preds = coords.copy()\n            # Transform back to the image\n            for i in range(N):\n                preds[i] = self.transform_preds_udp(preds[i], center[i],\n                                                    scale[i], [W, H])\n        else:\n            for n in range(coords.shape[0]):\n                for p in range(coords.shape[1]):\n                    hm = heatmaps[n][p]\n                    px = int(math.floor(coords[n][p][0] + 0.5))\n                    py = int(math.floor(coords[n][p][1] + 0.5))\n                    if 1 < px < W - 1 and 1 < py < H - 1:\n                        diff = np.array([\n                            hm[py][px + 1] - hm[py][px - 1],\n                            hm[py + 1][px] - hm[py - 1][px]\n                        ])\n                        coords[n][p] += np.sign(diff) * .25\n            preds = coords.copy()\n\n            # Transform back\n            for i in range(coords.shape[0]):\n                preds[i] = transform_preds(coords[i], center[i], scale[i],\n                                           [W, H])\n\n        return preds, maxvals\n\n    def __call__(self, output, center, scale):\n        preds, maxvals = self.get_final_preds(output, center, scale)\n        outputs = [[\n            np.concatenate(\n                (preds, maxvals), axis=-1), np.mean(\n                    maxvals, axis=1)\n        ]]\n        return outputs"
  },
  {
    "path": "ppdet/modeling/architectures/mask_rcnn.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['MaskRCNN']\n\n\n@register\nclass MaskRCNN(BaseArch):\n    \"\"\"\n    Mask R-CNN network, see https://arxiv.org/abs/1703.06870\n\n    Args:\n        backbone (object): backbone instance\n        rpn_head (object): `RPNHead` instance\n        bbox_head (object): `BBoxHead` instance\n        mask_head (object): `MaskHead` instance\n        bbox_post_process (object): `BBoxPostProcess` instance\n        mask_post_process (object): `MaskPostProcess` instance\n        neck (object): 'FPN' instance\n    \"\"\"\n\n    __category__ = 'architecture'\n    __inject__ = [\n        'bbox_post_process',\n        'mask_post_process',\n    ]\n\n    def __init__(self,\n                 backbone,\n                 rpn_head,\n                 bbox_head,\n                 mask_head,\n                 bbox_post_process,\n                 mask_post_process,\n                 neck=None):\n        super(MaskRCNN, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.rpn_head = rpn_head\n        self.bbox_head = bbox_head\n        self.mask_head = mask_head\n\n        self.bbox_post_process = bbox_post_process\n        self.mask_post_process = mask_post_process\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = cfg['neck'] and create(cfg['neck'], **kwargs)\n\n        out_shape = neck and neck.out_shape or backbone.out_shape\n        kwargs = {'input_shape': out_shape}\n        rpn_head = create(cfg['rpn_head'], **kwargs)\n        bbox_head = create(cfg['bbox_head'], **kwargs)\n\n        out_shape = neck and out_shape or bbox_head.get_head().out_shape\n        kwargs = {'input_shape': out_shape}\n        mask_head = create(cfg['mask_head'], **kwargs)\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"rpn_head\": rpn_head,\n            \"bbox_head\": bbox_head,\n            \"mask_head\": mask_head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        if self.neck is not None:\n            body_feats = self.neck(body_feats)\n\n        if self.training:\n            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)\n            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,\n                                                  self.inputs)\n            rois, rois_num = self.bbox_head.get_assigned_rois()\n            bbox_targets = self.bbox_head.get_assigned_targets()\n            # Mask Head needs bbox_feat in Mask RCNN\n            mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs,\n                                       bbox_targets, bbox_feat)\n            return rpn_loss, bbox_loss, mask_loss\n        else:\n            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)\n            preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None)\n\n            im_shape = self.inputs['im_shape']\n            scale_factor = self.inputs['scale_factor']\n\n            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(\n                preds, (rois, rois_num), im_shape, scale_factor)\n            mask_out = self.mask_head(\n                body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)\n\n            # rescale the prediction back to origin image\n            bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(\n                bbox, bbox_num, im_shape, scale_factor)\n            origin_shape = self.bbox_post_process.get_origin_shape()\n            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,\n                                               origin_shape)\n\n            if self.use_extra_data:\n                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx\n                \"\"\"extra_data:{\n                            'scores': predict scores,\n                            'nms_keep_idx': bbox index before nms,\n                           }\n                \"\"\"\n                extra_data['scores'] = preds[1]  # predict scores (probability)\n                # Todo: get logits output\n                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms\n                return bbox_pred, bbox_num, mask_pred, extra_data\n            else:\n                return bbox_pred, bbox_num, mask_pred\n\n    def get_loss(self, ):\n        bbox_loss, mask_loss, rpn_loss = self._forward()\n        loss = {}\n        loss.update(rpn_loss)\n        loss.update(bbox_loss)\n        loss.update(mask_loss)\n        total_loss = paddle.add_n(list(loss.values()))\n        loss.update({'loss': total_loss})\n        return loss\n\n    def get_pred(self):\n        if self.use_extra_data:\n            bbox_pred, bbox_num, mask_pred, extra_data = self._forward()\n            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data}\n        else:\n            bbox_pred, bbox_num, mask_pred = self._forward()\n            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/meta_arch.py",
    "content": "from __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport typing\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.post_process import nms\n\n__all__ = ['BaseArch']\n\n\n@register\nclass BaseArch(nn.Layer):\n    def __init__(self, data_format='NCHW', use_extra_data=False):\n        super(BaseArch, self).__init__()\n        self.data_format = data_format\n        self.inputs = {}\n        self.fuse_norm = False\n        self.use_extra_data = use_extra_data\n\n    def load_meanstd(self, cfg_transform):\n        scale = 1.\n        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)\n        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)\n        for item in cfg_transform:\n            if 'NormalizeImage' in item:\n                mean = np.array(\n                    item['NormalizeImage']['mean'], dtype=np.float32)\n                std = np.array(item['NormalizeImage']['std'], dtype=np.float32)\n                if item['NormalizeImage'].get('is_scale', True):\n                    scale = 1. / 255.\n                break\n        if self.data_format == 'NHWC':\n            self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))\n            self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))\n        else:\n            self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))\n            self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))\n\n    def forward(self, inputs):\n        if self.data_format == 'NHWC':\n            image = inputs['image']\n            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])\n\n        if self.fuse_norm:\n            image = inputs['image']\n            self.inputs['image'] = image * self.scale + self.bias\n            self.inputs['im_shape'] = inputs['im_shape']\n            self.inputs['scale_factor'] = inputs['scale_factor']\n        else:\n            self.inputs = inputs\n\n        self.model_arch()\n\n        if self.training:\n            out = self.get_loss()\n        else:\n            inputs_list = []\n            # multi-scale input\n            if not isinstance(inputs, typing.Sequence):\n                inputs_list.append(inputs)\n            else:\n                inputs_list.extend(inputs)\n            outs = []\n            for inp in inputs_list:\n                if self.fuse_norm:\n                    self.inputs['image'] = inp['image'] * self.scale + self.bias\n                    self.inputs['im_shape'] = inp['im_shape']\n                    self.inputs['scale_factor'] = inp['scale_factor']\n                else:\n                    self.inputs = inp\n                outs.append(self.get_pred())\n\n            # multi-scale test\n            if len(outs) > 1:\n                out = self.merge_multi_scale_predictions(outs)\n            else:\n                out = outs[0]\n        return out\n\n    def merge_multi_scale_predictions(self, outs):\n        # default values for architectures not included in following list\n        num_classes = 80\n        nms_threshold = 0.5\n        keep_top_k = 100\n\n        if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):\n            num_classes = self.bbox_head.num_classes\n            keep_top_k = self.bbox_post_process.nms.keep_top_k\n            nms_threshold = self.bbox_post_process.nms.nms_threshold\n        else:\n            raise Exception(\n                \"Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now\"\n            )\n\n        final_boxes = []\n        all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()\n        for c in range(num_classes):\n            idxs = all_scale_outs[:, 0] == c\n            if np.count_nonzero(idxs) == 0:\n                continue\n            r = nms(all_scale_outs[idxs, 1:], nms_threshold)\n            final_boxes.append(\n                np.concatenate([np.full((r.shape[0], 1), c), r], 1))\n        out = np.concatenate(final_boxes)\n        out = np.concatenate(sorted(\n            out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))\n        out = {\n            'bbox': paddle.to_tensor(out),\n            'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))\n        }\n\n        return out\n\n    def build_inputs(self, data, input_def):\n        inputs = {}\n        for i, k in enumerate(input_def):\n            inputs[k] = data[i]\n        return inputs\n\n    def model_arch(self, ):\n        pass\n\n    def get_loss(self, ):\n        raise NotImplementedError(\"Should implement get_loss method!\")\n\n    def get_pred(self, ):\n        raise NotImplementedError(\"Should implement get_pred method!\")\n"
  },
  {
    "path": "ppdet/modeling/architectures/multi_stream_detector.py",
    "content": "from typing import Dict\nfrom collections import OrderedDict\nfrom ppdet.modeling.architectures.meta_arch import BaseArch\n\n\nclass MultiSteamDetector(BaseArch):\n    def __init__(self,\n                 model: Dict[str, BaseArch],\n                 train_cfg=None,\n                 test_cfg=None):\n        super(MultiSteamDetector, self).__init__()\n        self.submodules = list(model.keys())\n        for k, v in model.items():\n            setattr(self, k, v)\n\n        self.train_cfg = train_cfg\n        self.test_cfg = test_cfg\n        self.inference_on = self.test_cfg.get(\"inference_on\",\n                                              self.submodules[0])\n        self.first_load = True\n\n    def forward(self, inputs, return_loss=True, **kwargs):\n        \"\"\"Calls either :func:`forward_train` or :func:`forward_test` depending\n        on whether ``return_loss`` is ``True``.\n\n        Note this setting will change the expected inputs. When\n        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor\n        and List[dict]), and when ``resturn_loss=False``, img and img_meta\n        should be double nested (i.e.  List[Tensor], List[List[dict]]), with\n        the outer list indicating test time augmentations.\n        \"\"\"\n        if return_loss:\n            return self.forward_train(inputs, **kwargs)\n        else:\n            return self.forward_test(inputs, **kwargs)\n\n    def get_loss(self, **kwargs):\n        # losses = self(**data)\n\n        return self.forward_train(self, **kwargs)\n\n    def model(self, **kwargs) -> BaseArch:\n        if \"submodule\" in kwargs:\n            assert (kwargs[\"submodule\"] in self.submodules\n                    ), \"Detector does not contain submodule {}\".format(kwargs[\n                        \"submodule\"])\n            model: BaseArch = getattr(self, kwargs[\"submodule\"])\n        else:\n            model: BaseArch = getattr(self, self.inference_on)\n        return model\n\n    def freeze(self, model_ref: str):\n        assert model_ref in self.submodules\n        model = getattr(self, model_ref)\n        model.eval()\n        for param in model.parameters():\n            param.stop_gradient = True\n\n    def update_ema_model(self, momentum=0.9996):\n        # print(momentum)\n        model_dict = self.student.state_dict()\n        new_dict = OrderedDict()\n        for key, value in self.teacher.state_dict().items():\n            if key in model_dict.keys():\n                new_dict[key] = (model_dict[key] *\n                                 (1 - momentum) + value * momentum)\n            else:\n                raise Exception(\"{} is not found in student model\".format(key))\n        self.teacher.set_dict(new_dict)\n"
  },
  {
    "path": "ppdet/modeling/architectures/picodet.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['PicoDet']\n\n\n@register\nclass PicoDet(BaseArch):\n    \"\"\"\n    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388\n\n    Args:\n        backbone (object): backbone instance\n        neck (object): 'FPN' instance\n        head (object): 'PicoHead' instance\n    \"\"\"\n\n    __category__ = 'architecture'\n\n    def __init__(self, backbone, neck, head='PicoHead', nms_cpu=False):\n        super(PicoDet, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.head = head\n        self.export_post_process = True\n        self.export_nms = True\n        self.nms_cpu = nms_cpu\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"head\": head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        fpn_feats = self.neck(body_feats)\n        head_outs = self.head(fpn_feats, self.export_post_process)\n        if self.training or not self.export_post_process:\n            return head_outs, None\n        else:\n            scale_factor = self.inputs['scale_factor']\n            bboxes, bbox_num = self.head.post_process(\n                head_outs,\n                scale_factor,\n                export_nms=self.export_nms,\n                nms_cpu=self.nms_cpu)\n            return bboxes, bbox_num\n\n    def get_loss(self, ):\n        loss = {}\n\n        head_outs, _ = self._forward()\n        loss_gfl = self.head.get_loss(head_outs, self.inputs)\n        loss.update(loss_gfl)\n        total_loss = paddle.add_n(list(loss.values()))\n        loss.update({'loss': total_loss})\n        return loss\n\n    def get_pred(self):\n        if not self.export_post_process:\n            return {'picodet': self._forward()[0]}\n        elif self.export_nms:\n            bbox_pred, bbox_num = self._forward()\n            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}\n            return output\n        else:\n            bboxes, mlvl_scores = self._forward()\n            output = {'bbox': bboxes, 'scores': mlvl_scores}\n            return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/pose3d_metro.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License. \n# You may obtain a copy of the License at \n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\nfrom .. import layers as L\n\n__all__ = ['METRO_Body']\n\n\ndef orthographic_projection(X, camera):\n    \"\"\"Perform orthographic projection of 3D points X using the camera parameters\n    Args:\n        X: size = [B, N, 3]\n        camera: size = [B, 3]\n    Returns:\n        Projected 2D points -- size = [B, N, 2]\n    \"\"\"\n    camera = camera.reshape((-1, 1, 3))\n    X_trans = X[:, :, :2] + camera[:, :, 1:]\n    shape = X_trans.shape\n    X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape)\n    return X_2d\n\n\n@register\nclass METRO_Body(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['loss']\n\n    def __init__(\n            self,\n            num_joints,\n            backbone='HRNet',\n            trans_encoder='',\n            loss='Pose3DLoss', ):\n        \"\"\"\n        Modified from METRO network, see https://arxiv.org/abs/2012.09760\n\n        Args:\n            backbone (nn.Layer): backbone instance\n        \"\"\"\n        super(METRO_Body, self).__init__()\n        self.num_joints = num_joints\n        self.backbone = backbone\n        self.loss = loss\n        self.deploy = False\n\n        self.trans_encoder = trans_encoder\n        self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1)\n        self.cam_param_fc = paddle.nn.Linear(3, 2)\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n        trans_encoder = create(cfg['trans_encoder'])\n\n        return {'backbone': backbone, 'trans_encoder': trans_encoder}\n\n    def _forward(self):\n        batch_size = self.inputs['image'].shape[0]\n\n        image_feat = self.backbone(self.inputs)\n        image_feat_flatten = image_feat.reshape((batch_size, 2048, 49))\n        image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1))\n        # and apply a conv layer to learn image token for each 3d joint/vertex position\n        features = self.conv_learn_tokens(image_feat_flatten)  # (B, J, C)\n\n        if self.training:\n            # apply mask vertex/joint modeling\n            # meta_masks is a tensor of all the masks, randomly generated in dataloader\n            # we pre-define a [MASK] token, which is a floating-value vector with 0.01s\n            meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048))\n            constant_tensor = paddle.ones_like(features) * 0.01\n            features = features * meta_masks + constant_tensor * (1 - meta_masks\n                                                                  )\n        pred_out = self.trans_encoder(features)\n\n        pred_3d_joints = pred_out[:, :self.num_joints, :]\n        cam_features = pred_out[:, self.num_joints:, :]\n\n        # learn camera parameters\n        pred_2d_joints = self.cam_param_fc(cam_features)\n        return pred_3d_joints, pred_2d_joints\n\n    def get_loss(self):\n        preds_3d, preds_2d = self._forward()\n        loss = self.loss(preds_3d, preds_2d, self.inputs)\n        output = {'loss': loss}\n        return output\n\n    def get_pred(self):\n        preds_3d, preds_2d = self._forward()\n        outputs = {'pose3d': preds_3d, 'pose2d': preds_2d}\n        return outputs\n"
  },
  {
    "path": "ppdet/modeling/architectures/ppyoloe.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport copy\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead']\n# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head\n# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head\n\n\n@register\nclass PPYOLOE(BaseArch):\n    \"\"\"\n    PPYOLOE network, see https://arxiv.org/abs/2203.16250\n\n    Args:\n        backbone (nn.Layer): backbone instance\n        neck (nn.Layer): neck instance\n        yolo_head (nn.Layer): anchor_head instance\n        post_process (object): `BBoxPostProcess` instance\n        ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod)\n        for_distill (bool): whether for distillation\n        feat_distill_place (str): distill which feature for distillation\n        for_mot (bool): whether return other features for multi-object tracking\n            models, default False in pure object detection models.\n    \"\"\"\n\n    __category__ = 'architecture'\n    __shared__ = ['for_distill']\n    __inject__ = ['post_process', 'ssod_loss']\n\n    def __init__(self,\n                 backbone='CSPResNet',\n                 neck='CustomCSPPAN',\n                 yolo_head='PPYOLOEHead',\n                 post_process='BBoxPostProcess',\n                 ssod_loss='SSODPPYOLOELoss',\n                 for_distill=False,\n                 feat_distill_place='neck_feats',\n                 with_mask=False,\n                 for_mot=False):\n        super(PPYOLOE, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.yolo_head = yolo_head\n        self.post_process = post_process\n        self.for_mot = for_mot\n        self.with_mask = with_mask\n\n        # for ssod, semi-det\n        self.is_teacher = False\n        self.ssod_loss = ssod_loss\n\n        # distill\n        self.for_distill = for_distill\n        self.feat_distill_place = feat_distill_place\n        if for_distill:\n            assert feat_distill_place in ['backbone_feats', 'neck_feats']\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        yolo_head = create(cfg['yolo_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"yolo_head\": yolo_head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        neck_feats = self.neck(body_feats, self.for_mot)\n\n        self.is_teacher = self.inputs.get('is_teacher', False)  # for semi-det\n        if self.training or self.is_teacher:\n            yolo_losses = self.yolo_head(neck_feats, self.inputs)\n\n            if self.for_distill:\n                if self.feat_distill_place == 'backbone_feats':\n                    self.yolo_head.distill_pairs['backbone_feats'] = body_feats\n                elif self.feat_distill_place == 'neck_feats':\n                    self.yolo_head.distill_pairs['neck_feats'] = neck_feats\n                else:\n                    raise ValueError\n            return yolo_losses\n        else:\n\n            yolo_head_outs = self.yolo_head(neck_feats)\n\n            if self.post_process is not None:\n                bbox, bbox_num, nms_keep_idx = self.post_process(\n                    yolo_head_outs, self.yolo_head.mask_anchors,\n                    self.inputs['im_shape'], self.inputs['scale_factor'])\n\n            else:\n                if not self.with_mask:\n                    bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(\n                        yolo_head_outs, self.inputs['scale_factor'])\n                else:\n                    bbox, bbox_num, mask, nms_keep_idx = self.yolo_head.post_process(\n                        yolo_head_outs,\n                        im_shape=self.inputs['im_shape'],\n                        scale_factor=self.inputs['scale_factor'],\n                        infer_shape=self.inputs['image'].shape[2:])\n\n            if not self.with_mask:\n                output = {'bbox': bbox, 'bbox_num': bbox_num}\n            else:\n                output = {'bbox': bbox, 'bbox_num': bbox_num, 'mask': mask}\n\n            if self.with_mask:\n                output['mask'] = mask\n\n            return output\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n\n    def get_loss_keys(self):\n        return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast']\n\n    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):\n        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,\n                                     train_cfg)\n        return ssod_losses\n\n\n@register\nclass PPYOLOEWithAuxHead(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['post_process']\n\n    def __init__(self,\n                 backbone='CSPResNet',\n                 neck='CustomCSPPAN',\n                 yolo_head='PPYOLOEHead',\n                 aux_head='SimpleConvHead',\n                 post_process='BBoxPostProcess',\n                 for_mot=False,\n                 detach_epoch=5):\n        \"\"\"\n        PPYOLOE network, see https://arxiv.org/abs/2203.16250\n\n        Args:\n            backbone (nn.Layer): backbone instance\n            neck (nn.Layer): neck instance\n            yolo_head (nn.Layer): anchor_head instance\n            post_process (object): `BBoxPostProcess` instance\n            for_mot (bool): whether return other features for multi-object tracking\n                models, default False in pure object detection models.\n        \"\"\"\n        super(PPYOLOEWithAuxHead, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.aux_neck = copy.deepcopy(self.neck)\n\n        self.yolo_head = yolo_head\n        self.aux_head = aux_head\n        self.post_process = post_process\n        self.for_mot = for_mot\n        self.detach_epoch = detach_epoch\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n\n        # fpn\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n        aux_neck = copy.deepcopy(neck)\n\n        # head\n        kwargs = {'input_shape': neck.out_shape}\n        yolo_head = create(cfg['yolo_head'], **kwargs)\n        aux_head = create(cfg['aux_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"yolo_head\": yolo_head,\n            'aux_head': aux_head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        neck_feats = self.neck(body_feats, self.for_mot)\n\n        if self.training:\n            if self.inputs['epoch_id'] >= self.detach_epoch:\n                aux_neck_feats = self.aux_neck([f.detach() for f in body_feats])\n                dual_neck_feats = (paddle.concat(\n                    [f.detach(), aux_f], axis=1) for f, aux_f in\n                                   zip(neck_feats, aux_neck_feats))\n            else:\n                aux_neck_feats = self.aux_neck(body_feats)\n                dual_neck_feats = (paddle.concat(\n                    [f, aux_f], axis=1) for f, aux_f in\n                                   zip(neck_feats, aux_neck_feats))\n            aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats)\n            loss = self.yolo_head(\n                neck_feats,\n                self.inputs,\n                aux_pred=[aux_cls_scores, aux_bbox_preds])\n            return loss\n        else:\n            yolo_head_outs = self.yolo_head(neck_feats)\n            if self.post_process is not None:\n                bbox, bbox_num = self.post_process(\n                    yolo_head_outs, self.yolo_head.mask_anchors,\n                    self.inputs['im_shape'], self.inputs['scale_factor'])\n            else:\n                bbox, bbox_num = self.yolo_head.post_process(\n                    yolo_head_outs, self.inputs['scale_factor'])\n            output = {'bbox': bbox, 'bbox_num': bbox_num}\n\n            return output\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()"
  },
  {
    "path": "ppdet/modeling/architectures/queryinst.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['QueryInst']\n\n\n@register\nclass QueryInst(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['post_process']\n\n    def __init__(self,\n                 backbone,\n                 neck,\n                 rpn_head,\n                 roi_head,\n                 post_process='SparsePostProcess'):\n        super(QueryInst, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.rpn_head = rpn_head\n        self.roi_head = roi_head\n        self.post_process = post_process\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        rpn_head = create(cfg['rpn_head'], **kwargs)\n        roi_head = create(cfg['roi_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            'rpn_head': rpn_head,\n            \"roi_head\": roi_head\n        }\n\n    def _forward(self, targets=None):\n        features = self.backbone(self.inputs)\n        features = self.neck(features)\n\n        proposal_bboxes, proposal_features = self.rpn_head(self.inputs[\n            'img_whwh'])\n        outputs = self.roi_head(features, proposal_bboxes, proposal_features,\n                                targets)\n\n        if self.training:\n            return outputs\n        else:\n            bbox_pred, bbox_num, mask_pred = self.post_process(\n                outputs['class_logits'], outputs['bbox_pred'],\n                self.inputs['scale_factor_whwh'], self.inputs['ori_shape'],\n                outputs['mask_logits'])\n            return bbox_pred, bbox_num, mask_pred\n\n    def get_loss(self):\n        targets = []\n        for i in range(len(self.inputs['img_whwh'])):\n            boxes = self.inputs['gt_bbox'][i]\n            labels = self.inputs['gt_class'][i].squeeze(-1)\n            img_whwh = self.inputs['img_whwh'][i]\n            if boxes.shape[0] != 0:\n                img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1])\n            else:\n                img_whwh_tgt = paddle.zeros_like(boxes)\n            gt_segm = self.inputs['gt_segm'][i].astype('float32')\n            targets.append({\n                'boxes': boxes,\n                'labels': labels,\n                'img_whwh': img_whwh,\n                'img_whwh_tgt': img_whwh_tgt,\n                'gt_segm': gt_segm\n            })\n        losses = self._forward(targets)\n        losses.update({'loss': sum(losses.values())})\n        return losses\n\n    def get_pred(self):\n        bbox_pred, bbox_num, mask_pred = self._forward()\n        return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}\n"
  },
  {
    "path": "ppdet/modeling/architectures/retinanet.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\nimport paddle\nimport paddle.nn.functional as F\n\n__all__ = ['RetinaNet']\n\n\n@register\nclass RetinaNet(BaseArch):\n    __category__ = 'architecture'\n\n    def __init__(self, backbone, neck, head):\n        super(RetinaNet, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.head = head\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            'head': head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        neck_feats = self.neck(body_feats)\n\n        if self.training:\n            return self.head(neck_feats, self.inputs)\n        else:\n            head_outs = self.head(neck_feats)\n            bbox, bbox_num, nms_keep_idx = self.head.post_process(\n                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])\n\n            if self.use_extra_data:\n                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx\n                \"\"\"extra_data:{\n                            'scores': predict scores,\n                            'nms_keep_idx': bbox index before nms,\n                           }\n                           \"\"\"\n                preds_logits = self.head.decode_cls_logits(head_outs[0])\n                preds_scores = F.sigmoid(preds_logits)\n                extra_data['logits'] = preds_logits\n                extra_data['scores'] = preds_scores\n                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms\n                return {'bbox': bbox, 'bbox_num': bbox_num, \"extra_data\": extra_data}\n            else:\n                return {'bbox': bbox, 'bbox_num': bbox_num}\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/rtdetrv3.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom .meta_arch import BaseArch\nfrom ppdet.core.workspace import register, create\n\n__all__ = ['RTDETRV3']\n# Deformable DETR, DINO use the same architecture as DETR\n\n\n@register\nclass RTDETRV3(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['post_process', 'post_process_semi']\n    __shared__ = ['with_mask', 'exclude_post_process']\n\n    def __init__(self,\n                 backbone,\n                 transformer='DETRTransformer',\n                 detr_head='DETRHead',\n                 neck=None,\n                 aux_o2m_head=None,\n                 post_process='DETRPostProcess',\n                 post_process_semi=None,\n                 with_mask=False,\n                 exclude_post_process=False):\n        super(RTDETRV3, self).__init__()\n        self.backbone = backbone\n        self.transformer = transformer\n        self.detr_head = detr_head\n        self.neck = neck\n        self.aux_o2m_head = aux_o2m_head\n        self.post_process = post_process\n        self.with_mask = with_mask\n        self.exclude_post_process = exclude_post_process\n        self.post_process_semi = post_process_semi\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n        # neck\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None\n\n        # transformer\n        if neck is not None:\n            kwargs = {'input_shape': neck.out_shape}\n        transformer = create(cfg['transformer'], **kwargs)\n        # head\n        kwargs = {\n            'hidden_dim': transformer.hidden_dim,\n            'nhead': transformer.nhead,\n            'input_shape': backbone.out_shape\n        }\n        detr_head = create(cfg['detr_head'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        aux_o2m_head = create(cfg['aux_o2m_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'transformer': transformer,\n            \"detr_head\": detr_head,\n            \"neck\": neck,\n            \"aux_o2m_head\": aux_o2m_head\n        }\n\n    def _forward(self):\n        # Backbone\n        body_feats = self.backbone(self.inputs)\n\n        # Neck\n        if self.neck is not None:\n            body_feats = self.neck(body_feats)\n\n        # Transformer\n        pad_mask = self.inputs.get('pad_mask', None)\n        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)\n\n        # DETR Head\n        if self.training:\n            detr_losses = self.detr_head(out_transformer, body_feats,\n                                         self.inputs)\n            detr_losses.update({\n                'loss': paddle.add_n(\n                    [v for k, v in detr_losses.items() if 'log' not in k])\n            })\n            if self.aux_o2m_head is not None:\n                aux_o2m_losses = self.aux_o2m_head(body_feats, self.inputs)\n                for k, v in aux_o2m_losses.items():\n                    if k == 'loss':\n                        detr_losses[k] += v\n                    k = k + '_aux_o2m'\n                    detr_losses[k] = v\n            return detr_losses\n        else:\n            preds = self.detr_head(out_transformer, body_feats)\n            if self.exclude_post_process:\n                bbox, bbox_num, mask = preds\n            else:\n                bbox, bbox_num, mask = self.post_process(\n                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],\n                    self.inputs['image'][2:].shape)\n\n                # aux_o2m_outs = self.aux_o2m_head(body_feats)\n                # bbox, bbox_num, nms_keep_idx = self.aux_o2m_head.post_process(\n                #         aux_o2m_outs, self.inputs['scale_factor'])\n\n            output = {'bbox': bbox, 'bbox_num': bbox_num}\n            if self.with_mask:\n                output['mask'] = mask\n            return output\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/s2anet.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['S2ANet']\n\n\n@register\nclass S2ANet(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = ['head']\n\n    def __init__(self, backbone, neck, head):\n        \"\"\"\n        S2ANet, see https://arxiv.org/pdf/2008.09397.pdf\n\n        Args:\n            backbone (object): backbone instance\n            neck (object): `FPN` instance\n            head (object): `Head` instance\n        \"\"\"\n        super(S2ANet, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.s2anet_head = head\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = cfg['neck'] and create(cfg['neck'], **kwargs)\n\n        out_shape = neck and neck.out_shape or backbone.out_shape\n        kwargs = {'input_shape': out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {'backbone': backbone, 'neck': neck, \"head\": head}\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        if self.neck is not None:\n            body_feats = self.neck(body_feats)\n        if self.training:\n            loss = self.s2anet_head(body_feats, self.inputs)\n            return loss\n        else:\n            head_outs = self.s2anet_head(body_feats)\n            # post_process\n            bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs)\n            # rescale the prediction back to origin image\n            im_shape = self.inputs['im_shape']\n            scale_factor = self.inputs['scale_factor']\n            bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape,\n                                               scale_factor)\n            # output\n            output = {'bbox': bboxes, 'bbox_num': bbox_num}\n            return output\n\n    def get_loss(self, ):\n        loss = self._forward()\n        return loss\n\n    def get_pred(self):\n        output = self._forward()\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/solov2.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['SOLOv2']\n\n\n@register\nclass SOLOv2(BaseArch):\n    \"\"\"\n    SOLOv2 network, see https://arxiv.org/abs/2003.10152\n\n    Args:\n        backbone (object): an backbone instance\n        solov2_head (object): an `SOLOv2Head` instance\n        mask_head (object): an `SOLOv2MaskHead` instance\n        neck (object): neck of network, such as feature pyramid network instance\n    \"\"\"\n\n    __category__ = 'architecture'\n\n    def __init__(self, backbone, solov2_head, mask_head, neck=None):\n        super(SOLOv2, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.solov2_head = solov2_head\n        self.mask_head = mask_head\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        solov2_head = create(cfg['solov2_head'], **kwargs)\n        mask_head = create(cfg['mask_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            'solov2_head': solov2_head,\n            'mask_head': mask_head,\n        }\n\n    def model_arch(self):\n        body_feats = self.backbone(self.inputs)\n\n        body_feats = self.neck(body_feats)\n\n        self.seg_pred = self.mask_head(body_feats)\n\n        self.cate_pred_list, self.kernel_pred_list = self.solov2_head(\n            body_feats)\n\n    def get_loss(self, ):\n        loss = {}\n        # get gt_ins_labels, gt_cate_labels, etc.\n        gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], []\n        fg_num = self.inputs['fg_num']\n        for i in range(len(self.solov2_head.seg_num_grids)):\n            ins_label = 'ins_label{}'.format(i)\n            if ins_label in self.inputs:\n                gt_ins_labels.append(self.inputs[ins_label])\n            cate_label = 'cate_label{}'.format(i)\n            if cate_label in self.inputs:\n                gt_cate_labels.append(self.inputs[cate_label])\n            grid_order = 'grid_order{}'.format(i)\n            if grid_order in self.inputs:\n                gt_grid_orders.append(self.inputs[grid_order])\n\n        loss_solov2 = self.solov2_head.get_loss(\n            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,\n            gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num)\n        loss.update(loss_solov2)\n        total_loss = paddle.add_n(list(loss.values()))\n        loss.update({'loss': total_loss})\n        return loss\n\n    def get_pred(self):\n        seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction(\n            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,\n            self.inputs['im_shape'], self.inputs['scale_factor'])\n        outs = {\n            \"segm\": seg_masks,\n            \"bbox_num\": bbox_num,\n            'cate_label': cate_labels,\n            'cate_score': cate_scores\n        }\n        return outs\n"
  },
  {
    "path": "ppdet/modeling/architectures/sparse_rcnn.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = [\"SparseRCNN\"]\n\n\n@register\nclass SparseRCNN(BaseArch):\n    __category__ = 'architecture'\n    __inject__ = [\"postprocess\"]\n\n    def __init__(self,\n                 backbone,\n                 neck,\n                 head=\"SparsercnnHead\",\n                 postprocess=\"SparsePostProcess\"):\n        super(SparseRCNN, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.head = head\n        self.postprocess = postprocess\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'roi_input_shape': neck.out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"head\": head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        fpn_feats = self.neck(body_feats)\n        head_outs = self.head(fpn_feats, self.inputs[\"img_whwh\"])\n\n        if not self.training:\n            bbox_pred, bbox_num = self.postprocess(\n                head_outs[\"pred_logits\"], head_outs[\"pred_boxes\"],\n                self.inputs[\"scale_factor_whwh\"], self.inputs[\"ori_shape\"])\n            return bbox_pred, bbox_num\n        else:\n            return head_outs\n\n    def get_loss(self):\n        batch_gt_class = self.inputs[\"gt_class\"]\n        batch_gt_box = self.inputs[\"gt_bbox\"]\n        batch_whwh = self.inputs[\"img_whwh\"]\n        targets = []\n\n        for i in range(len(batch_gt_class)):\n            boxes = batch_gt_box[i]\n            labels = batch_gt_class[i].squeeze(-1)\n            img_whwh = batch_whwh[i]\n            img_whwh_tgt = img_whwh.unsqueeze(0).tile([int(boxes.shape[0]), 1])\n            targets.append({\n                \"boxes\": boxes,\n                \"labels\": labels,\n                \"img_whwh\": img_whwh,\n                \"img_whwh_tgt\": img_whwh_tgt\n            })\n\n        outputs = self._forward()\n        loss_dict = self.head.get_loss(outputs, targets)\n        acc = loss_dict[\"acc\"]\n        loss_dict.pop(\"acc\")\n        total_loss = sum(loss_dict.values())\n        loss_dict.update({\"loss\": total_loss, \"acc\": acc})\n        return loss_dict\n\n    def get_pred(self):\n        bbox_pred, bbox_num = self._forward()\n        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/ssd.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\nimport paddle\nimport paddle.nn.functional as F\n\n__all__ = ['SSD']\n\n\n@register\nclass SSD(BaseArch):\n    \"\"\"\n    Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325\n\n    Args:\n        backbone (nn.Layer): backbone instance\n        ssd_head (nn.Layer): `SSDHead` instance\n        post_process (object): `BBoxPostProcess` instance\n    \"\"\"\n\n    __category__ = 'architecture'\n    __inject__ = ['post_process']\n\n    def __init__(self, backbone, ssd_head, post_process, r34_backbone=False):\n        super(SSD, self).__init__()\n        self.backbone = backbone\n        self.ssd_head = ssd_head\n        self.post_process = post_process\n        self.r34_backbone = r34_backbone\n        if self.r34_backbone:\n            from ppdet.modeling.backbones.resnet import ResNet\n            assert isinstance(self.backbone, ResNet) and \\\n                   self.backbone.depth == 34, \\\n                \"If you set r34_backbone=True, please use ResNet-34 as backbone.\"\n            self.backbone.res_layers[2].blocks[0].branch2a.conv._stride = [1, 1]\n            self.backbone.res_layers[2].blocks[0].short.conv._stride = [1, 1]\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n\n        # head\n        kwargs = {'input_shape': backbone.out_shape}\n        ssd_head = create(cfg['ssd_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            \"ssd_head\": ssd_head,\n        }\n\n    def _forward(self):\n        # Backbone\n        body_feats = self.backbone(self.inputs)\n\n        # SSD Head\n        if self.training:\n            return self.ssd_head(body_feats, self.inputs['image'],\n                                 self.inputs['gt_bbox'],\n                                 self.inputs['gt_class'])\n        else:\n            preds, anchors = self.ssd_head(body_feats, self.inputs['image'])\n            bbox, bbox_num, nms_keep_idx = self.post_process(\n                preds, anchors, self.inputs['im_shape'],\n                self.inputs['scale_factor'])\n\n            if self.use_extra_data:\n                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx\n                \"\"\"extra_data:{\n                            'scores': predict scores,\n                            'nms_keep_idx': bbox index before nms,\n                           }\n                           \"\"\"\n                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]\n                extra_data['scores'] = F.softmax(paddle.concat(\n                    preds_logits, axis=1)).transpose([0, 2, 1])\n                extra_data['logits'] = paddle.concat(\n                    preds_logits, axis=1).transpose([0, 2, 1])\n                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms\n                return bbox, bbox_num, extra_data\n            else:\n                return bbox, bbox_num\n\n    def get_loss(self, ):\n        return {\"loss\": self._forward()}\n\n    def get_pred(self):\n        if self.use_extra_data:\n            bbox_pred, bbox_num, extra_data = self._forward()\n            output = {\n                \"bbox\": bbox_pred,\n                \"bbox_num\": bbox_num,\n                \"extra_data\": extra_data\n            }\n        else:\n            bbox_pred, bbox_num = self._forward()\n            output = {\n                \"bbox\": bbox_pred,\n                \"bbox_num\": bbox_num,\n            }\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/tood.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['TOOD']\n\n\n@register\nclass TOOD(BaseArch):\n    \"\"\"\n    TOOD: Task-aligned One-stage Object Detection, see https://arxiv.org/abs/2108.07755\n    Args:\n        backbone (nn.Layer): backbone instance\n        neck (nn.Layer): 'FPN' instance\n        head (nn.Layer): 'TOODHead' instance\n    \"\"\"\n\n    __category__ = 'architecture'\n\n    def __init__(self, backbone, neck, head):\n        super(TOOD, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.head = head\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"head\": head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        fpn_feats = self.neck(body_feats)\n        head_outs = self.head(fpn_feats)\n        if not self.training:\n            bboxes, bbox_num = self.head.post_process(\n                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])\n            return bboxes, bbox_num\n        else:\n            loss = self.head.get_loss(head_outs, self.inputs)\n            return loss\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        bbox_pred, bbox_num = self._forward()\n        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/ttfnet.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['TTFNet']\n\n\n@register\nclass TTFNet(BaseArch):\n    \"\"\"\n    TTFNet network, see https://arxiv.org/abs/1909.00700\n\n    Args:\n        backbone (object): backbone instance\n        neck (object): 'TTFFPN' instance\n        ttf_head (object): 'TTFHead' instance\n        post_process (object): 'BBoxPostProcess' instance\n    \"\"\"\n\n    __category__ = 'architecture'\n    __inject__ = ['post_process']\n\n    def __init__(self,\n                 backbone='DarkNet',\n                 neck='TTFFPN',\n                 ttf_head='TTFHead',\n                 post_process='BBoxPostProcess'):\n        super(TTFNet, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.ttf_head = ttf_head\n        self.post_process = post_process\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        backbone = create(cfg['backbone'])\n\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        kwargs = {'input_shape': neck.out_shape}\n        ttf_head = create(cfg['ttf_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"ttf_head\": ttf_head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        body_feats = self.neck(body_feats)\n        hm, wh = self.ttf_head(body_feats)\n        if self.training:\n            return hm, wh\n        else:\n            bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'],\n                                               self.inputs['scale_factor'])\n            return bbox, bbox_num\n\n    def get_loss(self, ):\n        loss = {}\n        heatmap = self.inputs['ttf_heatmap']\n        box_target = self.inputs['ttf_box_target']\n        reg_weight = self.inputs['ttf_reg_weight']\n        hm, wh = self._forward()\n        head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target,\n                                           reg_weight)\n        loss.update(head_loss)\n        total_loss = paddle.add_n(list(loss.values()))\n        loss.update({'loss': total_loss})\n        return loss\n\n    def get_pred(self):\n        bbox_pred, bbox_num = self._forward()\n        output = {\n            \"bbox\": bbox_pred,\n            \"bbox_num\": bbox_num,\n        }\n        return output\n"
  },
  {
    "path": "ppdet/modeling/architectures/yolo.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\nfrom ..post_process import JDEBBoxPostProcess\n\n__all__ = ['YOLOv3']\n# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3\n# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head\n\n\n@register\nclass YOLOv3(BaseArch):\n    __category__ = 'architecture'\n    __shared__ = ['data_format']\n    __inject__ = ['post_process']\n\n    def __init__(self,\n                 backbone='DarkNet',\n                 neck='YOLOv3FPN',\n                 yolo_head='YOLOv3Head',\n                 post_process='BBoxPostProcess',\n                 data_format='NCHW',\n                 for_mot=False):\n        \"\"\"\n        YOLOv3 network, see https://arxiv.org/abs/1804.02767\n\n        Args:\n            backbone (nn.Layer): backbone instance\n            neck (nn.Layer): neck instance\n            yolo_head (nn.Layer): anchor_head instance\n            bbox_post_process (object): `BBoxPostProcess` instance\n            data_format (str): data format, NCHW or NHWC\n            for_mot (bool): whether return other features for multi-object tracking\n                models, default False in pure object detection models.\n        \"\"\"\n        super(YOLOv3, self).__init__(data_format=data_format)\n        self.backbone = backbone\n        self.neck = neck\n        self.yolo_head = yolo_head\n        self.post_process = post_process\n        self.for_mot = for_mot\n        self.return_idx = isinstance(post_process, JDEBBoxPostProcess)\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n\n        # fpn\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        # head\n        kwargs = {'input_shape': neck.out_shape}\n        yolo_head = create(cfg['yolo_head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"yolo_head\": yolo_head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        if self.for_mot:\n            neck_feats = self.neck(body_feats, self.for_mot)\n        else:\n            neck_feats = self.neck(body_feats)\n\n        if isinstance(neck_feats, dict):\n            assert self.for_mot == True\n            emb_feats = neck_feats['emb_feats']\n            neck_feats = neck_feats['yolo_feats']\n\n        if self.training:\n            yolo_losses = self.yolo_head(neck_feats, self.inputs)\n\n            if self.for_mot:\n                return {'det_losses': yolo_losses, 'emb_feats': emb_feats}\n            else:\n                return yolo_losses\n\n        else:\n            yolo_head_outs = self.yolo_head(neck_feats)\n\n            if self.for_mot:\n                # the detection part of JDE MOT model\n                boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(\n                    yolo_head_outs, self.yolo_head.mask_anchors)\n                output = {\n                    'bbox': bbox,\n                    'bbox_num': bbox_num,\n                    'boxes_idx': boxes_idx,\n                    'nms_keep_idx': nms_keep_idx,\n                    'emb_feats': emb_feats,\n                }\n            else:\n                if self.return_idx:\n                    # the detection part of JDE MOT model\n                    _, bbox, bbox_num, nms_keep_idx = self.post_process(\n                        yolo_head_outs, self.yolo_head.mask_anchors)\n                elif self.post_process is not None:\n                    # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors\n                    bbox, bbox_num, nms_keep_idx = self.post_process(\n                        yolo_head_outs, self.yolo_head.mask_anchors,\n                        self.inputs['im_shape'], self.inputs['scale_factor'])\n                else:\n                    # anchor free YOLOs: PP-YOLOE, PP-YOLOE+\n                    bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(\n                        yolo_head_outs, self.inputs['scale_factor'])\n\n                if self.use_extra_data:\n                    extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx\n                    \"\"\"extra_data:{\n                                'scores': predict scores,\n                                'nms_keep_idx': bbox index before nms,\n                               }\n                    \"\"\"\n                    extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)\n                    # Todo: get logits output\n                    extra_data['nms_keep_idx'] = nms_keep_idx\n                    # Todo support for mask_anchors yolo\n                    output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}\n                else:\n                    output = {'bbox': bbox, 'bbox_num': bbox_num}\n\n            return output\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/yolof.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\n__all__ = ['YOLOF']\n\n\n@register\nclass YOLOF(BaseArch):\n    __category__ = 'architecture'\n\n    def __init__(self,\n                 backbone='ResNet',\n                 neck='DilatedEncoder',\n                 head='YOLOFHead',\n                 for_mot=False):\n        \"\"\"\n        YOLOF network, see https://arxiv.org/abs/2103.09460\n\n        Args:\n            backbone (nn.Layer): backbone instance\n            neck (nn.Layer): DilatedEncoder instance\n            head (nn.Layer): YOLOFHead instance\n            for_mot (bool): whether return other features for multi-object tracking\n                models, default False in pure object detection models.\n        \"\"\"\n        super(YOLOF, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.head = head\n        self.for_mot = for_mot\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n\n        # fpn\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        # head\n        kwargs = {'input_shape': neck.out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"head\": head,\n        }\n\n    def _forward(self):\n        body_feats = self.backbone(self.inputs)\n        neck_feats = self.neck(body_feats, self.for_mot)\n\n        if self.training:\n            yolo_losses = self.head(neck_feats, self.inputs)\n            return yolo_losses\n        else:\n            yolo_head_outs = self.head(neck_feats)\n            bbox, bbox_num = self.head.post_process(yolo_head_outs,\n                                                    self.inputs['im_shape'],\n                                                    self.inputs['scale_factor'])\n            output = {'bbox': bbox, 'bbox_num': bbox_num}\n            return output\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n"
  },
  {
    "path": "ppdet/modeling/architectures/yolox.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register, create\nfrom .meta_arch import BaseArch\n\nimport random\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.distributed as dist\n\n__all__ = ['YOLOX']\n\n\n@register\nclass YOLOX(BaseArch):\n    \"\"\"\n    YOLOX network, see https://arxiv.org/abs/2107.08430\n\n    Args:\n        backbone (nn.Layer): backbone instance\n        neck (nn.Layer): neck instance\n        head (nn.Layer): head instance\n        for_mot (bool): whether used for MOT or not\n        input_size (list[int]): initial scale, will be reset by self._preprocess()\n        size_stride (int): stride of the size range\n        size_range (list[int]): multi-scale range for training\n        random_interval (int): interval of iter to change self._input_size\n    \"\"\"\n    __category__ = 'architecture'\n\n    def __init__(self,\n                 backbone='CSPDarkNet',\n                 neck='YOLOCSPPAN',\n                 head='YOLOXHead',\n                 for_mot=False,\n                 input_size=[640, 640],\n                 size_stride=32,\n                 size_range=[15, 25],\n                 random_interval=10):\n        super(YOLOX, self).__init__()\n        self.backbone = backbone\n        self.neck = neck\n        self.head = head\n        self.for_mot = for_mot\n\n        self.input_size = input_size\n        self._input_size = paddle.to_tensor(input_size)\n        self.size_stride = size_stride\n        self.size_range = size_range\n        self.random_interval = random_interval\n        self._step = 0\n\n    @classmethod\n    def from_config(cls, cfg, *args, **kwargs):\n        # backbone\n        backbone = create(cfg['backbone'])\n\n        # fpn\n        kwargs = {'input_shape': backbone.out_shape}\n        neck = create(cfg['neck'], **kwargs)\n\n        # head\n        kwargs = {'input_shape': neck.out_shape}\n        head = create(cfg['head'], **kwargs)\n\n        return {\n            'backbone': backbone,\n            'neck': neck,\n            \"head\": head,\n        }\n\n    def _forward(self):\n        if self.training:\n            self._preprocess()\n        body_feats = self.backbone(self.inputs)\n        neck_feats = self.neck(body_feats, self.for_mot)\n\n        if self.training:\n            yolox_losses = self.head(neck_feats, self.inputs)\n            yolox_losses.update({'size': self._input_size[0]})\n            return yolox_losses\n        else:\n            head_outs = self.head(neck_feats)\n            bbox, bbox_num = self.head.post_process(\n                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])\n            return {'bbox': bbox, 'bbox_num': bbox_num}\n\n    def get_loss(self):\n        return self._forward()\n\n    def get_pred(self):\n        return self._forward()\n\n    def _preprocess(self):\n        # YOLOX multi-scale training, interpolate resize before inputs of the network.\n        self._get_size()\n        scale_y = self._input_size[0] / self.input_size[0]\n        scale_x = self._input_size[1] / self.input_size[1]\n        if scale_x != 1 or scale_y != 1:\n            self.inputs['image'] = F.interpolate(\n                self.inputs['image'],\n                size=self._input_size,\n                mode='bilinear',\n                align_corners=False)\n            gt_bboxes = self.inputs['gt_bbox']\n            for i in range(len(gt_bboxes)):\n                if len(gt_bboxes[i]) > 0:\n                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x\n                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y\n            self.inputs['gt_bbox'] = gt_bboxes\n\n    def _get_size(self):\n        # random_interval = 10 as default, every 10 iters to change self._input_size\n        image_ratio = self.input_size[1] * 1.0 / self.input_size[0]\n        if self._step % self.random_interval == 0:\n            size_factor = random.randint(*self.size_range)\n            size = [\n                self.size_stride * size_factor,\n                self.size_stride * int(size_factor * image_ratio)\n            ]\n            self._input_size = paddle.to_tensor(size)\n        self._step += 1\n"
  },
  {
    "path": "ppdet/modeling/assigners/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import utils\nfrom . import task_aligned_assigner\nfrom . import atss_assigner\nfrom . import simota_assigner\nfrom . import max_iou_assigner\nfrom . import fcosr_assigner\nfrom . import rotated_task_aligned_assigner\nfrom . import task_aligned_assigner_cr\nfrom . import uniform_assigner\n\nfrom .utils import *\nfrom .task_aligned_assigner import *\nfrom .atss_assigner import *\nfrom .simota_assigner import *\nfrom .max_iou_assigner import *\nfrom .fcosr_assigner import *\nfrom .rotated_task_aligned_assigner import *\nfrom .task_aligned_assigner_cr import *\nfrom .uniform_assigner import *\nfrom .hungarian_assigner import *\nfrom .pose_utils import *\n"
  },
  {
    "path": "ppdet/modeling/assigners/atss_assigner.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ..bbox_utils import iou_similarity, batch_iou_similarity\nfrom ..bbox_utils import bbox_center\nfrom .utils import (check_points_inside_bboxes, compute_max_iou_anchor,\n                    compute_max_iou_gt)\n\n__all__ = ['ATSSAssigner']\n\n\n@register\nclass ATSSAssigner(nn.Layer):\n    \"\"\"Bridging the Gap Between Anchor-based and Anchor-free Detection\n     via Adaptive Training Sample Selection\n    \"\"\"\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 topk=9,\n                 num_classes=80,\n                 force_gt_matching=False,\n                 eps=1e-9,\n                 sm_use=False):\n        super(ATSSAssigner, self).__init__()\n        self.topk = topk\n        self.num_classes = num_classes\n        self.force_gt_matching = force_gt_matching\n        self.eps = eps\n        self.sm_use = sm_use\n\n    def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,\n                             pad_gt_mask):\n        gt2anchor_distances_list = paddle.split(\n            gt2anchor_distances, num_anchors_list, axis=-1)\n        num_anchors_index = np.cumsum(num_anchors_list).tolist()\n        num_anchors_index = [0, ] + num_anchors_index[:-1]\n        is_in_topk_list = []\n        topk_idxs_list = []\n        for distances, anchors_index in zip(gt2anchor_distances_list,\n                                            num_anchors_index):\n            num_anchors = distances.shape[-1]\n            _, topk_idxs = paddle.topk(\n                distances, self.topk, axis=-1, largest=False)\n            topk_idxs_list.append(topk_idxs + anchors_index)\n            is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(\n                axis=-2).astype(gt2anchor_distances.dtype)\n            is_in_topk_list.append(is_in_topk * pad_gt_mask)\n        is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1)\n        topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1)\n        return is_in_topk_list, topk_idxs_list\n\n    @paddle.no_grad()\n    def forward(self,\n                anchor_bboxes,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index,\n                gt_scores=None,\n                pred_bboxes=None):\n        r\"\"\"This code is based on\n            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py\n\n        The assignment is done in following steps\n        1. compute iou between all bbox (bbox of all pyramid levels) and gt\n        2. compute center distance between all bbox and gt\n        3. on each pyramid level, for each gt, select k bbox whose center\n           are closest to the gt center, so we total select k*l bbox as\n           candidates for each gt\n        4. get corresponding iou for the these candidates, and compute the\n           mean and std, set mean + std as the iou threshold\n        5. select these candidates whose iou are greater than or equal to\n           the threshold as positive\n        6. limit the positive sample's center in gt\n        7. if an anchor box is assigned to multiple gts, the one with the\n           highest iou will be selected.\n        Args:\n            anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),\n                    \"xmin, xmax, ymin, ymax\" format\n            num_anchors_list (List): num of anchors in each level\n            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)\n            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)\n            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)\n            bg_index (int): background index\n            gt_scores (Tensor|None, float32) Score of gt_bboxes,\n                    shape(B, n, 1), if None, then it will initialize with one_hot label\n            pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4)\n        Returns:\n            assigned_labels (Tensor): (B, L)\n            assigned_bboxes (Tensor): (B, L, 4)\n            assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious\n        \"\"\"\n        assert gt_labels.ndim == gt_bboxes.ndim and \\\n               gt_bboxes.ndim == 3\n\n        num_anchors, _ = anchor_bboxes.shape\n        batch_size, num_max_boxes, _ = gt_bboxes.shape\n\n        # negative batch\n        if num_max_boxes == 0:\n            assigned_labels = paddle.full(\n                [batch_size, num_anchors], bg_index, dtype='int32')\n            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])\n            assigned_scores = paddle.zeros(\n                [batch_size, num_anchors, self.num_classes])\n            return assigned_labels, assigned_bboxes, assigned_scores\n\n        # 1. compute iou between gt and anchor bbox, [B, n, L]\n        ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)\n        ious = ious.reshape([batch_size, -1, num_anchors])\n\n        # 2. compute center distance between all anchors and gt, [B, n, L]\n        gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)\n        anchor_centers = bbox_center(anchor_bboxes)\n        gt2anchor_distances = (gt_centers - anchor_centers.unsqueeze(0)) \\\n            .norm(2, axis=-1).reshape([batch_size, -1, num_anchors])\n\n        # 3. on each pyramid level, selecting topk closest candidates\n        # based on the center distance, [B, n, L]\n        is_in_topk, topk_idxs = self._gather_topk_pyramid(\n            gt2anchor_distances, num_anchors_list, pad_gt_mask)\n\n        # 4. get corresponding iou for the these candidates, and compute the\n        # mean and std, 5. set mean + std as the iou threshold\n        iou_candidates = ious * is_in_topk\n        iou_threshold = paddle.index_sample(\n            iou_candidates.flatten(stop_axis=-2),\n            topk_idxs.flatten(stop_axis=-2))\n        iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])\n        iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \\\n                        iou_threshold.std(axis=-1, keepdim=True)\n        is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk,\n                                  paddle.zeros_like(is_in_topk))\n\n        # 6. check the positive sample's center in gt, [B, n, L]\n        if self.sm_use:\n            is_in_gts = check_points_inside_bboxes(\n                anchor_centers, gt_bboxes, sm_use=True)\n        else:\n            is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)\n\n        # select positive sample, [B, n, L]\n        mask_positive = is_in_topk * is_in_gts * pad_gt_mask\n\n        # 7. if an anchor box is assigned to multiple gts,\n        # the one with the highest iou will be selected.\n        mask_positive_sum = mask_positive.sum(axis=-2)\n        if mask_positive_sum.max() > 1:\n            mask_multiple_gts = (\n                mask_positive_sum.unsqueeze(1) > 1).astype('int32').tile(\n                    [1, num_max_boxes, 1]).astype('bool')\n            if self.sm_use:\n                is_max_iou = compute_max_iou_anchor(ious * mask_positive)\n            else:\n                is_max_iou = compute_max_iou_anchor(ious)\n            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,\n                                         mask_positive)\n            mask_positive_sum = mask_positive.sum(axis=-2)\n        # 8. make sure every gt_bbox matches the anchor\n        if self.force_gt_matching:\n            is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask\n            mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile(\n                [1, num_max_boxes, 1])\n            mask_positive = paddle.where(mask_max_iou, is_max_iou,\n                                         mask_positive)\n            mask_positive_sum = mask_positive.sum(axis=-2)\n        assigned_gt_index = mask_positive.argmax(axis=-2)\n\n        # assigned target\n        batch_ind = paddle.arange(\n            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)\n        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)\n        assigned_labels = paddle.gather(\n            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)\n        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])\n        assigned_labels = paddle.where(\n            mask_positive_sum > 0, assigned_labels,\n            paddle.full_like(assigned_labels, bg_index))\n\n        assigned_bboxes = paddle.gather(\n            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)\n        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])\n\n        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)\n        ind = list(range(self.num_classes + 1))\n        ind.remove(bg_index)\n        assigned_scores = paddle.index_select(\n            assigned_scores, paddle.to_tensor(ind), axis=-1)\n        if pred_bboxes is not None:\n            # assigned iou\n            ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive\n            ious = ious.max(axis=-2).unsqueeze(-1)\n            assigned_scores *= ious\n        elif gt_scores is not None:\n            gather_scores = paddle.gather(\n                gt_scores.flatten(), assigned_gt_index.flatten(), axis=0)\n            gather_scores = gather_scores.reshape([batch_size, num_anchors])\n            gather_scores = paddle.where(mask_positive_sum > 0, gather_scores,\n                                         paddle.zeros_like(gather_scores))\n            assigned_scores *= gather_scores.unsqueeze(-1)\n\n        return assigned_labels, assigned_bboxes, assigned_scores\n"
  },
  {
    "path": "ppdet/modeling/assigners/clrnet_assigner.py",
    "content": "import paddle\nimport paddle.nn.functional as F\nfrom ppdet.modeling.losses.clrnet_line_iou_loss import line_iou\n\n\ndef distance_cost(predictions, targets, img_w):\n    \"\"\"\n    repeat predictions and targets to generate all combinations\n    use the abs distance as the new distance cost\n    \"\"\"\n    num_priors = predictions.shape[0]\n    num_targets = targets.shape[0]\n    predictions = paddle.repeat_interleave(\n        predictions, num_targets, axis=0)[..., 6:]\n    targets = paddle.concat(x=num_priors * [targets])[..., 6:]\n    invalid_masks = (targets < 0) | (targets >= img_w)\n    lengths = (~invalid_masks).sum(axis=1)\n    distances = paddle.abs(x=targets - predictions)\n    distances[invalid_masks] = 0.0\n    distances = distances.sum(axis=1) / (lengths.cast(\"float32\") + 1e-09)\n    distances = distances.reshape([num_priors, num_targets])\n    return distances\n\n\ndef focal_cost(cls_pred, gt_labels, alpha=0.25, gamma=2, eps=1e-12):\n    \"\"\"\n    Args:\n        cls_pred (Tensor): Predicted classification logits, shape\n            [num_query, num_class].\n        gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).\n\n    Returns:\n        torch.Tensor: cls_cost value\n    \"\"\"\n    cls_pred = F.sigmoid(cls_pred)\n    neg_cost = -(1 - cls_pred + eps).log() * (1 - alpha) * cls_pred.pow(gamma)\n    pos_cost = -(cls_pred + eps).log() * alpha * (1 - cls_pred).pow(gamma)\n    cls_cost = pos_cost.index_select(\n        gt_labels, axis=1) - neg_cost.index_select(\n            gt_labels, axis=1)\n    return cls_cost\n\n\ndef dynamic_k_assign(cost, pair_wise_ious):\n    \"\"\"\n    Assign grouth truths with priors dynamically.\n\n    Args:\n        cost: the assign cost.\n        pair_wise_ious: iou of grouth truth and priors.\n\n    Returns:\n        prior_idx: the index of assigned prior.\n        gt_idx: the corresponding ground truth index.\n    \"\"\"\n    matching_matrix = paddle.zeros_like(cost)\n    ious_matrix = pair_wise_ious\n    ious_matrix[ious_matrix < 0] = 0.0\n    n_candidate_k = 4\n    topk_ious, _ = paddle.topk(ious_matrix, n_candidate_k, axis=0)\n    dynamic_ks = paddle.clip(x=topk_ious.sum(0).cast(\"int32\"), min=1)\n    num_gt = cost.shape[1]\n\n    for gt_idx in range(num_gt):\n        _, pos_idx = paddle.topk(\n            x=cost[:, gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)\n        matching_matrix[pos_idx, gt_idx] = 1.0\n    del topk_ious, dynamic_ks, pos_idx\n    matched_gt = matching_matrix.sum(axis=1)\n\n    if (matched_gt > 1).sum() > 0:\n        matched_gt_indices = paddle.nonzero(matched_gt > 1)[:, 0]\n        cost_argmin = paddle.argmin(\n            cost.index_select(matched_gt_indices), axis=1)\n        matching_matrix[matched_gt_indices][0] *= 0.0\n        matching_matrix[matched_gt_indices, cost_argmin] = 1.0\n\n    prior_idx = matching_matrix.sum(axis=1).nonzero()\n    gt_idx = matching_matrix[prior_idx].argmax(axis=-1)\n    return prior_idx.flatten(), gt_idx.flatten()\n\n\ndef cdist_paddle(x1, x2, p=2):\n    assert x1.shape[1] == x2.shape[1]\n    B, M = x1.shape\n    # if p == np.inf:\n    #     dist = np.max(np.abs(x1[:, np.newaxis, :] - x2[np.newaxis, :, :]), axis=-1)\n    if p == 1:\n        dist = paddle.sum(\n            paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), axis=-1)\n    else:\n        dist = paddle.pow(paddle.sum(paddle.pow(\n            paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), p),\n                                     axis=-1),\n                          1 / p)\n    return dist\n\n\ndef assign(predictions,\n           targets,\n           img_w,\n           img_h,\n           distance_cost_weight=3.0,\n           cls_cost_weight=1.0):\n    \"\"\"\n    computes dynamicly matching based on the cost, including cls cost and lane similarity cost\n    Args:\n        predictions (Tensor): predictions predicted by each stage, shape: (num_priors, 78)\n        targets (Tensor): lane targets, shape: (num_targets, 78)\n    return:\n        matched_row_inds (Tensor): matched predictions, shape: (num_targets)\n        matched_col_inds (Tensor): matched targets, shape: (num_targets)\n    \"\"\"\n    predictions = predictions.detach().clone()\n    predictions[:, 3] *= img_w - 1\n    predictions[:, 6:] *= img_w - 1\n\n    targets = targets.detach().clone()\n    distances_score = distance_cost(predictions, targets, img_w)\n    distances_score = 1 - distances_score / paddle.max(x=distances_score) + 0.01\n\n    cls_score = focal_cost(predictions[:, :2], targets[:, 1].cast('int64'))\n\n    num_priors = predictions.shape[0]\n    num_targets = targets.shape[0]\n    target_start_xys = targets[:, 2:4]\n    target_start_xys[..., 0] *= (img_h - 1)\n    prediction_start_xys = predictions[:, 2:4]\n    prediction_start_xys[..., 0] *= (img_h - 1)\n    start_xys_score = cdist_paddle(\n        prediction_start_xys, target_start_xys,\n        p=2).reshape([num_priors, num_targets])\n\n    start_xys_score = 1 - start_xys_score / paddle.max(x=start_xys_score) + 0.01\n\n    target_thetas = targets[:, 4].unsqueeze(axis=-1)\n    theta_score = cdist_paddle(\n        predictions[:, 4].unsqueeze(axis=-1), target_thetas,\n        p=1).reshape([num_priors, num_targets]) * 180\n    theta_score = 1 - theta_score / paddle.max(x=theta_score) + 0.01\n\n    cost = -(distances_score * start_xys_score * theta_score\n             )**2 * distance_cost_weight + cls_score * cls_cost_weight\n    iou = line_iou(predictions[..., 6:], targets[..., 6:], img_w, aligned=False)\n\n    matched_row_inds, matched_col_inds = dynamic_k_assign(cost, iou)\n    return matched_row_inds, matched_col_inds\n"
  },
  {
    "path": "ppdet/modeling/assigners/fcosr_assigner.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.rbox_utils import box2corners, check_points_in_polys, paddle_gather\n\n__all__ = ['FCOSRAssigner']\n\nEPS = 1e-9\n\n\n@register\nclass FCOSRAssigner(nn.Layer):\n    \"\"\" FCOSR Assigner, refer to https://arxiv.org/abs/2111.10780 for details\n\n    1. compute normalized gaussian distribution score and refined gaussian distribution score\n    2. refer to ellipse center sampling, sample points whose normalized gaussian distribution score is greater than threshold\n    3. refer to multi-level sampling, assign ground truth to feature map which follows two conditions.\n        i). first, the ratio between the short edge of the target and the stride of the feature map is less than 2.\n        ii). second, the long edge of minimum bounding rectangle of the target is larger than the acceptance range of feature map\n    4. refer to fuzzy sample label assignment, the points satisfying 2 and 3 will be assigned to the ground truth according to gaussian distribution score\n    \"\"\"\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 num_classes=80,\n                 factor=12,\n                 threshold=0.23,\n                 boundary=[[-1, 128], [128, 320], [320, 10000]],\n                 score_type='iou'):\n        super(FCOSRAssigner, self).__init__()\n        self.num_classes = num_classes\n        self.factor = factor\n        self.threshold = threshold\n        self.boundary = [\n            paddle.to_tensor(\n                l, dtype=paddle.float32).reshape([1, 1, 2]) for l in boundary\n        ]\n        self.score_type = score_type\n\n    def get_gaussian_distribution_score(self, points, gt_rboxes, gt_polys):\n        # projecting points to coordinate system defined by each rbox\n        # [B, N, 4, 2] -> 4 * [B, N, 1, 2]\n        a, b, c, d = gt_polys.split(4, axis=2)\n        # [1, L, 2] -> [1, 1, L, 2]\n        points = points.unsqueeze(0)\n        ab = b - a\n        ad = d - a\n        # [B, N, 5] -> [B, N, 2], [B, N, 2], [B, N, 1]\n        xy, wh, angle = gt_rboxes.split([2, 2, 1], axis=-1)\n        # [B, N, 2] -> [B, N, 1, 2]\n        xy = xy.unsqueeze(2)\n        # vector of points to center [B, N, L, 2]\n        vec = points - xy\n        # <ab, vec> = |ab| * |vec| * cos(theta) [B, N, L]\n        vec_dot_ab = paddle.sum(vec * ab, axis=-1)\n        # <ad, vec> = |ad| * |vec| * cos(theta) [B, N, L]\n        vec_dot_ad = paddle.sum(vec * ad, axis=-1)\n        # norm_ab [B, N, L]\n        norm_ab = paddle.sum(ab * ab, axis=-1).sqrt()\n        # norm_ad [B, N, L]\n        norm_ad = paddle.sum(ad * ad, axis=-1).sqrt()\n        # min(h, w), [B, N, 1]\n        min_edge = paddle.min(wh, axis=-1, keepdim=True)\n        # delta_x, delta_y [B, N, L]\n        delta_x = vec_dot_ab.pow(2) / (norm_ab.pow(3) * min_edge + EPS)\n        delta_y = vec_dot_ad.pow(2) / (norm_ad.pow(3) * min_edge + EPS)\n        # score [B, N, L]\n        norm_score = paddle.exp(-0.5 * self.factor * (delta_x + delta_y))\n\n        # simplified calculation\n        sigma = min_edge / self.factor\n        refined_score = norm_score / (2 * np.pi * sigma + EPS)\n        return norm_score, refined_score\n\n    def get_rotated_inside_mask(self, points, gt_polys, scores):\n        inside_mask = check_points_in_polys(points, gt_polys)\n        center_mask = scores >= self.threshold\n        return (inside_mask & center_mask).cast(paddle.float32)\n\n    def get_inside_range_mask(self, points, gt_bboxes, gt_rboxes, stride_tensor,\n                              regress_range):\n        # [1, L, 2] -> [1, 1, L, 2]\n        points = points.unsqueeze(0)\n        # [B, n, 4] -> [B, n, 1, 4]\n        x1y1, x2y2 = gt_bboxes.unsqueeze(2).split(2, axis=-1)\n        # [B, n, L, 2]\n        lt = points - x1y1\n        rb = x2y2 - points\n        # [B, n, L, 4]\n        ltrb = paddle.concat([lt, rb], axis=-1)\n        # [B, n, L, 4] -> [B, n, L]\n        inside_mask = paddle.min(ltrb, axis=-1) > EPS\n        # regress_range [1, L, 2] -> [1, 1, L, 2]\n        regress_range = regress_range.unsqueeze(0)\n        # stride_tensor [1, L, 1] -> [1, 1, L]\n        stride_tensor = stride_tensor.transpose((0, 2, 1))\n        # fcos range\n        # [B, n, L, 4] -> [B, n, L]\n        ltrb_max = paddle.max(ltrb, axis=-1)\n        # [1, 1, L, 2] -> [1, 1, L]\n        low, high = regress_range[..., 0], regress_range[..., 1]\n        # [B, n, L]\n        regress_mask = (ltrb_max >= low) & (ltrb_max <= high)\n        # mask for rotated\n        # [B, n, 1]\n        min_edge = paddle.min(gt_rboxes[..., 2:4], axis=-1, keepdim=True)\n        # [B, n , L]\n        rotated_mask = ((min_edge / stride_tensor) < 2.0) & (ltrb_max > high)\n        mask = inside_mask & (regress_mask | rotated_mask)\n        return mask.cast(paddle.float32)\n\n    @paddle.no_grad()\n    def forward(self,\n                anchor_points,\n                stride_tensor,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                gt_rboxes,\n                pad_gt_mask,\n                bg_index,\n                pred_rboxes=None):\n        r\"\"\"\n\n        Args:\n            anchor_points (Tensor, float32): pre-defined anchor points, shape(1, L, 2),\n                    \"x, y\" format\n            stride_tensor (Tensor, float32): stride tensor, shape (1, L, 1)\n            num_anchors_list (List): num of anchors in each level\n            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)\n            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)\n            gt_rboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)\n            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)\n            bg_index (int): background index\n            pred_rboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 5)\n        Returns:\n            assigned_labels (Tensor): (B, L)\n            assigned_rboxes (Tensor): (B, L, 5)\n            assigned_scores (Tensor): (B, L, C), if pred_rboxes is not None, then output ious\n        \"\"\"\n\n        _, num_anchors, _ = anchor_points.shape\n        batch_size, num_max_boxes, _ = gt_rboxes.shape\n        if num_max_boxes == 0:\n            assigned_labels = paddle.full(\n                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)\n            assigned_rboxes = paddle.zeros([batch_size, num_anchors, 5])\n            assigned_scores = paddle.zeros(\n                [batch_size, num_anchors, self.num_classes])\n            return assigned_labels, assigned_rboxes, assigned_scores\n\n        # get normalized gaussian distribution score and refined distribution score\n        gt_polys = box2corners(gt_rboxes)\n        score, refined_score = self.get_gaussian_distribution_score(\n            anchor_points, gt_rboxes, gt_polys)\n        inside_mask = self.get_rotated_inside_mask(anchor_points, gt_polys,\n                                                   score)\n        regress_ranges = []\n        for num, bound in zip(num_anchors_list, self.boundary):\n            regress_ranges.append(bound.tile((1, num, 1)))\n        regress_ranges = paddle.concat(regress_ranges, axis=1)\n        regress_mask = self.get_inside_range_mask(\n            anchor_points, gt_bboxes, gt_rboxes, stride_tensor, regress_ranges)\n        # [B, n, L]\n        mask_positive = inside_mask * regress_mask * pad_gt_mask\n        refined_score = refined_score * mask_positive - (1. - mask_positive)\n\n        argmax_refined_score = refined_score.argmax(axis=-2)\n        max_refined_score = refined_score.max(axis=-2)\n        assigned_gt_index = argmax_refined_score\n\n        # assigned target\n        batch_ind = paddle.arange(\n            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)\n        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)\n        assigned_labels = paddle.gather(\n            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)\n        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])\n        assigned_labels = paddle.where(\n            max_refined_score > 0, assigned_labels,\n            paddle.full_like(assigned_labels, bg_index))\n\n        assigned_rboxes = paddle.gather(\n            gt_rboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)\n        assigned_rboxes = assigned_rboxes.reshape([batch_size, num_anchors, 5])\n\n        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)\n        ind = list(range(self.num_classes + 1))\n        ind.remove(bg_index)\n        assigned_scores = paddle.index_select(\n            assigned_scores, paddle.to_tensor(ind), axis=-1)\n\n        if self.score_type == 'gaussian':\n            selected_scores = paddle_gather(\n                score, 1, argmax_refined_score.unsqueeze(-2)).squeeze(-2)\n            assigned_scores = assigned_scores * selected_scores.unsqueeze(-1)\n        elif self.score_type == 'iou':\n            assert pred_rboxes is not None, 'If score type is iou, pred_rboxes should not be None'\n            from ext_op import matched_rbox_iou\n            b, l = pred_rboxes.shape[:2]\n            iou_score = matched_rbox_iou(\n                pred_rboxes.reshape((-1, 5)), assigned_rboxes.reshape(\n                    (-1, 5))).reshape((b, l, 1))\n            assigned_scores = assigned_scores * iou_score\n\n        return assigned_labels, assigned_rboxes, assigned_scores\n"
  },
  {
    "path": "ppdet/modeling/assigners/hungarian_assigner.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\ntry:\n    from scipy.optimize import linear_sum_assignment\nexcept ImportError:\n    linear_sum_assignment = None\n\nimport paddle\n\nfrom ppdet.core.workspace import register\n\n__all__ = ['PoseHungarianAssigner', 'PseudoSampler']\n\n\nclass AssignResult:\n    \"\"\"Stores assignments between predicted and truth boxes.\n\n    Attributes:\n        num_gts (int): the number of truth boxes considered when computing this\n            assignment\n\n        gt_inds (LongTensor): for each predicted box indicates the 1-based\n            index of the assigned truth box. 0 means unassigned and -1 means\n            ignore.\n\n        max_overlaps (FloatTensor): the iou between the predicted box and its\n            assigned truth box.\n\n        labels (None | LongTensor): If specified, for each predicted box\n            indicates the category label of the assigned truth box.\n    \"\"\"\n\n    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):\n        self.num_gts = num_gts\n        self.gt_inds = gt_inds\n        self.max_overlaps = max_overlaps\n        self.labels = labels\n        # Interface for possible user-defined properties\n        self._extra_properties = {}\n\n    @property\n    def num_preds(self):\n        \"\"\"int: the number of predictions in this assignment\"\"\"\n        return len(self.gt_inds)\n\n    def set_extra_property(self, key, value):\n        \"\"\"Set user-defined new property.\"\"\"\n        assert key not in self.info\n        self._extra_properties[key] = value\n\n    def get_extra_property(self, key):\n        \"\"\"Get user-defined property.\"\"\"\n        return self._extra_properties.get(key, None)\n\n    @property\n    def info(self):\n        \"\"\"dict: a dictionary of info about the object\"\"\"\n        basic_info = {\n            'num_gts': self.num_gts,\n            'num_preds': self.num_preds,\n            'gt_inds': self.gt_inds,\n            'max_overlaps': self.max_overlaps,\n            'labels': self.labels,\n        }\n        basic_info.update(self._extra_properties)\n        return basic_info\n\n\n@register\nclass PoseHungarianAssigner:\n    \"\"\"Computes one-to-one matching between predictions and ground truth.\n\n    This class computes an assignment between the targets and the predictions\n    based on the costs. The costs are weighted sum of three components:\n    classification cost, regression L1 cost and regression oks cost. The\n    targets don't include the no_object, so generally there are more\n    predictions than targets. After the one-to-one matching, the un-matched\n    are treated as backgrounds. Thus each query prediction will be assigned\n    with `0` or a positive integer indicating the ground truth index:\n\n    - 0: negative sample, no assigned gt.\n    - positive integer: positive sample, index (1-based) of assigned gt.\n\n    Args:\n        cls_weight (int | float, optional): The scale factor for classification\n            cost. Default 1.0.\n        kpt_weight (int | float, optional): The scale factor for regression\n            L1 cost. Default 1.0.\n        oks_weight (int | float, optional): The scale factor for regression\n            oks cost. Default 1.0.\n    \"\"\"\n    __inject__ = ['cls_cost', 'kpt_cost', 'oks_cost']\n\n    def __init__(self,\n                 cls_cost='ClassificationCost',\n                 kpt_cost='KptL1Cost',\n                 oks_cost='OksCost'):\n        self.cls_cost = cls_cost\n        self.kpt_cost = kpt_cost\n        self.oks_cost = oks_cost\n\n    def assign(self,\n               cls_pred,\n               kpt_pred,\n               gt_labels,\n               gt_keypoints,\n               gt_areas,\n               img_meta,\n               eps=1e-7):\n        \"\"\"Computes one-to-one matching based on the weighted costs.\n\n        This method assign each query prediction to a ground truth or\n        background. The `assigned_gt_inds` with -1 means don't care,\n        0 means negative sample, and positive number is the index (1-based)\n        of assigned gt.\n        The assignment is done in the following steps, the order matters.\n\n        1. assign every prediction to -1\n        2. compute the weighted costs\n        3. do Hungarian matching on CPU based on the costs\n        4. assign all to 0 (background) first, then for each matched pair\n           between predictions and gts, treat this prediction as foreground\n           and assign the corresponding gt index (plus 1) to it.\n\n        Args:\n            cls_pred (Tensor): Predicted classification logits, shape\n                [num_query, num_class].\n            kpt_pred (Tensor): Predicted keypoints with normalized coordinates\n                (x_{i}, y_{i}), which are all in range [0, 1]. Shape\n                [num_query, K*2].\n            gt_labels (Tensor): Label of `gt_keypoints`, shape (num_gt,).\n            gt_keypoints (Tensor): Ground truth keypoints with unnormalized\n                coordinates [p^{1}_x, p^{1}_y, p^{1}_v, ..., \\\n                    p^{K}_x, p^{K}_y, p^{K}_v]. Shape [num_gt, K*3].\n            gt_areas (Tensor): Ground truth mask areas, shape (num_gt,).\n            img_meta (dict): Meta information for current image.\n            eps (int | float, optional): A value added to the denominator for\n                numerical stability. Default 1e-7.\n\n        Returns:\n            :obj:`AssignResult`: The assigned result.\n        \"\"\"\n        num_gts, num_kpts = gt_keypoints.shape[0], kpt_pred.shape[0]\n        if not gt_keypoints.astype('bool').any():\n            num_gts = 0\n\n        # 1. assign -1 by default\n        assigned_gt_inds = paddle.full((num_kpts, ), -1, dtype=\"int64\")\n        assigned_labels = paddle.full((num_kpts, ), -1, dtype=\"int64\")\n        if num_gts == 0 or num_kpts == 0:\n            # No ground truth or keypoints, return empty assignment\n            if num_gts == 0:\n                # No ground truth, assign all to background\n                assigned_gt_inds[:] = 0\n            return AssignResult(\n                num_gts, assigned_gt_inds, None, labels=assigned_labels)\n        img_h, img_w, _ = img_meta['img_shape']\n        factor = paddle.to_tensor(\n            [img_w, img_h, img_w, img_h], dtype=gt_keypoints.dtype).reshape(\n                (1, -1))\n\n        # 2. compute the weighted costs\n        # classification cost\n        cls_cost = self.cls_cost(cls_pred, gt_labels)\n\n        # keypoint regression L1 cost\n        gt_keypoints_reshape = gt_keypoints.reshape((gt_keypoints.shape[0], -1,\n                                                     3))\n        valid_kpt_flag = gt_keypoints_reshape[..., -1]\n        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,\n                                                          2))\n        normalize_gt_keypoints = gt_keypoints_reshape[\n            ..., :2] / factor[:, :2].unsqueeze(0)\n        kpt_cost = self.kpt_cost(kpt_pred_tmp, normalize_gt_keypoints,\n                                 valid_kpt_flag)\n        # keypoint OKS cost\n        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,\n                                                          2))\n        kpt_pred_tmp = kpt_pred_tmp * factor[:, :2].unsqueeze(0)\n        oks_cost = self.oks_cost(kpt_pred_tmp, gt_keypoints_reshape[..., :2],\n                                 valid_kpt_flag, gt_areas)\n        # weighted sum of above three costs\n        cost = cls_cost + kpt_cost + oks_cost\n\n        # 3. do Hungarian matching on CPU using linear_sum_assignment\n        cost = cost.detach().cpu()\n        if linear_sum_assignment is None:\n            raise ImportError('Please run \"pip install scipy\" '\n                              'to install scipy first.')\n        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)\n        matched_row_inds = paddle.to_tensor(matched_row_inds)\n        matched_col_inds = paddle.to_tensor(matched_col_inds)\n\n        # 4. assign backgrounds and foregrounds\n        # assign all indices to backgrounds first\n        assigned_gt_inds[:] = 0\n        # assign foregrounds based on matching results\n        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1\n        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds][\n            ..., 0].astype(\"int64\")\n        return AssignResult(\n            num_gts, assigned_gt_inds, None, labels=assigned_labels)\n\n\nclass SamplingResult:\n    \"\"\"Bbox sampling result.\n    \"\"\"\n\n    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,\n                 gt_flags):\n        self.pos_inds = pos_inds\n        self.neg_inds = neg_inds\n        if pos_inds.size > 0:\n            self.pos_bboxes = bboxes[pos_inds]\n            self.neg_bboxes = bboxes[neg_inds]\n            self.pos_is_gt = gt_flags[pos_inds]\n\n            self.num_gts = gt_bboxes.shape[0]\n            self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1\n\n            if gt_bboxes.numel() == 0:\n                # hack for index error case\n                assert self.pos_assigned_gt_inds.numel() == 0\n                self.pos_gt_bboxes = paddle.zeros(\n                    gt_bboxes.shape, dtype=gt_bboxes.dtype).reshape((-1, 4))\n            else:\n                if len(gt_bboxes.shape) < 2:\n                    gt_bboxes = gt_bboxes.reshape((-1, 4))\n\n                self.pos_gt_bboxes = paddle.index_select(\n                    gt_bboxes,\n                    self.pos_assigned_gt_inds.astype('int64'),\n                    axis=0)\n\n            if assign_result.labels is not None:\n                self.pos_gt_labels = assign_result.labels[pos_inds]\n            else:\n                self.pos_gt_labels = None\n\n    @property\n    def bboxes(self):\n        \"\"\"paddle.Tensor: concatenated positive and negative boxes\"\"\"\n        return paddle.concat([self.pos_bboxes, self.neg_bboxes])\n\n    def __nice__(self):\n        data = self.info.copy()\n        data['pos_bboxes'] = data.pop('pos_bboxes').shape\n        data['neg_bboxes'] = data.pop('neg_bboxes').shape\n        parts = [f\"'{k}': {v!r}\" for k, v in sorted(data.items())]\n        body = '    ' + ',\\n    '.join(parts)\n        return '{\\n' + body + '\\n}'\n\n    @property\n    def info(self):\n        \"\"\"Returns a dictionary of info about the object.\"\"\"\n        return {\n            'pos_inds': self.pos_inds,\n            'neg_inds': self.neg_inds,\n            'pos_bboxes': self.pos_bboxes,\n            'neg_bboxes': self.neg_bboxes,\n            'pos_is_gt': self.pos_is_gt,\n            'num_gts': self.num_gts,\n            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,\n        }\n\n\n@register\nclass PseudoSampler:\n    \"\"\"A pseudo sampler that does not do sampling actually.\"\"\"\n\n    def __init__(self, **kwargs):\n        pass\n\n    def _sample_pos(self, **kwargs):\n        \"\"\"Sample positive samples.\"\"\"\n        raise NotImplementedError\n\n    def _sample_neg(self, **kwargs):\n        \"\"\"Sample negative samples.\"\"\"\n        raise NotImplementedError\n\n    def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs):\n        \"\"\"Directly returns the positive and negative indices  of samples.\n\n        Args:\n            assign_result (:obj:`AssignResult`): Assigned results\n            bboxes (paddle.Tensor): Bounding boxes\n            gt_bboxes (paddle.Tensor): Ground truth boxes\n\n        Returns:\n            :obj:`SamplingResult`: sampler results\n        \"\"\"\n        pos_inds = paddle.nonzero(\n            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1)\n        neg_inds = paddle.nonzero(\n            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1)\n        gt_flags = paddle.zeros([bboxes.shape[0]], dtype='int32')\n        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,\n                                         assign_result, gt_flags)\n        return sampling_result\n"
  },
  {
    "path": "ppdet/modeling/assigners/max_iou_assigner.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.proposal_generator.target import label_box\n\n__all__ = ['MaxIoUAssigner']\n\n@register\nclass MaxIoUAssigner(object):\n    \"\"\"a standard bbox assigner based on max IoU, use ppdet's label_box \n    as backend.\n    Args:\n        positive_overlap (float): threshold for defining positive samples \n        negative_overlap (float): threshold for denining negative samples\n        allow_low_quality (bool): whether to lower IoU thr if a GT poorly\n            overlaps with candidate bboxes\n    \"\"\"\n    def __init__(self,\n                 positive_overlap,\n                 negative_overlap,\n                 allow_low_quality=True):\n        self.positive_overlap = positive_overlap\n        self.negative_overlap = negative_overlap\n        self.allow_low_quality = allow_low_quality\n\n    def __call__(self, bboxes, gt_bboxes):\n        matches, match_labels = label_box(\n            bboxes,\n            gt_bboxes,\n            positive_overlap=self.positive_overlap,\n            negative_overlap=self.negative_overlap,\n            allow_low_quality=self.allow_low_quality,\n            ignore_thresh=-1,\n            is_crowd=None,\n            assign_on_cpu=False)\n        return matches, match_labels\n"
  },
  {
    "path": "ppdet/modeling/assigners/pose_utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport paddle\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\n\n__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost']\n\n\ndef masked_fill(x, mask, value):\n    y = paddle.full(x.shape, value, x.dtype)\n    return paddle.where(mask, y, x)\n\n\n@register\nclass KptL1Cost(object):\n    \"\"\"KptL1Cost.\n\n    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py\n\n    Args:\n        weight (int | float, optional): loss_weight.\n    \"\"\"\n\n    def __init__(self, weight=1.0):\n        self.weight = weight\n\n    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag):\n        \"\"\"\n        Args:\n            kpt_pred (Tensor): Predicted keypoints with normalized coordinates\n                (x_{i}, y_{i}), which are all in range [0, 1]. Shape\n                [num_query, K, 2].\n            gt_keypoints (Tensor): Ground truth keypoints with normalized\n                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].\n            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.\n                Shape [num_gt, K].\n\n        Returns:\n            paddle.Tensor: kpt_cost value with weight.\n        \"\"\"\n        kpt_cost = []\n        for i in range(len(gt_keypoints)):\n            if gt_keypoints[i].size == 0:\n                kpt_cost.append(kpt_pred.sum() * 0)\n            kpt_pred_tmp = kpt_pred.clone()\n            valid_flag = valid_kpt_flag[i] > 0\n            valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as(\n                kpt_pred_tmp)\n            if not valid_flag_expand.all():\n                kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0)\n            cost = F.pairwise_distance(\n                kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)),\n                gt_keypoints[i].reshape((-1, )).unsqueeze(0),\n                p=1,\n                keepdim=True)\n            avg_factor = paddle.clip(\n                valid_flag.astype('float32').sum() * 2, 1.0)\n            cost = cost / avg_factor\n            kpt_cost.append(cost)\n        kpt_cost = paddle.concat(kpt_cost, axis=1)\n        return kpt_cost * self.weight\n\n\n@register\nclass OksCost(object):\n    \"\"\"OksCost.\n\n    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py\n\n    Args:\n        num_keypoints (int): number of keypoints\n        weight (int | float, optional): loss_weight.\n    \"\"\"\n\n    def __init__(self, num_keypoints=17, weight=1.0):\n        self.weight = weight\n        if num_keypoints == 17:\n            self.sigmas = np.array(\n                [\n                    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,\n                    1.07, .87, .87, .89, .89\n                ],\n                dtype=np.float32) / 10.0\n        elif num_keypoints == 14:\n            self.sigmas = np.array(\n                [\n                    .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89,\n                    .89, .79, .79\n                ],\n                dtype=np.float32) / 10.0\n        else:\n            raise ValueError(f'Unsupported keypoints number {num_keypoints}')\n\n    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas):\n        \"\"\"\n        Args:\n            kpt_pred (Tensor): Predicted keypoints with unnormalized\n                coordinates (x_{i}, y_{i}). Shape [num_query, K, 2].\n            gt_keypoints (Tensor): Ground truth keypoints with unnormalized\n                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].\n            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.\n                Shape [num_gt, K].\n            gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,].\n\n        Returns:\n            paddle.Tensor: oks_cost value with weight.\n        \"\"\"\n        sigmas = paddle.to_tensor(self.sigmas)\n        variances = (sigmas * 2)**2\n\n        oks_cost = []\n        assert len(gt_keypoints) == len(gt_areas)\n        for i in range(len(gt_keypoints)):\n            if gt_keypoints[i].size == 0:\n                oks_cost.append(kpt_pred.sum() * 0)\n            squared_distance = \\\n                (kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \\\n                (kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2\n            vis_flag = (valid_kpt_flag[i] > 0).astype('int')\n            vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0]\n            num_vis_kpt = vis_ind.shape[0]\n            # assert num_vis_kpt > 0\n            if num_vis_kpt == 0:\n                oks_cost.append(paddle.zeros((squared_distance.shape[0], 1)))\n                continue\n            area = gt_areas[i]\n\n            squared_distance0 = squared_distance / (area * variances * 2)\n            squared_distance0 = paddle.index_select(\n                squared_distance0, vis_ind, axis=1)\n            squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1,\n                                                                   keepdim=True)\n            oks = squared_distance1 / num_vis_kpt\n            # The 1 is a constant that doesn't change the matching, so omitted.\n            oks_cost.append(-oks)\n        oks_cost = paddle.concat(oks_cost, axis=1)\n        return oks_cost * self.weight\n\n\n@register\nclass ClassificationCost:\n    \"\"\"ClsSoftmaxCost.\n\n     Args:\n         weight (int | float, optional): loss_weight\n    \"\"\"\n\n    def __init__(self, weight=1.):\n        self.weight = weight\n\n    def __call__(self, cls_pred, gt_labels):\n        \"\"\"\n        Args:\n            cls_pred (Tensor): Predicted classification logits, shape\n                (num_query, num_class).\n            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).\n\n        Returns:\n            paddle.Tensor: cls_cost value with weight\n        \"\"\"\n        # Following the official DETR repo, contrary to the loss that\n        # NLL is used, we approximate it in 1 - cls_score[gt_label].\n        # The 1 is a constant that doesn't change the matching,\n        # so it can be omitted.\n        cls_score = cls_pred.softmax(-1)\n        cls_cost = -cls_score[:, gt_labels]\n        return cls_cost * self.weight\n\n\n@register\nclass FocalLossCost:\n    \"\"\"FocalLossCost.\n\n     Args:\n         weight (int | float, optional): loss_weight\n         alpha (int | float, optional): focal_loss alpha\n         gamma (int | float, optional): focal_loss gamma\n         eps (float, optional): default 1e-12\n         binary_input (bool, optional): Whether the input is binary,\n            default False.\n    \"\"\"\n\n    def __init__(self,\n                 weight=1.,\n                 alpha=0.25,\n                 gamma=2,\n                 eps=1e-12,\n                 binary_input=False):\n        self.weight = weight\n        self.alpha = alpha\n        self.gamma = gamma\n        self.eps = eps\n        self.binary_input = binary_input\n\n    def _focal_loss_cost(self, cls_pred, gt_labels):\n        \"\"\"\n        Args:\n            cls_pred (Tensor): Predicted classification logits, shape\n                (num_query, num_class).\n            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).\n\n        Returns:\n            paddle.Tensor: cls_cost value with weight\n        \"\"\"\n        if gt_labels.size == 0:\n            return cls_pred.sum() * 0\n        cls_pred = F.sigmoid(cls_pred)\n        neg_cost = -(1 - cls_pred + self.eps).log() * (\n            1 - self.alpha) * cls_pred.pow(self.gamma)\n        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (\n            1 - cls_pred).pow(self.gamma)\n\n        cls_cost = paddle.index_select(\n            pos_cost, gt_labels, axis=1) - paddle.index_select(\n                neg_cost, gt_labels, axis=1)\n        return cls_cost * self.weight\n\n    def _mask_focal_loss_cost(self, cls_pred, gt_labels):\n        \"\"\"\n        Args:\n            cls_pred (Tensor): Predicted classfication logits\n                in shape (num_query, d1, ..., dn), dtype=paddle.float32.\n            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),\n                dtype=paddle.long. Labels should be binary.\n\n        Returns:\n            Tensor: Focal cost matrix with weight in shape\\\n                (num_query, num_gt).\n        \"\"\"\n        cls_pred = cls_pred.flatten(1)\n        gt_labels = gt_labels.flatten(1).float()\n        n = cls_pred.shape[1]\n        cls_pred = F.sigmoid(cls_pred)\n        neg_cost = -(1 - cls_pred + self.eps).log() * (\n            1 - self.alpha) * cls_pred.pow(self.gamma)\n        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (\n            1 - cls_pred).pow(self.gamma)\n\n        cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \\\n            paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))\n        return cls_cost / n * self.weight\n\n    def __call__(self, cls_pred, gt_labels):\n        \"\"\"\n        Args:\n            cls_pred (Tensor): Predicted classfication logits.\n            gt_labels (Tensor)): Labels.\n\n        Returns:\n            Tensor: Focal cost matrix with weight in shape\\\n                (num_query, num_gt).\n        \"\"\"\n        if self.binary_input:\n            return self._mask_focal_loss_cost(cls_pred, gt_labels)\n        else:\n            return self._focal_loss_cost(cls_pred, gt_labels)\n"
  },
  {
    "path": "ppdet/modeling/assigners/rotated_task_aligned_assigner.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ..rbox_utils import rotated_iou_similarity, check_points_in_rotated_boxes\nfrom .utils import gather_topk_anchors, compute_max_iou_anchor\n\n__all__ = ['RotatedTaskAlignedAssigner']\n\n\n@register\nclass RotatedTaskAlignedAssigner(nn.Layer):\n    \"\"\"TOOD: Task-aligned One-stage Object Detection\n    \"\"\"\n\n    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):\n        super(RotatedTaskAlignedAssigner, self).__init__()\n        self.topk = topk\n        self.alpha = alpha\n        self.beta = beta\n        self.eps = eps\n\n    @paddle.no_grad()\n    def forward(self,\n                pred_scores,\n                pred_bboxes,\n                anchor_points,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index,\n                gt_scores=None):\n        r\"\"\"This code is based on\n            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py\n\n        The assignment is done in following steps\n        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt\n        2. select top-k bbox as candidates for each gt\n        3. limit the positive sample's center in gt (because the anchor-free detector\n           only can predict positive distance)\n        4. if an anchor box is assigned to multiple gts, the one with the\n           highest iou will be selected.\n        Args:\n            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)\n            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 5)\n            anchor_points (Tensor, float32): pre-defined anchors, shape(1, L, 2), \"cxcy\" format\n            num_anchors_list (List): num of anchors in each level, shape(L)\n            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)\n            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)\n            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)\n            bg_index (int): background index\n            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)\n        Returns:\n            assigned_labels (Tensor): (B, L)\n            assigned_bboxes (Tensor): (B, L, 5)\n            assigned_scores (Tensor): (B, L, C)\n        \"\"\"\n        assert pred_scores.ndim == pred_bboxes.ndim\n        assert gt_labels.ndim == gt_bboxes.ndim and \\\n               gt_bboxes.ndim == 3\n\n        batch_size, num_anchors, num_classes = pred_scores.shape\n        _, num_max_boxes, _ = gt_bboxes.shape\n\n        # negative batch\n        if num_max_boxes == 0:\n            assigned_labels = paddle.full(\n                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)\n            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 5])\n            assigned_scores = paddle.zeros(\n                [batch_size, num_anchors, num_classes])\n            return assigned_labels, assigned_bboxes, assigned_scores\n\n        # compute iou between gt and pred bbox, [B, n, L]\n        ious = rotated_iou_similarity(gt_bboxes, pred_bboxes)\n        ious = paddle.where(ious > 1 + self.eps, paddle.zeros_like(ious), ious)\n        ious.stop_gradient = True\n        # gather pred bboxes class score\n        pred_scores = pred_scores.transpose([0, 2, 1])\n        batch_ind = paddle.arange(\n            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)\n        gt_labels_ind = paddle.stack(\n            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],\n            axis=-1)\n        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)\n        # compute alignment metrics, [B, n, L]\n        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(\n            self.beta)\n\n        # check the positive sample's center in gt, [B, n, L]\n        is_in_gts = check_points_in_rotated_boxes(anchor_points, gt_bboxes)\n\n        # select topk largest alignment metrics pred bbox as candidates\n        # for each gt, [B, n, L]\n        is_in_topk = gather_topk_anchors(\n            alignment_metrics * is_in_gts.astype(alignment_metrics.dtype), self.topk, topk_mask=pad_gt_mask)\n\n        # select positive sample, [B, n, L]\n        mask_positive = is_in_topk * is_in_gts.astype(is_in_topk.dtype) * pad_gt_mask\n\n        # if an anchor box is assigned to multiple gts,\n        # the one with the highest iou will be selected, [B, n, L]\n        mask_positive_sum = mask_positive.sum(axis=-2)\n        if mask_positive_sum.max() > 1:\n            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(\n                [1, num_max_boxes, 1])\n            is_max_iou = compute_max_iou_anchor(ious)\n            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,\n                                         mask_positive)\n            mask_positive_sum = mask_positive.sum(axis=-2)\n        assigned_gt_index = mask_positive.argmax(axis=-2)\n\n        # assigned target\n        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)\n        assigned_labels = paddle.gather(\n            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)\n        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])\n        assigned_labels = paddle.where(\n            mask_positive_sum > 0, assigned_labels,\n            paddle.full_like(assigned_labels, bg_index))\n\n        assigned_bboxes = paddle.gather(\n            gt_bboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)\n        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 5])\n\n        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)\n        ind = list(range(num_classes + 1))\n        ind.remove(bg_index)\n        assigned_scores = paddle.index_select(\n            assigned_scores, paddle.to_tensor(ind), axis=-1)\n        # rescale alignment metrics\n        alignment_metrics *= mask_positive\n        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)\n        max_ious_per_instance = (ious * mask_positive).max(axis=-1,\n                                                           keepdim=True)\n        alignment_metrics = alignment_metrics / (\n            max_metrics_per_instance + self.eps) * max_ious_per_instance\n        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)\n        assigned_scores = assigned_scores * alignment_metrics\n\n        assigned_bboxes.stop_gradient = True\n        assigned_scores.stop_gradient = True\n        assigned_labels.stop_gradient = True\n        return assigned_labels, assigned_bboxes, assigned_scores\n"
  },
  {
    "path": "ppdet/modeling/assigners/simota_assigner.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# The code is based on:\n# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/sim_ota_assigner.py\n\nimport paddle\nimport numpy as np\nimport paddle.nn.functional as F\n\nfrom ppdet.modeling.losses.varifocal_loss import varifocal_loss\nfrom ppdet.modeling.bbox_utils import batch_bbox_overlaps\nfrom ppdet.core.workspace import register\n\n\n@register\nclass SimOTAAssigner(object):\n    \"\"\"Computes matching between predictions and ground truth.\n    Args:\n        center_radius (int | float, optional): Ground truth center size\n            to judge whether a prior is in center. Default 2.5.\n        candidate_topk (int, optional): The candidate top-k which used to\n            get top-k ious to calculate dynamic-k. Default 10.\n        iou_weight (int | float, optional): The scale factor for regression\n            iou cost. Default 3.0.\n        cls_weight (int | float, optional): The scale factor for classification\n            cost. Default 1.0.\n        num_classes (int): The num_classes of dataset.\n        use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix.\n    \"\"\"\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 center_radius=2.5,\n                 candidate_topk=10,\n                 iou_weight=3.0,\n                 cls_weight=1.0,\n                 num_classes=80,\n                 use_vfl=True):\n        self.center_radius = center_radius\n        self.candidate_topk = candidate_topk\n        self.iou_weight = iou_weight\n        self.cls_weight = cls_weight\n        self.num_classes = num_classes\n        self.use_vfl = use_vfl\n\n    def get_in_gt_and_in_center_info(self, flatten_center_and_stride,\n                                     gt_bboxes):\n        num_gt = gt_bboxes.shape[0]\n\n        flatten_x = flatten_center_and_stride[:, 0].unsqueeze(1).tile(\n            [1, num_gt])\n        flatten_y = flatten_center_and_stride[:, 1].unsqueeze(1).tile(\n            [1, num_gt])\n        flatten_stride_x = flatten_center_and_stride[:, 2].unsqueeze(1).tile(\n            [1, num_gt])\n        flatten_stride_y = flatten_center_and_stride[:, 3].unsqueeze(1).tile(\n            [1, num_gt])\n\n        # is prior centers in gt bboxes, shape: [n_center, n_gt]\n        l_ = flatten_x - gt_bboxes[:, 0]\n        t_ = flatten_y - gt_bboxes[:, 1]\n        r_ = gt_bboxes[:, 2] - flatten_x\n        b_ = gt_bboxes[:, 3] - flatten_y\n\n        deltas = paddle.stack([l_, t_, r_, b_], axis=1)\n        is_in_gts = deltas.min(axis=1) > 0\n        is_in_gts_all = is_in_gts.sum(axis=1) > 0\n\n        # is prior centers in gt centers\n        gt_center_xs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0\n        gt_center_ys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0\n        ct_bound_l = gt_center_xs - self.center_radius * flatten_stride_x\n        ct_bound_t = gt_center_ys - self.center_radius * flatten_stride_y\n        ct_bound_r = gt_center_xs + self.center_radius * flatten_stride_x\n        ct_bound_b = gt_center_ys + self.center_radius * flatten_stride_y\n\n        cl_ = flatten_x - ct_bound_l\n        ct_ = flatten_y - ct_bound_t\n        cr_ = ct_bound_r - flatten_x\n        cb_ = ct_bound_b - flatten_y\n\n        ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1)\n        is_in_cts = ct_deltas.min(axis=1) > 0\n        is_in_cts_all = is_in_cts.sum(axis=1) > 0\n\n        # in any of gts or gt centers, shape: [n_center]\n        is_in_gts_or_centers_all = paddle.logical_or(is_in_gts_all,\n                                                     is_in_cts_all)\n\n        is_in_gts_or_centers_all_inds = paddle.nonzero(\n            is_in_gts_or_centers_all).squeeze(1)\n\n        # both in gts and gt centers, shape: [num_fg, num_gt]\n        is_in_gts_and_centers = paddle.logical_and(\n            paddle.gather(\n                is_in_gts.cast('int'), is_in_gts_or_centers_all_inds,\n                axis=0).cast('bool'),\n            paddle.gather(\n                is_in_cts.cast('int'), is_in_gts_or_centers_all_inds,\n                axis=0).cast('bool'))\n        return is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_gts_and_centers\n\n    def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):\n        match_matrix = np.zeros_like(cost_matrix.numpy())\n        # select candidate topk ious for dynamic-k calculation\n        topk_ious, _ = paddle.topk(\n            pairwise_ious,\n            min(self.candidate_topk, pairwise_ious.shape[0]),\n            axis=0)\n        # calculate dynamic k for each gt\n        dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1)\n        for gt_idx in range(num_gt):\n            _, pos_idx = paddle.topk(\n                cost_matrix[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)\n            match_matrix[:, gt_idx][pos_idx.numpy()] = 1.0\n\n        del topk_ious, dynamic_ks, pos_idx\n\n        # match points more than two gts\n        extra_match_gts_mask = match_matrix.sum(1) > 1\n        if extra_match_gts_mask.sum() > 0:\n            cost_matrix = cost_matrix.numpy()\n            cost_argmin = np.argmin(\n                cost_matrix[extra_match_gts_mask, :], axis=1)\n            match_matrix[extra_match_gts_mask, :] *= 0.0\n            match_matrix[extra_match_gts_mask, cost_argmin] = 1.0\n        # get foreground mask\n        match_fg_mask_inmatrix = match_matrix.sum(1) > 0\n        match_gt_inds_to_fg = match_matrix[match_fg_mask_inmatrix, :].argmax(1)\n\n        return match_gt_inds_to_fg, match_fg_mask_inmatrix\n\n    def get_sample(self, assign_gt_inds, gt_bboxes):\n        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])\n        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])\n        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1\n\n        if gt_bboxes.size == 0:\n            # hack for index error case\n            assert pos_assigned_gt_inds.size == 0\n            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)\n        else:\n            if len(gt_bboxes.shape) < 2:\n                gt_bboxes = gt_bboxes.resize(-1, 4)\n            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]\n        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds\n\n    def __call__(self,\n                 flatten_cls_pred_scores,\n                 flatten_center_and_stride,\n                 flatten_bboxes,\n                 gt_bboxes,\n                 gt_labels,\n                 eps=1e-7):\n        \"\"\"Assign gt to priors using SimOTA.\n        TODO: add comment.\n        Returns:\n            assign_result: The assigned result.\n        \"\"\"\n        num_gt = gt_bboxes.shape[0]\n        num_bboxes = flatten_bboxes.shape[0]\n\n        if num_gt == 0 or num_bboxes == 0:\n            # No ground truth or boxes\n            label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes\n            label_weight = np.ones([num_bboxes], dtype=np.float32)\n            bbox_target = np.zeros_like(flatten_center_and_stride)\n            return 0, label, label_weight, bbox_target\n\n        is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(\n            flatten_center_and_stride, gt_bboxes)\n\n        # bboxes and scores to calculate matrix\n        valid_flatten_bboxes = flatten_bboxes[is_in_gts_or_centers_all_inds]\n        valid_cls_pred_scores = flatten_cls_pred_scores[\n            is_in_gts_or_centers_all_inds]\n        num_valid_bboxes = valid_flatten_bboxes.shape[0]\n\n        pairwise_ious = batch_bbox_overlaps(valid_flatten_bboxes,\n                                            gt_bboxes)  # [num_points,num_gts]\n        if self.use_vfl:\n            gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile(\n                [num_valid_bboxes, 1]).reshape([-1])\n            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(\n                [1, num_gt, 1]).reshape([-1, self.num_classes])\n            vfl_score = np.zeros(valid_pred_scores.shape)\n            vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy(\n            )] = pairwise_ious.reshape([-1])\n            vfl_score = paddle.to_tensor(vfl_score)\n            losses_vfl = varifocal_loss(\n                valid_pred_scores, vfl_score,\n                use_sigmoid=False).reshape([num_valid_bboxes, num_gt])\n            losses_giou = batch_bbox_overlaps(\n                valid_flatten_bboxes, gt_bboxes, mode='giou')\n            cost_matrix = (\n                losses_vfl * self.cls_weight + losses_giou * self.iou_weight +\n                paddle.logical_not(is_in_boxes_and_center).cast('float32') *\n                100000000)\n        else:\n            iou_cost = -paddle.log(pairwise_ious + eps)\n            gt_onehot_label = (F.one_hot(\n                gt_labels.squeeze(-1).cast(paddle.int64),\n                flatten_cls_pred_scores.shape[-1]).cast('float32').unsqueeze(0)\n                               .tile([num_valid_bboxes, 1, 1]))\n\n            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(\n                [1, num_gt, 1])\n            cls_cost = F.binary_cross_entropy(\n                valid_pred_scores, gt_onehot_label, reduction='none').sum(-1)\n\n            cost_matrix = (\n                cls_cost * self.cls_weight + iou_cost * self.iou_weight +\n                paddle.logical_not(is_in_boxes_and_center).cast('float32') *\n                100000000)\n\n        match_gt_inds_to_fg, match_fg_mask_inmatrix = \\\n            self.dynamic_k_matching(\n                cost_matrix, pairwise_ious, num_gt)\n\n        # sample and assign results\n        assigned_gt_inds = np.zeros([num_bboxes], dtype=np.int64)\n        match_fg_mask_inall = np.zeros_like(assigned_gt_inds)\n        match_fg_mask_inall[is_in_gts_or_centers_all.numpy(\n        )] = match_fg_mask_inmatrix\n\n        assigned_gt_inds[match_fg_mask_inall.astype(\n            np.bool_)] = match_gt_inds_to_fg + 1\n\n        pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \\\n            = self.get_sample(assigned_gt_inds, gt_bboxes.numpy())\n\n        bbox_target = np.zeros(flatten_bboxes.shape, paddle.common_ops_import.convert_dtype(flatten_bboxes.dtype))\n        bbox_weight = np.zeros_like(bbox_target)\n        label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes\n        label_weight = np.zeros([num_bboxes], dtype=np.float32)\n\n        if len(pos_inds) > 0:\n            gt_labels = gt_labels.numpy()\n            pos_bbox_targets = pos_gt_bboxes\n            bbox_target[pos_inds, :] = pos_bbox_targets\n            bbox_weight[pos_inds, :] = 1.0\n            if not np.any(gt_labels):\n                label[pos_inds] = 0\n            else:\n                label[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds]\n\n            label_weight[pos_inds] = 1.0\n        if len(neg_inds) > 0:\n            label_weight[neg_inds] = 1.0\n\n        pos_num = max(pos_inds.size, 1)\n\n        return pos_num, label, label_weight, bbox_target\n"
  },
  {
    "path": "ppdet/modeling/assigners/task_aligned_assigner.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ..bbox_utils import batch_iou_similarity\nfrom .utils import (gather_topk_anchors, check_points_inside_bboxes,\n                    compute_max_iou_anchor)\n\n__all__ = ['TaskAlignedAssigner']\n\n\ndef is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.):\n    \"\"\"Calculate distance ratio of box1 and box2 in batch for larger stride\n        anchors dist/stride to promote the survive of large distance match\n    Args:\n        anchor (Tensor): box with the shape [L, 2]\n        gt (Tensor): box with the shape [N, M2, 4]\n    Return:\n        dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2]\n    \"\"\"\n    center1 = anchor.unsqueeze(0)\n    center2 = (gt[..., :2] + gt[..., -2:]) / 2.\n    center1 = center1.unsqueeze(1)  # [N, M1, 2] -> [N, 1, M1, 2]\n    center2 = center2.unsqueeze(2)  # [N, M2, 2] -> [N, M2, 1, 2]\n\n    stride = paddle.concat([\n        paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst)\n    ]).unsqueeze(0).unsqueeze(0)\n    dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride\n    dist_ratio = dist\n    dist_ratio[dist < max_dist] = 1.\n    dist_ratio[dist >= max_dist] = 0.\n    return dist_ratio\n\n\n@register\nclass TaskAlignedAssigner(nn.Layer):\n    \"\"\"TOOD: Task-aligned One-stage Object Detection\n    \"\"\"\n\n    def __init__(self,\n                 topk=13,\n                 alpha=1.0,\n                 beta=6.0,\n                 eps=1e-9,\n                 is_close_gt=False):\n        super(TaskAlignedAssigner, self).__init__()\n        self.topk = topk\n        self.alpha = alpha\n        self.beta = beta\n        self.eps = eps\n        self.is_close_gt = is_close_gt\n\n    @paddle.no_grad()\n    def forward(self,\n                pred_scores,\n                pred_bboxes,\n                anchor_points,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index,\n                gt_segms=None,\n                gt_scores=None):\n        r\"\"\"This code is based on\n            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py\n\n        The assignment is done in following steps\n        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt\n        2. select top-k bbox as candidates for each gt\n        3. limit the positive sample's center in gt (because the anchor-free detector\n           only can predict positive distance)\n        4. if an anchor box is assigned to multiple gts, the one with the\n           highest iou will be selected.\n        Args:\n            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)\n            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)\n            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), \"cxcy\" format\n            num_anchors_list (List): num of anchors in each level, shape(L)\n            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)\n            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)\n            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)\n            bg_index (int): background index\n            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)\n        Returns:\n            assigned_labels (Tensor): (B, L)\n            assigned_bboxes (Tensor): (B, L, 4)\n            assigned_scores (Tensor): (B, L, C)\n        \"\"\"\n        assert pred_scores.ndim == pred_bboxes.ndim\n        assert gt_labels.ndim == gt_bboxes.ndim and \\\n               gt_bboxes.ndim == 3\n\n        batch_size, num_anchors, num_classes = pred_scores.shape\n        _, num_max_boxes, _ = gt_bboxes.shape\n\n        # negative batch\n        if num_max_boxes == 0:\n            assigned_labels = paddle.full(\n                [batch_size, num_anchors], bg_index, dtype='int32')\n            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])\n            assigned_scores = paddle.zeros(\n                [batch_size, num_anchors, num_classes])\n            return assigned_labels, assigned_bboxes, assigned_scores\n\n        # compute iou between gt and pred bbox, [B, n, L]\n        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)\n        # gather pred bboxes class score\n        pred_scores = pred_scores.transpose([0, 2, 1])\n        batch_ind = paddle.arange(\n            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)\n        gt_labels_ind = paddle.stack(\n            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],\n            axis=-1)\n        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)\n        # compute alignment metrics, [B, n, L]\n        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(\n            self.beta)\n\n        # check the positive sample's center in gt, [B, n, L]\n        if self.is_close_gt:\n            is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list)\n        else:\n            is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)\n\n        # select topk largest alignment metrics pred bbox as candidates\n        # for each gt, [B, n, L]\n        is_in_topk = gather_topk_anchors(\n            alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)\n\n        # select positive sample, [B, n, L]\n        mask_positive = is_in_topk * is_in_gts * pad_gt_mask\n\n        # if an anchor box is assigned to multiple gts,\n        # the one with the highest iou will be selected, [B, n, L]\n        mask_positive_sum = mask_positive.sum(axis=-2)\n        if mask_positive_sum.max() > 1:\n            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(\n                [1, num_max_boxes, 1])\n            is_max_iou = compute_max_iou_anchor(ious)\n            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,\n                                         mask_positive)\n            mask_positive_sum = mask_positive.sum(axis=-2)\n        assigned_gt_index = mask_positive.argmax(axis=-2)\n\n        # assigned target\n        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)\n        assigned_labels = paddle.gather(\n            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)\n        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])\n        assigned_labels = paddle.where(\n            mask_positive_sum > 0, assigned_labels,\n            paddle.full_like(assigned_labels, bg_index))\n\n        assigned_bboxes = paddle.gather(\n            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)\n        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])\n\n        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)\n        ind = list(range(num_classes + 1))\n        ind.remove(bg_index)\n        assigned_scores = paddle.index_select(\n            assigned_scores, paddle.to_tensor(ind), axis=-1)\n        # rescale alignment metrics\n        alignment_metrics *= mask_positive\n        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)\n        max_ious_per_instance = (ious * mask_positive).max(axis=-1,\n                                                           keepdim=True)\n        alignment_metrics = alignment_metrics / (\n            max_metrics_per_instance + self.eps) * max_ious_per_instance\n        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)\n        assigned_scores = assigned_scores * alignment_metrics\n\n        if gt_segms is not None:\n            return assigned_labels, assigned_bboxes, assigned_scores, assigned_gt_index\n        else:\n            return assigned_labels, assigned_bboxes, assigned_scores\n"
  },
  {
    "path": "ppdet/modeling/assigners/task_aligned_assigner_cr.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ..bbox_utils import batch_iou_similarity\nfrom .utils import (gather_topk_anchors, check_points_inside_bboxes,\n                    compute_max_iou_anchor)\n\n__all__ = ['TaskAlignedAssigner_CR']\n\n\n@register\nclass TaskAlignedAssigner_CR(nn.Layer):\n    \"\"\"TOOD: Task-aligned One-stage Object Detection with Center R\n    \"\"\"\n\n    def __init__(self,\n                 topk=13,\n                 alpha=1.0,\n                 beta=6.0,\n                 center_radius=None,\n                 eps=1e-9):\n        super(TaskAlignedAssigner_CR, self).__init__()\n        self.topk = topk\n        self.alpha = alpha\n        self.beta = beta\n        self.center_radius = center_radius\n        self.eps = eps\n\n    @paddle.no_grad()\n    def forward(self,\n                pred_scores,\n                pred_bboxes,\n                anchor_points,\n                stride_tensor,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index,\n                gt_scores=None):\n        r\"\"\"This code is based on\n            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py\n\n        The assignment is done in following steps\n        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt\n        2. select top-k bbox as candidates for each gt\n        3. limit the positive sample's center in gt (because the anchor-free detector\n           only can predict positive distance)\n        4. if an anchor box is assigned to multiple gts, the one with the\n           highest iou will be selected.\n        Args:\n            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)\n            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)\n            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), \"cxcy\" format\n            stride_tensor (Tensor, float32): stride of feature map, shape(L, 1)\n            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)\n            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)\n            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)\n            bg_index (int): background index\n            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)\n        Returns:\n            assigned_labels (Tensor): (B, L)\n            assigned_bboxes (Tensor): (B, L, 4)\n            assigned_scores (Tensor): (B, L, C)\n        \"\"\"\n        assert pred_scores.ndim == pred_bboxes.ndim\n        assert gt_labels.ndim == gt_bboxes.ndim and \\\n               gt_bboxes.ndim == 3\n\n        batch_size, num_anchors, num_classes = pred_scores.shape\n        _, num_max_boxes, _ = gt_bboxes.shape\n\n        # negative batch\n        if num_max_boxes == 0:\n            assigned_labels = paddle.full(\n                [batch_size, num_anchors], bg_index, dtype='int32')\n            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])\n            assigned_scores = paddle.zeros(\n                [batch_size, num_anchors, num_classes])\n            return assigned_labels, assigned_bboxes, assigned_scores\n\n        # compute iou between gt and pred bbox, [B, n, L]\n        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)\n        # gather pred bboxes class score\n        pred_scores = pred_scores.transpose([0, 2, 1])\n        batch_ind = paddle.arange(\n            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)\n        gt_labels_ind = paddle.stack(\n            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],\n            axis=-1)\n        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)\n        # compute alignment metrics, [B, n, L]\n        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(\n            self.beta) * pad_gt_mask\n\n        # select positive sample, [B, n, L]\n        if self.center_radius is None:\n            # check the positive sample's center in gt, [B, n, L]\n            is_in_gts = check_points_inside_bboxes(\n                anchor_points, gt_bboxes, sm_use=True)\n            # select topk largest alignment metrics pred bbox as candidates\n            # for each gt, [B, n, L]\n            mask_positive = gather_topk_anchors(\n                alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts\n        else:\n            is_in_gts, is_in_center = check_points_inside_bboxes(\n                anchor_points,\n                gt_bboxes,\n                stride_tensor * self.center_radius,\n                sm_use=True)\n            is_in_gts *= pad_gt_mask\n            is_in_center *= pad_gt_mask\n            candidate_metrics = paddle.where(\n                is_in_gts.sum(-1, keepdim=True) == 0,\n                alignment_metrics + is_in_center,\n                alignment_metrics)\n            mask_positive = gather_topk_anchors(\n                candidate_metrics, self.topk,\n                topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) |\n                                                     (is_in_gts > 0), 'float32')\n\n        # if an anchor box is assigned to multiple gts,\n        # the one with the highest iou will be selected, [B, n, L]\n        mask_positive_sum = mask_positive.sum(axis=-2)\n        if mask_positive_sum.max() > 1:\n            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(\n                [1, num_max_boxes, 1])\n            is_max_iou = compute_max_iou_anchor(ious * mask_positive)\n            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,\n                                         mask_positive)\n            mask_positive_sum = mask_positive.sum(axis=-2)\n        assigned_gt_index = mask_positive.argmax(axis=-2)\n\n        # assigned target\n        assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)\n        assigned_labels = paddle.gather(\n            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)\n        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])\n        assigned_labels = paddle.where(\n            mask_positive_sum > 0, assigned_labels,\n            paddle.full_like(assigned_labels, bg_index))\n\n        assigned_bboxes = paddle.gather(\n            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)\n        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])\n\n        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)\n        ind = list(range(num_classes + 1))\n        ind.remove(bg_index)\n        assigned_scores = paddle.index_select(\n            assigned_scores, paddle.to_tensor(ind), axis=-1)\n        # rescale alignment metrics\n        alignment_metrics *= mask_positive\n        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)\n        max_ious_per_instance = (ious * mask_positive).max(axis=-1,\n                                                           keepdim=True)\n        alignment_metrics = alignment_metrics / (\n            max_metrics_per_instance + self.eps) * max_ious_per_instance\n        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)\n        assigned_scores = assigned_scores * alignment_metrics\n\n        return assigned_labels, assigned_bboxes, assigned_scores\n"
  },
  {
    "path": "ppdet/modeling/assigners/uniform_assigner.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\n\nfrom ppdet.modeling.bbox_utils import batch_bbox_overlaps\nfrom ppdet.modeling.transformers import bbox_xyxy_to_cxcywh\n\n__all__ = ['UniformAssigner']\n\n\ndef batch_p_dist(x, y, p=2):\n    \"\"\"\n    calculate pairwise p_dist, the first index of x and y are batch\n    return [x.shape[0], y.shape[0]]\n    \"\"\"\n    x = x.unsqueeze(1)\n    diff = x - y\n    return paddle.norm(diff, p=p, axis=list(range(2, diff.dim())))\n\n\n@register\nclass UniformAssigner(nn.Layer):\n    def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4):\n        super(UniformAssigner, self).__init__()\n        self.pos_ignore_thr = pos_ignore_thr\n        self.neg_ignore_thr = neg_ignore_thr\n        self.match_times = match_times\n\n    def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None):\n        num_bboxes = bbox_pred.shape[0]\n        num_gts = gt_bboxes.shape[0]\n        match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32)\n\n        pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes)\n        pred_max_iou = pred_ious.max(axis=1)\n        neg_ignore = pred_max_iou > self.neg_ignore_thr\n        # exclude potential ignored neg samples first, deal with pos samples later\n        #match_labels: -2(ignore), -1(neg) or >=0(pos_inds)\n        match_labels = paddle.where(neg_ignore,\n                                    paddle.full_like(match_labels, -2),\n                                    match_labels)\n\n        bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred)\n        anchor_c = bbox_xyxy_to_cxcywh(anchor)\n        gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes)\n        bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1)\n        anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1)\n\n        top_pred = bbox_pred_dist.topk(\n            k=self.match_times, axis=0, largest=False)[1]\n        top_anchor = anchor_dist.topk(\n            k=self.match_times, axis=0, largest=False)[1]\n\n        tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts])\n        tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts])\n        pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1])\n        pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1])\n\n        pos_anchor = anchor[pos_places]\n        pos_tar_bbox = gt_bboxes[pos_inds]\n        pos_ious = batch_bbox_overlaps(\n            pos_anchor, pos_tar_bbox, is_aligned=True)\n        pos_ignore = pos_ious < self.pos_ignore_thr\n        pos_inds = paddle.where(pos_ignore,\n                                paddle.full_like(pos_inds, -2), pos_inds)\n        match_labels[pos_places] = pos_inds\n        match_labels.stop_gradient = True\n        pos_keep = ~pos_ignore\n\n        if pos_keep.sum() > 0:\n            pos_places_keep = pos_places[pos_keep]\n            pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4])\n            pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach()\n        else:\n            pos_bbox_pred = None\n            pos_bbox_tar = None\n\n        return match_labels, pos_bbox_pred, pos_bbox_tar\n"
  },
  {
    "path": "ppdet/modeling/assigners/utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn.functional as F\n\n__all__ = [\n    'pad_gt', 'gather_topk_anchors', 'check_points_inside_bboxes',\n    'compute_max_iou_anchor', 'compute_max_iou_gt',\n    'generate_anchors_for_grid_cell'\n]\n\n\ndef pad_gt(gt_labels, gt_bboxes, gt_scores=None):\n    r\"\"\" Pad 0 in gt_labels and gt_bboxes.\n    Args:\n        gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes,\n            shape is [B, n, 1] or [[n_1, 1], [n_2, 1], ...], here n = sum(n_i)\n        gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes,\n            shape is [B, n, 4] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)\n        gt_scores (Tensor|List[Tensor]|None, float32): Score of gt_bboxes,\n            shape is [B, n, 1] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)\n    Returns:\n        pad_gt_labels (Tensor, int64): shape[B, n, 1]\n        pad_gt_bboxes (Tensor, float32): shape[B, n, 4]\n        pad_gt_scores (Tensor, float32): shape[B, n, 1]\n        pad_gt_mask (Tensor, float32): shape[B, n, 1], 1 means bbox, 0 means no bbox\n    \"\"\"\n    if isinstance(gt_labels, paddle.Tensor) and isinstance(gt_bboxes,\n                                                           paddle.Tensor):\n        assert gt_labels.ndim == gt_bboxes.ndim and \\\n               gt_bboxes.ndim == 3\n        pad_gt_mask = (\n            gt_bboxes.sum(axis=-1, keepdim=True) > 0).astype(gt_bboxes.dtype)\n        if gt_scores is None:\n            gt_scores = pad_gt_mask.clone()\n        assert gt_labels.ndim == gt_scores.ndim\n\n        return gt_labels, gt_bboxes, gt_scores, pad_gt_mask\n    elif isinstance(gt_labels, list) and isinstance(gt_bboxes, list):\n        assert len(gt_labels) == len(gt_bboxes), \\\n            'The number of `gt_labels` and `gt_bboxes` is not equal. '\n        num_max_boxes = max([len(a) for a in gt_bboxes])\n        batch_size = len(gt_bboxes)\n        # pad label and bbox\n        pad_gt_labels = paddle.zeros(\n            [batch_size, num_max_boxes, 1], dtype=gt_labels[0].dtype)\n        pad_gt_bboxes = paddle.zeros(\n            [batch_size, num_max_boxes, 4], dtype=gt_bboxes[0].dtype)\n        pad_gt_scores = paddle.zeros(\n            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)\n        pad_gt_mask = paddle.zeros(\n            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)\n        for i, (label, bbox) in enumerate(zip(gt_labels, gt_bboxes)):\n            if len(label) > 0 and len(bbox) > 0:\n                pad_gt_labels[i, :len(label)] = label\n                pad_gt_bboxes[i, :len(bbox)] = bbox\n                pad_gt_mask[i, :len(bbox)] = 1.\n                if gt_scores is not None:\n                    pad_gt_scores[i, :len(gt_scores[i])] = gt_scores[i]\n        if gt_scores is None:\n            pad_gt_scores = pad_gt_mask.clone()\n        return pad_gt_labels, pad_gt_bboxes, pad_gt_scores, pad_gt_mask\n    else:\n        raise ValueError('The input `gt_labels` or `gt_bboxes` is invalid! ')\n\n\ndef gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):\n    r\"\"\"\n    Args:\n        metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors\n        topk (int): The number of top elements to look for along the axis.\n        largest (bool) : largest is a flag, if set to true,\n            algorithm will sort by descending order, otherwise sort by\n            ascending order. Default: True\n        topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask,\n            Default: None\n        eps (float): Default: 1e-9\n    Returns:\n        is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected\n    \"\"\"\n    num_anchors = metrics.shape[-1]\n    topk_metrics, topk_idxs = paddle.topk(\n        metrics, topk, axis=-1, largest=largest)\n    if topk_mask is None:\n        topk_mask = (\n            topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype)\n    is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(\n        axis=-2).astype(metrics.dtype)\n    return is_in_topk * topk_mask\n\n\ndef check_points_inside_bboxes(points,\n                               bboxes,\n                               center_radius_tensor=None,\n                               eps=1e-9,\n                               sm_use=False):\n    r\"\"\"\n    Args:\n        points (Tensor, float32): shape[L, 2], \"xy\" format, L: num_anchors\n        bboxes (Tensor, float32): shape[B, n, 4], \"xmin, ymin, xmax, ymax\" format\n        center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None.\n        eps (float): Default: 1e-9\n    Returns:\n        is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected\n    \"\"\"\n    points = points.unsqueeze([0, 1])\n    x, y = points.chunk(2, axis=-1)\n    xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1)\n    # check whether `points` is in `bboxes`\n    l = x - xmin\n    t = y - ymin\n    r = xmax - x\n    b = ymax - y\n    delta_ltrb = paddle.concat([l, t, r, b], axis=-1)\n    is_in_bboxes = (delta_ltrb.min(axis=-1) > eps)\n    if center_radius_tensor is not None:\n        # check whether `points` is in `center_radius`\n        center_radius_tensor = center_radius_tensor.unsqueeze([0, 1])\n        cx = (xmin + xmax) * 0.5\n        cy = (ymin + ymax) * 0.5\n        l = x - (cx - center_radius_tensor)\n        t = y - (cy - center_radius_tensor)\n        r = (cx + center_radius_tensor) - x\n        b = (cy + center_radius_tensor) - y\n        delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)\n        is_in_center = (delta_ltrb_c.min(axis=-1) > eps)\n        if sm_use:\n            return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype(\n                bboxes.dtype)\n        else:\n            return (paddle.logical_and(is_in_bboxes, is_in_center),\n                    paddle.logical_or(is_in_bboxes, is_in_center))\n\n    return is_in_bboxes.astype(bboxes.dtype)\n\n\ndef compute_max_iou_anchor(ious):\n    r\"\"\"\n    For each anchor, find the GT with the largest IOU.\n    Args:\n        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors\n    Returns:\n        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected\n    \"\"\"\n    num_max_boxes = ious.shape[-2]\n    max_iou_index = ious.argmax(axis=-2)\n    is_max_iou = F.one_hot(max_iou_index, num_max_boxes).transpose([0, 2, 1])\n    return is_max_iou.astype(ious.dtype)\n\n\ndef compute_max_iou_gt(ious):\n    r\"\"\"\n    For each GT, find the anchor with the largest IOU.\n    Args:\n        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors\n    Returns:\n        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected\n    \"\"\"\n    num_anchors = ious.shape[-1]\n    max_iou_index = ious.argmax(axis=-1)\n    is_max_iou = F.one_hot(max_iou_index, num_anchors)\n    return is_max_iou.astype(ious.dtype)\n\n\ndef generate_anchors_for_grid_cell(feats,\n                                   fpn_strides,\n                                   grid_cell_size=5.0,\n                                   grid_cell_offset=0.5,\n                                   dtype='float32'):\n    r\"\"\"\n    Like ATSS, generate anchors based on grid size.\n    Args:\n        feats (List[Tensor]): shape[s, (b, c, h, w)]\n        fpn_strides (tuple|list): shape[s], stride for each scale feature\n        grid_cell_size (float): anchor size\n        grid_cell_offset (float): The range is between 0 and 1.\n    Returns:\n        anchors (Tensor): shape[l, 4], \"xmin, ymin, xmax, ymax\" format.\n        anchor_points (Tensor): shape[l, 2], \"x, y\" format.\n        num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...].\n        stride_tensor (Tensor): shape[l, 1], contains the stride for each scale.\n    \"\"\"\n    assert len(feats) == len(fpn_strides)\n    anchors = []\n    anchor_points = []\n    num_anchors_list = []\n    stride_tensor = []\n    for feat, stride in zip(feats, fpn_strides):\n        _, _, h, w = feat.shape\n        cell_half_size = grid_cell_size * stride * 0.5\n        shift_x = (paddle.arange(end=w) + grid_cell_offset) * stride\n        shift_y = (paddle.arange(end=h) + grid_cell_offset) * stride\n        shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)\n        anchor = paddle.stack(\n            [\n                shift_x - cell_half_size, shift_y - cell_half_size,\n                shift_x + cell_half_size, shift_y + cell_half_size\n            ],\n            axis=-1).astype(dtype)\n        anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype)\n\n        anchors.append(anchor.reshape([-1, 4]))\n        anchor_points.append(anchor_point.reshape([-1, 2]))\n        num_anchors_list.append(len(anchors[-1]))\n        stride_tensor.append(\n            paddle.full(\n                [num_anchors_list[-1], 1], stride, dtype=dtype))\n    anchors = paddle.concat(anchors)\n    anchors.stop_gradient = True\n    anchor_points = paddle.concat(anchor_points)\n    anchor_points.stop_gradient = True\n    stride_tensor = paddle.concat(stride_tensor)\n    stride_tensor.stop_gradient = True\n    return anchors, anchor_points, num_anchors_list, stride_tensor\n"
  },
  {
    "path": "ppdet/modeling/backbones/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom . import vgg\nfrom . import resnet\nfrom . import darknet\nfrom . import mobilenet_v1\nfrom . import mobilenet_v3\nfrom . import hrnet\nfrom . import lite_hrnet\nfrom . import blazenet\nfrom . import ghostnet\nfrom . import senet\nfrom . import res2net\nfrom . import dla\nfrom . import shufflenet_v2\nfrom . import swin_transformer\nfrom . import lcnet\nfrom . import hardnet\nfrom . import esnet\nfrom . import cspresnet\nfrom . import csp_darknet\nfrom . import convnext\nfrom . import vision_transformer\nfrom . import mobileone\nfrom . import trans_encoder\nfrom . import focalnet\nfrom . import vit_mae\nfrom . import hgnet_v2\nfrom . import clrnet_resnet\n\nfrom .vgg import *\nfrom .resnet import *\nfrom .darknet import *\nfrom .mobilenet_v1 import *\nfrom .mobilenet_v3 import *\nfrom .hrnet import *\nfrom .lite_hrnet import *\nfrom .blazenet import *\nfrom .ghostnet import *\nfrom .senet import *\nfrom .res2net import *\nfrom .dla import *\nfrom .shufflenet_v2 import *\nfrom .swin_transformer import *\nfrom .lcnet import *\nfrom .hardnet import *\nfrom .esnet import *\nfrom .cspresnet import *\nfrom .csp_darknet import *\nfrom .convnext import *\nfrom .vision_transformer import *\nfrom .mobileone import *\nfrom .trans_encoder import *\nfrom .focalnet import *\nfrom .vitpose import *\nfrom .vit_mae import *\nfrom .hgnet_v2 import *\nfrom .clrnet_resnet import *\n"
  },
  {
    "path": "ppdet/modeling/backbones/blazenet.py",
    "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import KaimingNormal\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['BlazeNet']\n\n\ndef hard_swish(x):\n    return x * F.relu6(x + 3) / 6.\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride,\n                 padding,\n                 num_groups=1,\n                 act='relu',\n                 conv_lr=0.1,\n                 conv_decay=0.,\n                 norm_decay=0.,\n                 norm_type='bn',\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.act = act\n        self._conv = nn.Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            groups=num_groups,\n            weight_attr=ParamAttr(\n                learning_rate=conv_lr, initializer=KaimingNormal()),\n            bias_attr=False)\n\n        if norm_type in ['bn', 'sync_bn']:\n            self._batch_norm = nn.BatchNorm2D(out_channels)\n\n    def forward(self, x):\n        x = self._conv(x)\n        x = self._batch_norm(x)\n        if self.act == \"relu\":\n            x = F.relu(x)\n        elif self.act == \"relu6\":\n            x = F.relu6(x)\n        elif self.act == 'leaky':\n            x = F.leaky_relu(x)\n        elif self.act == 'hard_swish':\n            x = hard_swish(x)\n        return x\n\n\nclass BlazeBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels1,\n                 out_channels2,\n                 double_channels=None,\n                 stride=1,\n                 use_5x5kernel=True,\n                 act='relu',\n                 name=None):\n        super(BlazeBlock, self).__init__()\n        assert stride in [1, 2]\n        self.use_pool = not stride == 1\n        self.use_double_block = double_channels is not None\n        self.conv_dw = []\n        if use_5x5kernel:\n            self.conv_dw.append(\n                self.add_sublayer(\n                    name + \"1_dw\",\n                    ConvBNLayer(\n                        in_channels=in_channels,\n                        out_channels=out_channels1,\n                        kernel_size=5,\n                        stride=stride,\n                        padding=2,\n                        num_groups=out_channels1,\n                        name=name + \"1_dw\")))\n        else:\n            self.conv_dw.append(\n                self.add_sublayer(\n                    name + \"1_dw_1\",\n                    ConvBNLayer(\n                        in_channels=in_channels,\n                        out_channels=out_channels1,\n                        kernel_size=3,\n                        stride=1,\n                        padding=1,\n                        num_groups=out_channels1,\n                        name=name + \"1_dw_1\")))\n            self.conv_dw.append(\n                self.add_sublayer(\n                    name + \"1_dw_2\",\n                    ConvBNLayer(\n                        in_channels=out_channels1,\n                        out_channels=out_channels1,\n                        kernel_size=3,\n                        stride=stride,\n                        padding=1,\n                        num_groups=out_channels1,\n                        name=name + \"1_dw_2\")))\n        self.act = act if self.use_double_block else None\n        self.conv_pw = ConvBNLayer(\n            in_channels=out_channels1,\n            out_channels=out_channels2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            act=self.act,\n            name=name + \"1_sep\")\n        if self.use_double_block:\n            self.conv_dw2 = []\n            if use_5x5kernel:\n                self.conv_dw2.append(\n                    self.add_sublayer(\n                        name + \"2_dw\",\n                        ConvBNLayer(\n                            in_channels=out_channels2,\n                            out_channels=out_channels2,\n                            kernel_size=5,\n                            stride=1,\n                            padding=2,\n                            num_groups=out_channels2,\n                            name=name + \"2_dw\")))\n            else:\n                self.conv_dw2.append(\n                    self.add_sublayer(\n                        name + \"2_dw_1\",\n                        ConvBNLayer(\n                            in_channels=out_channels2,\n                            out_channels=out_channels2,\n                            kernel_size=3,\n                            stride=1,\n                            padding=1,\n                            num_groups=out_channels2,\n                            name=name + \"1_dw_1\")))\n                self.conv_dw2.append(\n                    self.add_sublayer(\n                        name + \"2_dw_2\",\n                        ConvBNLayer(\n                            in_channels=out_channels2,\n                            out_channels=out_channels2,\n                            kernel_size=3,\n                            stride=1,\n                            padding=1,\n                            num_groups=out_channels2,\n                            name=name + \"2_dw_2\")))\n            self.conv_pw2 = ConvBNLayer(\n                in_channels=out_channels2,\n                out_channels=double_channels,\n                kernel_size=1,\n                stride=1,\n                padding=0,\n                name=name + \"2_sep\")\n        # shortcut\n        if self.use_pool:\n            shortcut_channel = double_channels or out_channels2\n            self._shortcut = []\n            self._shortcut.append(\n                self.add_sublayer(\n                    name + '_shortcut_pool',\n                    nn.MaxPool2D(\n                        kernel_size=stride, stride=stride, ceil_mode=True)))\n            self._shortcut.append(\n                self.add_sublayer(\n                    name + '_shortcut_conv',\n                    ConvBNLayer(\n                        in_channels=in_channels,\n                        out_channels=shortcut_channel,\n                        kernel_size=1,\n                        stride=1,\n                        padding=0,\n                        name=\"shortcut\" + name)))\n\n    def forward(self, x):\n        y = x\n        for conv_dw_block in self.conv_dw:\n            y = conv_dw_block(y)\n        y = self.conv_pw(y)\n        if self.use_double_block:\n            for conv_dw2_block in self.conv_dw2:\n                y = conv_dw2_block(y)\n            y = self.conv_pw2(y)\n        if self.use_pool:\n            for shortcut in self._shortcut:\n                x = shortcut(x)\n        return F.relu(paddle.add(x, y))\n\n\n@register\n@serializable\nclass BlazeNet(nn.Layer):\n    \"\"\"\n    BlazeFace, see https://arxiv.org/abs/1907.05047\n\n    Args:\n        blaze_filters (list): number of filter for each blaze block.\n        double_blaze_filters (list): number of filter for each double_blaze block.\n        use_5x5kernel (bool): whether or not filter size is 5x5 in depth-wise conv.\n    \"\"\"\n\n    def __init__(\n            self,\n            blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]],\n            double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96], [96, 24, 96],\n                                  [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]],\n            use_5x5kernel=True,\n            act=None):\n        super(BlazeNet, self).__init__()\n        conv1_num_filters = blaze_filters[0][0]\n        self.conv1 = ConvBNLayer(\n            in_channels=3,\n            out_channels=conv1_num_filters,\n            kernel_size=3,\n            stride=2,\n            padding=1,\n            name=\"conv1\")\n        in_channels = conv1_num_filters\n        self.blaze_block = []\n        self._out_channels = []\n        for k, v in enumerate(blaze_filters):\n            assert len(v) in [2, 3], \\\n                \"blaze_filters {} not in [2, 3]\"\n            if len(v) == 2:\n                self.blaze_block.append(\n                    self.add_sublayer(\n                        'blaze_{}'.format(k),\n                        BlazeBlock(\n                            in_channels,\n                            v[0],\n                            v[1],\n                            use_5x5kernel=use_5x5kernel,\n                            act=act,\n                            name='blaze_{}'.format(k))))\n            elif len(v) == 3:\n                self.blaze_block.append(\n                    self.add_sublayer(\n                        'blaze_{}'.format(k),\n                        BlazeBlock(\n                            in_channels,\n                            v[0],\n                            v[1],\n                            stride=v[2],\n                            use_5x5kernel=use_5x5kernel,\n                            act=act,\n                            name='blaze_{}'.format(k))))\n            in_channels = v[1]\n\n        for k, v in enumerate(double_blaze_filters):\n            assert len(v) in [3, 4], \\\n                \"blaze_filters {} not in [3, 4]\"\n            if len(v) == 3:\n                self.blaze_block.append(\n                    self.add_sublayer(\n                        'double_blaze_{}'.format(k),\n                        BlazeBlock(\n                            in_channels,\n                            v[0],\n                            v[1],\n                            double_channels=v[2],\n                            use_5x5kernel=use_5x5kernel,\n                            act=act,\n                            name='double_blaze_{}'.format(k))))\n            elif len(v) == 4:\n                self.blaze_block.append(\n                    self.add_sublayer(\n                        'double_blaze_{}'.format(k),\n                        BlazeBlock(\n                            in_channels,\n                            v[0],\n                            v[1],\n                            double_channels=v[2],\n                            stride=v[3],\n                            use_5x5kernel=use_5x5kernel,\n                            act=act,\n                            name='double_blaze_{}'.format(k))))\n            in_channels = v[2]\n            self._out_channels.append(in_channels)\n\n    def forward(self, inputs):\n        outs = []\n        y = self.conv1(inputs['image'])\n        for block in self.blaze_block:\n            y = block(y)\n            outs.append(y)\n        return [outs[-4], outs[-1]]\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(channels=c)\n            for c in [self._out_channels[-4], self._out_channels[-1]]\n        ]\n"
  },
  {
    "path": "ppdet/modeling/backbones/clrnet_resnet.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\n\nfrom paddle.utils.download import get_weights_path_from_url\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['CLRResNet']\n\nmodel_urls = {\n    'resnet18':\n    'https://x2paddle.bj.bcebos.com/vision/models/resnet18-pt.pdparams',\n    'resnet34':\n    'https://x2paddle.bj.bcebos.com/vision/models/resnet34-pt.pdparams',\n    'resnet50':\n    'https://x2paddle.bj.bcebos.com/vision/models/resnet50-pt.pdparams',\n    'resnet101':\n    'https://x2paddle.bj.bcebos.com/vision/models/resnet101-pt.pdparams',\n    'resnet152':\n    'https://x2paddle.bj.bcebos.com/vision/models/resnet152-pt.pdparams',\n    'resnext50_32x4d':\n    'https://x2paddle.bj.bcebos.com/vision/models/resnext50_32x4d-pt.pdparams',\n    'resnext101_32x8d':\n    'https://x2paddle.bj.bcebos.com/vision/models/resnext101_32x8d-pt.pdparams',\n    'wide_resnet50_2':\n    'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet50_2-pt.pdparams',\n    'wide_resnet101_2':\n    'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet101_2-pt.pdparams',\n}\n\n\nclass BasicBlock(nn.Layer):\n    expansion = 1\n\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 stride=1,\n                 downsample=None,\n                 groups=1,\n                 base_width=64,\n                 dilation=1,\n                 norm_layer=None):\n        super(BasicBlock, self).__init__()\n        if norm_layer is None:\n            norm_layer = nn.BatchNorm2D\n\n        if dilation > 1:\n            raise NotImplementedError(\n                \"Dilation > 1 not supported in BasicBlock\")\n\n        self.conv1 = nn.Conv2D(\n            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)\n        self.bn1 = norm_layer(planes)\n        self.relu = nn.ReLU()\n        self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False)\n        self.bn2 = norm_layer(planes)\n        self.downsample = downsample\n        self.stride = stride\n\n    def forward(self, x):\n        identity = x\n\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n\n        out = self.conv2(out)\n        out = self.bn2(out)\n\n        if self.downsample is not None:\n            identity = self.downsample(x)\n\n        out += identity\n        out = self.relu(out)\n\n        return out\n\n\nclass BottleneckBlock(nn.Layer):\n\n    expansion = 4\n\n    def __init__(self,\n                 inplanes,\n                 planes,\n                 stride=1,\n                 downsample=None,\n                 groups=1,\n                 base_width=64,\n                 dilation=1,\n                 norm_layer=None):\n        super(BottleneckBlock, self).__init__()\n        if norm_layer is None:\n            norm_layer = nn.BatchNorm2D\n        width = int(planes * (base_width / 64.)) * groups\n\n        self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False)\n        self.bn1 = norm_layer(width)\n\n        self.conv2 = nn.Conv2D(\n            width,\n            width,\n            3,\n            padding=dilation,\n            stride=stride,\n            groups=groups,\n            dilation=dilation,\n            bias_attr=False)\n        self.bn2 = norm_layer(width)\n\n        self.conv3 = nn.Conv2D(\n            width, planes * self.expansion, 1, bias_attr=False)\n        self.bn3 = norm_layer(planes * self.expansion)\n        self.relu = nn.ReLU()\n        self.downsample = downsample\n        self.stride = stride\n\n    def forward(self, x):\n        identity = x\n\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n\n        out = self.conv2(out)\n        out = self.bn2(out)\n        out = self.relu(out)\n\n        out = self.conv3(out)\n        out = self.bn3(out)\n\n        if self.downsample is not None:\n            identity = self.downsample(x)\n\n        out += identity\n        out = self.relu(out)\n\n        return out\n\n\nclass ResNet(nn.Layer):\n    \"\"\"ResNet model from\n    `\"Deep Residual Learning for Image Recognition\" <https://arxiv.org/pdf/1512.03385.pdf>`_.\n    Args:\n        Block (BasicBlock|BottleneckBlock): Block module of model.\n        depth (int, optional): Layers of ResNet, Default: 50.\n        width (int, optional): Base width per convolution group for each convolution block, Default: 64.\n        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer \n                            will not be defined. Default: 1000.\n        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.\n        groups (int, optional): Number of groups for each convolution block, Default: 1.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNet model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import ResNet\n            from paddle.vision.models.resnet import BottleneckBlock, BasicBlock\n            # build ResNet with 18 layers\n            resnet18 = ResNet(BasicBlock, 18)\n            # build ResNet with 50 layers\n            resnet50 = ResNet(BottleneckBlock, 50)\n            # build Wide ResNet model\n            wide_resnet50_2 = ResNet(BottleneckBlock, 50, width=64*2)\n            # build ResNeXt model\n            resnext50_32x4d = ResNet(BottleneckBlock, 50, width=4, groups=32)\n            x = paddle.rand([1, 3, 224, 224])\n            out = resnet18(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n\n    def __init__(self, block, depth=50, width=64, with_pool=True, groups=1):\n        super(ResNet, self).__init__()\n        layer_cfg = {\n            18: [2, 2, 2, 2],\n            34: [3, 4, 6, 3],\n            50: [3, 4, 6, 3],\n            101: [3, 4, 23, 3],\n            152: [3, 8, 36, 3]\n        }\n\n        layers = layer_cfg[depth]\n        self.groups = groups\n        self.base_width = width\n        self.with_pool = with_pool\n        self._norm_layer = nn.BatchNorm2D\n\n        self.inplanes = 64\n        self.dilation = 1\n\n        self.conv1 = nn.Conv2D(\n            3,\n            self.inplanes,\n            kernel_size=7,\n            stride=2,\n            padding=3,\n            bias_attr=False)\n        self.bn1 = self._norm_layer(self.inplanes)\n        self.relu = nn.ReLU()\n        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self.layer1 = self._make_layer(block, 64, layers[0])\n        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)\n        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)\n        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)\n        if with_pool:\n            self.avgpool = nn.AdaptiveAvgPool2D((1, 1))\n\n        ch_out_list = [64, 128, 256, 512]\n        block = BottleneckBlock if depth >= 50 else BasicBlock\n\n        self._out_channels = [block.expansion * v for v in ch_out_list]\n        self._out_strides = [4, 8, 16, 32]\n        self.return_idx = [0, 1, 2, 3]\n\n    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):\n        norm_layer = self._norm_layer\n        downsample = None\n        previous_dilation = self.dilation\n        if dilate:\n            self.dilation *= stride\n            stride = 1\n        if stride != 1 or self.inplanes != planes * block.expansion:\n            downsample = nn.Sequential(\n                nn.Conv2D(\n                    self.inplanes,\n                    planes * block.expansion,\n                    1,\n                    stride=stride,\n                    bias_attr=False),\n                norm_layer(planes * block.expansion), )\n\n        layers = []\n        layers.append(\n            block(self.inplanes, planes, stride, downsample, self.groups,\n                  self.base_width, previous_dilation, norm_layer))\n        self.inplanes = planes * block.expansion\n        for _ in range(1, blocks):\n            layers.append(\n                block(\n                    self.inplanes,\n                    planes,\n                    groups=self.groups,\n                    base_width=self.base_width,\n                    norm_layer=norm_layer))\n\n        return nn.Sequential(*layers)\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self._out_channels[i], stride=self._out_strides[i])\n            for i in self.return_idx\n        ]\n\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n\n        out_layers = []\n        x = self.layer1(x)\n        out_layers.append(x)\n        x = self.layer2(x)\n        out_layers.append(x)\n        x = self.layer3(x)\n        out_layers.append(x)\n        x = self.layer4(x)\n        out_layers.append(x)\n\n        if self.with_pool:\n            x = self.avgpool(x)\n\n        return out_layers\n\n\n@register\n@serializable\nclass CLRResNet(nn.Layer):\n    def __init__(self,\n                 resnet='resnet18',\n                 pretrained=True,\n                 out_conv=False,\n                 fea_stride=8,\n                 out_channel=128,\n                 in_channels=[64, 128, 256, 512],\n                 cfg=None):\n        super(CLRResNet, self).__init__()\n        self.cfg = cfg\n        self.in_channels = in_channels\n\n        self.model = eval(resnet)(pretrained=pretrained)\n        self.out = None\n        if out_conv:\n            out_channel = 512\n            for chan in reversed(self.in_channels):\n                if chan < 0: continue\n                out_channel = chan\n                break\n            self.out = nn.Conv2D(\n                out_channel * self.model.expansion,\n                cfg.featuremap_out_channel,\n                kernel_size=1,\n                bias_attr=False)\n\n    @property\n    def out_shape(self):\n        return self.model.out_shape\n\n    def forward(self, x):\n        x = self.model(x)\n        if self.out:\n            x[-1] = self.out(x[-1])\n        return x\n\n\ndef _resnet(arch, Block, depth, pretrained, **kwargs):\n    model = ResNet(Block, depth, **kwargs)\n    if pretrained:\n        assert arch in model_urls, \"{} model do not have a pretrained model now, you should set pretrained=False\".format(\n            arch)\n        weight_path = get_weights_path_from_url(model_urls[arch])\n\n        param = paddle.load(weight_path)\n        model.set_dict(param)\n\n    return model\n\n\ndef resnet18(pretrained=False, **kwargs):\n    \"\"\"ResNet 18-layer model from\n    `\"Deep Residual Learning for Image Recognition\" <https://arxiv.org/pdf/1512.03385.pdf>`_.\n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNet 18-layer model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnet18\n            # build model\n            model = resnet18()\n            # build model and load imagenet pretrained weight\n            # model = resnet18(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs)\n\n\ndef resnet34(pretrained=False, **kwargs):\n    \"\"\"ResNet 34-layer model from\n    `\"Deep Residual Learning for Image Recognition\" <https://arxiv.org/pdf/1512.03385.pdf>`_.\n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNet 34-layer model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnet34\n            # build model\n            model = resnet34()\n            # build model and load imagenet pretrained weight\n            # model = resnet34(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs)\n\n\ndef resnet50(pretrained=False, **kwargs):\n    \"\"\"ResNet 50-layer model from\n    `\"Deep Residual Learning for Image Recognition\" <https://arxiv.org/pdf/1512.03385.pdf>`_.\n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNet 50-layer model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnet50\n            # build model\n            model = resnet50()\n            # build model and load imagenet pretrained weight\n            # model = resnet50(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs)\n\n\ndef resnet101(pretrained=False, **kwargs):\n    \"\"\"ResNet 101-layer model from\n    `\"Deep Residual Learning for Image Recognition\" <https://arxiv.org/pdf/1512.03385.pdf>`_.\n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNet 101-layer.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnet101\n            # build model\n            model = resnet101()\n            # build model and load imagenet pretrained weight\n            # model = resnet101(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs)\n\n\ndef resnet152(pretrained=False, **kwargs):\n    \"\"\"ResNet 152-layer model from\n    `\"Deep Residual Learning for Image Recognition\" <https://arxiv.org/pdf/1512.03385.pdf>`_.\n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNet 152-layer model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnet152\n            # build model\n            model = resnet152()\n            # build model and load imagenet pretrained weight\n            # model = resnet152(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs)\n\n\ndef resnext50_32x4d(pretrained=False, **kwargs):\n    \"\"\"ResNeXt-50 32x4d model from\n    `\"Aggregated Residual Transformations for Deep Neural Networks\" <https://arxiv.org/pdf/1611.05431.pdf>`_.\n    \n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 32x4d model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnext50_32x4d\n            # build model\n            model = resnext50_32x4d()\n            # build model and load imagenet pretrained weight\n            # model = resnext50_32x4d(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    kwargs['groups'] = 32\n    kwargs['width'] = 4\n    return _resnet('resnext50_32x4d', BottleneckBlock, 50, pretrained, **kwargs)\n\n\ndef resnext50_64x4d(pretrained=False, **kwargs):\n    \"\"\"ResNeXt-50 64x4d model from\n    `\"Aggregated Residual Transformations for Deep Neural Networks\" <https://arxiv.org/pdf/1611.05431.pdf>`_.\n    \n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 64x4d model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnext50_64x4d\n            # build model\n            model = resnext50_64x4d()\n            # build model and load imagenet pretrained weight\n            # model = resnext50_64x4d(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    kwargs['groups'] = 64\n    kwargs['width'] = 4\n    return _resnet('resnext50_64x4d', BottleneckBlock, 50, pretrained, **kwargs)\n\n\ndef resnext101_32x4d(pretrained=False, **kwargs):\n    \"\"\"ResNeXt-101 32x4d model from\n    `\"Aggregated Residual Transformations for Deep Neural Networks\" <https://arxiv.org/pdf/1611.05431.pdf>`_.\n    \n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 32x4d model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnext101_32x4d\n            # build model\n            model = resnext101_32x4d()\n            # build model and load imagenet pretrained weight\n            # model = resnext101_32x4d(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    kwargs['groups'] = 32\n    kwargs['width'] = 4\n    return _resnet('resnext101_32x4d', BottleneckBlock, 101, pretrained,\n                   **kwargs)\n\n\ndef resnext101_64x4d(pretrained=False, **kwargs):\n    \"\"\"ResNeXt-101 64x4d model from\n    `\"Aggregated Residual Transformations for Deep Neural Networks\" <https://arxiv.org/pdf/1611.05431.pdf>`_.\n    \n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 64x4d model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnext101_64x4d\n            # build model\n            model = resnext101_64x4d()\n            # build model and load imagenet pretrained weight\n            # model = resnext101_64x4d(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    kwargs['groups'] = 64\n    kwargs['width'] = 4\n    return _resnet('resnext101_64x4d', BottleneckBlock, 101, pretrained,\n                   **kwargs)\n\n\ndef resnext152_32x4d(pretrained=False, **kwargs):\n    \"\"\"ResNeXt-152 32x4d model from\n    `\"Aggregated Residual Transformations for Deep Neural Networks\" <https://arxiv.org/pdf/1611.05431.pdf>`_.\n    \n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 32x4d model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnext152_32x4d\n            # build model\n            model = resnext152_32x4d()\n            # build model and load imagenet pretrained weight\n            # model = resnext152_32x4d(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    kwargs['groups'] = 32\n    kwargs['width'] = 4\n    return _resnet('resnext152_32x4d', BottleneckBlock, 152, pretrained,\n                   **kwargs)\n\n\ndef resnext152_64x4d(pretrained=False, **kwargs):\n    \"\"\"ResNeXt-152 64x4d model from\n    `\"Aggregated Residual Transformations for Deep Neural Networks\" <https://arxiv.org/pdf/1611.05431.pdf>`_.\n    \n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 64x4d model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import resnext152_64x4d\n            # build model\n            model = resnext152_64x4d()\n            # build model and load imagenet pretrained weight\n            # model = resnext152_64x4d(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    kwargs['groups'] = 64\n    kwargs['width'] = 4\n    return _resnet('resnext152_64x4d', BottleneckBlock, 152, pretrained,\n                   **kwargs)\n\n\ndef wide_resnet50_2(pretrained=False, **kwargs):\n    \"\"\"Wide ResNet-50-2 model from\n    `\"Wide Residual Networks\" <https://arxiv.org/pdf/1605.07146.pdf>`_.\n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-50-2 model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import wide_resnet50_2\n            # build model\n            model = wide_resnet50_2()\n            # build model and load imagenet pretrained weight\n            # model = wide_resnet50_2(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    kwargs['width'] = 64 * 2\n    return _resnet('wide_resnet50_2', BottleneckBlock, 50, pretrained, **kwargs)\n\n\ndef wide_resnet101_2(pretrained=False, **kwargs):\n    \"\"\"Wide ResNet-101-2 model from\n    `\"Wide Residual Networks\" <https://arxiv.org/pdf/1605.07146.pdf>`_.\n    Args:\n        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained\n                            on ImageNet. Default: False.\n        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.\n    Returns:\n        :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-101-2 model.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from paddle.vision.models import wide_resnet101_2\n            # build model\n            model = wide_resnet101_2()\n            # build model and load imagenet pretrained weight\n            # model = wide_resnet101_2(pretrained=True)\n            x = paddle.rand([1, 3, 224, 224])\n            out = model(x)\n            print(out.shape)\n            # [1, 1000]\n    \"\"\"\n    kwargs['width'] = 64 * 2\n    return _resnet('wide_resnet101_2', BottleneckBlock, 101, pretrained,\n                   **kwargs)\n"
  },
  {
    "path": "ppdet/modeling/backbones/convnext.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n'''\nModified from https://github.com/facebookresearch/ConvNeXt\nCopyright (c) Meta Platforms, Inc. and affiliates.\nAll rights reserved.\nThis source code is licensed under the license found in the\nLICENSE file in the root directory of this source tree.\n'''\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Constant\n\nimport numpy as np\n\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\nfrom .transformer_utils import DropPath, trunc_normal_, zeros_\n\n__all__ = ['ConvNeXt']\n\n\nclass Block(nn.Layer):\n    r\"\"\" ConvNeXt Block. There are two equivalent implementations:\n    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)\n    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back\n    We use (2) as we find it slightly faster in Pypaddle\n    \n    Args:\n        dim (int): Number of input channels.\n        drop_path (float): Stochastic depth rate. Default: 0.0\n        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.\n    \"\"\"\n\n    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):\n        super().__init__()\n        self.dwconv = nn.Conv2D(\n            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv\n        self.norm = LayerNorm(dim, eps=1e-6)\n        self.pwconv1 = nn.Linear(\n            dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers\n        self.act = nn.GELU()\n        self.pwconv2 = nn.Linear(4 * dim, dim)\n\n        if layer_scale_init_value > 0:\n            self.gamma = self.create_parameter(\n                shape=(dim, ),\n                attr=ParamAttr(initializer=Constant(layer_scale_init_value)))\n        else:\n            self.gamma = None\n\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(\n        )\n\n    def forward(self, x):\n        input = x\n        x = self.dwconv(x)\n        x = x.transpose([0, 2, 3, 1])\n        x = self.norm(x)\n        x = self.pwconv1(x)\n        x = self.act(x)\n        x = self.pwconv2(x)\n        if self.gamma is not None:\n            x = self.gamma * x\n        x = x.transpose([0, 3, 1, 2])\n        x = input + self.drop_path(x)\n        return x\n\n\nclass LayerNorm(nn.Layer):\n    r\"\"\" LayerNorm that supports two data formats: channels_last (default) or channels_first. \n    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with \n    shape (batch_size, height, width, channels) while channels_first corresponds to inputs \n    with shape (batch_size, channels, height, width).\n    \"\"\"\n\n    def __init__(self, normalized_shape, eps=1e-6, data_format=\"channels_last\"):\n        super().__init__()\n\n        self.weight = self.create_parameter(\n            shape=(normalized_shape, ),\n            attr=ParamAttr(initializer=Constant(1.)))\n        self.bias = self.create_parameter(\n            shape=(normalized_shape, ),\n            attr=ParamAttr(initializer=Constant(0.)))\n\n        self.eps = eps\n        self.data_format = data_format\n        if self.data_format not in [\"channels_last\", \"channels_first\"]:\n            raise NotImplementedError\n        self.normalized_shape = (normalized_shape, )\n\n    def forward(self, x):\n        if self.data_format == \"channels_last\":\n            return F.layer_norm(x, self.normalized_shape, self.weight,\n                                self.bias, self.eps)\n        elif self.data_format == \"channels_first\":\n            u = x.mean(1, keepdim=True)\n            s = (x - u).pow(2).mean(1, keepdim=True)\n            x = (x - u) / paddle.sqrt(s + self.eps)\n            x = self.weight[:, None, None] * x + self.bias[:, None, None]\n            return x\n\n\n@register\n@serializable\nclass ConvNeXt(nn.Layer):\n    r\"\"\" ConvNeXt\n        A Pypaddle impl of : `A ConvNet for the 2020s`  -\n          https://arxiv.org/pdf/2201.03545.pdf\n\n    Args:\n        in_chans (int): Number of input image channels. Default: 3\n        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]\n        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]\n        drop_path_rate (float): Stochastic depth rate. Default: 0.\n        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.\n    \"\"\"\n\n    arch_settings = {\n        'tiny': {\n            'depths': [3, 3, 9, 3],\n            'dims': [96, 192, 384, 768]\n        },\n        'small': {\n            'depths': [3, 3, 27, 3],\n            'dims': [96, 192, 384, 768]\n        },\n        'base': {\n            'depths': [3, 3, 27, 3],\n            'dims': [128, 256, 512, 1024]\n        },\n        'large': {\n            'depths': [3, 3, 27, 3],\n            'dims': [192, 384, 768, 1536]\n        },\n        'xlarge': {\n            'depths': [3, 3, 27, 3],\n            'dims': [256, 512, 1024, 2048]\n        },\n    }\n\n    def __init__(\n            self,\n            arch='tiny',\n            in_chans=3,\n            drop_path_rate=0.,\n            layer_scale_init_value=1e-6,\n            return_idx=[1, 2, 3],\n            norm_output=True,\n            pretrained=None, ):\n        super().__init__()\n        depths = self.arch_settings[arch]['depths']\n        dims = self.arch_settings[arch]['dims']\n        self.downsample_layers = nn.LayerList(\n        )  # stem and 3 intermediate downsampling conv layers\n        stem = nn.Sequential(\n            nn.Conv2D(\n                in_chans, dims[0], kernel_size=4, stride=4),\n            LayerNorm(\n                dims[0], eps=1e-6, data_format=\"channels_first\"))\n        self.downsample_layers.append(stem)\n        for i in range(3):\n            downsample_layer = nn.Sequential(\n                LayerNorm(\n                    dims[i], eps=1e-6, data_format=\"channels_first\"),\n                nn.Conv2D(\n                    dims[i], dims[i + 1], kernel_size=2, stride=2), )\n            self.downsample_layers.append(downsample_layer)\n\n        self.stages = nn.LayerList(\n        )  # 4 feature resolution stages, each consisting of multiple residual blocks\n        dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]\n        cur = 0\n        for i in range(4):\n            stage = nn.Sequential(* [\n                Block(\n                    dim=dims[i],\n                    drop_path=dp_rates[cur + j],\n                    layer_scale_init_value=layer_scale_init_value)\n                for j in range(depths[i])\n            ])\n            self.stages.append(stage)\n            cur += depths[i]\n\n        self.return_idx = return_idx\n        self.dims = [dims[i] for i in return_idx]  # [::-1]\n\n        self.norm_output = norm_output\n        if norm_output:\n            self.norms = nn.LayerList([\n                LayerNorm(\n                    c, eps=1e-6, data_format=\"channels_first\")\n                for c in self.dims\n            ])\n\n        self.apply(self._init_weights)\n\n        if pretrained is not None:\n            if 'http' in pretrained:  #URL\n                path = paddle.utils.download.get_weights_path_from_url(\n                    pretrained)\n            else:  #model in local path\n                path = pretrained\n            self.set_state_dict(paddle.load(path))\n\n    def _init_weights(self, m):\n        if isinstance(m, (nn.Conv2D, nn.Linear)):\n            trunc_normal_(m.weight)\n            zeros_(m.bias)\n\n    def forward_features(self, x):\n        output = []\n        for i in range(4):\n            x = self.downsample_layers[i](x)\n            x = self.stages[i](x)\n            output.append(x)\n\n        outputs = [output[i] for i in self.return_idx]\n        if self.norm_output:\n            outputs = [self.norms[i](out) for i, out in enumerate(outputs)]\n\n        return outputs\n\n    def forward(self, x):\n        x = self.forward_features(x['image'])\n        return x\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self.dims]\n"
  },
  {
    "path": "ppdet/modeling/backbones/csp_darknet.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.initializer import conv_init_\nfrom ..shape_spec import ShapeSpec\n\n__all__ = [\n    'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'\n]\n\n\nclass BaseConv(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 ksize,\n                 stride,\n                 groups=1,\n                 bias=False,\n                 act=\"silu\"):\n        super(BaseConv, self).__init__()\n        self.conv = nn.Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size=ksize,\n            stride=stride,\n            padding=(ksize - 1) // 2,\n            groups=groups,\n            bias_attr=bias)\n        self.bn = nn.BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        self._init_weights()\n\n    def _init_weights(self):\n        conv_init_(self.conv)\n\n    def forward(self, x):\n        # use 'x * F.sigmoid(x)' replace 'silu'\n        x = self.bn(self.conv(x))\n        y = x * F.sigmoid(x)\n        return y\n\n\nclass DWConv(nn.Layer):\n    \"\"\"Depthwise Conv\"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 ksize,\n                 stride=1,\n                 bias=False,\n                 act=\"silu\"):\n        super(DWConv, self).__init__()\n        self.dw_conv = BaseConv(\n            in_channels,\n            in_channels,\n            ksize=ksize,\n            stride=stride,\n            groups=in_channels,\n            bias=bias,\n            act=act)\n        self.pw_conv = BaseConv(\n            in_channels,\n            out_channels,\n            ksize=1,\n            stride=1,\n            groups=1,\n            bias=bias,\n            act=act)\n\n    def forward(self, x):\n        return self.pw_conv(self.dw_conv(x))\n\n\nclass Focus(nn.Layer):\n    \"\"\"Focus width and height information into channel space, used in YOLOX.\"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 ksize=3,\n                 stride=1,\n                 bias=False,\n                 act=\"silu\"):\n        super(Focus, self).__init__()\n        self.conv = BaseConv(\n            in_channels * 4,\n            out_channels,\n            ksize=ksize,\n            stride=stride,\n            bias=bias,\n            act=act)\n\n    def forward(self, inputs):\n        # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]\n        top_left = inputs[:, :, 0::2, 0::2]\n        top_right = inputs[:, :, 0::2, 1::2]\n        bottom_left = inputs[:, :, 1::2, 0::2]\n        bottom_right = inputs[:, :, 1::2, 1::2]\n        outputs = paddle.concat(\n            [top_left, bottom_left, top_right, bottom_right], 1)\n        return self.conv(outputs)\n\n\nclass BottleNeck(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 shortcut=True,\n                 expansion=0.5,\n                 depthwise=False,\n                 bias=False,\n                 act=\"silu\"):\n        super(BottleNeck, self).__init__()\n        hidden_channels = int(out_channels * expansion)\n        Conv = DWConv if depthwise else BaseConv\n        self.conv1 = BaseConv(\n            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)\n        self.conv2 = Conv(\n            hidden_channels,\n            out_channels,\n            ksize=3,\n            stride=1,\n            bias=bias,\n            act=act)\n        self.add_shortcut = shortcut and in_channels == out_channels\n\n    def forward(self, x):\n        y = self.conv2(self.conv1(x))\n        if self.add_shortcut:\n            y = y + x\n        return y\n\n\nclass SPPLayer(nn.Layer):\n    \"\"\"Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX\"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_sizes=(5, 9, 13),\n                 bias=False,\n                 act=\"silu\"):\n        super(SPPLayer, self).__init__()\n        hidden_channels = in_channels // 2\n        self.conv1 = BaseConv(\n            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)\n        self.maxpoolings = nn.LayerList([\n            nn.MaxPool2D(\n                kernel_size=ks, stride=1, padding=ks // 2)\n            for ks in kernel_sizes\n        ])\n        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)\n        self.conv2 = BaseConv(\n            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)\n\n    def forward(self, x):\n        x = self.conv1(x)\n        x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)\n        x = self.conv2(x)\n        return x\n\n\nclass SPPFLayer(nn.Layer):\n    \"\"\" Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,\n        equivalent to SPP(k=(5, 9, 13))\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 ksize=5,\n                 bias=False,\n                 act='silu'):\n        super(SPPFLayer, self).__init__()\n        hidden_channels = in_channels // 2\n        self.conv1 = BaseConv(\n            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)\n        self.maxpooling = nn.MaxPool2D(\n            kernel_size=ksize, stride=1, padding=ksize // 2)\n        conv2_channels = hidden_channels * 4\n        self.conv2 = BaseConv(\n            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)\n\n    def forward(self, x):\n        x = self.conv1(x)\n        y1 = self.maxpooling(x)\n        y2 = self.maxpooling(y1)\n        y3 = self.maxpooling(y2)\n        concats = paddle.concat([x, y1, y2, y3], axis=1)\n        out = self.conv2(concats)\n        return out\n\n\nclass CSPLayer(nn.Layer):\n    \"\"\"CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5\"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 num_blocks=1,\n                 shortcut=True,\n                 expansion=0.5,\n                 depthwise=False,\n                 bias=False,\n                 act=\"silu\"):\n        super(CSPLayer, self).__init__()\n        hidden_channels = int(out_channels * expansion)\n        self.conv1 = BaseConv(\n            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)\n        self.conv2 = BaseConv(\n            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)\n        self.bottlenecks = nn.Sequential(* [\n            BottleNeck(\n                hidden_channels,\n                hidden_channels,\n                shortcut=shortcut,\n                expansion=1.0,\n                depthwise=depthwise,\n                bias=bias,\n                act=act) for _ in range(num_blocks)\n        ])\n        self.conv3 = BaseConv(\n            hidden_channels * 2,\n            out_channels,\n            ksize=1,\n            stride=1,\n            bias=bias,\n            act=act)\n\n    def forward(self, x):\n        x_1 = self.conv1(x)\n        x_1 = self.bottlenecks(x_1)\n        x_2 = self.conv2(x)\n        x = paddle.concat([x_1, x_2], axis=1)\n        x = self.conv3(x)\n        return x\n\n\n@register\n@serializable\nclass CSPDarkNet(nn.Layer):\n    \"\"\"\n    CSPDarkNet backbone.\n    Args:\n        arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,\n            and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.\n        depth_mult (float): Depth multiplier, multiply number of channels in\n            each layer, default as 1.0.\n        width_mult (float): Width multiplier, multiply number of blocks in\n            CSPLayer, default as 1.0.\n        depthwise (bool): Whether to use depth-wise conv layer.\n        act (str): Activation function type, default as 'silu'.\n        return_idx (list): Index of stages whose feature maps are returned.\n    \"\"\"\n\n    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']\n\n    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)\n    # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.\n    arch_settings = {\n        'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],\n              [256, 512, 9, True, False], [512, 1024, 3, False, True]],\n        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],\n               [256, 512, 9, True, False], [512, 1024, 3, True, True]],\n        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],\n               [256, 512, 9, True, False], [512, 768, 3, True, False],\n               [768, 1024, 3, True, True]],\n    }\n\n    def __init__(self,\n                 arch='X',\n                 depth_mult=1.0,\n                 width_mult=1.0,\n                 depthwise=False,\n                 act='silu',\n                 trt=False,\n                 return_idx=[2, 3, 4]):\n        super(CSPDarkNet, self).__init__()\n        self.arch = arch\n        self.return_idx = return_idx\n        Conv = DWConv if depthwise else BaseConv\n        arch_setting = self.arch_settings[arch]\n        base_channels = int(arch_setting[0][0] * width_mult)\n\n        # Note: differences between the latest YOLOv5 and the original YOLOX\n        # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)\n        # 2. use SPPF(in YOLOv5) or SPP(in YOLOX)\n        # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer\n        # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX\n        if arch in ['P5', 'P6']:\n            # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)\n            self.stem = Conv(\n                3, base_channels, ksize=6, stride=2, bias=False, act=act)\n            spp_kernal_sizes = 5\n        elif arch in ['X']:\n            # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)\n            self.stem = Focus(\n                3, base_channels, ksize=3, stride=1, bias=False, act=act)\n            spp_kernal_sizes = (5, 9, 13)\n        else:\n            raise AttributeError(\"Unsupported arch type: {}\".format(arch))\n\n        _out_channels = [base_channels]\n        layers_num = 1\n        self.csp_dark_blocks = []\n\n        for i, (in_channels, out_channels, num_blocks, shortcut,\n                use_spp) in enumerate(arch_setting):\n            in_channels = int(in_channels * width_mult)\n            out_channels = int(out_channels * width_mult)\n            _out_channels.append(out_channels)\n            num_blocks = max(round(num_blocks * depth_mult), 1)\n            stage = []\n\n            conv_layer = self.add_sublayer(\n                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),\n                Conv(\n                    in_channels, out_channels, 3, 2, bias=False, act=act))\n            stage.append(conv_layer)\n            layers_num += 1\n\n            if use_spp and arch in ['X']:\n                # in YOLOX use SPPLayer\n                spp_layer = self.add_sublayer(\n                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),\n                    SPPLayer(\n                        out_channels,\n                        out_channels,\n                        kernel_sizes=spp_kernal_sizes,\n                        bias=False,\n                        act=act))\n                stage.append(spp_layer)\n                layers_num += 1\n\n            csp_layer = self.add_sublayer(\n                'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),\n                CSPLayer(\n                    out_channels,\n                    out_channels,\n                    num_blocks=num_blocks,\n                    shortcut=shortcut,\n                    depthwise=depthwise,\n                    bias=False,\n                    act=act))\n            stage.append(csp_layer)\n            layers_num += 1\n\n            if use_spp and arch in ['P5', 'P6']:\n                # in latest YOLOv5 use SPPFLayer instead of SPPLayer\n                sppf_layer = self.add_sublayer(\n                    'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),\n                    SPPFLayer(\n                        out_channels,\n                        out_channels,\n                        ksize=5,\n                        bias=False,\n                        act=act))\n                stage.append(sppf_layer)\n                layers_num += 1\n\n            self.csp_dark_blocks.append(nn.Sequential(*stage))\n\n        self._out_channels = [_out_channels[i] for i in self.return_idx]\n        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]\n\n    def forward(self, inputs):\n        x = inputs['image']\n        outputs = []\n        x = self.stem(x)\n        for i, layer in enumerate(self.csp_dark_blocks):\n            x = layer(x)\n            if i + 1 in self.return_idx:\n                outputs.append(x)\n        return outputs\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=c, stride=s)\n            for c, s in zip(self._out_channels, self.strides)\n        ]\n"
  },
  {
    "path": "ppdet/modeling/backbones/cspresnet.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import Constant\n\nfrom ppdet.modeling.ops import get_act_fn\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size=3,\n                 stride=1,\n                 groups=1,\n                 padding=0,\n                 act=None):\n        super(ConvBNLayer, self).__init__()\n\n        self.conv = nn.Conv2D(\n            in_channels=ch_in,\n            out_channels=ch_out,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=padding,\n            groups=groups,\n            bias_attr=False)\n\n        self.bn = nn.BatchNorm2D(\n            ch_out,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.act = get_act_fn(act) if act is None or isinstance(act, (\n            str, dict)) else act\n\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        x = self.act(x)\n\n        return x\n\n\nclass RepVggBlock(nn.Layer):\n    def __init__(self, ch_in, ch_out, act='relu', alpha=False):\n        super(RepVggBlock, self).__init__()\n        self.ch_in = ch_in\n        self.ch_out = ch_out\n        self.conv1 = ConvBNLayer(\n            ch_in, ch_out, 3, stride=1, padding=1, act=None)\n        self.conv2 = ConvBNLayer(\n            ch_in, ch_out, 1, stride=1, padding=0, act=None)\n        self.act = get_act_fn(act) if act is None or isinstance(act, (\n            str, dict)) else act\n        if alpha:\n            self.alpha = self.create_parameter(\n                shape=[1],\n                attr=ParamAttr(initializer=Constant(value=1.)),\n                dtype=\"float32\")\n        else:\n            self.alpha = None\n\n    def forward(self, x):\n        if hasattr(self, 'conv'):\n            y = self.conv(x)\n        else:\n            if self.alpha:\n                y = self.conv1(x) + self.alpha * self.conv2(x)\n            else:\n                y = self.conv1(x) + self.conv2(x)\n        y = self.act(y)\n        return y\n\n    def convert_to_deploy(self):\n        if not hasattr(self, 'conv'):\n            self.conv = nn.Conv2D(\n                in_channels=self.ch_in,\n                out_channels=self.ch_out,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                groups=1)\n        kernel, bias = self.get_equivalent_kernel_bias()\n        self.conv.weight.set_value(kernel)\n        self.conv.bias.set_value(bias)\n        self.__delattr__('conv1')\n        self.__delattr__('conv2')\n\n    def get_equivalent_kernel_bias(self):\n        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)\n        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)\n        if self.alpha:\n            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(\n                kernel1x1), bias3x3 + self.alpha * bias1x1\n        else:\n            return kernel3x3 + self._pad_1x1_to_3x3_tensor(\n                kernel1x1), bias3x3 + bias1x1\n\n    def _pad_1x1_to_3x3_tensor(self, kernel1x1):\n        if kernel1x1 is None:\n            return 0\n        else:\n            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])\n\n    def _fuse_bn_tensor(self, branch):\n        if branch is None:\n            return 0, 0\n        kernel = branch.conv.weight\n        running_mean = branch.bn._mean\n        running_var = branch.bn._variance\n        gamma = branch.bn.weight\n        beta = branch.bn.bias\n        eps = branch.bn._epsilon\n        std = (running_var + eps).sqrt()\n        t = (gamma / std).reshape((-1, 1, 1, 1))\n        return kernel * t, beta - running_mean * gamma / std\n\n\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 act='relu',\n                 shortcut=True,\n                 use_alpha=False):\n        super(BasicBlock, self).__init__()\n        assert ch_in == ch_out\n        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)\n        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)\n        self.shortcut = shortcut\n\n    def forward(self, x):\n        y = self.conv1(x)\n        y = self.conv2(y)\n        if self.shortcut:\n            return paddle.add(x, y)\n        else:\n            return y\n\n\nclass EffectiveSELayer(nn.Layer):\n    \"\"\" Effective Squeeze-Excitation\n    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667\n    \"\"\"\n\n    def __init__(self, channels, act='hardsigmoid'):\n        super(EffectiveSELayer, self).__init__()\n        self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)\n        self.act = get_act_fn(act) if act is None or isinstance(act, (\n            str, dict)) else act\n\n    def forward(self, x):\n        x_se = x.mean((2, 3), keepdim=True)\n        x_se = self.fc(x_se)\n        return x * self.act(x_se)\n\n\nclass CSPResStage(nn.Layer):\n    def __init__(self,\n                 block_fn,\n                 ch_in,\n                 ch_out,\n                 n,\n                 stride,\n                 act='relu',\n                 attn='eca',\n                 use_alpha=False):\n        super(CSPResStage, self).__init__()\n\n        ch_mid = (ch_in + ch_out) // 2\n        if stride == 2:\n            self.conv_down = ConvBNLayer(\n                ch_in, ch_mid, 3, stride=2, padding=1, act=act)\n        else:\n            self.conv_down = None\n        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)\n        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)\n        self.blocks = nn.Sequential(*[\n            block_fn(\n                ch_mid // 2,\n                ch_mid // 2,\n                act=act,\n                shortcut=True,\n                use_alpha=use_alpha) for i in range(n)\n        ])\n        if attn:\n            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')\n        else:\n            self.attn = None\n\n        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)\n\n    def forward(self, x):\n        if self.conv_down is not None:\n            x = self.conv_down(x)\n        y1 = self.conv1(x)\n        y2 = self.blocks(self.conv2(x))\n        y = paddle.concat([y1, y2], axis=1)\n        if self.attn is not None:\n            y = self.attn(y)\n        y = self.conv3(y)\n        return y\n\n\n@register\n@serializable\nclass CSPResNet(nn.Layer):\n    __shared__ = ['width_mult', 'depth_mult', 'trt']\n\n    def __init__(self,\n                 layers=[3, 6, 6, 3],\n                 channels=[64, 128, 256, 512, 1024],\n                 act='swish',\n                 return_idx=[1, 2, 3],\n                 depth_wise=False,\n                 use_large_stem=False,\n                 width_mult=1.0,\n                 depth_mult=1.0,\n                 trt=False,\n                 use_checkpoint=False,\n                 use_alpha=False,\n                 **args):\n        super(CSPResNet, self).__init__()\n        self.use_checkpoint = use_checkpoint\n        channels = [max(round(c * width_mult), 1) for c in channels]\n        layers = [max(round(l * depth_mult), 1) for l in layers]\n        act = get_act_fn(\n            act, trt=trt) if act is None or isinstance(act,\n                                                       (str, dict)) else act\n\n        if use_large_stem:\n            self.stem = nn.Sequential(\n                ('conv1', ConvBNLayer(\n                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),\n                ('conv2', ConvBNLayer(\n                    channels[0] // 2,\n                    channels[0] // 2,\n                    3,\n                    stride=1,\n                    padding=1,\n                    act=act)), ('conv3', ConvBNLayer(\n                        channels[0] // 2,\n                        channels[0],\n                        3,\n                        stride=1,\n                        padding=1,\n                        act=act)))\n        else:\n            self.stem = nn.Sequential(\n                ('conv1', ConvBNLayer(\n                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),\n                ('conv2', ConvBNLayer(\n                    channels[0] // 2,\n                    channels[0],\n                    3,\n                    stride=1,\n                    padding=1,\n                    act=act)))\n\n        n = len(channels) - 1\n        self.stages = nn.Sequential(*[(str(i), CSPResStage(\n            BasicBlock,\n            channels[i],\n            channels[i + 1],\n            layers[i],\n            2,\n            act=act,\n            use_alpha=use_alpha)) for i in range(n)])\n\n        self._out_channels = channels[1:]\n        self._out_strides = [4 * 2**i for i in range(n)]\n        self.return_idx = return_idx\n        if use_checkpoint:\n            paddle.seed(0)\n\n    def forward(self, inputs):\n        x = inputs['image']\n        x = self.stem(x)\n        outs = []\n        for idx, stage in enumerate(self.stages):\n            if self.use_checkpoint and self.training:\n                x = paddle.distributed.fleet.utils.recompute(\n                    stage, x, **{\"preserve_rng_state\": True})\n            else:\n                x = stage(x)\n            if idx in self.return_idx:\n                outs.append(x)\n\n        return outs\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self._out_channels[i], stride=self._out_strides[i])\n            for i in self.return_idx\n        ]\n"
  },
  {
    "path": "ppdet/modeling/backbones/darknet.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.ops import batch_norm, mish\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['DarkNet', 'ConvBNLayer']\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size=3,\n                 stride=1,\n                 groups=1,\n                 padding=0,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 act=\"leaky\",\n                 freeze_norm=False,\n                 data_format='NCHW',\n                 name=''):\n        \"\"\"\n        conv + bn + activation layer\n\n        Args:\n            ch_in (int): input channel\n            ch_out (int): output channel\n            filter_size (int): filter size, default 3\n            stride (int): stride, default 1\n            groups (int): number of groups of conv layer, default 1\n            padding (int): padding size, default 0\n            norm_type (str): batch norm type, default bn\n            norm_decay (str): decay for weight and bias of batch norm layer, default 0.\n            act (str): activation function type, default 'leaky', which means leaky_relu\n            freeze_norm (bool): whether to freeze norm, default False\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(ConvBNLayer, self).__init__()\n\n        self.conv = nn.Conv2D(\n            in_channels=ch_in,\n            out_channels=ch_out,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=padding,\n            groups=groups,\n            data_format=data_format,\n            bias_attr=False)\n        self.batch_norm = batch_norm(\n            ch_out,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            data_format=data_format)\n        self.act = act\n\n    def forward(self, inputs):\n        out = self.conv(inputs)\n        out = self.batch_norm(out)\n        if self.act == 'leaky':\n            out = F.leaky_relu(out, 0.1)\n        else:\n            out = getattr(F, self.act)(out)\n        return out\n\n\nclass DownSample(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size=3,\n                 stride=2,\n                 padding=1,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 data_format='NCHW'):\n        \"\"\"\n        downsample layer\n\n        Args:\n            ch_in (int): input channel\n            ch_out (int): output channel\n            filter_size (int): filter size, default 3\n            stride (int): stride, default 2\n            padding (int): padding size, default 1\n            norm_type (str): batch norm type, default bn\n            norm_decay (str): decay for weight and bias of batch norm layer, default 0.\n            freeze_norm (bool): whether to freeze norm, default False\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n\n        super(DownSample, self).__init__()\n\n        self.conv_bn_layer = ConvBNLayer(\n            ch_in=ch_in,\n            ch_out=ch_out,\n            filter_size=filter_size,\n            stride=stride,\n            padding=padding,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            data_format=data_format)\n        self.ch_out = ch_out\n\n    def forward(self, inputs):\n        out = self.conv_bn_layer(inputs)\n        return out\n\n\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 data_format='NCHW'):\n        \"\"\"\n        BasicBlock layer of DarkNet\n\n        Args:\n            ch_in (int): input channel\n            ch_out (int): output channel\n            norm_type (str): batch norm type, default bn\n            norm_decay (str): decay for weight and bias of batch norm layer, default 0.\n            freeze_norm (bool): whether to freeze norm, default False\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n\n        super(BasicBlock, self).__init__()\n\n        assert ch_in == ch_out and (ch_in % 2) == 0, \\\n            f\"ch_in and ch_out should be the same even int, but the input \\'ch_in is {ch_in}, \\'ch_out is {ch_out}\"\n        # example:\n        # --------------{conv1} --> {conv2}\n        # channel route: 10-->5 --> 5-->10\n        self.conv1 = ConvBNLayer(\n            ch_in=ch_in,\n            ch_out=int(ch_out / 2),\n            filter_size=1,\n            stride=1,\n            padding=0,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            data_format=data_format)\n        self.conv2 = ConvBNLayer(\n            ch_in=int(ch_out / 2),\n            ch_out=ch_out,\n            filter_size=3,\n            stride=1,\n            padding=1,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            data_format=data_format)\n\n    def forward(self, inputs):\n        conv1 = self.conv1(inputs)\n        conv2 = self.conv2(conv1)\n        out = paddle.add(x=inputs, y=conv2)\n        return out\n\n\nclass Blocks(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 count,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 name=None,\n                 data_format='NCHW'):\n        \"\"\"\n        Blocks layer, which consist of some BaickBlock layers\n\n        Args:\n            ch_in (int): input channel\n            ch_out (int): output channel\n            count (int): number of BasicBlock layer\n            norm_type (str): batch norm type, default bn\n            norm_decay (str): decay for weight and bias of batch norm layer, default 0.\n            freeze_norm (bool): whether to freeze norm, default False\n            name (str): layer name\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(Blocks, self).__init__()\n\n        self.basicblock0 = BasicBlock(\n            ch_in,\n            ch_out,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            data_format=data_format)\n        self.res_out_list = []\n        for i in range(1, count):\n            block_name = '{}.{}'.format(name, i)\n            res_out = self.add_sublayer(\n                block_name,\n                BasicBlock(\n                    ch_out,\n                    ch_out,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    data_format=data_format))\n            self.res_out_list.append(res_out)\n        self.ch_out = ch_out\n\n    def forward(self, inputs):\n        y = self.basicblock0(inputs)\n        for basic_block_i in self.res_out_list:\n            y = basic_block_i(y)\n        return y\n\n\nDarkNet_cfg = {53: ([1, 2, 8, 8, 4])}\n\n\n@register\n@serializable\nclass DarkNet(nn.Layer):\n    __shared__ = ['norm_type', 'data_format']\n\n    def __init__(self,\n                 depth=53,\n                 freeze_at=-1,\n                 return_idx=[2, 3, 4],\n                 num_stages=5,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 data_format='NCHW'):\n        \"\"\"\n        Darknet, see https://pjreddie.com/darknet/yolo/\n\n        Args:\n            depth (int): depth of network\n            freeze_at (int): freeze the backbone at which stage\n            filter_size (int): filter size, default 3\n            return_idx (list): index of stages whose feature maps are returned\n            norm_type (str): batch norm type, default bn\n            norm_decay (str): decay for weight and bias of batch norm layer, default 0.\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(DarkNet, self).__init__()\n        self.depth = depth\n        self.freeze_at = freeze_at\n        self.return_idx = return_idx\n        self.num_stages = num_stages\n        self.stages = DarkNet_cfg[self.depth][0:num_stages]\n\n        self.conv0 = ConvBNLayer(\n            ch_in=3,\n            ch_out=32,\n            filter_size=3,\n            stride=1,\n            padding=1,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            data_format=data_format)\n\n        self.downsample0 = DownSample(\n            ch_in=32,\n            ch_out=32 * 2,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            data_format=data_format)\n\n        self._out_channels = []\n        self.darknet_conv_block_list = []\n        self.downsample_list = []\n        ch_in = [64, 128, 256, 512, 1024]\n        for i, stage in enumerate(self.stages):\n            name = 'stage.{}'.format(i)\n            conv_block = self.add_sublayer(\n                name,\n                Blocks(\n                    int(ch_in[i]),\n                    int(ch_in[i]),\n                    stage,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    data_format=data_format,\n                    name=name))\n            self.darknet_conv_block_list.append(conv_block)\n            if i in return_idx:\n                self._out_channels.append(int(ch_in[i]))\n        for i in range(num_stages - 1):\n            down_name = 'stage.{}.downsample'.format(i)\n            downsample = self.add_sublayer(\n                down_name,\n                DownSample(\n                    ch_in=int(ch_in[i]),\n                    ch_out=int(ch_in[i + 1]),\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    data_format=data_format))\n            self.downsample_list.append(downsample)\n\n    def forward(self, inputs):\n        x = inputs['image']\n\n        out = self.conv0(x)\n        out = self.downsample0(out)\n        blocks = []\n        for i, conv_block_i in enumerate(self.darknet_conv_block_list):\n            out = conv_block_i(out)\n            if i == self.freeze_at:\n                out.stop_gradient = True\n            if i in self.return_idx:\n                blocks.append(out)\n            if i < self.num_stages - 1:\n                out = self.downsample_list[i](out)\n        return blocks\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/backbones/dla.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.layers import ConvNormLayer\nfrom ..shape_spec import ShapeSpec\n\nDLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512]), }\n\n\nclass BasicBlock(nn.Layer):\n    def __init__(self, ch_in, ch_out, stride=1):\n        super(BasicBlock, self).__init__()\n        self.conv1 = ConvNormLayer(\n            ch_in,\n            ch_out,\n            filter_size=3,\n            stride=stride,\n            bias_on=False,\n            norm_decay=None)\n        self.conv2 = ConvNormLayer(\n            ch_out,\n            ch_out,\n            filter_size=3,\n            stride=1,\n            bias_on=False,\n            norm_decay=None)\n\n    def forward(self, inputs, residual=None):\n        if residual is None:\n            residual = inputs\n\n        out = self.conv1(inputs)\n        out = F.relu(out)\n\n        out = self.conv2(out)\n\n        out = paddle.add(x=out, y=residual)\n        out = F.relu(out)\n\n        return out\n\n\nclass Root(nn.Layer):\n    def __init__(self, ch_in, ch_out, kernel_size, residual):\n        super(Root, self).__init__()\n        self.conv = ConvNormLayer(\n            ch_in,\n            ch_out,\n            filter_size=1,\n            stride=1,\n            bias_on=False,\n            norm_decay=None)\n        self.residual = residual\n\n    def forward(self, inputs):\n        children = inputs\n        out = self.conv(paddle.concat(inputs, axis=1))\n        if self.residual:\n            out = paddle.add(x=out, y=children[0])\n        out = F.relu(out)\n\n        return out\n\n\nclass Tree(nn.Layer):\n    def __init__(self,\n                 level,\n                 block,\n                 ch_in,\n                 ch_out,\n                 stride=1,\n                 level_root=False,\n                 root_dim=0,\n                 root_kernel_size=1,\n                 root_residual=False):\n        super(Tree, self).__init__()\n        if root_dim == 0:\n            root_dim = 2 * ch_out\n        if level_root:\n            root_dim += ch_in\n        if level == 1:\n            self.tree1 = block(ch_in, ch_out, stride)\n            self.tree2 = block(ch_out, ch_out, 1)\n        else:\n            self.tree1 = Tree(\n                level - 1,\n                block,\n                ch_in,\n                ch_out,\n                stride,\n                root_dim=0,\n                root_kernel_size=root_kernel_size,\n                root_residual=root_residual)\n            self.tree2 = Tree(\n                level - 1,\n                block,\n                ch_out,\n                ch_out,\n                1,\n                root_dim=root_dim + ch_out,\n                root_kernel_size=root_kernel_size,\n                root_residual=root_residual)\n\n        if level == 1:\n            self.root = Root(root_dim, ch_out, root_kernel_size, root_residual)\n        self.level_root = level_root\n        self.root_dim = root_dim\n        self.downsample = None\n        self.project = None\n        self.level = level\n        if stride > 1:\n            self.downsample = nn.MaxPool2D(stride, stride=stride)\n        if ch_in != ch_out:\n            self.project = ConvNormLayer(\n                ch_in,\n                ch_out,\n                filter_size=1,\n                stride=1,\n                bias_on=False,\n                norm_decay=None)\n\n    def forward(self, x, residual=None, children=None):\n        children = [] if children is None else children\n        bottom = self.downsample(x) if self.downsample else x\n        residual = self.project(bottom) if self.project else bottom\n        if self.level_root:\n            children.append(bottom)\n        x1 = self.tree1(x, residual)\n        if self.level == 1:\n            x2 = self.tree2(x1)\n            x = self.root([x2, x1] + children)\n        else:\n            children.append(x1)\n            x = self.tree2(x1, children=children)\n        return x\n\n\n@register\n@serializable\nclass DLA(nn.Layer):\n    \"\"\"\n    DLA, see https://arxiv.org/pdf/1707.06484.pdf\n\n    Args:\n        depth (int): DLA depth, only support 34 now.\n        residual_root (bool): whether use a reidual layer in the root block\n        pre_img (bool): add pre_img, only used in CenterTrack\n        pre_hm (bool): add pre_hm, only used in CenterTrack\n    \"\"\"\n\n    def __init__(self,\n                 depth=34,\n                 residual_root=False,\n                 pre_img=False,\n                 pre_hm=False):\n        super(DLA, self).__init__()\n        assert depth == 34, 'Only support DLA with depth of 34 now.'\n        if depth == 34:\n            block = BasicBlock\n        levels, channels = DLA_cfg[depth]\n        self.channels = channels\n        self.num_levels = len(levels)\n\n        self.base_layer = nn.Sequential(\n            ConvNormLayer(\n                3,\n                channels[0],\n                filter_size=7,\n                stride=1,\n                bias_on=False,\n                norm_decay=None),\n            nn.ReLU())\n        self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])\n        self.level1 = self._make_conv_level(\n            channels[0], channels[1], levels[1], stride=2)\n        self.level2 = Tree(\n            levels[2],\n            block,\n            channels[1],\n            channels[2],\n            2,\n            level_root=False,\n            root_residual=residual_root)\n        self.level3 = Tree(\n            levels[3],\n            block,\n            channels[2],\n            channels[3],\n            2,\n            level_root=True,\n            root_residual=residual_root)\n        self.level4 = Tree(\n            levels[4],\n            block,\n            channels[3],\n            channels[4],\n            2,\n            level_root=True,\n            root_residual=residual_root)\n        self.level5 = Tree(\n            levels[5],\n            block,\n            channels[4],\n            channels[5],\n            2,\n            level_root=True,\n            root_residual=residual_root)\n\n        if pre_img:\n            self.pre_img_layer = nn.Sequential(\n                ConvNormLayer(\n                    3,\n                    channels[0],\n                    filter_size=7,\n                    stride=1,\n                    bias_on=False,\n                    norm_decay=None),\n                nn.ReLU())\n        if pre_hm:\n            self.pre_hm_layer = nn.Sequential(\n                ConvNormLayer(\n                    1,\n                    channels[0],\n                    filter_size=7,\n                    stride=1,\n                    bias_on=False,\n                    norm_decay=None),\n                nn.ReLU())\n        self.pre_img = pre_img\n        self.pre_hm = pre_hm\n\n    def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1):\n        modules = []\n        for i in range(conv_num):\n            modules.extend([\n                ConvNormLayer(\n                    ch_in,\n                    ch_out,\n                    filter_size=3,\n                    stride=stride if i == 0 else 1,\n                    bias_on=False,\n                    norm_decay=None), nn.ReLU()\n            ])\n            ch_in = ch_out\n        return nn.Sequential(*modules)\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(channels=self.channels[i]) for i in range(self.num_levels)\n        ]\n\n    def forward(self, inputs):\n        outs = []\n        feats = self.base_layer(inputs['image'])\n\n        if self.pre_img and 'pre_image' in inputs and inputs[\n                'pre_image'] is not None:\n            feats = feats + self.pre_img_layer(inputs['pre_image'])\n\n        if self.pre_hm and 'pre_hm' in inputs and inputs['pre_hm'] is not None:\n            feats = feats + self.pre_hm_layer(inputs['pre_hm'])\n\n        for i in range(self.num_levels):\n            feats = getattr(self, 'level{}'.format(i))(feats)\n            outs.append(feats)\n\n        return outs\n"
  },
  {
    "path": "ppdet/modeling/backbones/esnet.py",
    "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm\nfrom paddle.nn.initializer import KaimingNormal\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register, serializable\nfrom numbers import Integral\nfrom ..shape_spec import ShapeSpec\nfrom ppdet.modeling.ops import channel_shuffle\nfrom ppdet.modeling.backbones.shufflenet_v2 import ConvBNLayer\n\n__all__ = ['ESNet']\n\n\ndef make_divisible(v, divisor=16, min_value=None):\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\n\n\nclass SEModule(nn.Layer):\n    def __init__(self, channel, reduction=4):\n        super(SEModule, self).__init__()\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        self.conv1 = Conv2D(\n            in_channels=channel,\n            out_channels=channel // reduction,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            weight_attr=ParamAttr(),\n            bias_attr=ParamAttr())\n        self.conv2 = Conv2D(\n            in_channels=channel // reduction,\n            out_channels=channel,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            weight_attr=ParamAttr(),\n            bias_attr=ParamAttr())\n\n    def forward(self, inputs):\n        outputs = self.avg_pool(inputs)\n        outputs = self.conv1(outputs)\n        outputs = F.relu(outputs)\n        outputs = self.conv2(outputs)\n        outputs = F.hardsigmoid(outputs)\n        return paddle.multiply(x=inputs, y=outputs)\n\n\nclass InvertedResidual(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 mid_channels,\n                 out_channels,\n                 stride,\n                 act=\"relu\"):\n        super(InvertedResidual, self).__init__()\n        self._conv_pw = ConvBNLayer(\n            in_channels=in_channels // 2,\n            out_channels=mid_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n        self._conv_dw = ConvBNLayer(\n            in_channels=mid_channels // 2,\n            out_channels=mid_channels // 2,\n            kernel_size=3,\n            stride=stride,\n            padding=1,\n            groups=mid_channels // 2,\n            act=None)\n        self._se = SEModule(mid_channels)\n\n        self._conv_linear = ConvBNLayer(\n            in_channels=mid_channels,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n\n    def forward(self, inputs):\n        x1, x2 = paddle.split(\n            inputs,\n            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],\n            axis=1)\n        x2 = self._conv_pw(x2)\n        x3 = self._conv_dw(x2)\n        x3 = paddle.concat([x2, x3], axis=1)\n        x3 = self._se(x3)\n        x3 = self._conv_linear(x3)\n        out = paddle.concat([x1, x3], axis=1)\n        return channel_shuffle(out, 2)\n\n\nclass InvertedResidualDS(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 mid_channels,\n                 out_channels,\n                 stride,\n                 act=\"relu\"):\n        super(InvertedResidualDS, self).__init__()\n\n        # branch1\n        self._conv_dw_1 = ConvBNLayer(\n            in_channels=in_channels,\n            out_channels=in_channels,\n            kernel_size=3,\n            stride=stride,\n            padding=1,\n            groups=in_channels,\n            act=None)\n        self._conv_linear_1 = ConvBNLayer(\n            in_channels=in_channels,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n        # branch2\n        self._conv_pw_2 = ConvBNLayer(\n            in_channels=in_channels,\n            out_channels=mid_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n        self._conv_dw_2 = ConvBNLayer(\n            in_channels=mid_channels // 2,\n            out_channels=mid_channels // 2,\n            kernel_size=3,\n            stride=stride,\n            padding=1,\n            groups=mid_channels // 2,\n            act=None)\n        self._se = SEModule(mid_channels // 2)\n        self._conv_linear_2 = ConvBNLayer(\n            in_channels=mid_channels // 2,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n        self._conv_dw_mv1 = ConvBNLayer(\n            in_channels=out_channels,\n            out_channels=out_channels,\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            groups=out_channels,\n            act=\"hard_swish\")\n        self._conv_pw_mv1 = ConvBNLayer(\n            in_channels=out_channels,\n            out_channels=out_channels,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=\"hard_swish\")\n\n    def forward(self, inputs):\n        x1 = self._conv_dw_1(inputs)\n        x1 = self._conv_linear_1(x1)\n        x2 = self._conv_pw_2(inputs)\n        x2 = self._conv_dw_2(x2)\n        x2 = self._se(x2)\n        x2 = self._conv_linear_2(x2)\n        out = paddle.concat([x1, x2], axis=1)\n        out = self._conv_dw_mv1(out)\n        out = self._conv_pw_mv1(out)\n\n        return out\n\n\n@register\n@serializable\nclass ESNet(nn.Layer):\n    def __init__(self,\n                 scale=1.0,\n                 act=\"hard_swish\",\n                 feature_maps=[4, 11, 14],\n                 channel_ratio=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]):\n        super(ESNet, self).__init__()\n        self.scale = scale\n        if isinstance(feature_maps, Integral):\n            feature_maps = [feature_maps]\n        self.feature_maps = feature_maps\n        stage_repeats = [3, 7, 3]\n\n        stage_out_channels = [\n            -1, 24, make_divisible(128 * scale), make_divisible(256 * scale),\n            make_divisible(512 * scale), 1024\n        ]\n\n        self._out_channels = []\n        self._feature_idx = 0\n        # 1. conv1\n        self._conv1 = ConvBNLayer(\n            in_channels=3,\n            out_channels=stage_out_channels[1],\n            kernel_size=3,\n            stride=2,\n            padding=1,\n            act=act)\n        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self._feature_idx += 1\n\n        # 2. bottleneck sequences\n        self._block_list = []\n        arch_idx = 0\n        for stage_id, num_repeat in enumerate(stage_repeats):\n            for i in range(num_repeat):\n                channels_scales = channel_ratio[arch_idx]\n                mid_c = make_divisible(\n                    int(stage_out_channels[stage_id + 2] * channels_scales),\n                    divisor=8)\n                if i == 0:\n                    block = self.add_sublayer(\n                        name=str(stage_id + 2) + '_' + str(i + 1),\n                        sublayer=InvertedResidualDS(\n                            in_channels=stage_out_channels[stage_id + 1],\n                            mid_channels=mid_c,\n                            out_channels=stage_out_channels[stage_id + 2],\n                            stride=2,\n                            act=act))\n                else:\n                    block = self.add_sublayer(\n                        name=str(stage_id + 2) + '_' + str(i + 1),\n                        sublayer=InvertedResidual(\n                            in_channels=stage_out_channels[stage_id + 2],\n                            mid_channels=mid_c,\n                            out_channels=stage_out_channels[stage_id + 2],\n                            stride=1,\n                            act=act))\n                self._block_list.append(block)\n                arch_idx += 1\n                self._feature_idx += 1\n                self._update_out_channels(stage_out_channels[stage_id + 2],\n                                          self._feature_idx, self.feature_maps)\n\n    def _update_out_channels(self, channel, feature_idx, feature_maps):\n        if feature_idx in feature_maps:\n            self._out_channels.append(channel)\n\n    def forward(self, inputs):\n        y = self._conv1(inputs['image'])\n        y = self._max_pool(y)\n        outs = []\n        for i, inv in enumerate(self._block_list):\n            y = inv(y)\n            if i + 2 in self.feature_maps:\n                outs.append(y)\n\n        return outs\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/backbones/focalnet.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py\n\"\"\"\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.modeling.shape_spec import ShapeSpec\nfrom ppdet.core.workspace import register, serializable\nfrom .transformer_utils import DropPath, Identity\nfrom .transformer_utils import add_parameter, to_2tuple\nfrom .transformer_utils import ones_, zeros_, trunc_normal_\nfrom .swin_transformer import Mlp\n\n__all__ = ['FocalNet']\n\nMODEL_cfg = {\n    'focalnet_T_224_1k_srf': dict(\n        embed_dim=96,\n        depths=[2, 2, 6, 2],\n        focal_levels=[2, 2, 2, 2],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.2,\n        use_conv_embed=False,\n        use_postln=False,\n        use_postln_in_modulation=False,\n        use_layerscale=False,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',\n    ),\n    'focalnet_S_224_1k_srf': dict(\n        embed_dim=96,\n        depths=[2, 2, 18, 2],\n        focal_levels=[2, 2, 2, 2],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.3,\n        use_conv_embed=False,\n        use_postln=False,\n        use_postln_in_modulation=False,\n        use_layerscale=False,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',\n    ),\n    'focalnet_B_224_1k_srf': dict(\n        embed_dim=128,\n        depths=[2, 2, 18, 2],\n        focal_levels=[2, 2, 2, 2],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.5,\n        use_conv_embed=False,\n        use_postln=False,\n        use_postln_in_modulation=False,\n        use_layerscale=False,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',\n    ),\n    'focalnet_T_224_1k_lrf': dict(\n        embed_dim=96,\n        depths=[2, 2, 6, 2],\n        focal_levels=[3, 3, 3, 3],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.2,\n        use_conv_embed=False,\n        use_postln=False,\n        use_postln_in_modulation=False,\n        use_layerscale=False,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',\n    ),\n    'focalnet_S_224_1k_lrf': dict(\n        embed_dim=96,\n        depths=[2, 2, 18, 2],\n        focal_levels=[3, 3, 3, 3],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.3,\n        use_conv_embed=False,\n        use_postln=False,\n        use_postln_in_modulation=False,\n        use_layerscale=False,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',\n    ),\n    'focalnet_B_224_1k_lrf': dict(\n        embed_dim=128,\n        depths=[2, 2, 18, 2],\n        focal_levels=[3, 3, 3, 3],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.5,\n        use_conv_embed=False,\n        use_postln=False,\n        use_postln_in_modulation=False,\n        use_layerscale=False,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',\n    ),\n    'focalnet_L_384_22k_fl3': dict(\n        embed_dim=192,\n        depths=[2, 2, 18, 2],\n        focal_levels=[3, 3, 3, 3],\n        focal_windows=[5, 5, 5, 5],\n        drop_path_rate=0.5,\n        use_conv_embed=True,\n        use_postln=True,\n        use_postln_in_modulation=False,\n        use_layerscale=True,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',\n    ),\n    'focalnet_L_384_22k_fl4': dict(\n        embed_dim=192,\n        depths=[2, 2, 18, 2],\n        focal_levels=[4, 4, 4, 4],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.5,\n        use_conv_embed=True,\n        use_postln=True,\n        use_postln_in_modulation=False,\n        use_layerscale=True,\n        normalize_modulator=True,  #\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',\n    ),\n    'focalnet_XL_384_22k_fl3': dict(\n        embed_dim=256,\n        depths=[2, 2, 18, 2],\n        focal_levels=[3, 3, 3, 3],\n        focal_windows=[5, 5, 5, 5],\n        drop_path_rate=0.5,\n        use_conv_embed=True,\n        use_postln=True,\n        use_postln_in_modulation=False,\n        use_layerscale=True,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',\n    ),\n    'focalnet_XL_384_22k_fl4': dict(\n        embed_dim=256,\n        depths=[2, 2, 18, 2],\n        focal_levels=[4, 4, 4, 4],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.5,\n        use_conv_embed=True,\n        use_postln=True,\n        use_postln_in_modulation=False,\n        use_layerscale=True,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',\n    ),\n    'focalnet_H_224_22k_fl3': dict(\n        embed_dim=352,\n        depths=[2, 2, 18, 2],\n        focal_levels=[3, 3, 3, 3],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.5,\n        use_conv_embed=True,\n        use_postln=True,\n        use_postln_in_modulation=True,  #\n        use_layerscale=True,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',\n    ),\n    'focalnet_H_224_22k_fl4': dict(\n        embed_dim=352,\n        depths=[2, 2, 18, 2],\n        focal_levels=[4, 4, 4, 4],\n        focal_windows=[3, 3, 3, 3],\n        drop_path_rate=0.5,\n        use_conv_embed=True,\n        use_postln=True,\n        use_postln_in_modulation=True,  #\n        use_layerscale=True,\n        normalize_modulator=False,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',\n    ),\n}\n\n\nclass FocalModulation(nn.Layer):\n    \"\"\"\n    Args:\n        dim (int): Number of input channels.\n        proj_drop (float, optional): Dropout ratio of output. Default: 0.0\n        focal_level (int): Number of focal levels\n        focal_window (int): Focal window size at focal level 1\n        focal_factor (int): Step to increase the focal window. Default: 2\n        use_postln_in_modulation (bool): Whether use post-modulation layernorm\n        normalize_modulator (bool): Whether use normalize in modulator\n    \"\"\"\n\n    def __init__(self,\n                 dim,\n                 proj_drop=0.,\n                 focal_level=2,\n                 focal_window=7,\n                 focal_factor=2,\n                 use_postln_in_modulation=False,\n                 normalize_modulator=False):\n        super().__init__()\n        self.dim = dim\n\n        # specific args for focalv3\n        self.focal_level = focal_level\n        self.focal_window = focal_window\n        self.focal_factor = focal_factor\n        self.use_postln_in_modulation = use_postln_in_modulation\n        self.normalize_modulator = normalize_modulator\n\n        self.f = nn.Linear(\n            dim, 2 * dim + (self.focal_level + 1), bias_attr=True)\n        self.h = nn.Conv2D(\n            dim,\n            dim,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            bias_attr=True)\n\n        self.act = nn.GELU()\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n        self.focal_layers = nn.LayerList()\n\n        if self.use_postln_in_modulation:\n            self.ln = nn.LayerNorm(dim)\n\n        for k in range(self.focal_level):\n            kernel_size = self.focal_factor * k + self.focal_window\n            self.focal_layers.append(\n                nn.Sequential(\n                    nn.Conv2D(\n                        dim,\n                        dim,\n                        kernel_size=kernel_size,\n                        stride=1,\n                        groups=dim,\n                        padding=kernel_size // 2,\n                        bias_attr=False),\n                    nn.GELU()))\n\n    def forward(self, x):\n        \"\"\" Forward function.\n        Args:\n            x: input features with shape of (B, H, W, C)\n        \"\"\"\n        _, _, _, C = x.shape\n        x = self.f(x)\n        x = x.transpose([0, 3, 1, 2])\n        q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)\n\n        ctx_all = 0\n        for l in range(self.focal_level):\n            ctx = self.focal_layers[l](ctx)\n            ctx_all = ctx_all + ctx * gates[:, l:l + 1]\n        ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))\n        ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]\n        if self.normalize_modulator:\n            ctx_all = ctx_all / (self.focal_level + 1)\n\n        x_out = q * self.h(ctx_all)\n        x_out = x_out.transpose([0, 2, 3, 1])\n        if self.use_postln_in_modulation:\n            x_out = self.ln(x_out)\n        x_out = self.proj(x_out)\n        x_out = self.proj_drop(x_out)\n        return x_out\n\n\nclass FocalModulationBlock(nn.Layer):\n    \"\"\" Focal Modulation Block.\n    Args:\n        dim (int): Number of input channels.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.\n        drop (float, optional): Dropout rate. Default: 0.0\n        drop_path (float, optional): Stochastic depth rate. Default: 0.0\n        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU\n        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm\n        focal_level (int): number of focal levels\n        focal_window (int): focal kernel size at level 1\n        use_postln (bool): Whether use layernorm after modulation. Default: False.\n        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.\n        normalize_modulator (bool): Whether use normalize in modulator\n        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False \n        layerscale_value (float): Value for layer scale. Default: 1e-4 \n    \"\"\"\n\n    def __init__(self,\n                 dim,\n                 mlp_ratio=4.,\n                 drop=0.,\n                 drop_path=0.,\n                 act_layer=nn.GELU,\n                 norm_layer=nn.LayerNorm,\n                 focal_level=2,\n                 focal_window=9,\n                 use_postln=False,\n                 use_postln_in_modulation=False,\n                 normalize_modulator=False,\n                 use_layerscale=False,\n                 layerscale_value=1e-4):\n        super().__init__()\n        self.dim = dim\n        self.mlp_ratio = mlp_ratio\n        self.focal_window = focal_window\n        self.focal_level = focal_level\n        self.use_postln = use_postln\n        self.use_layerscale = use_layerscale\n\n        self.norm1 = norm_layer(dim)\n        self.modulation = FocalModulation(\n            dim,\n            proj_drop=drop,\n            focal_level=self.focal_level,\n            focal_window=self.focal_window,\n            use_postln_in_modulation=use_postln_in_modulation,\n            normalize_modulator=normalize_modulator)\n\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        self.norm2 = norm_layer(dim)\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n        self.H = None\n        self.W = None\n\n        self.gamma_1 = 1.0\n        self.gamma_2 = 1.0\n        if self.use_layerscale:\n            self.gamma_1 = add_parameter(self,\n                                         layerscale_value * paddle.ones([dim]))\n            self.gamma_2 = add_parameter(self,\n                                         layerscale_value * paddle.ones([dim]))\n\n    def forward(self, x):\n        \"\"\"\n        Args:\n            x: Input feature, tensor size (B, H*W, C).\n        \"\"\"\n        B, L, C = x.shape\n        H, W = self.H, self.W\n        assert L == H * W, \"input feature has wrong size\"\n\n        shortcut = x\n        if not self.use_postln:\n            x = self.norm1(x)\n        x = x.reshape([-1, H, W, C])\n\n        # FM\n        x = self.modulation(x).reshape([-1, H * W, C])\n        if self.use_postln:\n            x = self.norm1(x)\n\n        # FFN\n        x = shortcut + self.drop_path(self.gamma_1 * x)\n\n        if self.use_postln:\n            x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))\n        else:\n            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))\n        return x\n\n\nclass BasicLayer(nn.Layer):\n    \"\"\" A basic focal modulation layer for one stage.\n    Args:\n        dim (int): Number of feature channels\n        depth (int): Depths of this stage.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.\n        drop (float, optional): Dropout rate. Default: 0.0\n        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0\n        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm\n        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None\n        focal_level (int): Number of focal levels\n        focal_window (int): Focal window size at focal level 1\n        use_conv_embed (bool): Whether use overlapped convolution for patch embedding\n        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False \n        layerscale_value (float): Value of layerscale\n        use_postln (bool): Whether use layernorm after modulation. Default: False.\n        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.\n        normalize_modulator (bool): Whether use normalize in modulator\n        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 dim,\n                 depth,\n                 mlp_ratio=4.,\n                 drop=0.,\n                 drop_path=0.,\n                 norm_layer=nn.LayerNorm,\n                 downsample=None,\n                 focal_level=2,\n                 focal_window=9,\n                 use_conv_embed=False,\n                 use_layerscale=False,\n                 layerscale_value=1e-4,\n                 use_postln=False,\n                 use_postln_in_modulation=False,\n                 normalize_modulator=False,\n                 use_checkpoint=False):\n        super().__init__()\n        self.depth = depth\n        self.use_checkpoint = use_checkpoint\n\n        # build blocks\n        self.blocks = nn.LayerList([\n            FocalModulationBlock(\n                dim=dim,\n                mlp_ratio=mlp_ratio,\n                drop=drop,\n                drop_path=drop_path[i]\n                if isinstance(drop_path, np.ndarray) else drop_path,\n                act_layer=nn.GELU,\n                norm_layer=norm_layer,\n                focal_level=focal_level,\n                focal_window=focal_window,\n                use_postln=use_postln,\n                use_postln_in_modulation=use_postln_in_modulation,\n                normalize_modulator=normalize_modulator,\n                use_layerscale=use_layerscale,\n                layerscale_value=layerscale_value) for i in range(depth)\n        ])\n\n        # patch merging layer\n        if downsample is not None:\n            self.downsample = downsample(\n                patch_size=2,\n                in_chans=dim,\n                embed_dim=2 * dim,\n                use_conv_embed=use_conv_embed,\n                norm_layer=norm_layer,\n                is_stem=False)\n        else:\n            self.downsample = None\n\n    def forward(self, x, H, W):\n        \"\"\"\n        Args:\n            x: Input feature, tensor size (B, H*W, C).\n        \"\"\"\n        for blk in self.blocks:\n            blk.H, blk.W = H, W\n            x = blk(x)\n\n        if self.downsample is not None:\n            x_reshaped = x.transpose([0, 2, 1]).reshape(\n                [x.shape[0], x.shape[-1], H, W])\n            x_down = self.downsample(x_reshaped)\n            x_down = x_down.flatten(2).transpose([0, 2, 1])\n            Wh, Ww = (H + 1) // 2, (W + 1) // 2\n            return x, H, W, x_down, Wh, Ww\n        else:\n            return x, H, W, x, H, W\n\n\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    Args:\n        patch_size (int): Patch token size. Default: 4.\n        in_chans (int): Number of input image channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        norm_layer (nn.Layer, optional): Normalization layer. Default: None\n        use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False\n        is_stem (bool): Is the stem block or not. \n    \"\"\"\n\n    def __init__(self,\n                 patch_size=4,\n                 in_chans=3,\n                 embed_dim=96,\n                 norm_layer=None,\n                 use_conv_embed=False,\n                 is_stem=False):\n        super().__init__()\n        patch_size = to_2tuple(patch_size)\n        self.patch_size = patch_size\n\n        self.in_chans = in_chans\n        self.embed_dim = embed_dim\n\n        if use_conv_embed:\n            # if we choose to use conv embedding, then we treat the stem and non-stem differently\n            if is_stem:\n                kernel_size = 7\n                padding = 2\n                stride = 4\n            else:\n                kernel_size = 3\n                padding = 1\n                stride = 2\n            self.proj = nn.Conv2D(\n                in_chans,\n                embed_dim,\n                kernel_size=kernel_size,\n                stride=stride,\n                padding=padding)\n        else:\n            self.proj = nn.Conv2D(\n                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)\n\n        if norm_layer is not None:\n            self.norm = norm_layer(embed_dim)\n        else:\n            self.norm = None\n\n    def forward(self, x):\n        _, _, H, W = x.shape\n\n        if W % self.patch_size[1] != 0:\n            # for 3D tensor: [pad_left, pad_right]\n            # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]\n            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])\n            W += W % self.patch_size[1]\n        if H % self.patch_size[0] != 0:\n            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])\n            H += H % self.patch_size[0]\n\n        x = self.proj(x)\n        if self.norm is not None:\n            _, _, Wh, Ww = x.shape\n            x = x.flatten(2).transpose([0, 2, 1])\n            x = self.norm(x)\n            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])\n\n        return x\n\n\n@register\n@serializable\nclass FocalNet(nn.Layer):\n    \"\"\" FocalNet backbone\n    Args:\n        arch (str): Architecture of FocalNet\n        out_indices (Sequence[int]): Output from which stages.\n        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).\n            -1 means not freezing any parameters.\n        patch_size (int | tuple(int)): Patch size. Default: 4.\n        in_chans (int): Number of input image channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        depths (tuple[int]): Depths of each FocalNet Transformer stage.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.\n        drop_rate (float): Dropout rate.\n        drop_path_rate (float): Stochastic depth rate. Default: 0.2.\n        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.\n        patch_norm (bool): If True, add normalization after patch embedding. Default: True.\n        focal_levels (Sequence[int]): Number of focal levels at four stages\n        focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages\n        use_conv_embed (bool): Whether use overlapped convolution for patch embedding\n        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False \n        layerscale_value (float): Value of layerscale\n        use_postln (bool): Whether use layernorm after modulation. Default: False.\n        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.\n        normalize_modulator (bool): Whether use normalize in modulator\n        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.\n    \"\"\"\n\n    def __init__(\n            self,\n            arch='focalnet_T_224_1k_srf',\n            out_indices=(0, 1, 2, 3),\n            frozen_stages=-1,\n            patch_size=4,\n            in_chans=3,\n            embed_dim=96,\n            depths=[2, 2, 6, 2],\n            mlp_ratio=4.,\n            drop_rate=0.,\n            drop_path_rate=0.2,  # 0.5 better for large+ models\n            norm_layer=nn.LayerNorm,\n            patch_norm=True,\n            focal_levels=[2, 2, 2, 2],\n            focal_windows=[3, 3, 3, 3],\n            use_conv_embed=False,\n            use_layerscale=False,\n            layerscale_value=1e-4,\n            use_postln=False,\n            use_postln_in_modulation=False,\n            normalize_modulator=False,\n            use_checkpoint=False,\n            pretrained=None):\n        super(FocalNet, self).__init__()\n        assert arch in MODEL_cfg.keys(), \"Unsupported arch: {}\".format(arch)\n\n        embed_dim = MODEL_cfg[arch]['embed_dim']\n        depths = MODEL_cfg[arch]['depths']\n        drop_path_rate = MODEL_cfg[arch]['drop_path_rate']\n        focal_levels = MODEL_cfg[arch]['focal_levels']\n        focal_windows = MODEL_cfg[arch]['focal_windows']\n        use_conv_embed = MODEL_cfg[arch]['use_conv_embed']\n        use_layerscale = MODEL_cfg[arch]['use_layerscale']\n        use_postln = MODEL_cfg[arch]['use_postln']\n        use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']\n        normalize_modulator = MODEL_cfg[arch]['normalize_modulator']\n        if pretrained is None:\n            pretrained = MODEL_cfg[arch]['pretrained']\n\n        self.out_indices = out_indices\n        self.frozen_stages = frozen_stages\n        self.num_layers = len(depths)\n        self.patch_norm = patch_norm\n\n        # split image into non-overlapping patches\n        self.patch_embed = PatchEmbed(\n            patch_size=patch_size,\n            in_chans=in_chans,\n            embed_dim=embed_dim,\n            norm_layer=norm_layer if self.patch_norm else None,\n            use_conv_embed=use_conv_embed,\n            is_stem=True)\n\n        self.pos_drop = nn.Dropout(p=drop_rate)\n\n        # stochastic depth decay rule\n        dpr = np.linspace(0, drop_path_rate, sum(depths))\n\n        # build layers\n        self.layers = nn.LayerList()\n        for i_layer in range(self.num_layers):\n            layer = BasicLayer(\n                dim=int(embed_dim * 2**i_layer),\n                depth=depths[i_layer],\n                mlp_ratio=mlp_ratio,\n                drop=drop_rate,\n                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],\n                norm_layer=norm_layer,\n                downsample=PatchEmbed\n                if (i_layer < self.num_layers - 1) else None,\n                focal_level=focal_levels[i_layer],\n                focal_window=focal_windows[i_layer],\n                use_conv_embed=use_conv_embed,\n                use_layerscale=use_layerscale,\n                layerscale_value=layerscale_value,\n                use_postln=use_postln,\n                use_postln_in_modulation=use_postln_in_modulation,\n                normalize_modulator=normalize_modulator,\n                use_checkpoint=use_checkpoint)\n            self.layers.append(layer)\n\n        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]\n        self.num_features = num_features\n\n        # add a norm layer for each output\n        for i_layer in out_indices:\n            layer = norm_layer(num_features[i_layer])\n            layer_name = f'norm{i_layer}'\n            self.add_sublayer(layer_name, layer)\n\n        self.apply(self._init_weights)\n        self._freeze_stages()\n        if pretrained:\n            if 'http' in pretrained:  #URL\n                path = paddle.utils.download.get_weights_path_from_url(\n                    pretrained)\n            else:  #model in local path\n                path = pretrained\n            self.set_state_dict(paddle.load(path))\n\n    def _freeze_stages(self):\n        if self.frozen_stages >= 0:\n            self.patch_embed.eval()\n            for param in self.patch_embed.parameters():\n                param.stop_gradient = True\n\n        if self.frozen_stages >= 2:\n            self.pos_drop.eval()\n            for i in range(0, self.frozen_stages - 1):\n                m = self.layers[i]\n                m.eval()\n                for param in m.parameters():\n                    param.stop_gradient = True\n\n    def _init_weights(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight)\n            if isinstance(m, nn.Linear) and m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):\n            zeros_(m.bias)\n            ones_(m.weight)\n\n    def forward(self, x):\n        x = self.patch_embed(x['image'])\n        B, _, Wh, Ww = x.shape\n        x = x.flatten(2).transpose([0, 2, 1])\n        x = self.pos_drop(x)\n        outs = []\n        for i in range(self.num_layers):\n            layer = self.layers[i]\n            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)\n            if i in self.out_indices:\n                norm_layer = getattr(self, f'norm{i}')\n                x_out = norm_layer(x_out)\n                out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(\n                    (0, 3, 1, 2))\n                outs.append(out)\n\n        return outs\n\n    @property\n    def out_shape(self):\n        out_strides = [4, 8, 16, 32]\n        return [\n            ShapeSpec(\n                channels=self.num_features[i], stride=out_strides[i])\n            for i in self.out_indices\n        ]\n"
  },
  {
    "path": "ppdet/modeling/backbones/ghostnet.py",
    "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import AdaptiveAvgPool2D, Linear\nfrom paddle.nn.initializer import Uniform\n\nfrom ppdet.core.workspace import register, serializable\nfrom numbers import Integral\nfrom ..shape_spec import ShapeSpec\nfrom .mobilenet_v3 import make_divisible, ConvBNLayer\n\n__all__ = ['GhostNet']\n\n\nclass ExtraBlockDW(nn.Layer):\n    def __init__(self,\n                 in_c,\n                 ch_1,\n                 ch_2,\n                 stride,\n                 lr_mult,\n                 conv_decay=0.,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 name=None):\n        super(ExtraBlockDW, self).__init__()\n        self.pointwise_conv = ConvBNLayer(\n            in_c=in_c,\n            out_c=ch_1,\n            filter_size=1,\n            stride=1,\n            padding=0,\n            act='relu6',\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_extra1\")\n        self.depthwise_conv = ConvBNLayer(\n            in_c=ch_1,\n            out_c=ch_2,\n            filter_size=3,\n            stride=stride,\n            padding=1,  #\n            num_groups=int(ch_1),\n            act='relu6',\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_extra2_dw\")\n        self.normal_conv = ConvBNLayer(\n            in_c=ch_2,\n            out_c=ch_2,\n            filter_size=1,\n            stride=1,\n            padding=0,\n            act='relu6',\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_extra2_sep\")\n\n    def forward(self, inputs):\n        x = self.pointwise_conv(inputs)\n        x = self.depthwise_conv(x)\n        x = self.normal_conv(x)\n        return x\n\n\nclass SEBlock(nn.Layer):\n    def __init__(self, num_channels, lr_mult, reduction_ratio=4, name=None):\n        super(SEBlock, self).__init__()\n        self.pool2d_gap = AdaptiveAvgPool2D(1)\n        self._num_channels = num_channels\n        stdv = 1.0 / math.sqrt(num_channels * 1.0)\n        med_ch = num_channels // reduction_ratio\n        self.squeeze = Linear(\n            num_channels,\n            med_ch,\n            weight_attr=ParamAttr(\n                learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),\n            bias_attr=ParamAttr(learning_rate=lr_mult))\n        stdv = 1.0 / math.sqrt(med_ch * 1.0)\n        self.excitation = Linear(\n            med_ch,\n            num_channels,\n            weight_attr=ParamAttr(\n                learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),\n            bias_attr=ParamAttr(learning_rate=lr_mult))\n\n    def forward(self, inputs):\n        pool = self.pool2d_gap(inputs)\n        pool = paddle.squeeze(pool, axis=[2, 3])\n        squeeze = self.squeeze(pool)\n        squeeze = F.relu(squeeze)\n        excitation = self.excitation(squeeze)\n        excitation = paddle.clip(x=excitation, min=0, max=1)\n        excitation = paddle.unsqueeze(excitation, axis=[2, 3])\n        out = paddle.multiply(inputs, excitation)\n        return out\n\n\nclass GhostModule(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 output_channels,\n                 kernel_size=1,\n                 ratio=2,\n                 dw_size=3,\n                 stride=1,\n                 relu=True,\n                 lr_mult=1.,\n                 conv_decay=0.,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 name=None):\n        super(GhostModule, self).__init__()\n        init_channels = int(math.ceil(output_channels / ratio))\n        new_channels = int(init_channels * (ratio - 1))\n        self.primary_conv = ConvBNLayer(\n            in_c=in_channels,\n            out_c=init_channels,\n            filter_size=kernel_size,\n            stride=stride,\n            padding=int((kernel_size - 1) // 2),\n            num_groups=1,\n            act=\"relu\" if relu else None,\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_primary_conv\")\n        self.cheap_operation = ConvBNLayer(\n            in_c=init_channels,\n            out_c=new_channels,\n            filter_size=dw_size,\n            stride=1,\n            padding=int((dw_size - 1) // 2),\n            num_groups=init_channels,\n            act=\"relu\" if relu else None,\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_cheap_operation\")\n\n    def forward(self, inputs):\n        x = self.primary_conv(inputs)\n        y = self.cheap_operation(x)\n        out = paddle.concat([x, y], axis=1)\n        return out\n\n\nclass GhostBottleneck(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 hidden_dim,\n                 output_channels,\n                 kernel_size,\n                 stride,\n                 use_se,\n                 lr_mult,\n                 conv_decay=0.,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 return_list=False,\n                 name=None):\n        super(GhostBottleneck, self).__init__()\n        self._stride = stride\n        self._use_se = use_se\n        self._num_channels = in_channels\n        self._output_channels = output_channels\n        self.return_list = return_list\n\n        self.ghost_module_1 = GhostModule(\n            in_channels=in_channels,\n            output_channels=hidden_dim,\n            kernel_size=1,\n            stride=1,\n            relu=True,\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_ghost_module_1\")\n        if stride == 2:\n            self.depthwise_conv = ConvBNLayer(\n                in_c=hidden_dim,\n                out_c=hidden_dim,\n                filter_size=kernel_size,\n                stride=stride,\n                padding=int((kernel_size - 1) // 2),\n                num_groups=hidden_dim,\n                act=None,\n                lr_mult=lr_mult,\n                conv_decay=conv_decay,\n                norm_type=norm_type,\n                norm_decay=norm_decay,\n                freeze_norm=freeze_norm,\n                name=name +\n                \"_depthwise_depthwise\"  # looks strange due to an old typo, will be fixed later.\n            )\n        if use_se:\n            self.se_block = SEBlock(hidden_dim, lr_mult, name=name + \"_se\")\n        self.ghost_module_2 = GhostModule(\n            in_channels=hidden_dim,\n            output_channels=output_channels,\n            kernel_size=1,\n            relu=False,\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_ghost_module_2\")\n        if stride != 1 or in_channels != output_channels:\n            self.shortcut_depthwise = ConvBNLayer(\n                in_c=in_channels,\n                out_c=in_channels,\n                filter_size=kernel_size,\n                stride=stride,\n                padding=int((kernel_size - 1) // 2),\n                num_groups=in_channels,\n                act=None,\n                lr_mult=lr_mult,\n                conv_decay=conv_decay,\n                norm_type=norm_type,\n                norm_decay=norm_decay,\n                freeze_norm=freeze_norm,\n                name=name +\n                \"_shortcut_depthwise_depthwise\"  # looks strange due to an old typo, will be fixed later.\n            )\n            self.shortcut_conv = ConvBNLayer(\n                in_c=in_channels,\n                out_c=output_channels,\n                filter_size=1,\n                stride=1,\n                padding=0,\n                num_groups=1,\n                act=None,\n                lr_mult=lr_mult,\n                conv_decay=conv_decay,\n                norm_type=norm_type,\n                norm_decay=norm_decay,\n                freeze_norm=freeze_norm,\n                name=name + \"_shortcut_conv\")\n\n    def forward(self, inputs):\n        y = self.ghost_module_1(inputs)\n        x = y\n        if self._stride == 2:\n            x = self.depthwise_conv(x)\n        if self._use_se:\n            x = self.se_block(x)\n        x = self.ghost_module_2(x)\n\n        if self._stride == 1 and self._num_channels == self._output_channels:\n            shortcut = inputs\n        else:\n            shortcut = self.shortcut_depthwise(inputs)\n            shortcut = self.shortcut_conv(shortcut)\n        x = paddle.add(x=x, y=shortcut)\n\n        if self.return_list:\n            return [y, x]\n        else:\n            return x\n\n\n@register\n@serializable\nclass GhostNet(nn.Layer):\n    __shared__ = ['norm_type']\n\n    def __init__(\n            self,\n            scale=1.3,\n            feature_maps=[6, 12, 15],\n            with_extra_blocks=False,\n            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],\n            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],\n            conv_decay=0.,\n            norm_type='bn',\n            norm_decay=0.0,\n            freeze_norm=False):\n        super(GhostNet, self).__init__()\n        if isinstance(feature_maps, Integral):\n            feature_maps = [feature_maps]\n        if norm_type == 'sync_bn' and freeze_norm:\n            raise ValueError(\n                \"The norm_type should not be sync_bn when freeze_norm is True\")\n        self.feature_maps = feature_maps\n        self.with_extra_blocks = with_extra_blocks\n        self.extra_block_filters = extra_block_filters\n\n        inplanes = 16\n        self.cfgs = [\n            # k, t, c, SE, s\n            [3, 16, 16, 0, 1],\n            [3, 48, 24, 0, 2],\n            [3, 72, 24, 0, 1],\n            [5, 72, 40, 1, 2],\n            [5, 120, 40, 1, 1],\n            [3, 240, 80, 0, 2],\n            [3, 200, 80, 0, 1],\n            [3, 184, 80, 0, 1],\n            [3, 184, 80, 0, 1],\n            [3, 480, 112, 1, 1],\n            [3, 672, 112, 1, 1],\n            [5, 672, 160, 1, 2],  # SSDLite output\n            [5, 960, 160, 0, 1],\n            [5, 960, 160, 1, 1],\n            [5, 960, 160, 0, 1],\n            [5, 960, 160, 1, 1]\n        ]\n        self.scale = scale\n        conv1_out_ch = int(make_divisible(inplanes * self.scale, 4))\n        self.conv1 = ConvBNLayer(\n            in_c=3,\n            out_c=conv1_out_ch,\n            filter_size=3,\n            stride=2,\n            padding=1,\n            num_groups=1,\n            act=\"relu\",\n            lr_mult=1.,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=\"conv1\")\n\n        # build inverted residual blocks\n        self._out_channels = []\n        self.ghost_bottleneck_list = []\n        idx = 0\n        inplanes = conv1_out_ch\n        for k, exp_size, c, use_se, s in self.cfgs:\n            lr_idx = min(idx // 3, len(lr_mult_list) - 1)\n            lr_mult = lr_mult_list[lr_idx]\n\n            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv\n            return_list = self.with_extra_blocks and idx + 2 in self.feature_maps\n\n            ghost_bottleneck = self.add_sublayer(\n                \"_ghostbottleneck_\" + str(idx),\n                sublayer=GhostBottleneck(\n                    in_channels=inplanes,\n                    hidden_dim=int(make_divisible(exp_size * self.scale, 4)),\n                    output_channels=int(make_divisible(c * self.scale, 4)),\n                    kernel_size=k,\n                    stride=s,\n                    use_se=use_se,\n                    lr_mult=lr_mult,\n                    conv_decay=conv_decay,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    return_list=return_list,\n                    name=\"_ghostbottleneck_\" + str(idx)))\n            self.ghost_bottleneck_list.append(ghost_bottleneck)\n            inplanes = int(make_divisible(c * self.scale, 4))\n            idx += 1\n            self._update_out_channels(\n                int(make_divisible(exp_size * self.scale, 4))\n                if return_list else inplanes, idx + 1, feature_maps)\n\n        if self.with_extra_blocks:\n            self.extra_block_list = []\n            extra_out_c = int(make_divisible(self.scale * self.cfgs[-1][1], 4))\n            lr_idx = min(idx // 3, len(lr_mult_list) - 1)\n            lr_mult = lr_mult_list[lr_idx]\n\n            conv_extra = self.add_sublayer(\n                \"conv\" + str(idx + 2),\n                sublayer=ConvBNLayer(\n                    in_c=inplanes,\n                    out_c=extra_out_c,\n                    filter_size=1,\n                    stride=1,\n                    padding=0,\n                    num_groups=1,\n                    act=\"relu6\",\n                    lr_mult=lr_mult,\n                    conv_decay=conv_decay,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    name=\"conv\" + str(idx + 2)))\n            self.extra_block_list.append(conv_extra)\n            idx += 1\n            self._update_out_channels(extra_out_c, idx + 1, feature_maps)\n\n            for j, block_filter in enumerate(self.extra_block_filters):\n                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -\n                                                                           1][1]\n                conv_extra = self.add_sublayer(\n                    \"conv\" + str(idx + 2),\n                    sublayer=ExtraBlockDW(\n                        in_c,\n                        block_filter[0],\n                        block_filter[1],\n                        stride=2,\n                        lr_mult=lr_mult,\n                        conv_decay=conv_decay,\n                        norm_type=norm_type,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        name='conv' + str(idx + 2)))\n                self.extra_block_list.append(conv_extra)\n                idx += 1\n                self._update_out_channels(block_filter[1], idx + 1,\n                                          feature_maps)\n\n    def _update_out_channels(self, channel, feature_idx, feature_maps):\n        if feature_idx in feature_maps:\n            self._out_channels.append(channel)\n\n    def forward(self, inputs):\n        x = self.conv1(inputs['image'])\n        outs = []\n        for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list):\n            x = ghost_bottleneck(x)\n            if idx + 2 in self.feature_maps:\n                if isinstance(x, list):\n                    outs.append(x[0])\n                    x = x[1]\n                else:\n                    outs.append(x)\n\n        if not self.with_extra_blocks:\n            return outs\n\n        for i, block in enumerate(self.extra_block_list):\n            idx = i + len(self.ghost_bottleneck_list)\n            x = block(x)\n            if idx + 2 in self.feature_maps:\n                outs.append(x)\n        return outs\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/backbones/hardnet.py",
    "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nfrom ppdet.core.workspace import register\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['HarDNet']\n\n\ndef ConvLayer(in_channels,\n              out_channels,\n              kernel_size=3,\n              stride=1,\n              bias_attr=False):\n    layer = nn.Sequential(\n        ('conv', nn.Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=kernel_size // 2,\n            groups=1,\n            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)),\n        ('relu', nn.ReLU6()))\n    return layer\n\n\ndef DWConvLayer(in_channels,\n                out_channels,\n                kernel_size=3,\n                stride=1,\n                bias_attr=False):\n    layer = nn.Sequential(\n        ('dwconv', nn.Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=1,\n            groups=out_channels,\n            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)))\n    return layer\n\n\ndef CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1):\n    layer = nn.Sequential(\n        ('layer1', ConvLayer(\n            in_channels, out_channels, kernel_size=kernel_size)),\n        ('layer2', DWConvLayer(\n            out_channels, out_channels, stride=stride)))\n    return layer\n\n\nclass HarDBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 growth_rate,\n                 grmul,\n                 n_layers,\n                 keepBase=False,\n                 residual_out=False,\n                 dwconv=False):\n        super().__init__()\n        self.keepBase = keepBase\n        self.links = []\n        layers_ = []\n        self.out_channels = 0\n        for i in range(n_layers):\n            outch, inch, link = self.get_link(i + 1, in_channels, growth_rate,\n                                              grmul)\n            self.links.append(link)\n            if dwconv:\n                layers_.append(CombConvLayer(inch, outch))\n            else:\n                layers_.append(ConvLayer(inch, outch))\n\n            if (i % 2 == 0) or (i == n_layers - 1):\n                self.out_channels += outch\n        self.layers = nn.LayerList(layers_)\n\n    def get_out_ch(self):\n        return self.out_channels\n\n    def get_link(self, layer, base_ch, growth_rate, grmul):\n        if layer == 0:\n            return base_ch, 0, []\n        out_channels = growth_rate\n\n        link = []\n        for i in range(10):\n            dv = 2**i\n            if layer % dv == 0:\n                k = layer - dv\n                link.append(k)\n                if i > 0:\n                    out_channels *= grmul\n\n        out_channels = int(int(out_channels + 1) / 2) * 2\n        in_channels = 0\n\n        for i in link:\n            ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul)\n            in_channels += ch\n\n        return out_channels, in_channels, link\n\n    def forward(self, x):\n        layers_ = [x]\n\n        for layer in range(len(self.layers)):\n            link = self.links[layer]\n            tin = []\n            for i in link:\n                tin.append(layers_[i])\n            if len(tin) > 1:\n                x = paddle.concat(tin, 1)\n            else:\n                x = tin[0]\n            out = self.layers[layer](x)\n            layers_.append(out)\n\n        t = len(layers_)\n        out_ = []\n        for i in range(t):\n            if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1):\n                out_.append(layers_[i])\n        out = paddle.concat(out_, 1)\n\n        return out\n\n\n@register\nclass HarDNet(nn.Layer):\n    def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85):\n        super(HarDNet, self).__init__()\n        assert arch in [68, 85], \"HarDNet-{} is not supported.\".format(arch)\n        if arch == 85:\n            first_ch = [48, 96]\n            second_kernel = 3\n            ch_list = [192, 256, 320, 480, 720]\n            grmul = 1.7\n            gr = [24, 24, 28, 36, 48]\n            n_layers = [8, 16, 16, 16, 16]\n        elif arch == 68:\n            first_ch = [32, 64]\n            second_kernel = 3\n            ch_list = [128, 256, 320, 640]\n            grmul = 1.7\n            gr = [14, 16, 20, 40]\n            n_layers = [8, 16, 16, 16]\n        else:\n            raise ValueError(\"HarDNet-{} is not supported.\".format(arch))\n\n        self.return_idx = return_idx\n        self._out_channels = [96, 214, 458, 784]\n\n        avg_pool = True\n        if depth_wise:\n            second_kernel = 1\n            avg_pool = False\n\n        blks = len(n_layers)\n        self.base = nn.LayerList([])\n\n        # First Layer: Standard Conv3x3, Stride=2\n        self.base.append(\n            ConvLayer(\n                in_channels=3,\n                out_channels=first_ch[0],\n                kernel_size=3,\n                stride=2,\n                bias_attr=False))\n\n        # Second Layer\n        self.base.append(\n            ConvLayer(\n                first_ch[0], first_ch[1], kernel_size=second_kernel))\n\n        # Avgpooling or DWConv3x3 downsampling\n        if avg_pool:\n            self.base.append(nn.AvgPool2D(kernel_size=3, stride=2, padding=1))\n        else:\n            self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2))\n\n        # Build all HarDNet blocks\n        ch = first_ch[1]\n        for i in range(blks):\n            blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)\n            ch = blk.out_channels\n            self.base.append(blk)\n\n            if i != blks - 1:\n                self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1))\n            ch = ch_list[i]\n            if i == 0:\n                self.base.append(\n                    nn.AvgPool2D(\n                        kernel_size=2, stride=2, ceil_mode=True))\n            elif i != blks - 1 and i != 1 and i != 3:\n                self.base.append(nn.AvgPool2D(kernel_size=2, stride=2))\n\n    def forward(self, inputs):\n        x = inputs['image']\n        outs = []\n        for i, layer in enumerate(self.base):\n            x = layer(x)\n            if i in self.return_idx:\n                outs.append(x)\n        return outs\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=self._out_channels[i]) for i in range(4)]\n"
  },
  {
    "path": "ppdet/modeling/backbones/hgnet_v2.py",
    "content": "# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import KaimingNormal, Constant\nfrom paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D\nfrom paddle.regularizer import L2Decay\nfrom paddle import ParamAttr\n\nimport copy\n\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['PPHGNetV2']\n\nkaiming_normal_ = KaimingNormal()\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)\n\n\nclass LearnableAffineBlock(nn.Layer):\n    def __init__(self,\n                 scale_value=1.0,\n                 bias_value=0.0,\n                 lr_mult=1.0,\n                 lab_lr=0.01):\n        super().__init__()\n        self.scale = self.create_parameter(\n            shape=[1, ],\n            default_initializer=Constant(value=scale_value),\n            attr=ParamAttr(learning_rate=lr_mult * lab_lr))\n        self.add_parameter(\"scale\", self.scale)\n        self.bias = self.create_parameter(\n            shape=[1, ],\n            default_initializer=Constant(value=bias_value),\n            attr=ParamAttr(learning_rate=lr_mult * lab_lr))\n        self.add_parameter(\"bias\", self.bias)\n\n    def forward(self, x):\n        return self.scale * x + self.bias\n\n\nclass ConvBNAct(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=3,\n                 stride=1,\n                 padding=1,\n                 groups=1,\n                 use_act=True,\n                 use_lab=False,\n                 lr_mult=1.0):\n        super().__init__()\n        self.use_act = use_act\n        self.use_lab = use_lab\n        self.conv = Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding=padding\n            if isinstance(padding, str) else (kernel_size - 1) // 2,\n            groups=groups,\n            weight_attr=ParamAttr(learning_rate=lr_mult),\n            bias_attr=False)\n        self.bn = BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(\n                regularizer=L2Decay(0.0), learning_rate=lr_mult),\n            bias_attr=ParamAttr(\n                regularizer=L2Decay(0.0), learning_rate=lr_mult))\n        if self.use_act:\n            self.act = ReLU()\n            if self.use_lab:\n                self.lab = LearnableAffineBlock(lr_mult=lr_mult)\n\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        if self.use_act:\n            x = self.act(x)\n            if self.use_lab:\n                x = self.lab(x)\n        return x\n\n\nclass LightConvBNAct(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride,\n                 groups=1,\n                 use_lab=False,\n                 lr_mult=1.0):\n        super().__init__()\n        self.conv1 = ConvBNAct(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            kernel_size=1,\n            use_act=False,\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n        self.conv2 = ConvBNAct(\n            in_channels=out_channels,\n            out_channels=out_channels,\n            kernel_size=kernel_size,\n            groups=out_channels,\n            use_act=True,\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n\n    def forward(self, x):\n        x = self.conv1(x)\n        x = self.conv2(x)\n        return x\n\n\nclass StemBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 mid_channels,\n                 out_channels,\n                 use_lab=False,\n                 lr_mult=1.0):\n        super().__init__()\n        self.stem1 = ConvBNAct(\n            in_channels=in_channels,\n            out_channels=mid_channels,\n            kernel_size=3,\n            stride=2,\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n        self.stem2a = ConvBNAct(\n            in_channels=mid_channels,\n            out_channels=mid_channels // 2,\n            kernel_size=2,\n            stride=1,\n            padding=\"SAME\",\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n        self.stem2b = ConvBNAct(\n            in_channels=mid_channels // 2,\n            out_channels=mid_channels,\n            kernel_size=2,\n            stride=1,\n            padding=\"SAME\",\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n        self.stem3 = ConvBNAct(\n            in_channels=mid_channels * 2,\n            out_channels=mid_channels,\n            kernel_size=3,\n            stride=2,\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n        self.stem4 = ConvBNAct(\n            in_channels=mid_channels,\n            out_channels=out_channels,\n            kernel_size=1,\n            stride=1,\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n        self.pool = nn.MaxPool2D(\n            kernel_size=2, stride=1, ceil_mode=True, padding=\"SAME\")\n\n    def forward(self, x):\n        x = self.stem1(x)\n        x2 = self.stem2a(x)\n        x2 = self.stem2b(x2)\n        x1 = self.pool(x)\n        x = paddle.concat([x1, x2], 1)\n        x = self.stem3(x)\n        x = self.stem4(x)\n\n        return x\n\n\nclass HG_Block(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 mid_channels,\n                 out_channels,\n                 kernel_size=3,\n                 layer_num=6,\n                 identity=False,\n                 light_block=True,\n                 use_lab=False,\n                 lr_mult=1.0):\n        super().__init__()\n        self.identity = identity\n\n        self.layers = nn.LayerList()\n        block_type = \"LightConvBNAct\" if light_block else \"ConvBNAct\"\n        for i in range(layer_num):\n            self.layers.append(\n                eval(block_type)(in_channels=in_channels\n                                 if i == 0 else mid_channels,\n                                 out_channels=mid_channels,\n                                 stride=1,\n                                 kernel_size=kernel_size,\n                                 use_lab=use_lab,\n                                 lr_mult=lr_mult))\n        # feature aggregation\n        total_channels = in_channels + layer_num * mid_channels\n        self.aggregation_squeeze_conv = ConvBNAct(\n            in_channels=total_channels,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n        self.aggregation_excitation_conv = ConvBNAct(\n            in_channels=out_channels // 2,\n            out_channels=out_channels,\n            kernel_size=1,\n            stride=1,\n            use_lab=use_lab,\n            lr_mult=lr_mult)\n\n    def forward(self, x):\n        identity = x\n        output = []\n        output.append(x)\n        for layer in self.layers:\n            x = layer(x)\n            output.append(x)\n        x = paddle.concat(output, axis=1)\n        x = self.aggregation_squeeze_conv(x)\n        x = self.aggregation_excitation_conv(x)\n        if self.identity:\n            x += identity\n        return x\n\n\nclass HG_Stage(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 mid_channels,\n                 out_channels,\n                 block_num,\n                 layer_num=6,\n                 downsample=True,\n                 light_block=True,\n                 kernel_size=3,\n                 use_lab=False,\n                 lr_mult=1.0):\n        super().__init__()\n        self.downsample = downsample\n        if downsample:\n            self.downsample = ConvBNAct(\n                in_channels=in_channels,\n                out_channels=in_channels,\n                kernel_size=3,\n                stride=2,\n                groups=in_channels,\n                use_act=False,\n                use_lab=use_lab,\n                lr_mult=lr_mult)\n\n        blocks_list = []\n        for i in range(block_num):\n            blocks_list.append(\n                HG_Block(\n                    in_channels=in_channels if i == 0 else out_channels,\n                    mid_channels=mid_channels,\n                    out_channels=out_channels,\n                    kernel_size=kernel_size,\n                    layer_num=layer_num,\n                    identity=False if i == 0 else True,\n                    light_block=light_block,\n                    use_lab=use_lab,\n                    lr_mult=lr_mult))\n        self.blocks = nn.Sequential(*blocks_list)\n\n    def forward(self, x):\n        if self.downsample:\n            x = self.downsample(x)\n        x = self.blocks(x)\n        return x\n\n\ndef _freeze_norm(m: nn.BatchNorm2D):\n    param_attr = ParamAttr(\n        learning_rate=0., regularizer=L2Decay(0.), trainable=False)\n    bias_attr = ParamAttr(\n        learning_rate=0., regularizer=L2Decay(0.), trainable=False)\n    global_stats = True\n    norm = nn.BatchNorm2D(\n        m._num_features,\n        weight_attr=param_attr,\n        bias_attr=bias_attr,\n        use_global_stats=global_stats)\n    for param in norm.parameters():\n        param.stop_gradient = True\n    return norm\n\n\ndef reset_bn(model: nn.Layer, reset_func=_freeze_norm):\n    if isinstance(model, nn.BatchNorm2D):\n        model = reset_func(model)\n    else:\n        for name, child in model.named_children():\n            _child = reset_bn(child, reset_func)\n            if _child is not child:\n                setattr(model, name, _child)\n    return model\n\n\n@register\n@serializable\nclass PPHGNetV2(nn.Layer):\n    \"\"\"\n    PPHGNetV2\n    Args:\n        stem_channels: list. Number of channels for the stem block.\n        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.\n        use_lab: boolean. Whether to use LearnableAffineBlock in network.\n        lr_mult_list: list. Control the learning rate of different stages.\n    Returns:\n        model: nn.Layer. Specific PPHGNetV2 model depends on args.\n    \"\"\"\n\n    arch_configs = {\n        'S': {\n            'stem_channels': [3, 24, 32],\n            'stage_config': {\n                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num\n                \"stage1\": [32, 32, 64, 1, False, False, 3, 3],\n                \"stage2\": [64, 48, 256, 1, True, False, 3, 3],\n                \"stage3\": [256, 96, 512, 2, True, True, 5, 3],\n                \"stage4\": [512, 192, 1024, 1, True, True, 5, 3],\n            }\n        },\n        'M': {\n            'stem_channels': [3, 24, 32],\n            'stage_config': {\n                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num\n                \"stage1\": [32, 32, 96, 1, False, False, 3, 4],\n                \"stage2\": [96, 64, 384, 1, True, False, 3, 4],\n                \"stage3\": [384, 128, 768, 3, True, True, 5, 4],\n                \"stage4\": [768, 256, 1536, 1, True, True, 5, 4],\n            }\n        },\n        'L': {\n            'stem_channels': [3, 32, 48],\n            'stage_config': {\n                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num\n                \"stage1\": [48, 48, 128, 1, False, False, 3, 6],\n                \"stage2\": [128, 96, 512, 1, True, False, 3, 6],\n                \"stage3\": [512, 192, 1024, 3, True, True, 5, 6],\n                \"stage4\": [1024, 384, 2048, 1, True, True, 5, 6],\n            }\n        },\n        'X': {\n            'stem_channels': [3, 32, 64],\n            'stage_config': {\n                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num\n                \"stage1\": [64, 64, 128, 1, False, False, 3, 6],\n                \"stage2\": [128, 128, 512, 2, True, False, 3, 6],\n                \"stage3\": [512, 256, 1024, 5, True, True, 5, 6],\n                \"stage4\": [1024, 512, 2048, 2, True, True, 5, 6],\n            }\n        },\n        'H': {\n            'stem_channels': [3, 48, 96],\n            'stage_config': {\n                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num\n                \"stage1\": [96, 96, 192, 2, False, False, 3, 6],\n                \"stage2\": [192, 192, 512, 3, True, False, 3, 6],\n                \"stage3\": [512, 384, 1024, 6, True, True, 5, 6],\n                \"stage4\": [1024, 768, 2048, 3, True, True, 5, 6],\n            }\n        }\n    }\n\n    def __init__(self,\n                 arch,\n                 use_lab=False,\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],\n                 return_idx=[1, 2, 3],\n                 freeze_stem_only=True,\n                 freeze_at=0,\n                 freeze_norm=True):\n        super().__init__()\n        self.use_lab = use_lab\n        self.return_idx = return_idx\n\n        stem_channels = self.arch_configs[arch]['stem_channels']\n        stage_config = self.arch_configs[arch]['stage_config']\n\n        self._out_strides = [4, 8, 16, 32]\n        self._out_channels = [stage_config[k][2] for k in stage_config]\n\n        # stem\n        self.stem = StemBlock(\n            in_channels=stem_channels[0],\n            mid_channels=stem_channels[1],\n            out_channels=stem_channels[2],\n            use_lab=use_lab,\n            lr_mult=lr_mult_list[0])\n\n        # stages\n        self.stages = nn.LayerList()\n        for i, k in enumerate(stage_config):\n            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[\n                k]\n            self.stages.append(\n                HG_Stage(\n                    in_channels,\n                    mid_channels,\n                    out_channels,\n                    block_num,\n                    layer_num,\n                    downsample,\n                    light_block,\n                    kernel_size,\n                    use_lab,\n                    lr_mult=lr_mult_list[i + 1]))\n\n        if freeze_at >= 0:\n            self._freeze_parameters(self.stem)\n            if not freeze_stem_only:\n                for i in range(min(freeze_at + 1, len(self.stages))):\n                    self._freeze_parameters(self.stages[i])\n\n        if freeze_norm:\n            reset_bn(self, reset_func=_freeze_norm)\n\n        self._init_weights()\n\n    def _freeze_parameters(self, m):\n        for p in m.parameters():\n            p.stop_gradient = True\n\n    def _init_weights(self):\n        for m in self.sublayers():\n            if isinstance(m, nn.Conv2D):\n                kaiming_normal_(m.weight)\n            elif isinstance(m, (nn.BatchNorm2D)):\n                ones_(m.weight)\n                zeros_(m.bias)\n            elif isinstance(m, nn.Linear):\n                zeros_(m.bias)\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self._out_channels[i], stride=self._out_strides[i])\n            for i in self.return_idx\n        ]\n\n    def forward(self, inputs):\n        x = inputs['image']\n        x = self.stem(x)\n        outs = []\n        for idx, stage in enumerate(self.stages):\n            x = stage(x)\n            if idx in self.return_idx:\n                outs.append(x)\n        return outs\n"
  },
  {
    "path": "ppdet/modeling/backbones/hrnet.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import AdaptiveAvgPool2D, Linear\nfrom paddle.regularizer import L2Decay\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Normal, Uniform\nfrom numbers import Integral\nimport math\n\nfrom ppdet.core.workspace import register\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['HRNet']\n\n\nclass ConvNormLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size,\n                 stride=1,\n                 norm_type='bn',\n                 norm_groups=32,\n                 use_dcn=False,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 act=None,\n                 name=None):\n        super(ConvNormLayer, self).__init__()\n        assert norm_type in ['bn', 'sync_bn', 'gn']\n\n        self.act = act\n        self.conv = nn.Conv2D(\n            in_channels=ch_in,\n            out_channels=ch_out,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=(filter_size - 1) // 2,\n            groups=1,\n            weight_attr=ParamAttr(initializer=Normal(\n                mean=0., std=0.01)),\n            bias_attr=False)\n\n        norm_lr = 0. if freeze_norm else 1.\n\n        param_attr = ParamAttr(\n            learning_rate=norm_lr, regularizer=L2Decay(norm_decay))\n        bias_attr = ParamAttr(\n            learning_rate=norm_lr, regularizer=L2Decay(norm_decay))\n        global_stats = True if freeze_norm else None\n        if norm_type in ['bn', 'sync_bn']:\n            self.norm = nn.BatchNorm2D(\n                ch_out,\n                momentum=norm_momentum,\n                weight_attr=param_attr,\n                bias_attr=bias_attr,\n                use_global_stats=global_stats)\n        elif norm_type == 'gn':\n            self.norm = nn.GroupNorm(\n                num_groups=norm_groups,\n                num_channels=ch_out,\n                weight_attr=param_attr,\n                bias_attr=bias_attr)\n        norm_params = self.norm.parameters()\n        if freeze_norm:\n            for param in norm_params:\n                param.stop_gradient = True\n\n    def forward(self, inputs):\n        out = self.conv(inputs)\n        out = self.norm(out)\n\n        if self.act == 'relu':\n            out = F.relu(out)\n        return out\n\n\nclass Layer1(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 has_se=False,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 name=None):\n        super(Layer1, self).__init__()\n\n        self.bottleneck_block_list = []\n\n        for i in range(4):\n            bottleneck_block = self.add_sublayer(\n                \"block_{}_{}\".format(name, i + 1),\n                BottleneckBlock(\n                    num_channels=num_channels if i == 0 else 256,\n                    num_filters=64,\n                    has_se=has_se,\n                    stride=1,\n                    downsample=True if i == 0 else False,\n                    norm_momentum=norm_momentum,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    name=name + '_' + str(i + 1)))\n            self.bottleneck_block_list.append(bottleneck_block)\n\n    def forward(self, input):\n        conv = input\n        for block_func in self.bottleneck_block_list:\n            conv = block_func(conv)\n        return conv\n\n\nclass TransitionLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 name=None):\n        super(TransitionLayer, self).__init__()\n\n        num_in = len(in_channels)\n        num_out = len(out_channels)\n        out = []\n        self.conv_bn_func_list = []\n        for i in range(num_out):\n            residual = None\n            if i < num_in:\n                if in_channels[i] != out_channels[i]:\n                    residual = self.add_sublayer(\n                        \"transition_{}_layer_{}\".format(name, i + 1),\n                        ConvNormLayer(\n                            ch_in=in_channels[i],\n                            ch_out=out_channels[i],\n                            filter_size=3,\n                            norm_momentum=norm_momentum,\n                            norm_decay=norm_decay,\n                            freeze_norm=freeze_norm,\n                            act='relu',\n                            name=name + '_layer_' + str(i + 1)))\n            else:\n                residual = self.add_sublayer(\n                    \"transition_{}_layer_{}\".format(name, i + 1),\n                    ConvNormLayer(\n                        ch_in=in_channels[-1],\n                        ch_out=out_channels[i],\n                        filter_size=3,\n                        stride=2,\n                        norm_momentum=norm_momentum,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        act='relu',\n                        name=name + '_layer_' + str(i + 1)))\n            self.conv_bn_func_list.append(residual)\n\n    def forward(self, input):\n        outs = []\n        for idx, conv_bn_func in enumerate(self.conv_bn_func_list):\n            if conv_bn_func is None:\n                outs.append(input[idx])\n            else:\n                if idx < len(input):\n                    outs.append(conv_bn_func(input[idx]))\n                else:\n                    outs.append(conv_bn_func(input[-1]))\n        return outs\n\n\nclass Branches(nn.Layer):\n    def __init__(self,\n                 block_num,\n                 in_channels,\n                 out_channels,\n                 has_se=False,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 name=None):\n        super(Branches, self).__init__()\n\n        self.basic_block_list = []\n        for i in range(len(out_channels)):\n            self.basic_block_list.append([])\n            for j in range(block_num):\n                in_ch = in_channels[i] if j == 0 else out_channels[i]\n                basic_block_func = self.add_sublayer(\n                    \"bb_{}_branch_layer_{}_{}\".format(name, i + 1, j + 1),\n                    BasicBlock(\n                        num_channels=in_ch,\n                        num_filters=out_channels[i],\n                        has_se=has_se,\n                        norm_momentum=norm_momentum,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        name=name + '_branch_layer_' + str(i + 1) + '_' +\n                        str(j + 1)))\n                self.basic_block_list[i].append(basic_block_func)\n\n    def forward(self, inputs):\n        outs = []\n        for idx, input in enumerate(inputs):\n            conv = input\n            basic_block_list = self.basic_block_list[idx]\n            for basic_block_func in basic_block_list:\n                conv = basic_block_func(conv)\n            outs.append(conv)\n        return outs\n\n\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 has_se,\n                 stride=1,\n                 downsample=False,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 name=None):\n        super(BottleneckBlock, self).__init__()\n\n        self.has_se = has_se\n        self.downsample = downsample\n\n        self.conv1 = ConvNormLayer(\n            ch_in=num_channels,\n            ch_out=num_filters,\n            filter_size=1,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            act=\"relu\",\n            name=name + \"_conv1\")\n        self.conv2 = ConvNormLayer(\n            ch_in=num_filters,\n            ch_out=num_filters,\n            filter_size=3,\n            stride=stride,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            act=\"relu\",\n            name=name + \"_conv2\")\n        self.conv3 = ConvNormLayer(\n            ch_in=num_filters,\n            ch_out=num_filters * 4,\n            filter_size=1,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            act=None,\n            name=name + \"_conv3\")\n\n        if self.downsample:\n            self.conv_down = ConvNormLayer(\n                ch_in=num_channels,\n                ch_out=num_filters * 4,\n                filter_size=1,\n                norm_momentum=norm_momentum,\n                norm_decay=norm_decay,\n                freeze_norm=freeze_norm,\n                act=None,\n                name=name + \"_downsample\")\n\n        if self.has_se:\n            self.se = SELayer(\n                num_channels=num_filters * 4,\n                num_filters=num_filters * 4,\n                reduction_ratio=16,\n                name='fc' + name)\n\n    def forward(self, input):\n        residual = input\n        conv1 = self.conv1(input)\n        conv2 = self.conv2(conv1)\n        conv3 = self.conv3(conv2)\n\n        if self.downsample:\n            residual = self.conv_down(input)\n\n        if self.has_se:\n            conv3 = self.se(conv3)\n\n        y = paddle.add(x=residual, y=conv3)\n        y = F.relu(y)\n        return y\n\n\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 stride=1,\n                 has_se=False,\n                 downsample=False,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 name=None):\n        super(BasicBlock, self).__init__()\n\n        self.has_se = has_se\n        self.downsample = downsample\n        self.conv1 = ConvNormLayer(\n            ch_in=num_channels,\n            ch_out=num_filters,\n            filter_size=3,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            stride=stride,\n            act=\"relu\",\n            name=name + \"_conv1\")\n        self.conv2 = ConvNormLayer(\n            ch_in=num_filters,\n            ch_out=num_filters,\n            filter_size=3,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            stride=1,\n            act=None,\n            name=name + \"_conv2\")\n\n        if self.downsample:\n            self.conv_down = ConvNormLayer(\n                ch_in=num_channels,\n                ch_out=num_filters * 4,\n                filter_size=1,\n                norm_momentum=norm_momentum,\n                norm_decay=norm_decay,\n                freeze_norm=freeze_norm,\n                act=None,\n                name=name + \"_downsample\")\n\n        if self.has_se:\n            self.se = SELayer(\n                num_channels=num_filters,\n                num_filters=num_filters,\n                reduction_ratio=16,\n                name='fc' + name)\n\n    def forward(self, input):\n        residual = input\n        conv1 = self.conv1(input)\n        conv2 = self.conv2(conv1)\n\n        if self.downsample:\n            residual = self.conv_down(input)\n\n        if self.has_se:\n            conv2 = self.se(conv2)\n\n        y = paddle.add(x=residual, y=conv2)\n        y = F.relu(y)\n        return y\n\n\nclass SELayer(nn.Layer):\n    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):\n        super(SELayer, self).__init__()\n\n        self.pool2d_gap = AdaptiveAvgPool2D(1)\n\n        self._num_channels = num_channels\n\n        med_ch = int(num_channels / reduction_ratio)\n        stdv = 1.0 / math.sqrt(num_channels * 1.0)\n        self.squeeze = Linear(\n            num_channels,\n            med_ch,\n            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))\n\n        stdv = 1.0 / math.sqrt(med_ch * 1.0)\n        self.excitation = Linear(\n            med_ch,\n            num_filters,\n            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))\n\n    def forward(self, input):\n        pool = self.pool2d_gap(input)\n        pool = paddle.squeeze(pool, axis=[2, 3])\n        squeeze = self.squeeze(pool)\n        squeeze = F.relu(squeeze)\n        excitation = self.excitation(squeeze)\n        excitation = F.sigmoid(excitation)\n        excitation = paddle.unsqueeze(excitation, axis=[2, 3])\n        out = input * excitation\n        return out\n\n\nclass Stage(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_modules,\n                 num_filters,\n                 has_se=False,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 multi_scale_output=True,\n                 name=None):\n        super(Stage, self).__init__()\n\n        self._num_modules = num_modules\n        self.stage_func_list = []\n        for i in range(num_modules):\n            if i == num_modules - 1 and not multi_scale_output:\n                stage_func = self.add_sublayer(\n                    \"stage_{}_{}\".format(name, i + 1),\n                    HighResolutionModule(\n                        num_channels=num_channels,\n                        num_filters=num_filters,\n                        has_se=has_se,\n                        norm_momentum=norm_momentum,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        multi_scale_output=False,\n                        name=name + '_' + str(i + 1)))\n            else:\n                stage_func = self.add_sublayer(\n                    \"stage_{}_{}\".format(name, i + 1),\n                    HighResolutionModule(\n                        num_channels=num_channels,\n                        num_filters=num_filters,\n                        has_se=has_se,\n                        norm_momentum=norm_momentum,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        name=name + '_' + str(i + 1)))\n\n            self.stage_func_list.append(stage_func)\n\n    def forward(self, input):\n        out = input\n        for idx in range(self._num_modules):\n            out = self.stage_func_list[idx](out)\n        return out\n\n\nclass HighResolutionModule(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 has_se=False,\n                 multi_scale_output=True,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 name=None):\n        super(HighResolutionModule, self).__init__()\n        self.branches_func = Branches(\n            block_num=4,\n            in_channels=num_channels,\n            out_channels=num_filters,\n            has_se=has_se,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name)\n\n        self.fuse_func = FuseLayers(\n            in_channels=num_filters,\n            out_channels=num_filters,\n            multi_scale_output=multi_scale_output,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name)\n\n    def forward(self, input):\n        out = self.branches_func(input)\n        out = self.fuse_func(out)\n        return out\n\n\nclass FuseLayers(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 multi_scale_output=True,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 name=None):\n        super(FuseLayers, self).__init__()\n\n        self._actual_ch = len(in_channels) if multi_scale_output else 1\n        self._in_channels = in_channels\n\n        self.residual_func_list = []\n        for i in range(self._actual_ch):\n            for j in range(len(in_channels)):\n                residual_func = None\n                if j > i:\n                    residual_func = self.add_sublayer(\n                        \"residual_{}_layer_{}_{}\".format(name, i + 1, j + 1),\n                        ConvNormLayer(\n                            ch_in=in_channels[j],\n                            ch_out=out_channels[i],\n                            filter_size=1,\n                            stride=1,\n                            act=None,\n                            norm_momentum=norm_momentum,\n                            norm_decay=norm_decay,\n                            freeze_norm=freeze_norm,\n                            name=name + '_layer_' + str(i + 1) + '_' +\n                            str(j + 1)))\n                    self.residual_func_list.append(residual_func)\n                elif j < i:\n                    pre_num_filters = in_channels[j]\n                    for k in range(i - j):\n                        if k == i - j - 1:\n                            residual_func = self.add_sublayer(\n                                \"residual_{}_layer_{}_{}_{}\".format(\n                                    name, i + 1, j + 1, k + 1),\n                                ConvNormLayer(\n                                    ch_in=pre_num_filters,\n                                    ch_out=out_channels[i],\n                                    filter_size=3,\n                                    stride=2,\n                                    norm_momentum=norm_momentum,\n                                    norm_decay=norm_decay,\n                                    freeze_norm=freeze_norm,\n                                    act=None,\n                                    name=name + '_layer_' + str(i + 1) + '_' +\n                                    str(j + 1) + '_' + str(k + 1)))\n                            pre_num_filters = out_channels[i]\n                        else:\n                            residual_func = self.add_sublayer(\n                                \"residual_{}_layer_{}_{}_{}\".format(\n                                    name, i + 1, j + 1, k + 1),\n                                ConvNormLayer(\n                                    ch_in=pre_num_filters,\n                                    ch_out=out_channels[j],\n                                    filter_size=3,\n                                    stride=2,\n                                    norm_momentum=norm_momentum,\n                                    norm_decay=norm_decay,\n                                    freeze_norm=freeze_norm,\n                                    act=\"relu\",\n                                    name=name + '_layer_' + str(i + 1) + '_' +\n                                    str(j + 1) + '_' + str(k + 1)))\n                            pre_num_filters = out_channels[j]\n                        self.residual_func_list.append(residual_func)\n\n    def forward(self, input):\n        outs = []\n        residual_func_idx = 0\n        for i in range(self._actual_ch):\n            residual = input[i]\n            for j in range(len(self._in_channels)):\n                if j > i:\n                    y = self.residual_func_list[residual_func_idx](input[j])\n                    residual_func_idx += 1\n                    y = F.interpolate(y, scale_factor=2**(j - i))\n                    residual = paddle.add(x=residual, y=y)\n                elif j < i:\n                    y = input[j]\n                    for k in range(i - j):\n                        y = self.residual_func_list[residual_func_idx](y)\n                        residual_func_idx += 1\n                    residual = paddle.add(x=residual, y=y)\n            residual = F.relu(residual)\n            outs.append(residual)\n\n        return outs\n\n\n@register\nclass HRNet(nn.Layer):\n    \"\"\"\n    HRNet, see https://arxiv.org/abs/1908.07919\n\n    Args:\n        width (int): the width of HRNet\n        has_se (bool): whether to add SE block for each stage\n        freeze_at (int): the stage to freeze\n        freeze_norm (bool): whether to freeze norm in HRNet\n        norm_momentum (float): momentum of BatchNorm\n        norm_decay (float): weight decay for normalization layer weights\n        return_idx (List): the stage to return\n        upsample (bool): whether to upsample and concat the backbone feats\n    \"\"\"\n\n    def __init__(self,\n                 width=18,\n                 has_se=False,\n                 freeze_at=0,\n                 freeze_norm=True,\n                 norm_momentum=0.9,\n                 norm_decay=0.,\n                 return_idx=[0, 1, 2, 3],\n                 upsample=False,\n                 downsample=False):\n        super(HRNet, self).__init__()\n\n        self.width = width\n        self.has_se = has_se\n        if isinstance(return_idx, Integral):\n            return_idx = [return_idx]\n\n        assert len(return_idx) > 0, \"need one or more return index\"\n        self.freeze_at = freeze_at\n        self.return_idx = return_idx\n        self.upsample = upsample\n        self.downsample = downsample\n\n        self.channels = {\n            18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],\n            30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]],\n            32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]],\n            40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]],\n            44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]],\n            48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]],\n            60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]],\n            64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]]\n        }\n\n        channels_2, channels_3, channels_4 = self.channels[width]\n        num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3\n        self._out_channels = [sum(channels_4)] if self.upsample else channels_4\n        self._out_strides = [4] if self.upsample else [4, 8, 16, 32]\n\n        self.conv_layer1_1 = ConvNormLayer(\n            ch_in=3,\n            ch_out=64,\n            filter_size=3,\n            stride=2,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            act='relu',\n            name=\"layer1_1\")\n\n        self.conv_layer1_2 = ConvNormLayer(\n            ch_in=64,\n            ch_out=64,\n            filter_size=3,\n            stride=2,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            act='relu',\n            name=\"layer1_2\")\n\n        self.la1 = Layer1(\n            num_channels=64,\n            has_se=has_se,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=\"layer2\")\n\n        self.tr1 = TransitionLayer(\n            in_channels=[256],\n            out_channels=channels_2,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=\"tr1\")\n\n        self.st2 = Stage(\n            num_channels=channels_2,\n            num_modules=num_modules_2,\n            num_filters=channels_2,\n            has_se=self.has_se,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=\"st2\")\n\n        self.tr2 = TransitionLayer(\n            in_channels=channels_2,\n            out_channels=channels_3,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=\"tr2\")\n\n        self.st3 = Stage(\n            num_channels=channels_3,\n            num_modules=num_modules_3,\n            num_filters=channels_3,\n            has_se=self.has_se,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=\"st3\")\n\n        self.tr3 = TransitionLayer(\n            in_channels=channels_3,\n            out_channels=channels_4,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=\"tr3\")\n        self.st4 = Stage(\n            num_channels=channels_4,\n            num_modules=num_modules_4,\n            num_filters=channels_4,\n            has_se=self.has_se,\n            norm_momentum=norm_momentum,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            multi_scale_output=len(return_idx) > 1,\n            name=\"st4\")\n\n        if self.downsample:\n            self.incre_modules, self.downsamp_modules, \\\n                self.final_layer = self._make_head(channels_4, norm_momentum=norm_momentum, has_se=self.has_se)\n\n    def _make_layer(self,\n                    block,\n                    inplanes,\n                    planes,\n                    blocks,\n                    stride=1,\n                    norm_momentum=0.9,\n                    has_se=False,\n                    name=None):\n        downsample = None\n        if stride != 1 or inplanes != planes * 4:\n            downsample = True\n\n        layers = []\n        layers.append(\n            block(\n                inplanes,\n                planes,\n                has_se,\n                stride,\n                downsample,\n                norm_momentum=norm_momentum,\n                freeze_norm=False,\n                name=name + \"_s0\"))\n        inplanes = planes * 4\n        for i in range(1, blocks):\n            layers.append(\n                block(\n                    inplanes,\n                    planes,\n                    has_se,\n                    norm_momentum=norm_momentum,\n                    freeze_norm=False,\n                    name=name + \"_s\" + str(i)))\n\n        return nn.Sequential(*layers)\n\n    def _make_head(self, pre_stage_channels, norm_momentum=0.9, has_se=False):\n        head_block = BottleneckBlock\n        head_channels = [32, 64, 128, 256]\n\n        # Increasing the #channels on each resolution \n        # from C, 2C, 4C, 8C to 128, 256, 512, 1024\n        incre_modules = []\n        for i, channels in enumerate(pre_stage_channels):\n            incre_module = self._make_layer(\n                head_block,\n                channels,\n                head_channels[i],\n                1,\n                stride=1,\n                norm_momentum=norm_momentum,\n                has_se=has_se,\n                name='incre' + str(i))\n            incre_modules.append(incre_module)\n        incre_modules = nn.LayerList(incre_modules)\n\n        # downsampling modules\n        downsamp_modules = []\n        for i in range(len(pre_stage_channels) - 1):\n            in_channels = head_channels[i] * 4\n            out_channels = head_channels[i + 1] * 4\n\n            downsamp_module = nn.Sequential(\n                nn.Conv2D(\n                    in_channels=in_channels,\n                    out_channels=out_channels,\n                    kernel_size=3,\n                    stride=2,\n                    padding=1),\n                nn.BatchNorm2D(\n                    out_channels, momentum=norm_momentum),\n                nn.ReLU())\n\n            downsamp_modules.append(downsamp_module)\n        downsamp_modules = nn.LayerList(downsamp_modules)\n\n        final_layer = nn.Sequential(\n            nn.Conv2D(\n                in_channels=head_channels[3] * 4,\n                out_channels=2048,\n                kernel_size=1,\n                stride=1,\n                padding=0),\n            nn.BatchNorm2D(\n                2048, momentum=norm_momentum),\n            nn.ReLU())\n\n        return incre_modules, downsamp_modules, final_layer\n\n    def forward(self, inputs):\n        x = inputs['image']\n        conv1 = self.conv_layer1_1(x)\n        conv2 = self.conv_layer1_2(conv1)\n\n        la1 = self.la1(conv2)\n        tr1 = self.tr1([la1])\n        st2 = self.st2(tr1)\n        tr2 = self.tr2(st2)\n\n        st3 = self.st3(tr2)\n        tr3 = self.tr3(st3)\n\n        st4 = self.st4(tr3)\n\n        if self.upsample:\n            # Upsampling\n            x0_h, x0_w = st4[0].shape[2:4]\n            x1 = F.upsample(st4[1], size=(x0_h, x0_w), mode='bilinear')\n            x2 = F.upsample(st4[2], size=(x0_h, x0_w), mode='bilinear')\n            x3 = F.upsample(st4[3], size=(x0_h, x0_w), mode='bilinear')\n            x = paddle.concat([st4[0], x1, x2, x3], 1)\n            return x\n\n        if self.downsample:\n            y = self.incre_modules[0](st4[0])\n            for i in range(len(self.downsamp_modules)):\n                y = self.incre_modules[i+1](st4[i+1]) + \\\n                            self.downsamp_modules[i](y)\n            y = self.final_layer(y)\n            return y\n\n        res = []\n        for i, layer in enumerate(st4):\n            if i == self.freeze_at:\n                layer.stop_gradient = True\n            if i in self.return_idx:\n                res.append(layer)\n\n        return res\n\n    @property\n    def out_shape(self):\n        if self.upsample:\n            self.return_idx = [0]\n        return [\n            ShapeSpec(\n                channels=self._out_channels[i], stride=self._out_strides[i])\n            for i in self.return_idx\n        ]\n"
  },
  {
    "path": "ppdet/modeling/backbones/lcnet.py",
    "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom paddle.nn import Conv2D\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import KaimingNormal\n\nfrom ppdet.core.workspace import register, serializable\nfrom numbers import Integral\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['LCNet']\n\nNET_CONFIG = {\n    \"blocks2\":\n    #k, in_c, out_c, s, use_se\n    [[3, 16, 32, 1, False], ],\n    \"blocks3\": [\n        [3, 32, 64, 2, False],\n        [3, 64, 64, 1, False],\n    ],\n    \"blocks4\": [\n        [3, 64, 128, 2, False],\n        [3, 128, 128, 1, False],\n    ],\n    \"blocks5\": [\n        [3, 128, 256, 2, False],\n        [5, 256, 256, 1, False],\n        [5, 256, 256, 1, False],\n        [5, 256, 256, 1, False],\n        [5, 256, 256, 1, False],\n        [5, 256, 256, 1, False],\n    ],\n    \"blocks6\": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]\n}\n\n\ndef make_divisible(v, divisor=8, min_value=None):\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 filter_size,\n                 num_filters,\n                 stride,\n                 num_groups=1,\n                 act='hard_swish'):\n        super().__init__()\n\n        self.conv = Conv2D(\n            in_channels=num_channels,\n            out_channels=num_filters,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=(filter_size - 1) // 2,\n            groups=num_groups,\n            weight_attr=ParamAttr(initializer=KaimingNormal()),\n            bias_attr=False)\n\n        self.bn = nn.BatchNorm2D(\n            num_filters,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        if act == 'hard_swish':\n            self.act = nn.Hardswish()\n        elif act == 'relu6':\n            self.act = nn.ReLU6()\n\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        x = self.act(x)\n        return x\n\n\nclass DepthwiseSeparable(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 stride,\n                 dw_size=3,\n                 use_se=False,\n                 act='hard_swish'):\n        super().__init__()\n        self.use_se = use_se\n        self.dw_conv = ConvBNLayer(\n            num_channels=num_channels,\n            num_filters=num_channels,\n            filter_size=dw_size,\n            stride=stride,\n            num_groups=num_channels,\n            act=act)\n        if use_se:\n            self.se = SEModule(num_channels)\n        self.pw_conv = ConvBNLayer(\n            num_channels=num_channels,\n            filter_size=1,\n            num_filters=num_filters,\n            stride=1,\n            act=act)\n\n    def forward(self, x):\n        x = self.dw_conv(x)\n        if self.use_se:\n            x = self.se(x)\n        x = self.pw_conv(x)\n        return x\n\nclass AdaptiveAvgPool2D(nn.AdaptiveAvgPool2D):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if paddle.device.get_device().startswith(\"npu\"):\n            self.device = \"npu\"\n        else:\n            self.device = None\n\n        if isinstance(self._output_size, int) and self._output_size == 1:\n            self._gap = True\n        elif isinstance(self._output_size, tuple) and self._output_size[\n                0] == 1 and self._output_size[1] == 1:\n            self._gap = True\n        else:\n            self._gap = False\n\n    def forward(self, x):\n        if self.device == \"npu\" and self._gap:\n            # Global Average Pooling\n            N, C, _, _ = x.shape\n            x_mean = paddle.mean(x, axis=[2, 3])\n            x_mean = paddle.reshape(x_mean, [N, C, 1, 1])\n            return x_mean\n        else:\n            return super(AdaptiveAvgPool2D, self).forward(x)\n\nclass SEModule(nn.Layer):\n    def __init__(self, channel, reduction=4):\n        super().__init__()\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        self.conv1 = Conv2D(\n            in_channels=channel,\n            out_channels=channel // reduction,\n            kernel_size=1,\n            stride=1,\n            padding=0)\n        self.relu = nn.ReLU()\n        self.conv2 = Conv2D(\n            in_channels=channel // reduction,\n            out_channels=channel,\n            kernel_size=1,\n            stride=1,\n            padding=0)\n        self.hardsigmoid = nn.Hardsigmoid()\n\n    def forward(self, x):\n        identity = x\n        x = self.avg_pool(x)\n        x = self.conv1(x)\n        x = self.relu(x)\n        x = self.conv2(x)\n        x = self.hardsigmoid(x)\n        x = paddle.multiply(x=identity, y=x)\n        return x\n\n\n@register\n@serializable\nclass LCNet(nn.Layer):\n    def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):\n        super().__init__()\n        self.scale = scale\n        self.feature_maps = feature_maps\n\n        out_channels = []\n\n        self.conv1 = ConvBNLayer(\n            num_channels=3,\n            filter_size=3,\n            num_filters=make_divisible(16 * scale),\n            stride=2,\n            act=act)\n\n        self.blocks2 = nn.Sequential(* [\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se,\n                act=act)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks2\"])\n        ])\n\n        self.blocks3 = nn.Sequential(* [\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se,\n                act=act)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks3\"])\n        ])\n\n        out_channels.append(\n            make_divisible(NET_CONFIG[\"blocks3\"][-1][2] * scale))\n\n        self.blocks4 = nn.Sequential(* [\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se,\n                act=act)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks4\"])\n        ])\n\n        out_channels.append(\n            make_divisible(NET_CONFIG[\"blocks4\"][-1][2] * scale))\n\n        self.blocks5 = nn.Sequential(* [\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se,\n                act=act)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks5\"])\n        ])\n\n        out_channels.append(\n            make_divisible(NET_CONFIG[\"blocks5\"][-1][2] * scale))\n\n        self.blocks6 = nn.Sequential(* [\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se,\n                act=act)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks6\"])\n        ])\n\n        out_channels.append(\n            make_divisible(NET_CONFIG[\"blocks6\"][-1][2] * scale))\n        self._out_channels = [\n            ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps\n        ]\n\n    def forward(self, inputs):\n        x = inputs['image']\n        outs = []\n\n        x = self.conv1(x)\n        x = self.blocks2(x)\n        x = self.blocks3(x)\n        outs.append(x)\n        x = self.blocks4(x)\n        outs.append(x)\n        x = self.blocks5(x)\n        outs.append(x)\n        x = self.blocks6(x)\n        outs.append(x)\n        outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]\n        return outs\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/backbones/lite_hrnet.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on\nhttps://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py\n\"\"\"\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom numbers import Integral\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import Normal, Constant\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.shape_spec import ShapeSpec\nfrom ppdet.modeling.ops import channel_shuffle\nfrom .. import layers as L\n\n__all__ = ['LiteHRNet']\n\n\nclass ConvNormLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size,\n                 stride=1,\n                 groups=1,\n                 norm_type=None,\n                 norm_groups=32,\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 act=None):\n        super(ConvNormLayer, self).__init__()\n        self.act = act\n        norm_lr = 0. if freeze_norm else 1.\n        if norm_type is not None:\n            assert norm_type in ['bn', 'sync_bn', 'gn'], \\\n                \"norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}\".format(norm_type)\n            param_attr = ParamAttr(\n                initializer=Constant(1.0),\n                learning_rate=norm_lr,\n                regularizer=L2Decay(norm_decay), )\n            bias_attr = ParamAttr(\n                learning_rate=norm_lr, regularizer=L2Decay(norm_decay))\n            global_stats = True if freeze_norm else None\n            if norm_type in ['bn', 'sync_bn']:\n                self.norm = nn.BatchNorm2D(\n                    ch_out,\n                    weight_attr=param_attr,\n                    bias_attr=bias_attr,\n                    use_global_stats=global_stats, )\n            elif norm_type == 'gn':\n                self.norm = nn.GroupNorm(\n                    num_groups=norm_groups,\n                    num_channels=ch_out,\n                    weight_attr=param_attr,\n                    bias_attr=bias_attr)\n            norm_params = self.norm.parameters()\n            if freeze_norm:\n                for param in norm_params:\n                    param.stop_gradient = True\n            conv_bias_attr = False\n        else:\n            conv_bias_attr = True\n            self.norm = None\n\n        self.conv = nn.Conv2D(\n            in_channels=ch_in,\n            out_channels=ch_out,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=(filter_size - 1) // 2,\n            groups=groups,\n            weight_attr=ParamAttr(initializer=Normal(\n                mean=0., std=0.001)),\n            bias_attr=conv_bias_attr)\n\n    def forward(self, inputs):\n        out = self.conv(inputs)\n        if self.norm is not None:\n            out = self.norm(out)\n\n        if self.act == 'relu':\n            out = F.relu(out)\n        elif self.act == 'sigmoid':\n            out = F.sigmoid(out)\n        return out\n\n\nclass DepthWiseSeparableConvNormLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size,\n                 stride=1,\n                 dw_norm_type=None,\n                 pw_norm_type=None,\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 dw_act=None,\n                 pw_act=None):\n        super(DepthWiseSeparableConvNormLayer, self).__init__()\n        self.depthwise_conv = ConvNormLayer(\n            ch_in=ch_in,\n            ch_out=ch_in,\n            filter_size=filter_size,\n            stride=stride,\n            groups=ch_in,\n            norm_type=dw_norm_type,\n            act=dw_act,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm, )\n        self.pointwise_conv = ConvNormLayer(\n            ch_in=ch_in,\n            ch_out=ch_out,\n            filter_size=1,\n            stride=1,\n            norm_type=pw_norm_type,\n            act=pw_act,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm, )\n\n    def forward(self, x):\n        x = self.depthwise_conv(x)\n        x = self.pointwise_conv(x)\n        return x\n\n\nclass CrossResolutionWeightingModule(nn.Layer):\n    def __init__(self,\n                 channels,\n                 ratio=16,\n                 norm_type='bn',\n                 freeze_norm=False,\n                 norm_decay=0.):\n        super(CrossResolutionWeightingModule, self).__init__()\n        self.channels = channels\n        total_channel = sum(channels)\n        self.conv1 = ConvNormLayer(\n            ch_in=total_channel,\n            ch_out=total_channel // ratio,\n            filter_size=1,\n            stride=1,\n            norm_type=norm_type,\n            act='relu',\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n        self.conv2 = ConvNormLayer(\n            ch_in=total_channel // ratio,\n            ch_out=total_channel,\n            filter_size=1,\n            stride=1,\n            norm_type=norm_type,\n            act='sigmoid',\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n\n    def forward(self, x):\n        mini_size = x[-1].shape[-2:]\n        out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]\n        out = paddle.concat(out, 1)\n        out = self.conv1(out)\n        out = self.conv2(out)\n        out = paddle.split(out, self.channels, 1)\n        out = [\n            s * F.interpolate(\n                a, s.shape[-2:], mode='nearest') for s, a in zip(x, out)\n        ]\n        return out\n\n\nclass SpatialWeightingModule(nn.Layer):\n    def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.):\n        super(SpatialWeightingModule, self).__init__()\n        self.global_avgpooling = nn.AdaptiveAvgPool2D(1)\n        self.conv1 = ConvNormLayer(\n            ch_in=in_channel,\n            ch_out=in_channel // ratio,\n            filter_size=1,\n            stride=1,\n            act='relu',\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n        self.conv2 = ConvNormLayer(\n            ch_in=in_channel // ratio,\n            ch_out=in_channel,\n            filter_size=1,\n            stride=1,\n            act='sigmoid',\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n\n    def forward(self, x):\n        out = self.global_avgpooling(x)\n        out = self.conv1(out)\n        out = self.conv2(out)\n        return x * out\n\n\nclass ConditionalChannelWeightingBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 stride,\n                 reduce_ratio,\n                 norm_type='bn',\n                 freeze_norm=False,\n                 norm_decay=0.):\n        super(ConditionalChannelWeightingBlock, self).__init__()\n        assert stride in [1, 2]\n        branch_channels = [channel // 2 for channel in in_channels]\n\n        self.cross_resolution_weighting = CrossResolutionWeightingModule(\n            branch_channels,\n            ratio=reduce_ratio,\n            norm_type=norm_type,\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n        self.depthwise_convs = nn.LayerList([\n            ConvNormLayer(\n                channel,\n                channel,\n                filter_size=3,\n                stride=stride,\n                groups=channel,\n                norm_type=norm_type,\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay) for channel in branch_channels\n        ])\n\n        self.spatial_weighting = nn.LayerList([\n            SpatialWeightingModule(\n                channel,\n                ratio=4,\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay) for channel in branch_channels\n        ])\n\n    def forward(self, x):\n        x = [s.chunk(2, axis=1) for s in x]\n        x1 = [s[0] for s in x]\n        x2 = [s[1] for s in x]\n\n        x2 = self.cross_resolution_weighting(x2)\n        x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]\n        x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]\n\n        out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)]\n        out = [channel_shuffle(s, groups=2) for s in out]\n        return out\n\n\nclass ShuffleUnit(nn.Layer):\n    def __init__(self,\n                 in_channel,\n                 out_channel,\n                 stride,\n                 norm_type='bn',\n                 freeze_norm=False,\n                 norm_decay=0.):\n        super(ShuffleUnit, self).__init__()\n        branch_channel = out_channel // 2\n        self.stride = stride\n        if self.stride == 1:\n            assert in_channel == branch_channel * 2, \\\n                \"when stride=1, in_channel {} should equal to branch_channel*2 {}\".format(in_channel, branch_channel * 2)\n        if stride > 1:\n            self.branch1 = nn.Sequential(\n                ConvNormLayer(\n                    ch_in=in_channel,\n                    ch_out=in_channel,\n                    filter_size=3,\n                    stride=self.stride,\n                    groups=in_channel,\n                    norm_type=norm_type,\n                    freeze_norm=freeze_norm,\n                    norm_decay=norm_decay),\n                ConvNormLayer(\n                    ch_in=in_channel,\n                    ch_out=branch_channel,\n                    filter_size=1,\n                    stride=1,\n                    norm_type=norm_type,\n                    act='relu',\n                    freeze_norm=freeze_norm,\n                    norm_decay=norm_decay), )\n        self.branch2 = nn.Sequential(\n            ConvNormLayer(\n                ch_in=branch_channel if stride == 1 else in_channel,\n                ch_out=branch_channel,\n                filter_size=1,\n                stride=1,\n                norm_type=norm_type,\n                act='relu',\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay),\n            ConvNormLayer(\n                ch_in=branch_channel,\n                ch_out=branch_channel,\n                filter_size=3,\n                stride=self.stride,\n                groups=branch_channel,\n                norm_type=norm_type,\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay),\n            ConvNormLayer(\n                ch_in=branch_channel,\n                ch_out=branch_channel,\n                filter_size=1,\n                stride=1,\n                norm_type=norm_type,\n                act='relu',\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay), )\n\n    def forward(self, x):\n        if self.stride > 1:\n            x1 = self.branch1(x)\n            x2 = self.branch2(x)\n        else:\n            x1, x2 = x.chunk(2, axis=1)\n            x2 = self.branch2(x2)\n        out = paddle.concat([x1, x2], axis=1)\n        out = channel_shuffle(out, groups=2)\n        return out\n\n\nclass IterativeHead(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 norm_type='bn',\n                 freeze_norm=False,\n                 norm_decay=0.):\n        super(IterativeHead, self).__init__()\n        num_branches = len(in_channels)\n        self.in_channels = in_channels[::-1]\n\n        projects = []\n        for i in range(num_branches):\n            if i != num_branches - 1:\n                projects.append(\n                    DepthWiseSeparableConvNormLayer(\n                        ch_in=self.in_channels[i],\n                        ch_out=self.in_channels[i + 1],\n                        filter_size=3,\n                        stride=1,\n                        dw_act=None,\n                        pw_act='relu',\n                        dw_norm_type=norm_type,\n                        pw_norm_type=norm_type,\n                        freeze_norm=freeze_norm,\n                        norm_decay=norm_decay))\n            else:\n                projects.append(\n                    DepthWiseSeparableConvNormLayer(\n                        ch_in=self.in_channels[i],\n                        ch_out=self.in_channels[i],\n                        filter_size=3,\n                        stride=1,\n                        dw_act=None,\n                        pw_act='relu',\n                        dw_norm_type=norm_type,\n                        pw_norm_type=norm_type,\n                        freeze_norm=freeze_norm,\n                        norm_decay=norm_decay))\n        self.projects = nn.LayerList(projects)\n\n    def forward(self, x):\n        x = x[::-1]\n        y = []\n        last_x = None\n        for i, s in enumerate(x):\n            if last_x is not None:\n                last_x = F.interpolate(\n                    last_x,\n                    size=s.shape[-2:],\n                    mode='bilinear',\n                    align_corners=True)\n                s = s + last_x\n            s = self.projects[i](s)\n            y.append(s)\n            last_x = s\n\n        return y[::-1]\n\n\nclass Stem(nn.Layer):\n    def __init__(self,\n                 in_channel,\n                 stem_channel,\n                 out_channel,\n                 expand_ratio,\n                 norm_type='bn',\n                 freeze_norm=False,\n                 norm_decay=0.):\n        super(Stem, self).__init__()\n        self.conv1 = ConvNormLayer(\n            in_channel,\n            stem_channel,\n            filter_size=3,\n            stride=2,\n            norm_type=norm_type,\n            act='relu',\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n        mid_channel = int(round(stem_channel * expand_ratio))\n        branch_channel = stem_channel // 2\n        if stem_channel == out_channel:\n            inc_channel = out_channel - branch_channel\n        else:\n            inc_channel = out_channel - stem_channel\n        self.branch1 = nn.Sequential(\n            ConvNormLayer(\n                ch_in=branch_channel,\n                ch_out=branch_channel,\n                filter_size=3,\n                stride=2,\n                groups=branch_channel,\n                norm_type=norm_type,\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay),\n            ConvNormLayer(\n                ch_in=branch_channel,\n                ch_out=inc_channel,\n                filter_size=1,\n                stride=1,\n                norm_type=norm_type,\n                act='relu',\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay), )\n        self.expand_conv = ConvNormLayer(\n            ch_in=branch_channel,\n            ch_out=mid_channel,\n            filter_size=1,\n            stride=1,\n            norm_type=norm_type,\n            act='relu',\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n        self.depthwise_conv = ConvNormLayer(\n            ch_in=mid_channel,\n            ch_out=mid_channel,\n            filter_size=3,\n            stride=2,\n            groups=mid_channel,\n            norm_type=norm_type,\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n        self.linear_conv = ConvNormLayer(\n            ch_in=mid_channel,\n            ch_out=branch_channel\n            if stem_channel == out_channel else stem_channel,\n            filter_size=1,\n            stride=1,\n            norm_type=norm_type,\n            act='relu',\n            freeze_norm=freeze_norm,\n            norm_decay=norm_decay)\n\n    def forward(self, x):\n        x = self.conv1(x)\n        x1, x2 = x.chunk(2, axis=1)\n        x1 = self.branch1(x1)\n        x2 = self.expand_conv(x2)\n        x2 = self.depthwise_conv(x2)\n        x2 = self.linear_conv(x2)\n        out = paddle.concat([x1, x2], axis=1)\n        out = channel_shuffle(out, groups=2)\n\n        return out\n\n\nclass LiteHRNetModule(nn.Layer):\n    def __init__(self,\n                 num_branches,\n                 num_blocks,\n                 in_channels,\n                 reduce_ratio,\n                 module_type,\n                 multiscale_output=False,\n                 with_fuse=True,\n                 norm_type='bn',\n                 freeze_norm=False,\n                 norm_decay=0.):\n        super(LiteHRNetModule, self).__init__()\n        assert num_branches == len(in_channels),\\\n            \"num_branches {} should equal to num_in_channels {}\".format(num_branches, len(in_channels))\n        assert module_type in [\n            'LITE', 'NAIVE'\n        ], \"module_type should be one of ['LITE', 'NAIVE']\"\n        self.num_branches = num_branches\n        self.in_channels = in_channels\n        self.multiscale_output = multiscale_output\n        self.with_fuse = with_fuse\n        self.norm_type = 'bn'\n        self.module_type = module_type\n\n        if self.module_type == 'LITE':\n            self.layers = self._make_weighting_blocks(\n                num_blocks,\n                reduce_ratio,\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay)\n        elif self.module_type == 'NAIVE':\n            self.layers = self._make_naive_branches(\n                num_branches,\n                num_blocks,\n                freeze_norm=freeze_norm,\n                norm_decay=norm_decay)\n\n        if self.with_fuse:\n            self.fuse_layers = self._make_fuse_layers(\n                freeze_norm=freeze_norm, norm_decay=norm_decay)\n            self.relu = nn.ReLU()\n\n    def _make_weighting_blocks(self,\n                               num_blocks,\n                               reduce_ratio,\n                               stride=1,\n                               freeze_norm=False,\n                               norm_decay=0.):\n        layers = []\n        for i in range(num_blocks):\n            layers.append(\n                ConditionalChannelWeightingBlock(\n                    self.in_channels,\n                    stride=stride,\n                    reduce_ratio=reduce_ratio,\n                    norm_type=self.norm_type,\n                    freeze_norm=freeze_norm,\n                    norm_decay=norm_decay))\n        return nn.Sequential(*layers)\n\n    def _make_naive_branches(self,\n                             num_branches,\n                             num_blocks,\n                             freeze_norm=False,\n                             norm_decay=0.):\n        branches = []\n        for branch_idx in range(num_branches):\n            layers = []\n            for i in range(num_blocks):\n                layers.append(\n                    ShuffleUnit(\n                        self.in_channels[branch_idx],\n                        self.in_channels[branch_idx],\n                        stride=1,\n                        norm_type=self.norm_type,\n                        freeze_norm=freeze_norm,\n                        norm_decay=norm_decay))\n            branches.append(nn.Sequential(*layers))\n        return nn.LayerList(branches)\n\n    def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.):\n        if self.num_branches == 1:\n            return None\n        fuse_layers = []\n        num_out_branches = self.num_branches if self.multiscale_output else 1\n        for i in range(num_out_branches):\n            fuse_layer = []\n            for j in range(self.num_branches):\n                if j > i:\n                    fuse_layer.append(\n                        nn.Sequential(\n                            L.Conv2d(\n                                self.in_channels[j],\n                                self.in_channels[i],\n                                kernel_size=1,\n                                stride=1,\n                                padding=0,\n                                bias=False, ),\n                            nn.BatchNorm2D(self.in_channels[i]),\n                            nn.Upsample(\n                                scale_factor=2**(j - i), mode='nearest')))\n                elif j == i:\n                    fuse_layer.append(None)\n                else:\n                    conv_downsamples = []\n                    for k in range(i - j):\n                        if k == i - j - 1:\n                            conv_downsamples.append(\n                                nn.Sequential(\n                                    L.Conv2d(\n                                        self.in_channels[j],\n                                        self.in_channels[j],\n                                        kernel_size=3,\n                                        stride=2,\n                                        padding=1,\n                                        groups=self.in_channels[j],\n                                        bias=False, ),\n                                    nn.BatchNorm2D(self.in_channels[j]),\n                                    L.Conv2d(\n                                        self.in_channels[j],\n                                        self.in_channels[i],\n                                        kernel_size=1,\n                                        stride=1,\n                                        padding=0,\n                                        bias=False, ),\n                                    nn.BatchNorm2D(self.in_channels[i])))\n                        else:\n                            conv_downsamples.append(\n                                nn.Sequential(\n                                    L.Conv2d(\n                                        self.in_channels[j],\n                                        self.in_channels[j],\n                                        kernel_size=3,\n                                        stride=2,\n                                        padding=1,\n                                        groups=self.in_channels[j],\n                                        bias=False, ),\n                                    nn.BatchNorm2D(self.in_channels[j]),\n                                    L.Conv2d(\n                                        self.in_channels[j],\n                                        self.in_channels[j],\n                                        kernel_size=1,\n                                        stride=1,\n                                        padding=0,\n                                        bias=False, ),\n                                    nn.BatchNorm2D(self.in_channels[j]),\n                                    nn.ReLU()))\n\n                    fuse_layer.append(nn.Sequential(*conv_downsamples))\n            fuse_layers.append(nn.LayerList(fuse_layer))\n\n        return nn.LayerList(fuse_layers)\n\n    def forward(self, x):\n        if self.num_branches == 1:\n            return [self.layers[0](x[0])]\n        if self.module_type == 'LITE':\n            out = self.layers(x)\n        elif self.module_type == 'NAIVE':\n            for i in range(self.num_branches):\n                x[i] = self.layers[i](x[i])\n            out = x\n        if self.with_fuse:\n            out_fuse = []\n            for i in range(len(self.fuse_layers)):\n                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])\n                for j in range(self.num_branches):\n                    if j == 0:\n                        y += y\n                    elif i == j:\n                        y += out[j]\n                    else:\n                        y += self.fuse_layers[i][j](out[j])\n                    if i == 0:\n                        out[i] = y\n                out_fuse.append(self.relu(y))\n            out = out_fuse\n        elif not self.multiscale_output:\n            out = [out[0]]\n        return out\n\n\n@register\nclass LiteHRNet(nn.Layer):\n    \"\"\"\n    @inproceedings{Yulitehrnet21,\n    title={Lite-HRNet: A Lightweight High-Resolution Network},\n        author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},\n        booktitle={CVPR},year={2021}\n    }\n    Args:\n        network_type (str): the network_type should be one of [\"lite_18\", \"lite_30\", \"naive\", \"wider_naive\"],\n            \"naive\": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet.\n            \"wider_naive\": Naive network with wider channels in each block.\n            \"lite_18\": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting.\n            \"lite_30\": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18.\n        freeze_at (int): the stage to freeze\n        freeze_norm (bool): whether to freeze norm in HRNet\n        norm_decay (float): weight decay for normalization layer weights\n        return_idx (List): the stage to return\n    \"\"\"\n\n    def __init__(self,\n                 network_type,\n                 freeze_at=0,\n                 freeze_norm=True,\n                 norm_decay=0.,\n                 return_idx=[0, 1, 2, 3]):\n        super(LiteHRNet, self).__init__()\n        if isinstance(return_idx, Integral):\n            return_idx = [return_idx]\n        assert network_type in [\"lite_18\", \"lite_30\", \"naive\", \"wider_naive\"], \\\n            \"the network_type should be one of [lite_18, lite_30, naive, wider_naive]\"\n        assert len(return_idx) > 0, \"need one or more return index\"\n        self.freeze_at = freeze_at\n        self.freeze_norm = freeze_norm\n        self.norm_decay = norm_decay\n        self.return_idx = return_idx\n        self.norm_type = 'bn'\n\n        self.module_configs = {\n            \"lite_18\": {\n                \"num_modules\": [2, 4, 2],\n                \"num_branches\": [2, 3, 4],\n                \"num_blocks\": [2, 2, 2],\n                \"module_type\": [\"LITE\", \"LITE\", \"LITE\"],\n                \"reduce_ratios\": [8, 8, 8],\n                \"num_channels\": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],\n            },\n            \"lite_30\": {\n                \"num_modules\": [3, 8, 3],\n                \"num_branches\": [2, 3, 4],\n                \"num_blocks\": [2, 2, 2],\n                \"module_type\": [\"LITE\", \"LITE\", \"LITE\"],\n                \"reduce_ratios\": [8, 8, 8],\n                \"num_channels\": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],\n            },\n            \"naive\": {\n                \"num_modules\": [2, 4, 2],\n                \"num_branches\": [2, 3, 4],\n                \"num_blocks\": [2, 2, 2],\n                \"module_type\": [\"NAIVE\", \"NAIVE\", \"NAIVE\"],\n                \"reduce_ratios\": [1, 1, 1],\n                \"num_channels\": [[30, 60], [30, 60, 120], [30, 60, 120, 240]],\n            },\n            \"wider_naive\": {\n                \"num_modules\": [2, 4, 2],\n                \"num_branches\": [2, 3, 4],\n                \"num_blocks\": [2, 2, 2],\n                \"module_type\": [\"NAIVE\", \"NAIVE\", \"NAIVE\"],\n                \"reduce_ratios\": [1, 1, 1],\n                \"num_channels\": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],\n            },\n        }\n\n        self.stages_config = self.module_configs[network_type]\n\n        self.stem = Stem(3, 32, 32, 1)\n        num_channels_pre_layer = [32]\n        for stage_idx in range(3):\n            num_channels = self.stages_config[\"num_channels\"][stage_idx]\n            setattr(self, 'transition{}'.format(stage_idx),\n                    self._make_transition_layer(num_channels_pre_layer,\n                                                num_channels, self.freeze_norm,\n                                                self.norm_decay))\n            stage, num_channels_pre_layer = self._make_stage(\n                self.stages_config, stage_idx, num_channels, True,\n                self.freeze_norm, self.norm_decay)\n            setattr(self, 'stage{}'.format(stage_idx), stage)\n        self.head_layer = IterativeHead(num_channels_pre_layer, 'bn',\n                                        self.freeze_norm, self.norm_decay)\n\n    def _make_transition_layer(self,\n                               num_channels_pre_layer,\n                               num_channels_cur_layer,\n                               freeze_norm=False,\n                               norm_decay=0.):\n        num_branches_pre = len(num_channels_pre_layer)\n        num_branches_cur = len(num_channels_cur_layer)\n        transition_layers = []\n        for i in range(num_branches_cur):\n            if i < num_branches_pre:\n                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:\n                    transition_layers.append(\n                        nn.Sequential(\n                            L.Conv2d(\n                                num_channels_pre_layer[i],\n                                num_channels_pre_layer[i],\n                                kernel_size=3,\n                                stride=1,\n                                padding=1,\n                                groups=num_channels_pre_layer[i],\n                                bias=False),\n                            nn.BatchNorm2D(num_channels_pre_layer[i]),\n                            L.Conv2d(\n                                num_channels_pre_layer[i],\n                                num_channels_cur_layer[i],\n                                kernel_size=1,\n                                stride=1,\n                                padding=0,\n                                bias=False, ),\n                            nn.BatchNorm2D(num_channels_cur_layer[i]),\n                            nn.ReLU()))\n                else:\n                    transition_layers.append(None)\n            else:\n                conv_downsamples = []\n                for j in range(i + 1 - num_branches_pre):\n                    conv_downsamples.append(\n                        nn.Sequential(\n                            L.Conv2d(\n                                num_channels_pre_layer[-1],\n                                num_channels_pre_layer[-1],\n                                groups=num_channels_pre_layer[-1],\n                                kernel_size=3,\n                                stride=2,\n                                padding=1,\n                                bias=False, ),\n                            nn.BatchNorm2D(num_channels_pre_layer[-1]),\n                            L.Conv2d(\n                                num_channels_pre_layer[-1],\n                                num_channels_cur_layer[i]\n                                if j == i - num_branches_pre else\n                                num_channels_pre_layer[-1],\n                                kernel_size=1,\n                                stride=1,\n                                padding=0,\n                                bias=False, ),\n                            nn.BatchNorm2D(num_channels_cur_layer[i]\n                                           if j == i - num_branches_pre else\n                                           num_channels_pre_layer[-1]),\n                            nn.ReLU()))\n                transition_layers.append(nn.Sequential(*conv_downsamples))\n        return nn.LayerList(transition_layers)\n\n    def _make_stage(self,\n                    stages_config,\n                    stage_idx,\n                    in_channels,\n                    multiscale_output,\n                    freeze_norm=False,\n                    norm_decay=0.):\n        num_modules = stages_config[\"num_modules\"][stage_idx]\n        num_branches = stages_config[\"num_branches\"][stage_idx]\n        num_blocks = stages_config[\"num_blocks\"][stage_idx]\n        reduce_ratio = stages_config['reduce_ratios'][stage_idx]\n        module_type = stages_config['module_type'][stage_idx]\n\n        modules = []\n        for i in range(num_modules):\n            if not multiscale_output and i == num_modules - 1:\n                reset_multiscale_output = False\n            else:\n                reset_multiscale_output = True\n            modules.append(\n                LiteHRNetModule(\n                    num_branches,\n                    num_blocks,\n                    in_channels,\n                    reduce_ratio,\n                    module_type,\n                    multiscale_output=reset_multiscale_output,\n                    with_fuse=True,\n                    freeze_norm=freeze_norm,\n                    norm_decay=norm_decay))\n            in_channels = modules[-1].in_channels\n        return nn.Sequential(*modules), in_channels\n\n    def forward(self, inputs):\n        x = inputs['image']\n        dims = x.shape\n        if len(dims) == 5:\n            x = paddle.reshape(x, (dims[0] * dims[1], dims[2], dims[3],\n                                   dims[4]))  # [6, 3, 128, 96]\n\n        x = self.stem(x)\n        y_list = [x]\n        for stage_idx in range(3):\n            x_list = []\n            transition = getattr(self, 'transition{}'.format(stage_idx))\n            for j in range(self.stages_config[\"num_branches\"][stage_idx]):\n                if transition[j] is not None:\n                    if j >= len(y_list):\n                        x_list.append(transition[j](y_list[-1]))\n                    else:\n                        x_list.append(transition[j](y_list[j]))\n                else:\n                    x_list.append(y_list[j])\n            y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list)\n        x = self.head_layer(y_list)\n        res = []\n        for i, layer in enumerate(x):\n            if i == self.freeze_at:\n                layer.stop_gradient = True\n            if i in self.return_idx:\n                res.append(layer)\n        return res\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self._out_channels[i], stride=self._out_strides[i])\n            for i in self.return_idx\n        ]\n"
  },
  {
    "path": "ppdet/modeling/backbones/mobilenet_v1.py",
    "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import KaimingNormal\nfrom ppdet.core.workspace import register, serializable\nfrom numbers import Integral\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['MobileNet']\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride,\n                 padding,\n                 num_groups=1,\n                 act='relu',\n                 conv_lr=1.,\n                 conv_decay=0.,\n                 norm_decay=0.,\n                 norm_type='bn',\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.act = act\n        self._conv = nn.Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            groups=num_groups,\n            weight_attr=ParamAttr(\n                learning_rate=conv_lr,\n                initializer=KaimingNormal(),\n                regularizer=L2Decay(conv_decay)),\n            bias_attr=False)\n\n        param_attr = ParamAttr(regularizer=L2Decay(norm_decay))\n        bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))\n        if norm_type in ['sync_bn', 'bn']:\n            self._batch_norm = nn.BatchNorm2D(\n                out_channels, weight_attr=param_attr, bias_attr=bias_attr)\n\n    def forward(self, x):\n        x = self._conv(x)\n        x = self._batch_norm(x)\n        if self.act == \"relu\":\n            x = F.relu(x)\n        elif self.act == \"relu6\":\n            x = F.relu6(x)\n        return x\n\n\nclass DepthwiseSeparable(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels1,\n                 out_channels2,\n                 num_groups,\n                 stride,\n                 scale,\n                 conv_lr=1.,\n                 conv_decay=0.,\n                 norm_decay=0.,\n                 norm_type='bn',\n                 name=None):\n        super(DepthwiseSeparable, self).__init__()\n\n        self._depthwise_conv = ConvBNLayer(\n            in_channels,\n            int(out_channels1 * scale),\n            kernel_size=3,\n            stride=stride,\n            padding=1,\n            num_groups=int(num_groups * scale),\n            conv_lr=conv_lr,\n            conv_decay=conv_decay,\n            norm_decay=norm_decay,\n            norm_type=norm_type,\n            name=name + \"_dw\")\n\n        self._pointwise_conv = ConvBNLayer(\n            int(out_channels1 * scale),\n            int(out_channels2 * scale),\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            conv_lr=conv_lr,\n            conv_decay=conv_decay,\n            norm_decay=norm_decay,\n            norm_type=norm_type,\n            name=name + \"_sep\")\n\n    def forward(self, x):\n        x = self._depthwise_conv(x)\n        x = self._pointwise_conv(x)\n        return x\n\n\nclass ExtraBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels1,\n                 out_channels2,\n                 num_groups=1,\n                 stride=2,\n                 conv_lr=1.,\n                 conv_decay=0.,\n                 norm_decay=0.,\n                 norm_type='bn',\n                 name=None):\n        super(ExtraBlock, self).__init__()\n\n        self.pointwise_conv = ConvBNLayer(\n            in_channels,\n            int(out_channels1),\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            num_groups=int(num_groups),\n            act='relu6',\n            conv_lr=conv_lr,\n            conv_decay=conv_decay,\n            norm_decay=norm_decay,\n            norm_type=norm_type,\n            name=name + \"_extra1\")\n\n        self.normal_conv = ConvBNLayer(\n            int(out_channels1),\n            int(out_channels2),\n            kernel_size=3,\n            stride=stride,\n            padding=1,\n            num_groups=int(num_groups),\n            act='relu6',\n            conv_lr=conv_lr,\n            conv_decay=conv_decay,\n            norm_decay=norm_decay,\n            norm_type=norm_type,\n            name=name + \"_extra2\")\n\n    def forward(self, x):\n        x = self.pointwise_conv(x)\n        x = self.normal_conv(x)\n        return x\n\n\n@register\n@serializable\nclass MobileNet(nn.Layer):\n    __shared__ = ['norm_type']\n\n    def __init__(self,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 conv_decay=0.,\n                 scale=1,\n                 conv_learning_rate=1.0,\n                 feature_maps=[4, 6, 13],\n                 with_extra_blocks=False,\n                 extra_block_filters=[[256, 512], [128, 256], [128, 256],\n                                      [64, 128]]):\n        super(MobileNet, self).__init__()\n        if isinstance(feature_maps, Integral):\n            feature_maps = [feature_maps]\n        self.feature_maps = feature_maps\n        self.with_extra_blocks = with_extra_blocks\n        self.extra_block_filters = extra_block_filters\n\n        self._out_channels = []\n\n        self.conv1 = ConvBNLayer(\n            in_channels=3,\n            out_channels=int(32 * scale),\n            kernel_size=3,\n            stride=2,\n            padding=1,\n            conv_lr=conv_learning_rate,\n            conv_decay=conv_decay,\n            norm_decay=norm_decay,\n            norm_type=norm_type,\n            name=\"conv1\")\n\n        self.dwsl = []\n        dws21 = self.add_sublayer(\n            \"conv2_1\",\n            sublayer=DepthwiseSeparable(\n                in_channels=int(32 * scale),\n                out_channels1=32,\n                out_channels2=64,\n                num_groups=32,\n                stride=1,\n                scale=scale,\n                conv_lr=conv_learning_rate,\n                conv_decay=conv_decay,\n                norm_decay=norm_decay,\n                norm_type=norm_type,\n                name=\"conv2_1\"))\n        self.dwsl.append(dws21)\n        self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)\n        dws22 = self.add_sublayer(\n            \"conv2_2\",\n            sublayer=DepthwiseSeparable(\n                in_channels=int(64 * scale),\n                out_channels1=64,\n                out_channels2=128,\n                num_groups=64,\n                stride=2,\n                scale=scale,\n                conv_lr=conv_learning_rate,\n                conv_decay=conv_decay,\n                norm_decay=norm_decay,\n                norm_type=norm_type,\n                name=\"conv2_2\"))\n        self.dwsl.append(dws22)\n        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)\n        # 1/4\n        dws31 = self.add_sublayer(\n            \"conv3_1\",\n            sublayer=DepthwiseSeparable(\n                in_channels=int(128 * scale),\n                out_channels1=128,\n                out_channels2=128,\n                num_groups=128,\n                stride=1,\n                scale=scale,\n                conv_lr=conv_learning_rate,\n                conv_decay=conv_decay,\n                norm_decay=norm_decay,\n                norm_type=norm_type,\n                name=\"conv3_1\"))\n        self.dwsl.append(dws31)\n        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)\n        dws32 = self.add_sublayer(\n            \"conv3_2\",\n            sublayer=DepthwiseSeparable(\n                in_channels=int(128 * scale),\n                out_channels1=128,\n                out_channels2=256,\n                num_groups=128,\n                stride=2,\n                scale=scale,\n                conv_lr=conv_learning_rate,\n                conv_decay=conv_decay,\n                norm_decay=norm_decay,\n                norm_type=norm_type,\n                name=\"conv3_2\"))\n        self.dwsl.append(dws32)\n        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)\n        # 1/8\n        dws41 = self.add_sublayer(\n            \"conv4_1\",\n            sublayer=DepthwiseSeparable(\n                in_channels=int(256 * scale),\n                out_channels1=256,\n                out_channels2=256,\n                num_groups=256,\n                stride=1,\n                scale=scale,\n                conv_lr=conv_learning_rate,\n                conv_decay=conv_decay,\n                norm_decay=norm_decay,\n                norm_type=norm_type,\n                name=\"conv4_1\"))\n        self.dwsl.append(dws41)\n        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)\n        dws42 = self.add_sublayer(\n            \"conv4_2\",\n            sublayer=DepthwiseSeparable(\n                in_channels=int(256 * scale),\n                out_channels1=256,\n                out_channels2=512,\n                num_groups=256,\n                stride=2,\n                scale=scale,\n                conv_lr=conv_learning_rate,\n                conv_decay=conv_decay,\n                norm_decay=norm_decay,\n                norm_type=norm_type,\n                name=\"conv4_2\"))\n        self.dwsl.append(dws42)\n        self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)\n        # 1/16\n        for i in range(5):\n            tmp = self.add_sublayer(\n                \"conv5_\" + str(i + 1),\n                sublayer=DepthwiseSeparable(\n                    in_channels=int(512 * scale),\n                    out_channels1=512,\n                    out_channels2=512,\n                    num_groups=512,\n                    stride=1,\n                    scale=scale,\n                    conv_lr=conv_learning_rate,\n                    conv_decay=conv_decay,\n                    norm_decay=norm_decay,\n                    norm_type=norm_type,\n                    name=\"conv5_\" + str(i + 1)))\n            self.dwsl.append(tmp)\n            self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)\n        dws56 = self.add_sublayer(\n            \"conv5_6\",\n            sublayer=DepthwiseSeparable(\n                in_channels=int(512 * scale),\n                out_channels1=512,\n                out_channels2=1024,\n                num_groups=512,\n                stride=2,\n                scale=scale,\n                conv_lr=conv_learning_rate,\n                conv_decay=conv_decay,\n                norm_decay=norm_decay,\n                norm_type=norm_type,\n                name=\"conv5_6\"))\n        self.dwsl.append(dws56)\n        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)\n        # 1/32\n        dws6 = self.add_sublayer(\n            \"conv6\",\n            sublayer=DepthwiseSeparable(\n                in_channels=int(1024 * scale),\n                out_channels1=1024,\n                out_channels2=1024,\n                num_groups=1024,\n                stride=1,\n                scale=scale,\n                conv_lr=conv_learning_rate,\n                conv_decay=conv_decay,\n                norm_decay=norm_decay,\n                norm_type=norm_type,\n                name=\"conv6\"))\n        self.dwsl.append(dws6)\n        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)\n\n        if self.with_extra_blocks:\n            self.extra_blocks = []\n            for i, block_filter in enumerate(self.extra_block_filters):\n                in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]\n                conv_extra = self.add_sublayer(\n                    \"conv7_\" + str(i + 1),\n                    sublayer=ExtraBlock(\n                        in_c,\n                        block_filter[0],\n                        block_filter[1],\n                        conv_lr=conv_learning_rate,\n                        conv_decay=conv_decay,\n                        norm_decay=norm_decay,\n                        norm_type=norm_type,\n                        name=\"conv7_\" + str(i + 1)))\n                self.extra_blocks.append(conv_extra)\n                self._update_out_channels(\n                    block_filter[1],\n                    len(self.dwsl) + len(self.extra_blocks), feature_maps)\n\n    def _update_out_channels(self, channel, feature_idx, feature_maps):\n        if feature_idx in feature_maps:\n            self._out_channels.append(channel)\n\n    def forward(self, inputs):\n        outs = []\n        y = self.conv1(inputs['image'])\n        for i, block in enumerate(self.dwsl):\n            y = block(y)\n            if i + 1 in self.feature_maps:\n                outs.append(y)\n\n        if not self.with_extra_blocks:\n            return outs\n\n        y = outs[-1]\n        for i, block in enumerate(self.extra_blocks):\n            idx = i + len(self.dwsl)\n            y = block(y)\n            if idx + 1 in self.feature_maps:\n                outs.append(y)\n        return outs\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/backbones/mobilenet_v3.py",
    "content": "# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom ppdet.core.workspace import register, serializable\nfrom numbers import Integral\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['MobileNetV3']\n\n\ndef make_divisible(v, divisor=8, min_value=None):\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_c,\n                 out_c,\n                 filter_size,\n                 stride,\n                 padding,\n                 num_groups=1,\n                 act=None,\n                 lr_mult=1.,\n                 conv_decay=0.,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 name=\"\"):\n        super(ConvBNLayer, self).__init__()\n        self.act = act\n        self.conv = nn.Conv2D(\n            in_channels=in_c,\n            out_channels=out_c,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=padding,\n            groups=num_groups,\n            weight_attr=ParamAttr(\n                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),\n            bias_attr=False)\n\n        norm_lr = 0. if freeze_norm else lr_mult\n        param_attr = ParamAttr(\n            learning_rate=norm_lr,\n            regularizer=L2Decay(norm_decay),\n            trainable=False if freeze_norm else True)\n        bias_attr = ParamAttr(\n            learning_rate=norm_lr,\n            regularizer=L2Decay(norm_decay),\n            trainable=False if freeze_norm else True)\n        global_stats = True if freeze_norm else None\n        if norm_type in ['sync_bn', 'bn']:\n            self.bn = nn.BatchNorm2D(\n                out_c,\n                weight_attr=param_attr,\n                bias_attr=bias_attr,\n                use_global_stats=global_stats)\n        norm_params = self.bn.parameters()\n        if freeze_norm:\n            for param in norm_params:\n                param.stop_gradient = True\n\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        if self.act is not None:\n            if self.act == \"relu\":\n                x = F.relu(x)\n            elif self.act == \"relu6\":\n                x = F.relu6(x)\n            elif self.act == \"hard_swish\":\n                x = F.hardswish(x)\n            else:\n                raise NotImplementedError(\n                    \"The activation function is selected incorrectly.\")\n        return x\n\n\nclass ResidualUnit(nn.Layer):\n    def __init__(self,\n                 in_c,\n                 mid_c,\n                 out_c,\n                 filter_size,\n                 stride,\n                 use_se,\n                 lr_mult,\n                 conv_decay=0.,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 act=None,\n                 return_list=False,\n                 name=''):\n        super(ResidualUnit, self).__init__()\n        self.if_shortcut = stride == 1 and in_c == out_c\n        self.use_se = use_se\n        self.return_list = return_list\n\n        self.expand_conv = ConvBNLayer(\n            in_c=in_c,\n            out_c=mid_c,\n            filter_size=1,\n            stride=1,\n            padding=0,\n            act=act,\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_expand\")\n        self.bottleneck_conv = ConvBNLayer(\n            in_c=mid_c,\n            out_c=mid_c,\n            filter_size=filter_size,\n            stride=stride,\n            padding=int((filter_size - 1) // 2),\n            num_groups=mid_c,\n            act=act,\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_depthwise\")\n        if self.use_se:\n            self.mid_se = SEModule(\n                mid_c, lr_mult, conv_decay, name=name + \"_se\")\n        self.linear_conv = ConvBNLayer(\n            in_c=mid_c,\n            out_c=out_c,\n            filter_size=1,\n            stride=1,\n            padding=0,\n            act=None,\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_linear\")\n\n    def forward(self, inputs):\n        y = self.expand_conv(inputs)\n        x = self.bottleneck_conv(y)\n        if self.use_se:\n            x = self.mid_se(x)\n        x = self.linear_conv(x)\n        if self.if_shortcut:\n            x = paddle.add(inputs, x)\n        if self.return_list:\n            return [y, x]\n        else:\n            return x\n\n\nclass SEModule(nn.Layer):\n    def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=\"\"):\n        super(SEModule, self).__init__()\n        self.avg_pool = nn.AdaptiveAvgPool2D(1)\n        mid_channels = int(channel // reduction)\n        self.conv1 = nn.Conv2D(\n            in_channels=channel,\n            out_channels=mid_channels,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            weight_attr=ParamAttr(\n                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),\n            bias_attr=ParamAttr(\n                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))\n        self.conv2 = nn.Conv2D(\n            in_channels=mid_channels,\n            out_channels=channel,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            weight_attr=ParamAttr(\n                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),\n            bias_attr=ParamAttr(\n                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))\n\n    def forward(self, inputs):\n        outputs = self.avg_pool(inputs)\n        outputs = self.conv1(outputs)\n        outputs = F.relu(outputs)\n        outputs = self.conv2(outputs)\n        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)\n        return paddle.multiply(x=inputs, y=outputs)\n\n\nclass ExtraBlockDW(nn.Layer):\n    def __init__(self,\n                 in_c,\n                 ch_1,\n                 ch_2,\n                 stride,\n                 lr_mult,\n                 conv_decay=0.,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 name=None):\n        super(ExtraBlockDW, self).__init__()\n        self.pointwise_conv = ConvBNLayer(\n            in_c=in_c,\n            out_c=ch_1,\n            filter_size=1,\n            stride=1,\n            padding='SAME',\n            act='relu6',\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_extra1\")\n        self.depthwise_conv = ConvBNLayer(\n            in_c=ch_1,\n            out_c=ch_2,\n            filter_size=3,\n            stride=stride,\n            padding='SAME',\n            num_groups=int(ch_1),\n            act='relu6',\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_extra2_dw\")\n        self.normal_conv = ConvBNLayer(\n            in_c=ch_2,\n            out_c=ch_2,\n            filter_size=1,\n            stride=1,\n            padding='SAME',\n            act='relu6',\n            lr_mult=lr_mult,\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=name + \"_extra2_sep\")\n\n    def forward(self, inputs):\n        x = self.pointwise_conv(inputs)\n        x = self.depthwise_conv(x)\n        x = self.normal_conv(x)\n        return x\n\n\n@register\n@serializable\nclass MobileNetV3(nn.Layer):\n    __shared__ = ['norm_type']\n\n    def __init__(\n            self,\n            scale=1.0,\n            model_name=\"large\",\n            feature_maps=[6, 12, 15],\n            with_extra_blocks=False,\n            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],\n            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],\n            conv_decay=0.0,\n            multiplier=1.0,\n            norm_type='bn',\n            norm_decay=0.0,\n            freeze_norm=False):\n        super(MobileNetV3, self).__init__()\n        if isinstance(feature_maps, Integral):\n            feature_maps = [feature_maps]\n        if norm_type == 'sync_bn' and freeze_norm:\n            raise ValueError(\n                \"The norm_type should not be sync_bn when freeze_norm is True\")\n        self.feature_maps = feature_maps\n        self.with_extra_blocks = with_extra_blocks\n        self.extra_block_filters = extra_block_filters\n\n        inplanes = 16\n        if model_name == \"large\":\n            self.cfg = [\n                # k, exp, c,  se,     nl,  s,\n                [3, 16, 16, False, \"relu\", 1],\n                [3, 64, 24, False, \"relu\", 2],\n                [3, 72, 24, False, \"relu\", 1],\n                [5, 72, 40, True, \"relu\", 2],  # RCNN output\n                [5, 120, 40, True, \"relu\", 1],\n                [5, 120, 40, True, \"relu\", 1],  # YOLOv3 output\n                [3, 240, 80, False, \"hard_swish\", 2],  # RCNN output\n                [3, 200, 80, False, \"hard_swish\", 1],\n                [3, 184, 80, False, \"hard_swish\", 1],\n                [3, 184, 80, False, \"hard_swish\", 1],\n                [3, 480, 112, True, \"hard_swish\", 1],\n                [3, 672, 112, True, \"hard_swish\", 1],  # YOLOv3 output\n                [5, 672, 160, True, \"hard_swish\", 2],  # SSD/SSDLite/RCNN output\n                [5, 960, 160, True, \"hard_swish\", 1],\n                [5, 960, 160, True, \"hard_swish\", 1],  # YOLOv3 output\n            ]\n        elif model_name == \"small\":\n            self.cfg = [\n                # k, exp, c,  se,     nl,  s,\n                [3, 16, 16, True, \"relu\", 2],\n                [3, 72, 24, False, \"relu\", 2],  # RCNN output\n                [3, 88, 24, False, \"relu\", 1],  # YOLOv3 output\n                [5, 96, 40, True, \"hard_swish\", 2],  # RCNN output\n                [5, 240, 40, True, \"hard_swish\", 1],\n                [5, 240, 40, True, \"hard_swish\", 1],\n                [5, 120, 48, True, \"hard_swish\", 1],\n                [5, 144, 48, True, \"hard_swish\", 1],  # YOLOv3 output\n                [5, 288, 96, True, \"hard_swish\", 2],  # SSD/SSDLite/RCNN output\n                [5, 576, 96, True, \"hard_swish\", 1],\n                [5, 576, 96, True, \"hard_swish\", 1],  # YOLOv3 output\n            ]\n        else:\n            raise NotImplementedError(\n                \"mode[{}_model] is not implemented!\".format(model_name))\n\n        if multiplier != 1.0:\n            self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)\n            self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)\n            self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)\n            self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)\n            self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)\n\n        self.conv1 = ConvBNLayer(\n            in_c=3,\n            out_c=make_divisible(inplanes * scale),\n            filter_size=3,\n            stride=2,\n            padding=1,\n            num_groups=1,\n            act=\"hard_swish\",\n            lr_mult=lr_mult_list[0],\n            conv_decay=conv_decay,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            name=\"conv1\")\n\n        self._out_channels = []\n        self.block_list = []\n        i = 0\n        inplanes = make_divisible(inplanes * scale)\n        for (k, exp, c, se, nl, s) in self.cfg:\n            lr_idx = min(i // 3, len(lr_mult_list) - 1)\n            lr_mult = lr_mult_list[lr_idx]\n\n            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv\n            return_list = self.with_extra_blocks and i + 2 in self.feature_maps\n\n            block = self.add_sublayer(\n                \"conv\" + str(i + 2),\n                sublayer=ResidualUnit(\n                    in_c=inplanes,\n                    mid_c=make_divisible(scale * exp),\n                    out_c=make_divisible(scale * c),\n                    filter_size=k,\n                    stride=s,\n                    use_se=se,\n                    act=nl,\n                    lr_mult=lr_mult,\n                    conv_decay=conv_decay,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    return_list=return_list,\n                    name=\"conv\" + str(i + 2)))\n            self.block_list.append(block)\n            inplanes = make_divisible(scale * c)\n            i += 1\n            self._update_out_channels(\n                make_divisible(scale * exp)\n                if return_list else inplanes, i + 1, feature_maps)\n\n        if self.with_extra_blocks:\n            self.extra_block_list = []\n            extra_out_c = make_divisible(scale * self.cfg[-1][1])\n            lr_idx = min(i // 3, len(lr_mult_list) - 1)\n            lr_mult = lr_mult_list[lr_idx]\n\n            conv_extra = self.add_sublayer(\n                \"conv\" + str(i + 2),\n                sublayer=ConvBNLayer(\n                    in_c=inplanes,\n                    out_c=extra_out_c,\n                    filter_size=1,\n                    stride=1,\n                    padding=0,\n                    num_groups=1,\n                    act=\"hard_swish\",\n                    lr_mult=lr_mult,\n                    conv_decay=conv_decay,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    name=\"conv\" + str(i + 2)))\n            self.extra_block_list.append(conv_extra)\n            i += 1\n            self._update_out_channels(extra_out_c, i + 1, feature_maps)\n\n            for j, block_filter in enumerate(self.extra_block_filters):\n                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -\n                                                                           1][1]\n                conv_extra = self.add_sublayer(\n                    \"conv\" + str(i + 2),\n                    sublayer=ExtraBlockDW(\n                        in_c,\n                        block_filter[0],\n                        block_filter[1],\n                        stride=2,\n                        lr_mult=lr_mult,\n                        conv_decay=conv_decay,\n                        norm_type=norm_type,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        name='conv' + str(i + 2)))\n                self.extra_block_list.append(conv_extra)\n                i += 1\n                self._update_out_channels(block_filter[1], i + 1, feature_maps)\n\n    def _update_out_channels(self, channel, feature_idx, feature_maps):\n        if feature_idx in feature_maps:\n            self._out_channels.append(channel)\n\n    def forward(self, inputs):\n        x = self.conv1(inputs['image'])\n        outs = []\n        for idx, block in enumerate(self.block_list):\n            x = block(x)\n            if idx + 2 in self.feature_maps:\n                if isinstance(x, list):\n                    outs.append(x[0])\n                    x = x[1]\n                else:\n                    outs.append(x)\n\n        if not self.with_extra_blocks:\n            return outs\n\n        for i, block in enumerate(self.extra_block_list):\n            idx = i + len(self.block_list)\n            x = block(x)\n            if idx + 2 in self.feature_maps:\n                outs.append(x)\n        return outs\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/backbones/mobileone.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. \nSome codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py\nThs copyright of microsoft/Swin-Transformer is as follows:\nMIT License [see LICENSE for details]\n\"\"\"\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import Normal, Constant\n\nfrom ppdet.modeling.ops import get_act_fn\nfrom ppdet.modeling.layers import ConvNormLayer\n\n\nclass MobileOneBlock(nn.Layer):\n    def __init__(\n            self,\n            ch_in,\n            ch_out,\n            stride,\n            kernel_size,\n            conv_num=1,\n            norm_type='bn',\n            norm_decay=0.,\n            norm_groups=32,\n            bias_on=False,\n            lr_scale=1.,\n            freeze_norm=False,\n            initializer=Normal(\n                mean=0., std=0.01),\n            skip_quant=False,\n            act='relu', ):\n        super(MobileOneBlock, self).__init__()\n\n        self.ch_in = ch_in\n        self.ch_out = ch_out\n        self.kernel_size = kernel_size\n        self.stride = stride\n        self.padding = (kernel_size - 1) // 2\n        self.k = conv_num\n\n        self.depth_conv = nn.LayerList()\n        self.point_conv = nn.LayerList()\n        for _ in range(self.k):\n            self.depth_conv.append(\n                ConvNormLayer(\n                    ch_in,\n                    ch_in,\n                    kernel_size,\n                    stride=stride,\n                    groups=ch_in,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    norm_groups=norm_groups,\n                    bias_on=bias_on,\n                    lr_scale=lr_scale,\n                    freeze_norm=freeze_norm,\n                    initializer=initializer,\n                    skip_quant=skip_quant))\n            self.point_conv.append(\n                ConvNormLayer(\n                    ch_in,\n                    ch_out,\n                    1,\n                    stride=1,\n                    groups=1,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    norm_groups=norm_groups,\n                    bias_on=bias_on,\n                    lr_scale=lr_scale,\n                    freeze_norm=freeze_norm,\n                    initializer=initializer,\n                    skip_quant=skip_quant))\n        self.rbr_1x1 = ConvNormLayer(\n            ch_in,\n            ch_in,\n            1,\n            stride=self.stride,\n            groups=ch_in,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            norm_groups=norm_groups,\n            bias_on=bias_on,\n            lr_scale=lr_scale,\n            freeze_norm=freeze_norm,\n            initializer=initializer,\n            skip_quant=skip_quant)\n        self.rbr_identity_st1 = nn.BatchNorm2D(\n            num_features=ch_in,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(\n                0.0))) if ch_in == ch_out and self.stride == 1 else None\n        self.rbr_identity_st2 = nn.BatchNorm2D(\n            num_features=ch_out,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(\n                0.0))) if ch_in == ch_out and self.stride == 1 else None\n        self.act = get_act_fn(act) if act is None or isinstance(act, (\n            str, dict)) else act\n\n    def forward(self, x):\n        if hasattr(self, \"conv1\") and hasattr(self, \"conv2\"):\n            y = self.act(self.conv2(self.act(self.conv1(x))))\n        else:\n            if self.rbr_identity_st1 is None:\n                id_out_st1 = 0\n            else:\n                id_out_st1 = self.rbr_identity_st1(x)\n\n            x1_1 = 0\n            for i in range(self.k):\n                x1_1 += self.depth_conv[i](x)\n\n            x1_2 = self.rbr_1x1(x)\n            x1 = self.act(x1_1 + x1_2 + id_out_st1)\n\n            if self.rbr_identity_st2 is None:\n                id_out_st2 = 0\n            else:\n                id_out_st2 = self.rbr_identity_st2(x1)\n\n            x2_1 = 0\n            for i in range(self.k):\n                x2_1 += self.point_conv[i](x1)\n            y = self.act(x2_1 + id_out_st2)\n\n        return y\n\n    def convert_to_deploy(self):\n        if not hasattr(self, 'conv1'):\n            self.conv1 = nn.Conv2D(\n                in_channels=self.ch_in,\n                out_channels=self.ch_in,\n                kernel_size=self.kernel_size,\n                stride=self.stride,\n                padding=self.padding,\n                groups=self.ch_in,\n                bias_attr=ParamAttr(\n                    initializer=Constant(value=0.), learning_rate=1.))\n        if not hasattr(self, 'conv2'):\n            self.conv2 = nn.Conv2D(\n                in_channels=self.ch_in,\n                out_channels=self.ch_out,\n                kernel_size=1,\n                stride=1,\n                padding='SAME',\n                groups=1,\n                bias_attr=ParamAttr(\n                    initializer=Constant(value=0.), learning_rate=1.))\n\n        conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(\n        )\n        self.conv1.weight.set_value(conv1_kernel)\n        self.conv1.bias.set_value(conv1_bias)\n        self.conv2.weight.set_value(conv2_kernel)\n        self.conv2.bias.set_value(conv2_bias)\n        self.__delattr__('depth_conv')\n        self.__delattr__('point_conv')\n        self.__delattr__('rbr_1x1')\n        if hasattr(self, 'rbr_identity_st1'):\n            self.__delattr__('rbr_identity_st1')\n        if hasattr(self, 'rbr_identity_st2'):\n            self.__delattr__('rbr_identity_st2')\n\n    def get_equivalent_kernel_bias(self):\n        st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)\n        st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)\n        st1_kernelid, st1_biasid = self._fuse_bn_tensor(\n            self.rbr_identity_st1, kernel_size=self.kernel_size)\n\n        st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)\n        st2_kernelid, st2_biasid = self._fuse_bn_tensor(\n            self.rbr_identity_st2, kernel_size=1)\n\n        conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(\n            st1_kernel1x1) + st1_kernelid\n\n        conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid\n\n        conv2_kernel = st2_kernel1x1 + st2_kernelid\n        conv2_bias = st2_bias1x1 + st2_biasid\n\n        return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias\n\n    def _pad_1x1_to_3x3_tensor(self, kernel1x1):\n        if kernel1x1 is None:\n            return 0\n        else:\n            padding_size = (self.kernel_size - 1) // 2\n            return nn.functional.pad(\n                kernel1x1,\n                [padding_size, padding_size, padding_size, padding_size])\n\n    def _fuse_bn_tensor(self, branch, kernel_size=3):\n        if branch is None:\n            return 0, 0\n\n        if isinstance(branch, nn.LayerList):\n            fused_kernels = []\n            fused_bias = []\n            for block in branch:\n                kernel = block.conv.weight\n                running_mean = block.norm._mean\n                running_var = block.norm._variance\n                gamma = block.norm.weight\n                beta = block.norm.bias\n                eps = block.norm._epsilon\n\n                std = (running_var + eps).sqrt()\n                t = (gamma / std).reshape((-1, 1, 1, 1))\n\n                fused_kernels.append(kernel * t)\n                fused_bias.append(beta - running_mean * gamma / std)\n\n            return sum(fused_kernels), sum(fused_bias)\n\n        elif isinstance(branch, ConvNormLayer):\n            kernel = branch.conv.weight\n            running_mean = branch.norm._mean\n            running_var = branch.norm._variance\n            gamma = branch.norm.weight\n            beta = branch.norm.bias\n            eps = branch.norm._epsilon\n        else:\n            assert isinstance(branch, nn.BatchNorm2D)\n            input_dim = self.ch_in if kernel_size == 1 else 1\n            kernel_value = paddle.zeros(\n                shape=[self.ch_in, input_dim, kernel_size, kernel_size],\n                dtype='float32')\n            if kernel_size > 1:\n                for i in range(self.ch_in):\n                    kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (\n                        kernel_size - 1) // 2] = 1\n            elif kernel_size == 1:\n                for i in range(self.ch_in):\n                    kernel_value[i, i % input_dim, 0, 0] = 1\n            else:\n                raise ValueError(\"Invalid kernel size recieved!\")\n            kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)\n            running_mean = branch._mean\n            running_var = branch._variance\n            gamma = branch.weight\n            beta = branch.bias\n            eps = branch._epsilon\n\n        std = (running_var + eps).sqrt()\n        t = (gamma / std).reshape((-1, 1, 1, 1))\n\n        return kernel * t, beta - running_mean * gamma / std\n"
  },
  {
    "path": "ppdet/modeling/backbones/name_adapter.py",
    "content": "class NameAdapter(object):\n    \"\"\"Fix the backbones variable names for pretrained weight\"\"\"\n\n    def __init__(self, model):\n        super(NameAdapter, self).__init__()\n        self.model = model\n\n    @property\n    def model_type(self):\n        return getattr(self.model, '_model_type', '')\n\n    @property\n    def variant(self):\n        return getattr(self.model, 'variant', '')\n\n    def fix_conv_norm_name(self, name):\n        if name == \"conv1\":\n            bn_name = \"bn_\" + name\n        else:\n            bn_name = \"bn\" + name[3:]\n        # the naming rule is same as pretrained weight\n        if self.model_type == 'SEResNeXt':\n            bn_name = name + \"_bn\"\n        return bn_name\n\n    def fix_shortcut_name(self, name):\n        if self.model_type == 'SEResNeXt':\n            name = 'conv' + name + '_prj'\n        return name\n\n    def fix_bottleneck_name(self, name):\n        if self.model_type == 'SEResNeXt':\n            conv_name1 = 'conv' + name + '_x1'\n            conv_name2 = 'conv' + name + '_x2'\n            conv_name3 = 'conv' + name + '_x3'\n            shortcut_name = name\n        else:\n            conv_name1 = name + \"_branch2a\"\n            conv_name2 = name + \"_branch2b\"\n            conv_name3 = name + \"_branch2c\"\n            shortcut_name = name + \"_branch1\"\n        return conv_name1, conv_name2, conv_name3, shortcut_name\n\n    def fix_basicblock_name(self, name):\n        if self.model_type == 'SEResNeXt':\n            conv_name1 = 'conv' + name + '_x1'\n            conv_name2 = 'conv' + name + '_x2'\n            shortcut_name = name\n        else:\n            conv_name1 = name + \"_branch2a\"\n            conv_name2 = name + \"_branch2b\"\n            shortcut_name = name + \"_branch1\"\n        return conv_name1, conv_name2, shortcut_name\n\n    def fix_layer_warp_name(self, stage_num, count, i):\n        name = 'res' + str(stage_num)\n        if count > 10 and stage_num == 4:\n            if i == 0:\n                conv_name = name + \"a\"\n            else:\n                conv_name = name + \"b\" + str(i)\n        else:\n            conv_name = name + chr(ord(\"a\") + i)\n        if self.model_type == 'SEResNeXt':\n            conv_name = str(stage_num + 2) + '_' + str(i + 1)\n        return conv_name\n\n    def fix_c1_stage_name(self):\n        return \"res_conv1\" if self.model_type == 'ResNeXt' else \"conv1\"\n"
  },
  {
    "path": "ppdet/modeling/backbones/res2net.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom numbers import Integral\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\nfrom .resnet import ConvNormLayer\n\n__all__ = ['Res2Net', 'Res2NetC5']\n\nRes2Net_cfg = {\n    50: [3, 4, 6, 3],\n    101: [3, 4, 23, 3],\n    152: [3, 8, 36, 3],\n    200: [3, 12, 48, 3]\n}\n\n\nclass BottleNeck(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 stride,\n                 shortcut,\n                 width,\n                 scales=4,\n                 variant='b',\n                 groups=1,\n                 lr=1.0,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 dcn_v2=False):\n        super(BottleNeck, self).__init__()\n\n        self.shortcut = shortcut\n        self.scales = scales\n        self.stride = stride\n        if not shortcut:\n            if variant == 'd' and stride == 2:\n                self.branch1 = nn.Sequential()\n                self.branch1.add_sublayer(\n                    'pool',\n                    nn.AvgPool2D(\n                        kernel_size=2, stride=2, padding=0, ceil_mode=True))\n                self.branch1.add_sublayer(\n                    'conv',\n                    ConvNormLayer(\n                        ch_in=ch_in,\n                        ch_out=ch_out,\n                        filter_size=1,\n                        stride=1,\n                        norm_type=norm_type,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        lr=lr))\n            else:\n                self.branch1 = ConvNormLayer(\n                    ch_in=ch_in,\n                    ch_out=ch_out,\n                    filter_size=1,\n                    stride=stride,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    lr=lr)\n\n        self.branch2a = ConvNormLayer(\n            ch_in=ch_in,\n            ch_out=width * scales,\n            filter_size=1,\n            stride=stride if variant == 'a' else 1,\n            groups=1,\n            act='relu',\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            lr=lr)\n\n        self.branch2b = nn.LayerList([\n            ConvNormLayer(\n                ch_in=width,\n                ch_out=width,\n                filter_size=3,\n                stride=1 if variant == 'a' else stride,\n                groups=groups,\n                act='relu',\n                norm_type=norm_type,\n                norm_decay=norm_decay,\n                freeze_norm=freeze_norm,\n                lr=lr,\n                dcn_v2=dcn_v2) for _ in range(self.scales - 1)\n        ])\n\n        self.branch2c = ConvNormLayer(\n            ch_in=width * scales,\n            ch_out=ch_out,\n            filter_size=1,\n            stride=1,\n            groups=1,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            lr=lr)\n\n    def forward(self, inputs):\n\n        out = self.branch2a(inputs)\n        feature_split = paddle.split(out, self.scales, 1)\n        out_split = []\n        for i in range(self.scales - 1):\n            if i == 0 or self.stride == 2:\n                out_split.append(self.branch2b[i](feature_split[i]))\n            else:\n                out_split.append(self.branch2b[i](paddle.add(feature_split[i],\n                                                             out_split[-1])))\n        if self.stride == 1:\n            out_split.append(feature_split[-1])\n        else:\n            out_split.append(F.avg_pool2d(feature_split[-1], 3, self.stride, 1))\n        out = self.branch2c(paddle.concat(out_split, 1))\n\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.branch1(inputs)\n\n        out = paddle.add(out, short)\n        out = F.relu(out)\n\n        return out\n\n\nclass Blocks(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 count,\n                 stage_num,\n                 width,\n                 scales=4,\n                 variant='b',\n                 groups=1,\n                 lr=1.0,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 dcn_v2=False):\n        super(Blocks, self).__init__()\n\n        self.blocks = nn.Sequential()\n        for i in range(count):\n            self.blocks.add_sublayer(\n                str(i),\n                BottleNeck(\n                    ch_in=ch_in if i == 0 else ch_out,\n                    ch_out=ch_out,\n                    stride=2 if i == 0 and stage_num != 2 else 1,\n                    shortcut=False if i == 0 else True,\n                    width=width * (2**(stage_num - 2)),\n                    scales=scales,\n                    variant=variant,\n                    groups=groups,\n                    lr=lr,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    dcn_v2=dcn_v2))\n\n    def forward(self, inputs):\n        return self.blocks(inputs)\n\n\n@register\n@serializable\nclass Res2Net(nn.Layer):\n    \"\"\"\n    Res2Net, see https://arxiv.org/abs/1904.01169\n    Args:\n        depth (int): Res2Net depth, should be 50, 101, 152, 200.\n        width (int): Res2Net width\n        scales (int): Res2Net scale\n        variant (str): Res2Net variant, supports 'a', 'b', 'c', 'd' currently\n        lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),\n                             lower learning rate ratio is need for pretrained model\n                             got using distillation(default as [1.0, 1.0, 1.0, 1.0]).\n        groups (int): The groups number of the Conv Layer.\n        norm_type (str): normalization type, 'bn' or 'sync_bn'\n        norm_decay (float): weight decay for normalization layer weights\n        freeze_norm (bool): freeze normalization layers\n        freeze_at (int): freeze the backbone at which stage\n        return_idx (list): index of stages whose feature maps are returned,\n                           index 0 stands for res2\n        dcn_v2_stages (list): index of stages who select deformable conv v2\n        num_stages (int): number of stages created\n\n    \"\"\"\n    __shared__ = ['norm_type']\n\n    def __init__(self,\n                 depth=50,\n                 width=26,\n                 scales=4,\n                 variant='b',\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],\n                 groups=1,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 freeze_at=0,\n                 return_idx=[0, 1, 2, 3],\n                 dcn_v2_stages=[-1],\n                 num_stages=4):\n        super(Res2Net, self).__init__()\n\n        self._model_type = 'Res2Net' if groups == 1 else 'Res2NeXt'\n\n        assert depth in [50, 101, 152, 200], \\\n            \"depth {} not in [50, 101, 152, 200]\"\n        assert variant in ['a', 'b', 'c', 'd'], \"invalid Res2Net variant\"\n        assert num_stages >= 1 and num_stages <= 4\n\n        self.depth = depth\n        self.variant = variant\n        self.norm_type = norm_type\n        self.norm_decay = norm_decay\n        self.freeze_norm = freeze_norm\n        self.freeze_at = freeze_at\n        if isinstance(return_idx, Integral):\n            return_idx = [return_idx]\n        assert max(return_idx) < num_stages, \\\n            'the maximum return index must smaller than num_stages, ' \\\n            'but received maximum return index is {} and num_stages ' \\\n            'is {}'.format(max(return_idx), num_stages)\n        self.return_idx = return_idx\n        self.num_stages = num_stages\n        assert len(lr_mult_list) == 4, \\\n            \"lr_mult_list length must be 4 but got {}\".format(len(lr_mult_list))\n        if isinstance(dcn_v2_stages, Integral):\n            dcn_v2_stages = [dcn_v2_stages]\n        assert max(dcn_v2_stages) < num_stages\n        self.dcn_v2_stages = dcn_v2_stages\n\n        block_nums = Res2Net_cfg[depth]\n\n        # C1 stage\n        if self.variant in ['c', 'd']:\n            conv_def = [\n                [3, 32, 3, 2, \"conv1_1\"],\n                [32, 32, 3, 1, \"conv1_2\"],\n                [32, 64, 3, 1, \"conv1_3\"],\n            ]\n        else:\n            conv_def = [[3, 64, 7, 2, \"conv1\"]]\n        self.res1 = nn.Sequential()\n        for (c_in, c_out, k, s, _name) in conv_def:\n            self.res1.add_sublayer(\n                _name,\n                ConvNormLayer(\n                    ch_in=c_in,\n                    ch_out=c_out,\n                    filter_size=k,\n                    stride=s,\n                    groups=1,\n                    act='relu',\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    lr=1.0))\n\n        self._in_channels = [64, 256, 512, 1024]\n        self._out_channels = [256, 512, 1024, 2048]\n        self._out_strides = [4, 8, 16, 32]\n\n        # C2-C5 stages\n        self.res_layers = []\n        for i in range(num_stages):\n            lr_mult = lr_mult_list[i]\n            stage_num = i + 2\n            self.res_layers.append(\n                self.add_sublayer(\n                    \"res{}\".format(stage_num),\n                    Blocks(\n                        self._in_channels[i],\n                        self._out_channels[i],\n                        count=block_nums[i],\n                        stage_num=stage_num,\n                        width=width,\n                        scales=scales,\n                        groups=groups,\n                        lr=lr_mult,\n                        norm_type=norm_type,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        dcn_v2=(i in self.dcn_v2_stages))))\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self._out_channels[i], stride=self._out_strides[i])\n            for i in self.return_idx\n        ]\n\n    def forward(self, inputs):\n        x = inputs['image']\n        res1 = self.res1(x)\n        x = F.max_pool2d(res1, kernel_size=3, stride=2, padding=1)\n        outs = []\n        for idx, stage in enumerate(self.res_layers):\n            x = stage(x)\n            if idx == self.freeze_at:\n                x.stop_gradient = True\n            if idx in self.return_idx:\n                outs.append(x)\n        return outs\n\n\n@register\nclass Res2NetC5(nn.Layer):\n    def __init__(self, depth=50, width=26, scales=4, variant='b'):\n        super(Res2NetC5, self).__init__()\n        feat_in, feat_out = [1024, 2048]\n        self.res5 = Blocks(\n            feat_in,\n            feat_out,\n            count=3,\n            stage_num=5,\n            width=width,\n            scales=scales,\n            variant=variant)\n        self.feat_out = feat_out\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(\n            channels=self.feat_out,\n            stride=32, )]\n\n    def forward(self, roi_feat, stage=0):\n        y = self.res5(roi_feat)\n        return y\n"
  },
  {
    "path": "ppdet/modeling/backbones/resnet.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport math\nfrom numbers import Integral\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import Uniform\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Constant\nfrom paddle.vision.ops import DeformConv2D\nfrom .name_adapter import NameAdapter\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']\n\nResNet_cfg = {\n    18: [2, 2, 2, 2],\n    34: [3, 4, 6, 3],\n    50: [3, 4, 6, 3],\n    101: [3, 4, 23, 3],\n    152: [3, 8, 36, 3],\n}\n\n\nclass ConvNormLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size,\n                 stride,\n                 groups=1,\n                 act=None,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 lr=1.0,\n                 dcn_v2=False):\n        super(ConvNormLayer, self).__init__()\n        assert norm_type in ['bn', 'sync_bn']\n        self.norm_type = norm_type\n        self.act = act\n        self.dcn_v2 = dcn_v2\n\n        if not self.dcn_v2:\n            self.conv = nn.Conv2D(\n                in_channels=ch_in,\n                out_channels=ch_out,\n                kernel_size=filter_size,\n                stride=stride,\n                padding=(filter_size - 1) // 2,\n                groups=groups,\n                weight_attr=ParamAttr(learning_rate=lr),\n                bias_attr=False)\n        else:\n            self.offset_channel = 2 * filter_size**2\n            self.mask_channel = filter_size**2\n\n            self.conv_offset = nn.Conv2D(\n                in_channels=ch_in,\n                out_channels=3 * filter_size**2,\n                kernel_size=filter_size,\n                stride=stride,\n                padding=(filter_size - 1) // 2,\n                weight_attr=ParamAttr(initializer=Constant(0.)),\n                bias_attr=ParamAttr(initializer=Constant(0.)))\n            self.conv = DeformConv2D(\n                in_channels=ch_in,\n                out_channels=ch_out,\n                kernel_size=filter_size,\n                stride=stride,\n                padding=(filter_size - 1) // 2,\n                dilation=1,\n                groups=groups,\n                weight_attr=ParamAttr(learning_rate=lr),\n                bias_attr=False)\n\n        norm_lr = 0. if freeze_norm else lr\n        param_attr = ParamAttr(\n            learning_rate=norm_lr,\n            regularizer=L2Decay(norm_decay),\n            trainable=False if freeze_norm else True)\n        bias_attr = ParamAttr(\n            learning_rate=norm_lr,\n            regularizer=L2Decay(norm_decay),\n            trainable=False if freeze_norm else True)\n\n        global_stats = True if freeze_norm else None\n        if norm_type in ['sync_bn', 'bn']:\n            self.norm = nn.BatchNorm2D(\n                ch_out,\n                weight_attr=param_attr,\n                bias_attr=bias_attr,\n                use_global_stats=global_stats)\n        norm_params = self.norm.parameters()\n\n        if freeze_norm:\n            for param in norm_params:\n                param.stop_gradient = True\n\n    def forward(self, inputs):\n        if not self.dcn_v2:\n            out = self.conv(inputs)\n        else:\n            offset_mask = self.conv_offset(inputs)\n            offset, mask = paddle.split(\n                offset_mask,\n                num_or_sections=[self.offset_channel, self.mask_channel],\n                axis=1)\n            mask = F.sigmoid(mask)\n            out = self.conv(inputs, offset, mask=mask)\n\n        if self.norm_type in ['bn', 'sync_bn']:\n            out = self.norm(out)\n        if self.act:\n            out = getattr(F, self.act)(out)\n        return out\n\n\nclass SELayer(nn.Layer):\n    def __init__(self, ch, reduction_ratio=16):\n        super(SELayer, self).__init__()\n        self.pool = nn.AdaptiveAvgPool2D(1)\n        stdv = 1.0 / math.sqrt(ch)\n        c_ = ch // reduction_ratio\n        self.squeeze = nn.Linear(\n            ch,\n            c_,\n            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),\n            bias_attr=True)\n\n        stdv = 1.0 / math.sqrt(c_)\n        self.extract = nn.Linear(\n            c_,\n            ch,\n            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),\n            bias_attr=True)\n\n    def forward(self, inputs):\n        out = self.pool(inputs)\n        out = paddle.squeeze(out, axis=[2, 3])\n        out = self.squeeze(out)\n        out = F.relu(out)\n        out = self.extract(out)\n        out = F.sigmoid(out)\n        out = paddle.unsqueeze(out, axis=[2, 3])\n        scale = out * inputs\n        return scale\n\n\nclass BasicBlock(nn.Layer):\n\n    expansion = 1\n\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 stride,\n                 shortcut,\n                 variant='b',\n                 groups=1,\n                 base_width=64,\n                 lr=1.0,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 dcn_v2=False,\n                 std_senet=False):\n        super(BasicBlock, self).__init__()\n        assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'\n\n        self.shortcut = shortcut\n        if not shortcut:\n            if variant == 'd' and stride == 2:\n                self.short = nn.Sequential()\n                self.short.add_sublayer(\n                    'pool',\n                    nn.AvgPool2D(\n                        kernel_size=2, stride=2, padding=0, ceil_mode=True))\n                self.short.add_sublayer(\n                    'conv',\n                    ConvNormLayer(\n                        ch_in=ch_in,\n                        ch_out=ch_out,\n                        filter_size=1,\n                        stride=1,\n                        norm_type=norm_type,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        lr=lr))\n            else:\n                self.short = ConvNormLayer(\n                    ch_in=ch_in,\n                    ch_out=ch_out,\n                    filter_size=1,\n                    stride=stride,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    lr=lr)\n\n        self.branch2a = ConvNormLayer(\n            ch_in=ch_in,\n            ch_out=ch_out,\n            filter_size=3,\n            stride=stride,\n            act='relu',\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            lr=lr)\n\n        self.branch2b = ConvNormLayer(\n            ch_in=ch_out,\n            ch_out=ch_out,\n            filter_size=3,\n            stride=1,\n            act=None,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            lr=lr,\n            dcn_v2=dcn_v2)\n\n        self.std_senet = std_senet\n        if self.std_senet:\n            self.se = SELayer(ch_out)\n\n    def forward(self, inputs):\n        out = self.branch2a(inputs)\n        out = self.branch2b(out)\n        if self.std_senet:\n            out = self.se(out)\n\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n\n        out = paddle.add(x=out, y=short)\n        out = F.relu(out)\n\n        return out\n\n\nclass BottleNeck(nn.Layer):\n\n    expansion = 4\n\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 stride,\n                 shortcut,\n                 variant='b',\n                 groups=1,\n                 base_width=4,\n                 lr=1.0,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 dcn_v2=False,\n                 std_senet=False):\n        super(BottleNeck, self).__init__()\n        if variant == 'a':\n            stride1, stride2 = stride, 1\n        else:\n            stride1, stride2 = 1, stride\n\n        # ResNeXt\n        width = int(ch_out * (base_width / 64.)) * groups\n\n        self.branch2a = ConvNormLayer(\n            ch_in=ch_in,\n            ch_out=width,\n            filter_size=1,\n            stride=stride1,\n            groups=1,\n            act='relu',\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            lr=lr)\n\n        self.branch2b = ConvNormLayer(\n            ch_in=width,\n            ch_out=width,\n            filter_size=3,\n            stride=stride2,\n            groups=groups,\n            act='relu',\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            lr=lr,\n            dcn_v2=dcn_v2)\n\n        self.branch2c = ConvNormLayer(\n            ch_in=width,\n            ch_out=ch_out * self.expansion,\n            filter_size=1,\n            stride=1,\n            groups=1,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            lr=lr)\n\n        self.shortcut = shortcut\n        if not shortcut:\n            if variant == 'd' and stride == 2:\n                self.short = nn.Sequential()\n                self.short.add_sublayer(\n                    'pool',\n                    nn.AvgPool2D(\n                        kernel_size=2, stride=2, padding=0, ceil_mode=True))\n                self.short.add_sublayer(\n                    'conv',\n                    ConvNormLayer(\n                        ch_in=ch_in,\n                        ch_out=ch_out * self.expansion,\n                        filter_size=1,\n                        stride=1,\n                        norm_type=norm_type,\n                        norm_decay=norm_decay,\n                        freeze_norm=freeze_norm,\n                        lr=lr))\n            else:\n                self.short = ConvNormLayer(\n                    ch_in=ch_in,\n                    ch_out=ch_out * self.expansion,\n                    filter_size=1,\n                    stride=stride,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    lr=lr)\n\n        self.std_senet = std_senet\n        if self.std_senet:\n            self.se = SELayer(ch_out * self.expansion)\n\n    def forward(self, inputs):\n\n        out = self.branch2a(inputs)\n        out = self.branch2b(out)\n        out = self.branch2c(out)\n\n        if self.std_senet:\n            out = self.se(out)\n\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n\n        out = paddle.add(x=out, y=short)\n        out = F.relu(out)\n\n        return out\n\n\nclass Blocks(nn.Layer):\n    def __init__(self,\n                 block,\n                 ch_in,\n                 ch_out,\n                 count,\n                 name_adapter,\n                 stage_num,\n                 variant='b',\n                 groups=1,\n                 base_width=64,\n                 lr=1.0,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 freeze_norm=True,\n                 dcn_v2=False,\n                 std_senet=False):\n        super(Blocks, self).__init__()\n\n        self.blocks = []\n        for i in range(count):\n            conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)\n            layer = self.add_sublayer(\n                conv_name,\n                block(\n                    ch_in=ch_in,\n                    ch_out=ch_out,\n                    stride=2 if i == 0 and stage_num != 2 else 1,\n                    shortcut=False if i == 0 else True,\n                    variant=variant,\n                    groups=groups,\n                    base_width=base_width,\n                    lr=lr,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    dcn_v2=dcn_v2,\n                    std_senet=std_senet))\n            self.blocks.append(layer)\n            if i == 0:\n                ch_in = ch_out * block.expansion\n\n    def forward(self, inputs):\n        block_out = inputs\n        for block in self.blocks:\n            block_out = block(block_out)\n        return block_out\n\n\n@register\n@serializable\nclass ResNet(nn.Layer):\n    __shared__ = ['norm_type']\n\n    def __init__(self,\n                 depth=50,\n                 ch_in=64,\n                 variant='b',\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],\n                 groups=1,\n                 base_width=64,\n                 norm_type='bn',\n                 norm_decay=0,\n                 freeze_norm=True,\n                 freeze_at=0,\n                 return_idx=[0, 1, 2, 3],\n                 dcn_v2_stages=[-1],\n                 num_stages=4,\n                 std_senet=False,\n                 freeze_stem_only=False):\n        \"\"\"\n        Residual Network, see https://arxiv.org/abs/1512.03385\n        \n        Args:\n            depth (int): ResNet depth, should be 18, 34, 50, 101, 152.\n            ch_in (int): output channel of first stage, default 64\n            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently\n            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),\n                                 lower learning rate ratio is need for pretrained model \n                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).\n            groups (int): group convolution cardinality\n            base_width (int): base width of each group convolution\n            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'\n            norm_decay (float): weight decay for normalization layer weights\n            freeze_norm (bool): freeze normalization layers\n            freeze_at (int): freeze the backbone at which stage\n            return_idx (list): index of the stages whose feature maps are returned\n            dcn_v2_stages (list): index of stages who select deformable conv v2\n            num_stages (int): total num of stages\n            std_senet (bool): whether use senet, default True\n        \"\"\"\n        super(ResNet, self).__init__()\n        self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'\n        assert num_stages >= 1 and num_stages <= 4\n        self.depth = depth\n        self.variant = variant\n        self.groups = groups\n        self.base_width = base_width\n        self.norm_type = norm_type\n        self.norm_decay = norm_decay\n        self.freeze_norm = freeze_norm\n        self.freeze_at = freeze_at\n        if isinstance(return_idx, Integral):\n            return_idx = [return_idx]\n        assert max(return_idx) < num_stages, \\\n            'the maximum return index must smaller than num_stages, ' \\\n            'but received maximum return index is {} and num_stages ' \\\n            'is {}'.format(max(return_idx), num_stages)\n        self.return_idx = return_idx\n        self.num_stages = num_stages\n        assert len(lr_mult_list) == 4, \\\n            \"lr_mult_list length must be 4 but got {}\".format(len(lr_mult_list))\n        if isinstance(dcn_v2_stages, Integral):\n            dcn_v2_stages = [dcn_v2_stages]\n        assert max(dcn_v2_stages) < num_stages\n\n        if isinstance(dcn_v2_stages, Integral):\n            dcn_v2_stages = [dcn_v2_stages]\n        assert max(dcn_v2_stages) < num_stages\n        self.dcn_v2_stages = dcn_v2_stages\n\n        block_nums = ResNet_cfg[depth]\n        na = NameAdapter(self)\n\n        conv1_name = na.fix_c1_stage_name()\n        if variant in ['c', 'd']:\n            conv_def = [\n                [3, ch_in // 2, 3, 2, \"conv1_1\"],\n                [ch_in // 2, ch_in // 2, 3, 1, \"conv1_2\"],\n                [ch_in // 2, ch_in, 3, 1, \"conv1_3\"],\n            ]\n        else:\n            conv_def = [[3, ch_in, 7, 2, conv1_name]]\n        self.conv1 = nn.Sequential()\n        for (c_in, c_out, k, s, _name) in conv_def:\n            self.conv1.add_sublayer(\n                _name,\n                ConvNormLayer(\n                    ch_in=c_in,\n                    ch_out=c_out,\n                    filter_size=k,\n                    stride=s,\n                    groups=1,\n                    act='relu',\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    lr=1.0))\n\n        self.ch_in = ch_in\n        ch_out_list = [64, 128, 256, 512]\n        block = BottleNeck if depth >= 50 else BasicBlock\n\n        self._out_channels = [block.expansion * v for v in ch_out_list]\n        self._out_strides = [4, 8, 16, 32]\n\n        self.res_layers = []\n        for i in range(num_stages):\n            lr_mult = lr_mult_list[i]\n            stage_num = i + 2\n            res_name = \"res{}\".format(stage_num)\n            res_layer = self.add_sublayer(\n                res_name,\n                Blocks(\n                    block,\n                    self.ch_in,\n                    ch_out_list[i],\n                    count=block_nums[i],\n                    name_adapter=na,\n                    stage_num=stage_num,\n                    variant=variant,\n                    groups=groups,\n                    base_width=base_width,\n                    lr=lr_mult,\n                    norm_type=norm_type,\n                    norm_decay=norm_decay,\n                    freeze_norm=freeze_norm,\n                    dcn_v2=(i in self.dcn_v2_stages),\n                    std_senet=std_senet))\n            self.res_layers.append(res_layer)\n            self.ch_in = self._out_channels[i]\n\n        if freeze_at >= 0:\n            self._freeze_parameters(self.conv1)\n            if not freeze_stem_only:\n                for i in range(min(freeze_at + 1, num_stages)):\n                    self._freeze_parameters(self.res_layers[i])\n\n    def _freeze_parameters(self, m):\n        for p in m.parameters():\n            p.stop_gradient = True\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self._out_channels[i], stride=self._out_strides[i])\n            for i in self.return_idx\n        ]\n\n    def forward(self, inputs):\n        x = inputs['image']\n        conv1 = self.conv1(x)\n        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)\n        outs = []\n        for idx, stage in enumerate(self.res_layers):\n            x = stage(x)\n            if idx in self.return_idx:\n                outs.append(x)\n        return outs\n\n\n@register\nclass Res5Head(nn.Layer):\n    def __init__(self, depth=50):\n        super(Res5Head, self).__init__()\n        feat_in, feat_out = [1024, 512]\n        if depth < 50:\n            feat_in = 256\n        na = NameAdapter(self)\n        block = BottleNeck if depth >= 50 else BasicBlock\n        self.res5 = Blocks(\n            block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)\n        self.feat_out = feat_out if depth < 50 else feat_out * 4\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(\n            channels=self.feat_out,\n            stride=16, )]\n\n    def forward(self, roi_feat, stage=0):\n        y = self.res5(roi_feat)\n        return y\n"
  },
  {
    "path": "ppdet/modeling/backbones/senet.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle.nn as nn\n\nfrom ppdet.core.workspace import register, serializable\nfrom .resnet import ResNet, Blocks, BasicBlock, BottleNeck\nfrom ..shape_spec import ShapeSpec\nfrom .name_adapter import NameAdapter\n\n__all__ = ['SENet', 'SERes5Head']\n\n\n@register\n@serializable\nclass SENet(ResNet):\n    __shared__ = ['norm_type']\n\n    def __init__(self,\n                 depth=50,\n                 variant='b',\n                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],\n                 groups=1,\n                 base_width=64,\n                 norm_type='bn',\n                 norm_decay=0,\n                 freeze_norm=True,\n                 freeze_at=0,\n                 return_idx=[0, 1, 2, 3],\n                 dcn_v2_stages=[-1],\n                 std_senet=True,\n                 num_stages=4):\n        \"\"\"\n        Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507\n        \n        Args:\n            depth (int): SENet depth, should be 50, 101, 152\n            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently\n            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),\n                                 lower learning rate ratio is need for pretrained model \n                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).\n            groups (int): group convolution cardinality\n            base_width (int): base width of each group convolution\n            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'\n            norm_decay (float): weight decay for normalization layer weights\n            freeze_norm (bool): freeze normalization layers\n            freeze_at (int): freeze the backbone at which stage\n            return_idx (list): index of the stages whose feature maps are returned\n            dcn_v2_stages (list): index of stages who select deformable conv v2\n            std_senet (bool): whether use senet, default True\n            num_stages (int): total num of stages\n        \"\"\"\n\n        super(SENet, self).__init__(\n            depth=depth,\n            variant=variant,\n            lr_mult_list=lr_mult_list,\n            ch_in=128,\n            groups=groups,\n            base_width=base_width,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            freeze_at=freeze_at,\n            return_idx=return_idx,\n            dcn_v2_stages=dcn_v2_stages,\n            std_senet=std_senet,\n            num_stages=num_stages)\n\n\n@register\nclass SERes5Head(nn.Layer):\n    def __init__(self,\n                 depth=50,\n                 variant='b',\n                 lr_mult=1.0,\n                 groups=1,\n                 base_width=64,\n                 norm_type='bn',\n                 norm_decay=0,\n                 dcn_v2=False,\n                 freeze_norm=False,\n                 std_senet=True):\n        \"\"\"\n        SERes5Head layer\n\n        Args:\n            depth (int): SENet depth, should be 50, 101, 152\n            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently\n            lr_mult (list): learning rate ratio of SERes5Head, default as 1.0.\n            groups (int): group convolution cardinality\n            base_width (int): base width of each group convolution\n            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'\n            norm_decay (float): weight decay for normalization layer weights\n            dcn_v2_stages (list): index of stages who select deformable conv v2\n            std_senet (bool): whether use senet, default True\n            \n        \"\"\"\n        super(SERes5Head, self).__init__()\n        ch_out = 512\n        ch_in = 256 if depth < 50 else 1024\n        na = NameAdapter(self)\n        block = BottleNeck if depth >= 50 else BasicBlock\n        self.res5 = Blocks(\n            block,\n            ch_in,\n            ch_out,\n            count=3,\n            name_adapter=na,\n            stage_num=5,\n            variant=variant,\n            groups=groups,\n            base_width=base_width,\n            lr=lr_mult,\n            norm_type=norm_type,\n            norm_decay=norm_decay,\n            freeze_norm=freeze_norm,\n            dcn_v2=dcn_v2,\n            std_senet=std_senet)\n        self.ch_out = ch_out * block.expansion\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(\n            channels=self.ch_out,\n            stride=16, )]\n\n    def forward(self, roi_feat):\n        y = self.res5(roi_feat)\n        return y\n"
  },
  {
    "path": "ppdet/modeling/backbones/shufflenet_v2.py",
    "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nimport paddle.nn.functional as F\nfrom paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D\nfrom paddle.nn.initializer import KaimingNormal\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register, serializable\nfrom numbers import Integral\nfrom ..shape_spec import ShapeSpec\nfrom ppdet.modeling.ops import channel_shuffle\n\n__all__ = ['ShuffleNetV2']\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride,\n                 padding,\n                 groups=1,\n                 act=None):\n        super(ConvBNLayer, self).__init__()\n        self._conv = Conv2D(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            groups=groups,\n            weight_attr=ParamAttr(initializer=KaimingNormal()),\n            bias_attr=False)\n\n        self._batch_norm = BatchNorm2D(\n            out_channels,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        if act == \"hard_swish\":\n            act = 'hardswish'\n        self.act = act\n\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self.act:\n            y = getattr(F, self.act)(y)\n        return y\n\n\nclass InvertedResidual(nn.Layer):\n    def __init__(self, in_channels, out_channels, stride, act=\"relu\"):\n        super(InvertedResidual, self).__init__()\n        self._conv_pw = ConvBNLayer(\n            in_channels=in_channels // 2,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n        self._conv_dw = ConvBNLayer(\n            in_channels=out_channels // 2,\n            out_channels=out_channels // 2,\n            kernel_size=3,\n            stride=stride,\n            padding=1,\n            groups=out_channels // 2,\n            act=None)\n        self._conv_linear = ConvBNLayer(\n            in_channels=out_channels // 2,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n\n    def forward(self, inputs):\n        x1, x2 = paddle.split(\n            inputs,\n            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],\n            axis=1)\n        x2 = self._conv_pw(x2)\n        x2 = self._conv_dw(x2)\n        x2 = self._conv_linear(x2)\n        out = paddle.concat([x1, x2], axis=1)\n        return channel_shuffle(out, 2)\n\n\nclass InvertedResidualDS(nn.Layer):\n    def __init__(self, in_channels, out_channels, stride, act=\"relu\"):\n        super(InvertedResidualDS, self).__init__()\n\n        # branch1\n        self._conv_dw_1 = ConvBNLayer(\n            in_channels=in_channels,\n            out_channels=in_channels,\n            kernel_size=3,\n            stride=stride,\n            padding=1,\n            groups=in_channels,\n            act=None)\n        self._conv_linear_1 = ConvBNLayer(\n            in_channels=in_channels,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n        # branch2\n        self._conv_pw_2 = ConvBNLayer(\n            in_channels=in_channels,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n        self._conv_dw_2 = ConvBNLayer(\n            in_channels=out_channels // 2,\n            out_channels=out_channels // 2,\n            kernel_size=3,\n            stride=stride,\n            padding=1,\n            groups=out_channels // 2,\n            act=None)\n        self._conv_linear_2 = ConvBNLayer(\n            in_channels=out_channels // 2,\n            out_channels=out_channels // 2,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            act=act)\n\n    def forward(self, inputs):\n        x1 = self._conv_dw_1(inputs)\n        x1 = self._conv_linear_1(x1)\n        x2 = self._conv_pw_2(inputs)\n        x2 = self._conv_dw_2(x2)\n        x2 = self._conv_linear_2(x2)\n        out = paddle.concat([x1, x2], axis=1)\n\n        return channel_shuffle(out, 2)\n\n\n@register\n@serializable\nclass ShuffleNetV2(nn.Layer):\n    def __init__(self, scale=1.0, act=\"relu\", feature_maps=[5, 13, 17]):\n        super(ShuffleNetV2, self).__init__()\n        self.scale = scale\n        if isinstance(feature_maps, Integral):\n            feature_maps = [feature_maps]\n        self.feature_maps = feature_maps\n        stage_repeats = [4, 8, 4]\n\n        if scale == 0.25:\n            stage_out_channels = [-1, 24, 24, 48, 96, 512]\n        elif scale == 0.33:\n            stage_out_channels = [-1, 24, 32, 64, 128, 512]\n        elif scale == 0.5:\n            stage_out_channels = [-1, 24, 48, 96, 192, 1024]\n        elif scale == 1.0:\n            stage_out_channels = [-1, 24, 116, 232, 464, 1024]\n        elif scale == 1.5:\n            stage_out_channels = [-1, 24, 176, 352, 704, 1024]\n        elif scale == 2.0:\n            stage_out_channels = [-1, 24, 244, 488, 976, 2048]\n        else:\n            raise NotImplementedError(\"This scale size:[\" + str(scale) +\n                                      \"] is not implemented!\")\n        self._out_channels = []\n        self._feature_idx = 0\n        # 1. conv1\n        self._conv1 = ConvBNLayer(\n            in_channels=3,\n            out_channels=stage_out_channels[1],\n            kernel_size=3,\n            stride=2,\n            padding=1,\n            act=act)\n        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)\n        self._feature_idx += 1\n\n        # 2. bottleneck sequences\n        self._block_list = []\n        for stage_id, num_repeat in enumerate(stage_repeats):\n            for i in range(num_repeat):\n                if i == 0:\n                    block = self.add_sublayer(\n                        name=str(stage_id + 2) + '_' + str(i + 1),\n                        sublayer=InvertedResidualDS(\n                            in_channels=stage_out_channels[stage_id + 1],\n                            out_channels=stage_out_channels[stage_id + 2],\n                            stride=2,\n                            act=act))\n                else:\n                    block = self.add_sublayer(\n                        name=str(stage_id + 2) + '_' + str(i + 1),\n                        sublayer=InvertedResidual(\n                            in_channels=stage_out_channels[stage_id + 2],\n                            out_channels=stage_out_channels[stage_id + 2],\n                            stride=1,\n                            act=act))\n                self._block_list.append(block)\n                self._feature_idx += 1\n                self._update_out_channels(stage_out_channels[stage_id + 2],\n                                          self._feature_idx, self.feature_maps)\n\n    def _update_out_channels(self, channel, feature_idx, feature_maps):\n        if feature_idx in feature_maps:\n            self._out_channels.append(channel)\n\n    def forward(self, inputs):\n        y = self._conv1(inputs['image'])\n        y = self._max_pool(y)\n        outs = []\n        for i, inv in enumerate(self._block_list):\n            y = inv(y)\n            if i + 2 in self.feature_maps:\n                outs.append(y)\n\n        return outs\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/backbones/swin_transformer.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py\nThs copyright of microsoft/Swin-Transformer is as follows:\nMIT License [see LICENSE for details]\n\"\"\"\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.modeling.shape_spec import ShapeSpec\nfrom ppdet.core.workspace import register, serializable\nfrom .transformer_utils import DropPath, Identity\nfrom .transformer_utils import add_parameter, to_2tuple\nfrom .transformer_utils import ones_, zeros_, trunc_normal_\n\n__all__ = ['SwinTransformer']\n\nMODEL_cfg = {\n    # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config\n    'swin_T_224': dict(\n        pretrain_img_size=224,\n        embed_dim=96,\n        depths=[2, 2, 6, 2],\n        num_heads=[3, 6, 12, 24],\n        window_size=7,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',\n    ),\n    'swin_S_224': dict(\n        pretrain_img_size=224,\n        embed_dim=96,\n        depths=[2, 2, 18, 2],\n        num_heads=[3, 6, 12, 24],\n        window_size=7,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',\n    ),\n    'swin_B_224': dict(\n        pretrain_img_size=224,\n        embed_dim=128,\n        depths=[2, 2, 18, 2],\n        num_heads=[4, 8, 16, 32],\n        window_size=7,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',\n    ),\n    'swin_L_224': dict(\n        pretrain_img_size=224,\n        embed_dim=192,\n        depths=[2, 2, 18, 2],\n        num_heads=[6, 12, 24, 48],\n        window_size=7,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',\n    ),\n    'swin_B_384': dict(\n        pretrain_img_size=384,\n        embed_dim=128,\n        depths=[2, 2, 18, 2],\n        num_heads=[4, 8, 16, 32],\n        window_size=12,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',\n    ),\n    'swin_L_384': dict(\n        pretrain_img_size=384,\n        embed_dim=192,\n        depths=[2, 2, 18, 2],\n        num_heads=[6, 12, 24, 48],\n        window_size=12,\n        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',\n    ),\n}\n\n\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\n\n\ndef window_partition(x, window_size):\n    \"\"\"\n    Args:\n        x: (B, H, W, C)\n        window_size (int): window size\n    Returns:\n        windows: (num_windows*B, window_size, window_size, C)\n    \"\"\"\n    B, H, W, C = x.shape\n    x = x.reshape(\n        [-1, H // window_size, window_size, W // window_size, window_size, C])\n    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(\n        [-1, window_size, window_size, C])\n    return windows\n\n\ndef window_reverse(windows, window_size, H, W):\n    \"\"\"\n    Args:\n        windows: (num_windows*B, window_size, window_size, C)\n        window_size (int): Window size\n        H (int): Height of image\n        W (int): Width of image\n    Returns:\n        x: (B, H, W, C)\n    \"\"\"\n    _, _, _, C = windows.shape\n    B = int(windows.shape[0] / (H * W / window_size / window_size))\n    x = windows.reshape(\n        [-1, H // window_size, W // window_size, window_size, window_size, C])\n    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])\n    return x\n\n\nclass WindowAttention(nn.Layer):\n    \"\"\" Window based multi-head self attention (W-MSA) module with relative position bias.\n    It supports both of shifted and non-shifted window.\n\n    Args:\n        dim (int): Number of input channels.\n        window_size (tuple[int]): The height and width of the window.\n        num_heads (int): Number of attention heads.\n        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set\n        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0\n        proj_drop (float, optional): Dropout ratio of output. Default: 0.0\n    \"\"\"\n\n    def __init__(self,\n                 dim,\n                 window_size,\n                 num_heads,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 attn_drop=0.,\n                 proj_drop=0.):\n\n        super().__init__()\n        self.dim = dim\n        self.window_size = window_size  # Wh, Ww\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n\n        # define a parameter table of relative position bias\n        self.relative_position_bias_table = add_parameter(\n            self,\n            paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),\n                          num_heads)))  # 2*Wh-1 * 2*Ww-1, nH\n\n        # get pair-wise relative position index for each token inside the window\n        coords_h = paddle.arange(self.window_size[0])\n        coords_w = paddle.arange(self.window_size[1])\n        coords = paddle.stack(paddle.meshgrid(\n            [coords_h, coords_w]))  # 2, Wh, Ww\n        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww\n        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)\n        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)\n        relative_coords = coords_flatten_1 - coords_flatten_2\n        relative_coords = relative_coords.transpose(\n            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2\n        relative_coords[:, :, 0] += self.window_size[\n            0] - 1  # shift to start from 0\n        relative_coords[:, :, 1] += self.window_size[1] - 1\n        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1\n        self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww\n\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.attn_drop = nn.Dropout(attn_drop)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n\n        trunc_normal_(self.relative_position_bias_table)\n        self.softmax = nn.Softmax(axis=-1)\n\n    def forward(self, x, mask=None):\n        \"\"\" Forward function.\n        Args:\n            x: input features with shape of (num_windows*B, N, C)\n            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None\n        \"\"\"\n        B_, N, C = x.shape\n        qkv = self.qkv(x).reshape(\n            [-1, N, 3, self.num_heads, C // self.num_heads]).transpose(\n                [2, 0, 3, 1, 4])\n        q, k, v = qkv[0], qkv[1], qkv[2]\n\n        q = q * self.scale\n        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))\n\n        index = self.relative_position_index.flatten()\n\n        relative_position_bias = paddle.index_select(\n            self.relative_position_bias_table, index)\n        relative_position_bias = relative_position_bias.reshape([\n            self.window_size[0] * self.window_size[1],\n            self.window_size[0] * self.window_size[1], -1\n        ])  # Wh*Ww,Wh*Ww,nH\n        relative_position_bias = relative_position_bias.transpose(\n            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww\n        attn = attn + relative_position_bias.unsqueeze(0)\n\n        if mask is not None:\n            nW = mask.shape[0]\n            attn = attn.reshape([-1, nW, self.num_heads, N, N\n                                 ]) + mask.unsqueeze(1).unsqueeze(0)\n            attn = attn.reshape([-1, self.num_heads, N, N])\n            attn = self.softmax(attn)\n        else:\n            attn = self.softmax(attn)\n\n        attn = self.attn_drop(attn)\n\n        # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])\n        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\n\n\nclass SwinTransformerBlock(nn.Layer):\n    \"\"\" Swin Transformer Block.\n    Args:\n        dim (int): Number of input channels.\n        num_heads (int): Number of attention heads.\n        window_size (int): Window size.\n        shift_size (int): Shift size for SW-MSA.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.\n        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.\n        drop (float, optional): Dropout rate. Default: 0.0\n        attn_drop (float, optional): Attention dropout rate. Default: 0.0\n        drop_path (float, optional): Stochastic depth rate. Default: 0.0\n        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU\n        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm\n    \"\"\"\n\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 window_size=7,\n                 shift_size=0,\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 act_layer=nn.GELU,\n                 norm_layer=nn.LayerNorm):\n        super().__init__()\n        self.dim = dim\n        self.num_heads = num_heads\n        self.window_size = window_size\n        self.shift_size = shift_size\n        self.mlp_ratio = mlp_ratio\n        assert 0 <= self.shift_size < self.window_size, \"shift_size must in 0-window_size\"\n\n        self.norm1 = norm_layer(dim)\n        self.attn = WindowAttention(\n            dim,\n            window_size=to_2tuple(self.window_size),\n            num_heads=num_heads,\n            qkv_bias=qkv_bias,\n            qk_scale=qk_scale,\n            attn_drop=attn_drop,\n            proj_drop=drop)\n\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        self.norm2 = norm_layer(dim)\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n\n        self.H = None\n        self.W = None\n\n    def forward(self, x, mask_matrix):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, H*W, C).\n            H, W: Spatial resolution of the input feature.\n            mask_matrix: Attention mask for cyclic shift.\n        \"\"\"\n        B, L, C = x.shape\n        H, W = self.H, self.W\n        assert L == H * W, \"input feature has wrong size\"\n\n        shortcut = x\n        x = self.norm1(x)\n        x = x.reshape([-1, H, W, C])\n\n        # pad feature maps to multiples of window size\n        pad_l = pad_t = 0\n        pad_r = (self.window_size - W % self.window_size) % self.window_size\n        pad_b = (self.window_size - H % self.window_size) % self.window_size\n        x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],\n                  data_format='NHWC')\n        _, Hp, Wp, _ = x.shape\n\n        # cyclic shift\n        if self.shift_size > 0:\n            shifted_x = paddle.roll(\n                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))\n            attn_mask = mask_matrix\n        else:\n            shifted_x = x\n            attn_mask = None\n\n        # partition windows\n        x_windows = window_partition(\n            shifted_x, self.window_size)  # nW*B, window_size, window_size, C\n        x_windows = x_windows.reshape(\n            [x_windows.shape[0], self.window_size * self.window_size,\n             C])  # nW*B, window_size*window_size, C\n\n        # W-MSA/SW-MSA\n        attn_windows = self.attn(\n            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C\n\n        # merge windows\n        attn_windows = attn_windows.reshape(\n            [x_windows.shape[0], self.window_size, self.window_size, C])\n        shifted_x = window_reverse(attn_windows, self.window_size, Hp,\n                                   Wp)  # B H' W' C\n\n        # reverse cyclic shift\n        if self.shift_size > 0:\n            x = paddle.roll(\n                shifted_x,\n                shifts=(self.shift_size, self.shift_size),\n                axis=(1, 2))\n        else:\n            x = shifted_x\n\n        if pad_r > 0 or pad_b > 0:\n            x = x[:, :H, :W, :]\n\n        x = x.reshape([-1, H * W, C])\n\n        # FFN\n        x = shortcut + self.drop_path(x)\n        x = x + self.drop_path(self.mlp(self.norm2(x)))\n\n        return x\n\n\nclass PatchMerging(nn.Layer):\n    r\"\"\" Patch Merging Layer.\n    Args:\n        dim (int): Number of input channels.\n        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm\n    \"\"\"\n\n    def __init__(self, dim, norm_layer=nn.LayerNorm):\n        super().__init__()\n        self.dim = dim\n        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)\n        self.norm = norm_layer(4 * dim)\n\n    def forward(self, x, H, W):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, H*W, C).\n            H, W: Spatial resolution of the input feature.\n        \"\"\"\n        B, L, C = x.shape\n        assert L == H * W, \"input feature has wrong size\"\n\n        x = x.reshape([-1, H, W, C])\n\n        # padding\n        pad_input = (H % 2 == 1) or (W % 2 == 1)\n        if pad_input:\n            # paddle F.pad default data_format is 'NCHW'\n            x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')\n            H += H % 2\n            W += W % 2\n\n        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C\n        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C\n        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C\n        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C\n        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C\n        x = x.reshape([-1, H * W // 4, 4 * C])  # B H/2*W/2 4*C\n\n        x = self.norm(x)\n        x = self.reduction(x)\n\n        return x\n\n\nclass BasicLayer(nn.Layer):\n    \"\"\" A basic Swin Transformer layer for one stage.\n    Args:\n        dim (int): Number of input channels.\n        depth (int): Number of blocks.\n        num_heads (int): Number of attention heads.\n        window_size (int): Local window size.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.\n        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.\n        drop (float, optional): Dropout rate. Default: 0.0\n        attn_drop (float, optional): Attention dropout rate. Default: 0.0\n        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0\n        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm\n        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None\n    \"\"\"\n\n    def __init__(self,\n                 dim,\n                 depth,\n                 num_heads,\n                 window_size=7,\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 norm_layer=nn.LayerNorm,\n                 downsample=None):\n        super().__init__()\n        self.window_size = window_size\n        self.shift_size = window_size // 2\n        self.depth = depth\n\n        # build blocks\n        self.blocks = nn.LayerList([\n            SwinTransformerBlock(\n                dim=dim,\n                num_heads=num_heads,\n                window_size=window_size,\n                shift_size=0 if (i % 2 == 0) else window_size // 2,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop,\n                attn_drop=attn_drop,\n                drop_path=drop_path[i]\n                if isinstance(drop_path, np.ndarray) else drop_path,\n                norm_layer=norm_layer) for i in range(depth)\n        ])\n\n        # patch merging layer\n        if downsample is not None:\n            self.downsample = downsample(dim=dim, norm_layer=norm_layer)\n        else:\n            self.downsample = None\n\n    def forward(self, x, H, W):\n        \"\"\" Forward function.\n        Args:\n            x: Input feature, tensor size (B, H*W, C).\n            H, W: Spatial resolution of the input feature.\n        \"\"\"\n\n        # calculate attention mask for SW-MSA\n        Hp = int(np.ceil(H / self.window_size)) * self.window_size\n        Wp = int(np.ceil(W / self.window_size)) * self.window_size\n        img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32')  # 1 Hp Wp 1\n        h_slices = (slice(0, -self.window_size),\n                    slice(-self.window_size, -self.shift_size),\n                    slice(-self.shift_size, None))\n        w_slices = (slice(0, -self.window_size),\n                    slice(-self.window_size, -self.shift_size),\n                    slice(-self.shift_size, None))\n        cnt = 0\n        for h in h_slices:\n            for w in w_slices:\n                img_mask[:, h, w, :] = cnt\n\n                cnt += 1\n\n        mask_windows = window_partition(\n            img_mask, self.window_size)  # nW, window_size, window_size, 1\n        mask_windows = mask_windows.reshape(\n            [-1, self.window_size * self.window_size])\n        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)\n        huns = -100.0 * paddle.ones_like(attn_mask)\n        attn_mask = huns * (attn_mask != 0).astype(\"float32\")\n\n        for blk in self.blocks:\n            blk.H, blk.W = H, W\n            x = blk(x, attn_mask)\n        if self.downsample is not None:\n            x_down = self.downsample(x, H, W)\n            Wh, Ww = (H + 1) // 2, (W + 1) // 2\n            return x, H, W, x_down, Wh, Ww\n        else:\n            return x, H, W, x, H, W\n\n\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    Args:\n        patch_size (int): Patch token size. Default: 4.\n        in_chans (int): Number of input image channels. Default: 3.\n        embed_dim (int): Number of linear projection output channels. Default: 96.\n        norm_layer (nn.Layer, optional): Normalization layer. Default: None\n    \"\"\"\n\n    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):\n        super().__init__()\n        patch_size = to_2tuple(patch_size)\n        self.patch_size = patch_size\n\n        self.in_chans = in_chans\n        self.embed_dim = embed_dim\n\n        self.proj = nn.Conv2D(\n            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)\n        if norm_layer is not None:\n            self.norm = norm_layer(embed_dim)\n        else:\n            self.norm = None\n\n    def forward(self, x):\n        # TODO # export dynamic shape\n        B, C, H, W = x.shape\n        # assert [H, W] == self.img_size[:2], \"Input image size ({H}*{W}) doesn't match model ({}*{}).\".format(H, W, self.img_size[0], self.img_size[1])\n        if W % self.patch_size[1] != 0:\n            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])\n        if H % self.patch_size[0] != 0:\n            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])\n\n        x = self.proj(x)\n        if self.norm is not None:\n            _, _, Wh, Ww = x.shape\n            x = x.flatten(2).transpose([0, 2, 1])\n            x = self.norm(x)\n            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])\n\n        return x\n\n\n@register\n@serializable\nclass SwinTransformer(nn.Layer):\n    \"\"\" Swin Transformer backbone\n    Args:\n        arch (str): Architecture of FocalNet\n        pretrain_img_size (int | tuple(int)): Input image size. Default 224\n        patch_size (int | tuple(int)): Patch size. Default: 4\n        in_chans (int): Number of input image channels. Default: 3\n        embed_dim (int): Patch embedding dimension. Default: 96\n        depths (tuple(int)): Depth of each Swin Transformer layer.\n        num_heads (tuple(int)): Number of attention heads in different layers.\n        window_size (int): Window size. Default: 7\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4\n        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True\n        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None\n        drop_rate (float): Dropout rate. Default: 0\n        attn_drop_rate (float): Attention dropout rate. Default: 0\n        drop_path_rate (float): Stochastic depth rate. Default: 0.1\n        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.\n        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False\n        patch_norm (bool): If True, add normalization after patch embedding. Default: True\n    \"\"\"\n\n    def __init__(self,\n                 arch='swin_T_224',\n                 pretrain_img_size=224,\n                 patch_size=4,\n                 in_chans=3,\n                 embed_dim=96,\n                 depths=[2, 2, 6, 2],\n                 num_heads=[3, 6, 12, 24],\n                 window_size=7,\n                 mlp_ratio=4.,\n                 qkv_bias=True,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.2,\n                 norm_layer=nn.LayerNorm,\n                 ape=False,\n                 patch_norm=True,\n                 out_indices=(0, 1, 2, 3),\n                 frozen_stages=-1,\n                 pretrained=None):\n        super(SwinTransformer, self).__init__()\n        assert arch in MODEL_cfg.keys(), \"Unsupported arch: {}\".format(arch)\n\n        pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']\n        embed_dim = MODEL_cfg[arch]['embed_dim']\n        depths = MODEL_cfg[arch]['depths']\n        num_heads = MODEL_cfg[arch]['num_heads']\n        window_size = MODEL_cfg[arch]['window_size']\n        if pretrained is None:\n            pretrained = MODEL_cfg[arch]['pretrained']\n\n        self.num_layers = len(depths)\n        self.ape = ape\n        self.patch_norm = patch_norm\n        self.out_indices = out_indices\n        self.frozen_stages = frozen_stages\n\n        # split image into non-overlapping patches\n        self.patch_embed = PatchEmbed(\n            patch_size=patch_size,\n            in_chans=in_chans,\n            embed_dim=embed_dim,\n            norm_layer=norm_layer if self.patch_norm else None)\n\n        # absolute position embedding\n        if self.ape:\n            pretrain_img_size = to_2tuple(pretrain_img_size)\n            patch_size = to_2tuple(patch_size)\n            patches_resolution = [\n                pretrain_img_size[0] // patch_size[0],\n                pretrain_img_size[1] // patch_size[1]\n            ]\n\n            self.absolute_pos_embed = add_parameter(\n                self,\n                paddle.zeros((1, embed_dim, patches_resolution[0],\n                              patches_resolution[1])))\n            trunc_normal_(self.absolute_pos_embed)\n\n        self.pos_drop = nn.Dropout(p=drop_rate)\n\n        # stochastic depth\n        dpr = np.linspace(0, drop_path_rate,\n                          sum(depths))  # stochastic depth decay rule\n\n        # build layers\n        self.layers = nn.LayerList()\n        for i_layer in range(self.num_layers):\n            layer = BasicLayer(\n                dim=int(embed_dim * 2**i_layer),\n                depth=depths[i_layer],\n                num_heads=num_heads[i_layer],\n                window_size=window_size,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop_rate,\n                attn_drop=attn_drop_rate,\n                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],\n                norm_layer=norm_layer,\n                downsample=PatchMerging\n                if (i_layer < self.num_layers - 1) else None)\n            self.layers.append(layer)\n\n        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]\n        self.num_features = num_features\n\n        # add a norm layer for each output\n        for i_layer in out_indices:\n            layer = norm_layer(num_features[i_layer])\n            layer_name = f'norm{i_layer}'\n            self.add_sublayer(layer_name, layer)\n\n        self.apply(self._init_weights)\n        self._freeze_stages()\n        if pretrained:\n            if 'http' in pretrained:  #URL\n                path = paddle.utils.download.get_weights_path_from_url(\n                    pretrained)\n            else:  #model in local path\n                path = pretrained\n            self.set_state_dict(paddle.load(path))\n\n    def _freeze_stages(self):\n        if self.frozen_stages >= 0:\n            self.patch_embed.eval()\n            for param in self.patch_embed.parameters():\n                param.stop_gradient = True\n\n        if self.frozen_stages >= 1 and self.ape:\n            self.absolute_pos_embed.stop_gradient = True\n\n        if self.frozen_stages >= 2:\n            self.pos_drop.eval()\n            for i in range(0, self.frozen_stages - 1):\n                m = self.layers[i]\n                m.eval()\n                for param in m.parameters():\n                    param.stop_gradient = True\n\n    def _init_weights(self, m):\n        if isinstance(m, nn.Linear):\n            trunc_normal_(m.weight)\n            if isinstance(m, nn.Linear) and m.bias is not None:\n                zeros_(m.bias)\n        elif isinstance(m, nn.LayerNorm):\n            zeros_(m.bias)\n            ones_(m.weight)\n\n    def forward(self, x):\n        \"\"\"Forward function.\"\"\"\n        x = self.patch_embed(x['image'])\n        B, _, Wh, Ww = x.shape\n        if self.ape:\n            # interpolate the position embedding to the corresponding size\n            absolute_pos_embed = F.interpolate(\n                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')\n            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])\n        else:\n            x = x.flatten(2).transpose([0, 2, 1])\n        x = self.pos_drop(x)\n        outs = []\n        for i in range(self.num_layers):\n            layer = self.layers[i]\n            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)\n            if i in self.out_indices:\n                norm_layer = getattr(self, f'norm{i}')\n                x_out = norm_layer(x_out)\n                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(\n                    (0, 3, 1, 2))\n                outs.append(out)\n\n        return outs\n\n    @property\n    def out_shape(self):\n        out_strides = [4, 8, 16, 32]\n        return [\n            ShapeSpec(\n                channels=self.num_features[i], stride=out_strides[i])\n            for i in self.out_indices\n        ]\n"
  },
  {
    "path": "ppdet/modeling/backbones/trans_encoder.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn import ReLU, Swish, GELU\nimport math\n\nfrom ppdet.core.workspace import register\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['TransEncoder']\n\n\nclass BertEmbeddings(nn.Layer):\n    def __init__(self, word_size, position_embeddings_size, word_type_size,\n                 hidden_size, dropout_prob):\n        super(BertEmbeddings, self).__init__()\n        self.word_embeddings = nn.Embedding(\n            word_size, hidden_size, padding_idx=0)\n        self.position_embeddings = nn.Embedding(position_embeddings_size,\n                                                hidden_size)\n        self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)\n        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)\n        self.dropout = nn.Dropout(dropout_prob)\n\n    def forward(self, x, token_type_ids=None, position_ids=None):\n        seq_len = x.shape[1]\n        if position_ids is None:\n            position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)\n        if token_type_ids is None:\n            token_type_ids = paddle.zeros(x.shape)\n\n        word_embs = self.word_embeddings(x)\n        position_embs = self.position_embeddings(position_ids)\n        token_type_embs = self.token_type_embeddings(token_type_ids)\n\n        embs_cmb = word_embs + position_embs + token_type_embs\n        embs_out = self.layernorm(embs_cmb)\n        embs_out = self.dropout(embs_out)\n        return embs_out\n\n\nclass BertSelfAttention(nn.Layer):\n    def __init__(self,\n                 hidden_size,\n                 num_attention_heads,\n                 attention_probs_dropout_prob,\n                 output_attentions=False):\n        super(BertSelfAttention, self).__init__()\n        if hidden_size % num_attention_heads != 0:\n            raise ValueError(\n                \"The hidden_size must be a multiple of the number of attention \"\n                \"heads, but got {} % {} != 0\" %\n                (hidden_size, num_attention_heads))\n\n        self.num_attention_heads = num_attention_heads\n        self.attention_head_size = int(hidden_size / num_attention_heads)\n        self.all_head_size = self.num_attention_heads * self.attention_head_size\n\n        self.query = nn.Linear(hidden_size, self.all_head_size)\n        self.key = nn.Linear(hidden_size, self.all_head_size)\n        self.value = nn.Linear(hidden_size, self.all_head_size)\n\n        self.dropout = nn.Dropout(attention_probs_dropout_prob)\n        self.output_attentions = output_attentions\n\n    def forward(self, x, attention_mask, head_mask=None):\n        query = self.query(x)\n        key = self.key(x)\n        value = self.value(x)\n\n        query_dim1, query_dim2 = query.shape[:-1]\n        new_shape = [\n            query_dim1, query_dim2, self.num_attention_heads,\n            self.attention_head_size\n        ]\n        query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))\n        key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))\n        value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))\n\n        attention = paddle.matmul(query,\n                                  key) / math.sqrt(self.attention_head_size)\n        attention = attention + attention_mask\n        attention_value = F.softmax(attention, axis=-1)\n        attention_value = self.dropout(attention_value)\n\n        if head_mask is not None:\n            attention_value = attention_value * head_mask\n\n        context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,\n                                                                        3))\n        ctx_dim1, ctx_dim2 = context.shape[:-2]\n        new_context_shape = [\n            ctx_dim1,\n            ctx_dim2,\n            self.all_head_size,\n        ]\n        context = context.reshape(new_context_shape)\n\n        if self.output_attentions:\n            return (context, attention_value)\n        else:\n            return (context, )\n\n\nclass BertAttention(nn.Layer):\n    def __init__(self,\n                 hidden_size,\n                 num_attention_heads,\n                 attention_probs_dropout_prob,\n                 fc_dropout_prob,\n                 output_attentions=False):\n        super(BertAttention, self).__init__()\n        self.bert_selfattention = BertSelfAttention(\n            hidden_size, num_attention_heads, attention_probs_dropout_prob,\n            output_attentions)\n        self.fc = nn.Linear(hidden_size, hidden_size)\n        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)\n        self.dropout = nn.Dropout(fc_dropout_prob)\n\n    def forward(self, x, attention_mask, head_mask=None):\n        attention_feats = self.bert_selfattention(x, attention_mask, head_mask)\n        features = self.fc(attention_feats[0])\n        features = self.dropout(features)\n        features = self.layernorm(features + x)\n        if len(attention_feats) == 2:\n            return (features, attention_feats[1])\n        else:\n            return (features, )\n\n\nclass BertFeedForward(nn.Layer):\n    def __init__(self,\n                 hidden_size,\n                 intermediate_size,\n                 num_attention_heads,\n                 attention_probs_dropout_prob,\n                 fc_dropout_prob,\n                 act_fn='ReLU',\n                 output_attentions=False):\n        super(BertFeedForward, self).__init__()\n        self.fc1 = nn.Linear(hidden_size, intermediate_size)\n        self.act_fn = eval(act_fn)\n        self.fc2 = nn.Linear(intermediate_size, hidden_size)\n        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)\n        self.dropout = nn.Dropout(fc_dropout_prob)\n\n    def forward(self, x):\n        features = self.fc1(x)\n        features = self.act_fn(features)\n        features = self.fc2(features)\n        features = self.dropout(features)\n        features = self.layernorm(features + x)\n        return features\n\n\nclass BertLayer(nn.Layer):\n    def __init__(self,\n                 hidden_size,\n                 intermediate_size,\n                 num_attention_heads,\n                 attention_probs_dropout_prob,\n                 fc_dropout_prob,\n                 act_fn='ReLU',\n                 output_attentions=False):\n        super(BertLayer, self).__init__()\n        self.attention = BertAttention(hidden_size, num_attention_heads,\n                                       attention_probs_dropout_prob,\n                                       output_attentions)\n        self.feed_forward = BertFeedForward(\n            hidden_size, intermediate_size, num_attention_heads,\n            attention_probs_dropout_prob, fc_dropout_prob, act_fn,\n            output_attentions)\n\n    def forward(self, x, attention_mask, head_mask=None):\n        attention_feats = self.attention(x, attention_mask, head_mask)\n        features = self.feed_forward(attention_feats[0])\n        if len(attention_feats) == 2:\n            return (features, attention_feats[1])\n        else:\n            return (features, )\n\n\nclass BertEncoder(nn.Layer):\n    def __init__(self,\n                 num_hidden_layers,\n                 hidden_size,\n                 intermediate_size,\n                 num_attention_heads,\n                 attention_probs_dropout_prob,\n                 fc_dropout_prob,\n                 act_fn='ReLU',\n                 output_attentions=False,\n                 output_hidden_feats=False):\n        super(BertEncoder, self).__init__()\n        self.output_attentions = output_attentions\n        self.output_hidden_feats = output_hidden_feats\n        self.layers = nn.LayerList([\n            BertLayer(hidden_size, intermediate_size, num_attention_heads,\n                      attention_probs_dropout_prob, fc_dropout_prob, act_fn,\n                      output_attentions) for _ in range(num_hidden_layers)\n        ])\n\n    def forward(self, x, attention_mask, head_mask=None):\n        all_features = (x, )\n        all_attentions = ()\n\n        for i, layer in enumerate(self.layers):\n            mask = head_mask[i] if head_mask is not None else None\n            layer_out = layer(x, attention_mask, mask)\n\n            if self.output_hidden_feats:\n                all_features = all_features + (x, )\n            x = layer_out[0]\n            if self.output_attentions:\n                all_attentions = all_attentions + (layer_out[1], )\n\n        outputs = (x, )\n        if self.output_hidden_feats:\n            outputs += (all_features, )\n        if self.output_attentions:\n            outputs += (all_attentions, )\n        return outputs\n\n\nclass BertPooler(nn.Layer):\n    def __init__(self, hidden_size):\n        super(BertPooler, self).__init__()\n        self.fc = nn.Linear(hidden_size, hidden_size)\n        self.act = nn.Tanh()\n\n    def forward(self, x):\n        first_token = x[:, 0]\n        pooled_output = self.fc(first_token)\n        pooled_output = self.act(pooled_output)\n        return pooled_output\n\n\nclass METROEncoder(nn.Layer):\n    def __init__(self,\n                 vocab_size,\n                 num_hidden_layers,\n                 features_dims,\n                 position_embeddings_size,\n                 hidden_size,\n                 intermediate_size,\n                 output_feature_dim,\n                 num_attention_heads,\n                 attention_probs_dropout_prob,\n                 fc_dropout_prob,\n                 act_fn='ReLU',\n                 output_attentions=False,\n                 output_hidden_feats=False,\n                 use_img_layernorm=False):\n        super(METROEncoder, self).__init__()\n        self.img_dims = features_dims\n        self.num_hidden_layers = num_hidden_layers\n        self.use_img_layernorm = use_img_layernorm\n        self.output_attentions = output_attentions\n        self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,\n                                        hidden_size, fc_dropout_prob)\n        self.encoder = BertEncoder(\n            num_hidden_layers, hidden_size, intermediate_size,\n            num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,\n            act_fn, output_attentions, output_hidden_feats)\n        self.pooler = BertPooler(hidden_size)\n        self.position_embeddings = nn.Embedding(position_embeddings_size,\n                                                hidden_size)\n        self.img_embedding = nn.Linear(\n            features_dims, hidden_size, bias_attr=True)\n        self.dropout = nn.Dropout(fc_dropout_prob)\n        self.cls_head = nn.Linear(hidden_size, output_feature_dim)\n        self.residual = nn.Linear(features_dims, output_feature_dim)\n\n        self.apply(self.init_weights)\n\n    def init_weights(self, module):\n        \"\"\" Initialize the weights.\n        \"\"\"\n        if isinstance(module, (nn.Linear, nn.Embedding)):\n            module.weight.set_value(\n                paddle.normal(\n                    mean=0.0, std=0.02, shape=module.weight.shape))\n        elif isinstance(module, nn.LayerNorm):\n            module.bias.set_value(paddle.zeros(shape=module.bias.shape))\n            module.weight.set_value(\n                paddle.full(\n                    shape=module.weight.shape, fill_value=1.0))\n        if isinstance(module, nn.Linear) and module.bias is not None:\n            module.bias.set_value(paddle.zeros(shape=module.bias.shape))\n\n    def forward(self, x):\n        batchsize, seq_len = x.shape[:2]\n        input_ids = paddle.zeros((batchsize, seq_len), dtype=\"int64\")\n        position_ids = paddle.arange(\n            seq_len, dtype=\"int64\").unsqueeze(0).expand_as(input_ids)\n\n        attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)\n        head_mask = [None] * self.num_hidden_layers\n\n        position_embs = self.position_embeddings(position_ids)\n        attention_mask = (1.0 - attention_mask) * -10000.0\n\n        img_features = self.img_embedding(x)\n\n        # We empirically observe that adding an additional learnable position embedding leads to more stable training\n        embeddings = position_embs + img_features\n        if self.use_img_layernorm:\n            embeddings = self.layernorm(embeddings)\n        embeddings = self.dropout(embeddings)\n\n        encoder_outputs = self.encoder(\n            embeddings, attention_mask, head_mask=head_mask)\n\n        pred_score = self.cls_head(encoder_outputs[0])\n        res_img_feats = self.residual(x)\n        pred_score = pred_score + res_img_feats\n\n        if self.output_attentions and self.output_hidden_feats:\n            return pred_score, encoder_outputs[1], encoder_outputs[-1]\n        else:\n            return pred_score\n\n\ndef gelu(x):\n    \"\"\"Implementation of the gelu activation function.\n        https://arxiv.org/abs/1606.08415\n    \"\"\"\n    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))\n\n\n@register\nclass TransEncoder(nn.Layer):\n    def __init__(self,\n                 vocab_size=30522,\n                 num_hidden_layers=4,\n                 num_attention_heads=4,\n                 position_embeddings_size=512,\n                 intermediate_size=3072,\n                 input_feat_dim=[2048, 512, 128],\n                 hidden_feat_dim=[1024, 256, 128],\n                 attention_probs_dropout_prob=0.1,\n                 fc_dropout_prob=0.1,\n                 act_fn='gelu',\n                 output_attentions=False,\n                 output_hidden_feats=False):\n        super(TransEncoder, self).__init__()\n        output_feat_dim = input_feat_dim[1:] + [3]\n        trans_encoder = []\n        for i in range(len(output_feat_dim)):\n            features_dims = input_feat_dim[i]\n            output_feature_dim = output_feat_dim[i]\n            hidden_size = hidden_feat_dim[i]\n\n            # init a transformer encoder and append it to a list\n            assert hidden_size % num_attention_heads == 0\n            model = METROEncoder(vocab_size, num_hidden_layers, features_dims,\n                                 position_embeddings_size, hidden_size,\n                                 intermediate_size, output_feature_dim,\n                                 num_attention_heads,\n                                 attention_probs_dropout_prob, fc_dropout_prob,\n                                 act_fn, output_attentions, output_hidden_feats)\n            trans_encoder.append(model)\n        self.trans_encoder = paddle.nn.Sequential(*trans_encoder)\n\n    def forward(self, x):\n        out = self.trans_encoder(x)\n        return out\n"
  },
  {
    "path": "ppdet/modeling/backbones/transformer_utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom paddle.nn.initializer import TruncatedNormal, Constant, Assign\n\n# Common initializations\nones_ = Constant(value=1.)\nzeros_ = Constant(value=0.)\ntrunc_normal_ = TruncatedNormal(std=.02)\n\n\n# Common Layers\ndef drop_path(x, drop_prob=0., training=False):\n    \"\"\"\n        Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)\n    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\n\n\nclass DropPath(nn.Layer):\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\n\n\nclass Identity(nn.Layer):\n    def __init__(self):\n        super(Identity, self).__init__()\n\n    def forward(self, input):\n        return input\n\n\n# common funcs\n\n\ndef to_2tuple(x):\n    if isinstance(x, (list, tuple)):\n        return x\n    return tuple([x] * 2)\n\n\ndef add_parameter(layer, datas, name=None):\n    parameter = layer.create_parameter(\n        shape=(datas.shape), default_initializer=Assign(datas))\n    if name:\n        layer.add_parameter(name, parameter)\n    return parameter\n\n\ndef window_partition(x, window_size):\n    \"\"\"\n    Partition into non-overlapping windows with padding if needed.\n    Args:\n        x (tensor): input tokens with [B, H, W, C].\n        window_size (int): window size.\n    Returns:\n        windows: windows after partition with [B * num_windows, window_size, window_size, C].\n        (Hp, Wp): padded height and width before partition\n    \"\"\"\n    B, H, W, C = x.shape\n\n    pad_h = (window_size - H % window_size) % window_size\n    pad_w = (window_size - W % window_size) % window_size\n    x = F.pad(x.transpose([0, 3, 1, 2]),\n              paddle.to_tensor(\n                  [0, int(pad_w), 0, int(pad_h)],\n                  dtype='int32')).transpose([0, 2, 3, 1])\n    Hp, Wp = H + pad_h, W + pad_w\n\n    num_h, num_w = Hp // window_size, Wp // window_size\n\n    x = x.reshape([B, num_h, window_size, num_w, window_size, C])\n    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(\n        [-1, window_size, window_size, C])\n    return windows, (Hp, Wp), (num_h, num_w)\n\n\ndef window_unpartition(x, pad_hw, num_hw, hw):\n    \"\"\"\n    Window unpartition into original sequences and removing padding.\n    Args:\n        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].\n        pad_hw (Tuple): padded height and width (Hp, Wp).\n        hw (Tuple): original height and width (H, W) before padding.\n    Returns:\n        x: unpartitioned sequences with [B, H, W, C].\n    \"\"\"\n    Hp, Wp = pad_hw\n    num_h, num_w = num_hw\n    H, W = hw\n    B, window_size, _, C = x.shape\n    B = B // (num_h * num_w)\n    x = x.reshape([B, num_h, num_w, window_size, window_size, C])\n    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])\n\n    return x[:, :H, :W, :]\n"
  },
  {
    "path": "ppdet/modeling/backbones/vgg.py",
    "content": "from __future__ import division\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn import Conv2D, MaxPool2D\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['VGG']\n\nVGG_cfg = {16: [2, 2, 3, 3, 3], 19: [2, 2, 4, 4, 4]}\n\n\nclass ConvBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 groups,\n                 pool_size=2,\n                 pool_stride=2,\n                 pool_padding=0,\n                 name=None):\n        super(ConvBlock, self).__init__()\n\n        self.groups = groups\n        self.conv0 = nn.Conv2D(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            kernel_size=3,\n            stride=1,\n            padding=1)\n        self.conv_out_list = []\n        for i in range(1, groups):\n            conv_out = self.add_sublayer(\n                'conv{}'.format(i),\n                Conv2D(\n                    in_channels=out_channels,\n                    out_channels=out_channels,\n                    kernel_size=3,\n                    stride=1,\n                    padding=1))\n            self.conv_out_list.append(conv_out)\n\n        self.pool = MaxPool2D(\n            kernel_size=pool_size,\n            stride=pool_stride,\n            padding=pool_padding,\n            ceil_mode=True)\n\n    def forward(self, inputs):\n        out = self.conv0(inputs)\n        out = F.relu(out)\n        for conv_i in self.conv_out_list:\n            out = conv_i(out)\n            out = F.relu(out)\n        pool = self.pool(out)\n        return out, pool\n\n\nclass ExtraBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 mid_channels,\n                 out_channels,\n                 padding,\n                 stride,\n                 kernel_size,\n                 name=None):\n        super(ExtraBlock, self).__init__()\n\n        self.conv0 = Conv2D(\n            in_channels=in_channels,\n            out_channels=mid_channels,\n            kernel_size=1,\n            stride=1,\n            padding=0)\n        self.conv1 = Conv2D(\n            in_channels=mid_channels,\n            out_channels=out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding)\n\n    def forward(self, inputs):\n        out = self.conv0(inputs)\n        out = F.relu(out)\n        out = self.conv1(out)\n        out = F.relu(out)\n        return out\n\n\nclass L2NormScale(nn.Layer):\n    def __init__(self, num_channels, scale=1.0):\n        super(L2NormScale, self).__init__()\n        self.scale = self.create_parameter(\n            attr=ParamAttr(initializer=paddle.nn.initializer.Constant(scale)),\n            shape=[num_channels])\n\n    def forward(self, inputs):\n        out = F.normalize(inputs, axis=1, epsilon=1e-10)\n        # out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(\n        #     out) * out\n        out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3) * out\n        return out\n\n\n@register\n@serializable\nclass VGG(nn.Layer):\n    def __init__(self,\n                 depth=16,\n                 normalizations=[20., -1, -1, -1, -1, -1],\n                 extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3],\n                                      [128, 256, 0, 1, 3],\n                                      [128, 256, 0, 1, 3]]):\n        super(VGG, self).__init__()\n\n        assert depth in [16, 19], \\\n                \"depth as 16/19 supported currently, but got {}\".format(depth)\n        self.depth = depth\n        self.groups = VGG_cfg[depth]\n        self.normalizations = normalizations\n        self.extra_block_filters = extra_block_filters\n\n        self._out_channels = []\n\n        self.conv_block_0 = ConvBlock(\n            3, 64, self.groups[0], 2, 2, 0, name=\"conv1_\")\n        self.conv_block_1 = ConvBlock(\n            64, 128, self.groups[1], 2, 2, 0, name=\"conv2_\")\n        self.conv_block_2 = ConvBlock(\n            128, 256, self.groups[2], 2, 2, 0, name=\"conv3_\")\n        self.conv_block_3 = ConvBlock(\n            256, 512, self.groups[3], 2, 2, 0, name=\"conv4_\")\n        self.conv_block_4 = ConvBlock(\n            512, 512, self.groups[4], 3, 1, 1, name=\"conv5_\")\n        self._out_channels.append(512)\n\n        self.fc6 = Conv2D(\n            in_channels=512,\n            out_channels=1024,\n            kernel_size=3,\n            stride=1,\n            padding=6,\n            dilation=6)\n        self.fc7 = Conv2D(\n            in_channels=1024,\n            out_channels=1024,\n            kernel_size=1,\n            stride=1,\n            padding=0)\n        self._out_channels.append(1024)\n\n        # extra block\n        self.extra_convs = []\n        last_channels = 1024\n        for i, v in enumerate(self.extra_block_filters):\n            assert len(v) == 5, \"extra_block_filters size not fix\"\n            extra_conv = self.add_sublayer(\"conv{}\".format(6 + i),\n                                           ExtraBlock(last_channels, v[0], v[1],\n                                                      v[2], v[3], v[4]))\n            last_channels = v[1]\n            self.extra_convs.append(extra_conv)\n            self._out_channels.append(last_channels)\n\n        self.norms = []\n        for i, n in enumerate(self.normalizations):\n            if n != -1:\n                norm = self.add_sublayer(\"norm{}\".format(i),\n                                         L2NormScale(\n                                             self.extra_block_filters[i][1], n))\n            else:\n                norm = None\n            self.norms.append(norm)\n\n    def forward(self, inputs):\n        outputs = []\n\n        conv, pool = self.conv_block_0(inputs['image'])\n        conv, pool = self.conv_block_1(pool)\n        conv, pool = self.conv_block_2(pool)\n        conv, pool = self.conv_block_3(pool)\n        outputs.append(conv)\n\n        conv, pool = self.conv_block_4(pool)\n        out = self.fc6(pool)\n        out = F.relu(out)\n        out = self.fc7(out)\n        out = F.relu(out)\n        outputs.append(out)\n\n        if not self.extra_block_filters:\n            return outputs\n\n        # extra block\n        for extra_conv in self.extra_convs:\n            out = extra_conv(out)\n            outputs.append(out)\n\n        for i, n in enumerate(self.normalizations):\n            if n != -1:\n                outputs[i] = self.norms[i](outputs[i])\n\n        return outputs\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/backbones/vision_transformer.py",
    "content": "# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom paddle.nn.initializer import Constant\n\nfrom ppdet.modeling.shape_spec import ShapeSpec\nfrom ppdet.core.workspace import register, serializable\n\nfrom .transformer_utils import zeros_, DropPath, Identity\n\n\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\n\n\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.,\n                 proj_drop=0.,\n                 window_size=None):\n        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)\n\n        if qkv_bias:\n            self.q_bias = self.create_parameter(\n                shape=([dim]), default_initializer=zeros_)\n            self.v_bias = self.create_parameter(\n                shape=([dim]), default_initializer=zeros_)\n        else:\n            self.q_bias = None\n            self.v_bias = None\n        if window_size:\n            self.window_size = window_size\n            self.num_relative_distance = (2 * window_size[0] - 1) * (\n                2 * window_size[1] - 1) + 3\n            self.relative_position_bias_table = self.create_parameter(\n                shape=(self.num_relative_distance, num_heads),\n                default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH\n            # cls to token & token 2 cls & cls to cls\n\n            # get pair-wise relative position index for each token inside the window\n            coords_h = paddle.arange(window_size[0])\n            coords_w = paddle.arange(window_size[1])\n            coords = paddle.stack(paddle.meshgrid(\n                [coords_h, coords_w]))  # 2, Wh, Ww\n            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww \n            coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)\n            coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)\n            relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(\n            )\n\n            #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Wh\n            relative_coords = relative_coords.transpose(\n                (1, 2, 0))  #.contiguous()  # Wh*Ww, Wh*Ww, 2\n            relative_coords[:, :, 0] += window_size[\n                0] - 1  # shift to start from 0\n            relative_coords[:, :, 1] += window_size[1] - 1\n            relative_coords[:, :, 0] *= 2 * window_size[1] - 1\n            relative_position_index = \\\n                paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)\n            relative_position_index[1:, 1:] = relative_coords.sum(\n                -1)  # Wh*Ww, Wh*Ww\n            relative_position_index[0, 0:] = self.num_relative_distance - 3\n            relative_position_index[0:, 0] = self.num_relative_distance - 2\n            relative_position_index[0, 0] = self.num_relative_distance - 1\n\n            self.register_buffer(\"relative_position_index\",\n                                 relative_position_index)\n            # trunc_normal_(self.relative_position_bias_table, std=.0)\n        else:\n            self.window_size = None\n            self.relative_position_bias_table = None\n            self.relative_position_index = None\n\n        self.attn_drop = nn.Dropout(attn_drop)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n\n    def forward(self, x, rel_pos_bias=None):\n        x_shape = x.shape\n        N, C = x_shape[1], x_shape[2]\n\n        qkv_bias = None\n        if self.q_bias is not None:\n            qkv_bias = paddle.concat(\n                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))\n        qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)\n\n        qkv = qkv.reshape((-1, N, 3, self.num_heads,\n                           C // self.num_heads)).transpose((2, 0, 3, 1, 4))\n        q, k, v = qkv[0], qkv[1], qkv[2]\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n\n        if self.relative_position_bias_table is not None:\n            relative_position_bias = self.relative_position_bias_table[\n                self.relative_position_index.reshape([-1])].reshape([\n                    self.window_size[0] * self.window_size[1] + 1,\n                    self.window_size[0] * self.window_size[1] + 1, -1\n                ])  # Wh*Ww,Wh*Ww,nH\n            relative_position_bias = relative_position_bias.transpose(\n                (2, 0, 1))  #.contiguous()  # nH, Wh*Ww, Wh*Ww\n            attn = attn + relative_position_bias.unsqueeze(0)\n        if rel_pos_bias is not None:\n            attn = attn + rel_pos_bias\n\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n\n        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\n\n\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 window_size=None,\n                 init_values=None,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5):\n        super().__init__()\n        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)\n        self.attn = Attention(\n            dim,\n            num_heads=num_heads,\n            qkv_bias=qkv_bias,\n            qk_scale=qk_scale,\n            attn_drop=attn_drop,\n            proj_drop=drop,\n            window_size=window_size)\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n        if init_values is not None:\n            self.gamma_1 = self.create_parameter(\n                shape=([dim]), default_initializer=Constant(value=init_values))\n            self.gamma_2 = self.create_parameter(\n                shape=([dim]), default_initializer=Constant(value=init_values))\n        else:\n            self.gamma_1, self.gamma_2 = None, None\n\n    def forward(self, x, rel_pos_bias=None):\n\n        if self.gamma_1 is None:\n            x = x + self.drop_path(\n                self.attn(\n                    self.norm1(x), rel_pos_bias=rel_pos_bias))\n            x = x + self.drop_path(self.mlp(self.norm2(x)))\n        else:\n            x = x + self.drop_path(self.gamma_1 * self.attn(\n                self.norm1(x), rel_pos_bias=rel_pos_bias))\n            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))\n        return x\n\n\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n\n    def __init__(self,\n                 img_size=[224, 224],\n                 patch_size=16,\n                 in_chans=3,\n                 embed_dim=768):\n        super().__init__()\n        self.num_patches_w = img_size[0] // patch_size\n        self.num_patches_h = img_size[1] // patch_size\n\n        num_patches = self.num_patches_w * self.num_patches_h\n        self.patch_shape = (img_size[0] // patch_size,\n                            img_size[1] // patch_size)\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.num_patches = num_patches\n\n        self.proj = nn.Conv2D(\n            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)\n\n    @property\n    def num_patches_in_h(self):\n        return self.img_size[1] // self.patch_size\n\n    @property\n    def num_patches_in_w(self):\n        return self.img_size[0] // self.patch_size\n\n    def forward(self, x, mask=None):\n        B, C, H, W = x.shape\n        return self.proj(x)\n\n\nclass RelativePositionBias(nn.Layer):\n    def __init__(self, window_size, num_heads):\n        super().__init__()\n        self.window_size = window_size\n        self.num_relative_distance = (2 * window_size[0] - 1) * (\n            2 * window_size[1] - 1) + 3\n        self.relative_position_bias_table = self.create_parameter(\n            shape=(self.num_relative_distance, num_heads),\n            default_initialize=zeros_)\n        # cls to token & token 2 cls & cls to cls\n\n        # get pair-wise relative position index for each token inside the window\n        coords_h = paddle.arange(window_size[0])\n        coords_w = paddle.arange(window_size[1])\n        coords = paddle.stack(paddle.meshgrid(\n            [coords_h, coords_w]))  # 2, Wh, Ww\n        coords_flatten = coords.flatten(1)  # 2, Wh*Ww\n\n        relative_coords = coords_flatten[:, :,\n                                         None] - coords_flatten[:,\n                                                                None, :]  # 2, Wh*Ww, Wh*Ww\n        relative_coords = relative_coords.transpos(\n            (1, 2, 0))  # Wh*Ww, Wh*Ww, 2 \n        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0\n        relative_coords[:, :, 1] += window_size[1] - 1\n        relative_coords[:, :, 0] *= 2 * window_size[1] - 1\n        relative_position_index = \\\n            paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)\n        relative_position_index[1:, 1:] = relative_coords.sum(\n            -1)  # Wh*Ww, Wh*Ww\n        relative_position_index[0, 0:] = self.num_relative_distance - 3\n        relative_position_index[0:, 0] = self.num_relative_distance - 2\n        relative_position_index[0, 0] = self.num_relative_distance - 1\n        self.register_buffer(\"relative_position_index\", relative_position_index)\n\n    def forward(self):\n        relative_position_bias = \\\n            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([\n                 self.window_size[0] * self.window_size[1] + 1,\n                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH \n        return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww\n\n\ndef get_sinusoid_encoding_table(n_position, d_hid, token=False):\n    ''' Sinusoid position encoding table '''\n\n    def get_position_angle_vec(position):\n        return [\n            position / np.power(10000, 2 * (hid_j // 2) / d_hid)\n            for hid_j in range(d_hid)\n        ]\n\n    sinusoid_table = np.array(\n        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])\n    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i\n    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1\n    if token:\n        sinusoid_table = np.concatenate(\n            [sinusoid_table, np.zeros([1, d_hid])], dim=0)\n\n    return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)\n\n\n@register\n@serializable\nclass VisionTransformer(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n\n    def __init__(self,\n                 img_size=[672, 1092],\n                 patch_size=16,\n                 in_chans=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.,\n                 norm_layer='nn.LayerNorm',\n                 init_values=None,\n                 use_rel_pos_bias=False,\n                 use_shared_rel_pos_bias=False,\n                 epsilon=1e-5,\n                 final_norm=False,\n                 pretrained=None,\n                 out_indices=[3, 5, 7, 11],\n                 use_abs_pos_emb=False,\n                 use_sincos_pos_emb=True,\n                 with_fpn=True,\n                 num_fpn_levels=4,\n                 use_checkpoint=False,\n                 **args):\n        super().__init__()\n        self.img_size = img_size\n        self.embed_dim = embed_dim\n        self.with_fpn = with_fpn\n        self.use_checkpoint = use_checkpoint\n        self.use_sincos_pos_emb = use_sincos_pos_emb\n        self.use_rel_pos_bias = use_rel_pos_bias\n        self.final_norm = final_norm\n        self.out_indices = out_indices\n        self.num_fpn_levels = num_fpn_levels\n\n        if use_checkpoint:\n            paddle.seed(0)\n\n        self.patch_embed = PatchEmbed(\n            img_size=img_size,\n            patch_size=patch_size,\n            in_chans=in_chans,\n            embed_dim=embed_dim)\n\n        self.pos_w = self.patch_embed.num_patches_in_w\n        self.pos_h = self.patch_embed.num_patches_in_h\n\n        self.cls_token = self.create_parameter(\n            shape=(1, 1, embed_dim),\n            default_initializer=paddle.nn.initializer.Constant(value=0.))\n\n        if use_abs_pos_emb:\n            self.pos_embed = self.create_parameter(\n                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),\n                default_initializer=paddle.nn.initializer.TruncatedNormal(\n                    std=.02))\n        elif use_sincos_pos_emb:\n            pos_embed = self.build_2d_sincos_position_embedding(embed_dim)\n\n            self.pos_embed = pos_embed\n            self.pos_embed = self.create_parameter(shape=pos_embed.shape)\n            self.pos_embed.set_value(pos_embed.numpy())\n            self.pos_embed.stop_gradient = True\n\n        else:\n            self.pos_embed = None\n\n        self.pos_drop = nn.Dropout(p=drop_rate)\n\n        if use_shared_rel_pos_bias:\n            self.rel_pos_bias = RelativePositionBias(\n                window_size=self.patch_embed.patch_shape, num_heads=num_heads)\n        else:\n            self.rel_pos_bias = None\n\n        dpr = np.linspace(0, drop_path_rate, depth)\n\n        self.blocks = nn.LayerList([\n            Block(\n                dim=embed_dim,\n                num_heads=num_heads,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop_rate,\n                attn_drop=attn_drop_rate,\n                drop_path=dpr[i],\n                norm_layer=norm_layer,\n                init_values=init_values,\n                window_size=self.patch_embed.patch_shape\n                if use_rel_pos_bias else None,\n                epsilon=epsilon) for i in range(depth)\n        ])\n\n        self.pretrained = pretrained\n        self.init_weight()\n\n        assert len(out_indices) <= 4, ''\n        self.out_indices = out_indices\n        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]\n        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [\n            patch_size for _ in range(len(out_indices))\n        ]\n\n        self.norm = Identity()\n\n        if self.with_fpn:\n            assert num_fpn_levels <= 4, ''\n            self.init_fpn(\n                embed_dim=embed_dim,\n                patch_size=patch_size, )\n\n    def init_weight(self):\n        pretrained = self.pretrained\n\n        if pretrained:\n            if 'http' in pretrained:  #URL\n                path = paddle.utils.download.get_weights_path_from_url(\n                    pretrained)\n            else:  #model in local path\n                path = pretrained\n\n            load_state_dict = paddle.load(path)\n            model_state_dict = self.state_dict()\n            pos_embed_name = \"pos_embed\"\n\n            if pos_embed_name in load_state_dict.keys():\n                load_pos_embed = paddle.to_tensor(\n                    load_state_dict[pos_embed_name], dtype=\"float32\")\n                if self.pos_embed.shape != load_pos_embed.shape:\n                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))\n                    model_state_dict[pos_embed_name] = self.resize_pos_embed(\n                        load_pos_embed, (pos_size, pos_size),\n                        (self.pos_h, self.pos_w))\n\n                    # self.set_state_dict(model_state_dict)\n                    load_state_dict[pos_embed_name] = model_state_dict[\n                        pos_embed_name]\n\n                    print(\"Load pos_embed and resize it from {} to {} .\".format(\n                        load_pos_embed.shape, self.pos_embed.shape))\n\n            self.set_state_dict(load_state_dict)\n            print(\"Load load_state_dict....\")\n\n    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):\n        if patch_size == 16:\n            self.fpn1 = nn.Sequential(\n                nn.Conv2DTranspose(\n                    embed_dim, embed_dim, kernel_size=2, stride=2),\n                nn.BatchNorm2D(embed_dim),\n                nn.GELU(),\n                nn.Conv2DTranspose(\n                    embed_dim, embed_dim, kernel_size=2, stride=2), )\n\n            self.fpn2 = nn.Sequential(\n                nn.Conv2DTranspose(\n                    embed_dim, embed_dim, kernel_size=2, stride=2), )\n\n            self.fpn3 = Identity()\n\n            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)\n        elif patch_size == 8:\n            self.fpn1 = nn.Sequential(\n                nn.Conv2DTranspose(\n                    embed_dim, embed_dim, kernel_size=2, stride=2), )\n\n            self.fpn2 = Identity()\n\n            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )\n\n            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )\n\n        if not out_with_norm:\n            self.norm = Identity()\n        else:\n            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)\n\n    def interpolate_pos_encoding(self, x, w, h):\n        npatch = x.shape[1] - 1\n        N = self.pos_embed.shape[1] - 1\n        w0 = w // self.patch_embed.patch_size\n        h0 = h // self.patch_embed.patch_size\n        if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:\n            return self.pos_embed\n        class_pos_embed = self.pos_embed[:, 0]\n        patch_pos_embed = self.pos_embed[:, 1:]\n        dim = x.shape[-1]\n        # we add a small number to avoid floating point error in the interpolation\n        # see discussion at https://github.com/facebookresearch/dino/issues/8\n        # w0, h0 = w0 + 0.1, h0 + 0.1\n        # patch_pos_embed = nn.functional.interpolate(\n        #     patch_pos_embed.reshape([\n        #         1, self.patch_embed.num_patches_w,\n        #         self.patch_embed.num_patches_h, dim\n        #     ]).transpose((0, 3, 1, 2)),\n        #     scale_factor=(w0 / self.patch_embed.num_patches_w,\n        #                   h0 / self.patch_embed.num_patches_h),\n        #     mode='bicubic', )\n\n        patch_pos_embed = nn.functional.interpolate(\n            patch_pos_embed.reshape([\n                1, self.patch_embed.num_patches_w,\n                self.patch_embed.num_patches_h, dim\n            ]).transpose((0, 3, 1, 2)),\n            (w0, h0),\n            mode='bicubic', )\n\n        assert int(w0) == patch_pos_embed.shape[-2] and int(\n            h0) == patch_pos_embed.shape[-1]\n        patch_pos_embed = patch_pos_embed.transpose(\n            (0, 2, 3, 1)).reshape([1, -1, dim])\n        return paddle.concat(\n            (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)\n\n    def resize_pos_embed(self, pos_embed, old_hw, new_hw):\n        \"\"\"\n        Resize pos_embed weight.\n        Args:\n            pos_embed (Tensor): the pos_embed weight\n            old_hw (list[int]): the height and width of old pos_embed\n            new_hw (list[int]): the height and width of new pos_embed\n        Returns:\n            Tensor: the resized pos_embed weight\n        \"\"\"\n        cls_pos_embed = pos_embed[:, :1, :]\n        pos_embed = pos_embed[:, 1:, :]\n\n        pos_embed = pos_embed.transpose([0, 2, 1])\n        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])\n        pos_embed = F.interpolate(\n            pos_embed, new_hw, mode='bicubic', align_corners=False)\n        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])\n        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)\n\n        return pos_embed\n\n    def build_2d_sincos_position_embedding(\n            self,\n            embed_dim=768,\n            temperature=10000., ):\n        h, w = self.patch_embed.patch_shape\n        grid_w = paddle.arange(w, dtype=paddle.float32)\n        grid_h = paddle.arange(h, dtype=paddle.float32)\n        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)\n        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'\n        pos_dim = embed_dim // 4\n        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim\n        omega = 1. / (temperature**omega)\n\n        out_w = grid_w.flatten()[..., None] @omega[None]\n        out_h = grid_h.flatten()[..., None] @omega[None]\n\n        pos_emb = paddle.concat(\n            [\n                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),\n                paddle.cos(out_h)\n            ],\n            axis=1)[None, :, :]\n\n        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)\n        pos_embed = paddle.concat([pe_token, pos_emb], axis=1)\n        # pos_embed.stop_gradient = True\n\n        return pos_embed\n\n    def forward(self, x):\n        x = x['image'] if isinstance(x, dict) else x\n        _, _, h, w = x.shape\n\n        x = self.patch_embed(x)\n\n        B, D, Hp, Wp = x.shape  # b * c * h * w\n\n        cls_tokens = self.cls_token.expand(\n            (B, self.cls_token.shape[-2], self.cls_token.shape[-1]))\n        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c\n        x = paddle.concat([cls_tokens, x], axis=1)\n\n        if self.pos_embed is not None:\n            # x = x + self.interpolate_pos_encoding(x, w, h)\n            x = x + self.interpolate_pos_encoding(x, h, w)\n\n        x = self.pos_drop(x)\n\n        rel_pos_bias = self.rel_pos_bias(\n        ) if self.rel_pos_bias is not None else None\n\n        feats = []\n        for idx, blk in enumerate(self.blocks):\n            if self.use_checkpoint and self.training:\n                x = paddle.distributed.fleet.utils.recompute(\n                    blk, x, rel_pos_bias, **{\"preserve_rng_state\": True})\n            else:\n                x = blk(x, rel_pos_bias)\n\n            if idx in self.out_indices:\n                xp = paddle.reshape(\n                    paddle.transpose(\n                        self.norm(x[:, 1:, :]), perm=[0, 2, 1]),\n                    shape=[B, D, Hp, Wp])\n                feats.append(xp)\n\n        if self.with_fpn:\n            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][\n                -self.num_fpn_levels:]\n            assert len(fpns) == len(feats) or len(feats) == 1, ''\n            outputs = []\n            for i, m in enumerate(fpns):\n                outputs.append(\n                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))\n\n            return outputs\n\n        return feats\n\n    @property\n    def num_layers(self):\n        return len(self.blocks)\n\n    @property\n    def no_weight_decay(self):\n        return {'pos_embed', 'cls_token'}\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=c, stride=s)\n            for c, s in zip(self.out_channels, self.out_strides)\n        ]\n"
  },
  {
    "path": "ppdet/modeling/backbones/vit_mae.py",
    "content": "# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nimport math\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import Constant, TruncatedNormal\n\nfrom ppdet.modeling.shape_spec import ShapeSpec\nfrom ppdet.core.workspace import register, serializable\n\nfrom .transformer_utils import (zeros_, DropPath, Identity, window_partition,\n                                window_unpartition)\nfrom ..initializer import linear_init_\n\n__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']\n\n\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer='nn.GELU',\n                 drop=0.,\n                 lr_factor=1.0):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(\n            in_features,\n            hidden_features,\n            weight_attr=ParamAttr(learning_rate=lr_factor),\n            bias_attr=ParamAttr(learning_rate=lr_factor))\n        self.act = eval(act_layer)()\n        self.fc2 = nn.Linear(\n            hidden_features,\n            out_features,\n            weight_attr=ParamAttr(learning_rate=lr_factor),\n            bias_attr=ParamAttr(learning_rate=lr_factor))\n        self.drop = nn.Dropout(drop)\n\n        self._init_weights()\n\n    def _init_weights(self):\n        linear_init_(self.fc1)\n        linear_init_(self.fc2)\n\n    def forward(self, x):\n        x = self.drop(self.act(self.fc1(x)))\n        x = self.drop(self.fc2(x))\n        return x\n\n\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 attn_bias=False,\n                 attn_drop=0.,\n                 proj_drop=0.,\n                 use_rel_pos=False,\n                 rel_pos_zero_init=True,\n                 window_size=None,\n                 input_size=None,\n                 qk_scale=None,\n                 lr_factor=1.0):\n        super().__init__()\n        self.num_heads = num_heads\n        self.head_dim = dim // num_heads\n        self.scale = qk_scale or self.head_dim**-0.5\n        self.use_rel_pos = use_rel_pos\n        self.input_size = input_size\n        self.rel_pos_zero_init = rel_pos_zero_init\n        self.window_size = window_size\n        self.lr_factor = lr_factor\n\n        self.qkv = nn.Linear(\n            dim,\n            dim * 3,\n            weight_attr=ParamAttr(learning_rate=lr_factor),\n            bias_attr=ParamAttr(learning_rate=lr_factor)\n            if attn_bias else False)\n        if qkv_bias:\n            self.q_bias = self.create_parameter(\n                shape=([dim]), default_initializer=zeros_)\n            self.v_bias = self.create_parameter(\n                shape=([dim]), default_initializer=zeros_)\n        else:\n            self.q_bias = None\n            self.v_bias = None\n        self.proj = nn.Linear(\n            dim,\n            dim,\n            weight_attr=ParamAttr(learning_rate=lr_factor),\n            bias_attr=ParamAttr(learning_rate=lr_factor))\n        self.attn_drop = nn.Dropout(attn_drop)\n        if window_size is None:\n            self.window_size = self.input_size[0]\n\n        self._init_weights()\n\n    def _init_weights(self):\n        linear_init_(self.qkv)\n        linear_init_(self.proj)\n\n        if self.use_rel_pos:\n            self.rel_pos_h = self.create_parameter(\n                [2 * self.window_size - 1, self.head_dim],\n                attr=ParamAttr(learning_rate=self.lr_factor),\n                default_initializer=Constant(value=0.))\n            self.rel_pos_w = self.create_parameter(\n                [2 * self.window_size - 1, self.head_dim],\n                attr=ParamAttr(learning_rate=self.lr_factor),\n                default_initializer=Constant(value=0.))\n\n            if not self.rel_pos_zero_init:\n                TruncatedNormal(self.rel_pos_h, std=0.02)\n                TruncatedNormal(self.rel_pos_w, std=0.02)\n\n    def get_rel_pos(self, seq_size, rel_pos):\n        max_rel_dist = int(2 * seq_size - 1)\n        # Interpolate rel pos if needed.\n        if rel_pos.shape[0] != max_rel_dist:\n            # Interpolate rel pos.\n            rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])\n            rel_pos = rel_pos.transpose([0, 2, 1])\n            rel_pos_resized = F.interpolate(\n                rel_pos,\n                size=(max_rel_dist, ),\n                mode=\"linear\",\n                data_format='NCW')\n            rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])\n            rel_pos_resized = rel_pos_resized.transpose([1, 0])\n        else:\n            rel_pos_resized = rel_pos\n\n        coords = paddle.arange(seq_size, dtype='float32')\n        relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)\n        relative_coords += (seq_size - 1)\n        relative_coords = relative_coords.astype('int64').flatten()\n\n        return paddle.index_select(rel_pos_resized, relative_coords).reshape(\n            [seq_size, seq_size, self.head_dim])\n\n    def add_decomposed_rel_pos(self, attn, q, h, w):\n        \"\"\"\n        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.\n        Args:\n            attn (Tensor): attention map.\n            q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).\n        Returns:\n            attn (Tensor): attention map with added relative positional embeddings.\n        \"\"\"\n        Rh = self.get_rel_pos(h, self.rel_pos_h)\n        Rw = self.get_rel_pos(w, self.rel_pos_w)\n\n        B, _, dim = q.shape\n        r_q = q.reshape([B, h, w, dim])\n        # bhwc, hch->bhwh1\n        # bwhc, wcw->bhw1w\n        rel_h = paddle.einsum(\"bhwc,hkc->bhwk\", r_q, Rh).unsqueeze(-1)\n        rel_w = paddle.einsum(\"bhwc,wkc->bhwk\", r_q, Rw).unsqueeze(-2)\n\n        attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w\n        return attn.reshape([B, h * w, h * w])\n\n    def forward(self, x):\n        B, H, W, C = x.shape\n\n        if self.q_bias is not None:\n            qkv_bias = paddle.concat(\n                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))\n            qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)\n        else:\n            qkv = self.qkv(x).reshape(\n                [B, H * W, 3, self.num_heads, self.head_dim]).transpose(\n                    [2, 0, 3, 1, 4]).reshape(\n                        [3, B * self.num_heads, H * W, self.head_dim])\n\n        q, k, v = qkv[0], qkv[1], qkv[2]\n        attn = q.matmul(k.transpose([0, 2, 1])) * self.scale\n\n        if self.use_rel_pos:\n            attn = self.add_decomposed_rel_pos(attn, q, H, W)\n\n        attn = F.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n        x = attn.matmul(v).reshape(\n            [B, self.num_heads, H * W, self.head_dim]).transpose(\n                [0, 2, 1, 3]).reshape([B, H, W, C])\n        x = self.proj(x)\n        return x\n\n\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.,\n                 qkv_bias=False,\n                 attn_bias=False,\n                 qk_scale=None,\n                 init_values=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 use_rel_pos=True,\n                 rel_pos_zero_init=True,\n                 window_size=None,\n                 input_size=None,\n                 act_layer='nn.GELU',\n                 norm_layer='nn.LayerNorm',\n                 lr_factor=1.0,\n                 epsilon=1e-5):\n        super().__init__()\n        self.window_size = window_size\n\n        self.norm1 = eval(norm_layer)(dim,\n                                      weight_attr=ParamAttr(\n                                          learning_rate=lr_factor,\n                                          regularizer=L2Decay(0.0)),\n                                      bias_attr=ParamAttr(\n                                          learning_rate=lr_factor,\n                                          regularizer=L2Decay(0.0)),\n                                      epsilon=epsilon)\n        self.attn = Attention(\n            dim,\n            num_heads=num_heads,\n            qkv_bias=qkv_bias,\n            attn_bias=attn_bias,\n            qk_scale=qk_scale,\n            attn_drop=attn_drop,\n            proj_drop=drop,\n            use_rel_pos=use_rel_pos,\n            rel_pos_zero_init=rel_pos_zero_init,\n            window_size=window_size,\n            input_size=input_size,\n            lr_factor=lr_factor)\n\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        self.norm2 = eval(norm_layer)(dim,\n                                      weight_attr=ParamAttr(\n                                          learning_rate=lr_factor,\n                                          regularizer=L2Decay(0.0)),\n                                      bias_attr=ParamAttr(\n                                          learning_rate=lr_factor,\n                                          regularizer=L2Decay(0.0)),\n                                      epsilon=epsilon)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=int(dim * mlp_ratio),\n                       act_layer=act_layer,\n                       drop=drop,\n                       lr_factor=lr_factor)\n        if init_values is not None:\n            self.gamma_1 = self.create_parameter(\n                shape=([dim]), default_initializer=Constant(value=init_values))\n            self.gamma_2 = self.create_parameter(\n                shape=([dim]), default_initializer=Constant(value=init_values))\n        else:\n            self.gamma_1, self.gamma_2 = None, None\n\n    def forward(self, x):\n        y = self.norm1(x)\n        if self.window_size is not None:\n            y, pad_hw, num_hw = window_partition(y, self.window_size)\n        y = self.attn(y)\n        if self.gamma_1 is not None:\n            y = self.gamma_1 * y\n\n        if self.window_size is not None:\n            y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))\n        x = x + self.drop_path(y)\n        if self.gamma_2 is None:\n            x = x + self.drop_path(self.mlp(self.norm2(x)))\n        else:\n            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))\n\n        return x\n\n\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n\n    def __init__(self,\n                 img_size=(224, 224),\n                 patch_size=16,\n                 in_chans=3,\n                 embed_dim=768,\n                 lr_factor=0.01):\n        super().__init__()\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.proj = nn.Conv2D(\n            in_chans,\n            embed_dim,\n            kernel_size=patch_size,\n            stride=patch_size,\n            weight_attr=ParamAttr(learning_rate=lr_factor),\n            bias_attr=ParamAttr(learning_rate=lr_factor))\n\n    @property\n    def num_patches_in_h(self):\n        return self.img_size[1] // self.patch_size\n\n    @property\n    def num_patches_in_w(self):\n        return self.img_size[0] // self.patch_size\n\n    def forward(self, x):\n        out = self.proj(x)\n        return out\n\n\n@register\n@serializable\nclass VisionTransformer2D(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n\n    def __init__(self,\n                 img_size=(1024, 1024),\n                 patch_size=16,\n                 in_chans=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,\n                 qkv_bias=False,\n                 attn_bias=False,\n                 qk_scale=None,\n                 init_values=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.,\n                 act_layer='nn.GELU',\n                 norm_layer='nn.LayerNorm',\n                 lr_decay_rate=1.0,\n                 global_attn_indexes=(2, 5, 8, 11),\n                 use_abs_pos=False,\n                 use_rel_pos=False,\n                 use_abs_pos_emb=False,\n                 use_sincos_pos_emb=False,\n                 rel_pos_zero_init=True,\n                 epsilon=1e-5,\n                 final_norm=False,\n                 pretrained=None,\n                 window_size=None,\n                 out_indices=(11, ),\n                 with_fpn=False,\n                 use_checkpoint=False,\n                 *args,\n                 **kwargs):\n        super().__init__()\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.embed_dim = embed_dim\n        self.num_heads = num_heads\n        self.depth = depth\n        self.global_attn_indexes = global_attn_indexes\n        self.epsilon = epsilon\n        self.with_fpn = with_fpn\n        self.use_checkpoint = use_checkpoint\n\n        self.patch_h = img_size[0] // patch_size\n        self.patch_w = img_size[1] // patch_size\n        self.num_patches = self.patch_h * self.patch_w\n        self.use_abs_pos = use_abs_pos\n        self.use_abs_pos_emb = use_abs_pos_emb\n\n        self.patch_embed = PatchEmbed(\n            img_size=img_size,\n            patch_size=patch_size,\n            in_chans=in_chans,\n            embed_dim=embed_dim)\n\n        dpr = np.linspace(0, drop_path_rate, depth)\n        if use_checkpoint:\n            paddle.seed(0)\n\n        if use_abs_pos_emb:\n            self.pos_w = self.patch_embed.num_patches_in_w\n            self.pos_h = self.patch_embed.num_patches_in_h\n            self.pos_embed = self.create_parameter(\n                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),\n                default_initializer=paddle.nn.initializer.TruncatedNormal(\n                    std=.02))\n        elif use_sincos_pos_emb:\n            pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,\n                                                              self.patch_w)\n\n            self.pos_embed = pos_embed\n            self.pos_embed = self.create_parameter(shape=pos_embed.shape)\n            self.pos_embed.set_value(pos_embed.numpy())\n            self.pos_embed.stop_gradient = True\n        else:\n            self.pos_embed = None\n\n        self.blocks = nn.LayerList([\n            Block(\n                embed_dim,\n                num_heads=num_heads,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                attn_bias=attn_bias,\n                qk_scale=qk_scale,\n                drop=drop_rate,\n                attn_drop=attn_drop_rate,\n                drop_path=dpr[i],\n                use_rel_pos=use_rel_pos,\n                rel_pos_zero_init=rel_pos_zero_init,\n                window_size=None\n                if i in self.global_attn_indexes else window_size,\n                input_size=[self.patch_h, self.patch_w],\n                act_layer=act_layer,\n                lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),\n                norm_layer=norm_layer,\n                init_values=init_values,\n                epsilon=epsilon) for i in range(depth)\n        ])\n\n        assert len(out_indices) <= 4, 'out_indices out of bound'\n        self.out_indices = out_indices\n        self.pretrained = pretrained\n        self.init_weight()\n\n        self.out_channels = [embed_dim for _ in range(len(out_indices))]\n        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [\n            patch_size for _ in range(len(out_indices))\n        ]\n        self.norm = Identity()\n        if self.with_fpn:\n            self.init_fpn(\n                embed_dim=embed_dim,\n                patch_size=patch_size,\n                out_with_norm=final_norm)\n\n    def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):\n        return lr_decay_rate**(self.depth - layer_id)\n\n    def init_weight(self):\n        pretrained = self.pretrained\n        if pretrained:\n            if 'http' in pretrained:\n                path = paddle.utils.download.get_weights_path_from_url(\n                    pretrained)\n            else:\n                path = pretrained\n\n            load_state_dict = paddle.load(path)\n            model_state_dict = self.state_dict()\n            pos_embed_name = \"pos_embed\"\n\n            if pos_embed_name in load_state_dict.keys(\n            ) and self.use_abs_pos_emb:\n                load_pos_embed = paddle.to_tensor(\n                    load_state_dict[pos_embed_name], dtype=\"float32\")\n                if self.pos_embed.shape != load_pos_embed.shape:\n                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))\n                    model_state_dict[pos_embed_name] = self.resize_pos_embed(\n                        load_pos_embed, (pos_size, pos_size),\n                        (self.pos_h, self.pos_w))\n\n                    # self.set_state_dict(model_state_dict)\n                    load_state_dict[pos_embed_name] = model_state_dict[\n                        pos_embed_name]\n\n                    print(\"Load pos_embed and resize it from {} to {} .\".format(\n                        load_pos_embed.shape, self.pos_embed.shape))\n\n            self.set_state_dict(load_state_dict)\n            print(\"Load load_state_dict....\")\n\n    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):\n        if patch_size == 16:\n            self.fpn1 = nn.Sequential(\n                nn.Conv2DTranspose(\n                    embed_dim, embed_dim, kernel_size=2, stride=2),\n                nn.BatchNorm2D(embed_dim),\n                nn.GELU(),\n                nn.Conv2DTranspose(\n                    embed_dim, embed_dim, kernel_size=2, stride=2), )\n\n            self.fpn2 = nn.Sequential(\n                nn.Conv2DTranspose(\n                    embed_dim, embed_dim, kernel_size=2, stride=2), )\n\n            self.fpn3 = Identity()\n\n            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)\n        elif patch_size == 8:\n            self.fpn1 = nn.Sequential(\n                nn.Conv2DTranspose(\n                    embed_dim, embed_dim, kernel_size=2, stride=2), )\n\n            self.fpn2 = Identity()\n\n            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )\n\n            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )\n\n        if not out_with_norm:\n            self.norm = Identity()\n        else:\n            self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)\n\n    def resize_pos_embed(self, pos_embed, old_hw, new_hw):\n        \"\"\"\n        Resize pos_embed weight.\n        Args:\n            pos_embed (Tensor): the pos_embed weight\n            old_hw (list[int]): the height and width of old pos_embed\n            new_hw (list[int]): the height and width of new pos_embed\n        Returns:\n            Tensor: the resized pos_embed weight\n        \"\"\"\n        cls_pos_embed = pos_embed[:, :1, :]\n        pos_embed = pos_embed[:, 1:, :]\n\n        pos_embed = pos_embed.transpose([0, 2, 1])\n        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])\n        pos_embed = F.interpolate(\n            pos_embed, new_hw, mode='bicubic', align_corners=False)\n        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])\n        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)\n\n        return pos_embed\n\n    def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):\n        grid_y, grid_x = paddle.meshgrid(\n            paddle.arange(\n                h, dtype=paddle.float32),\n            paddle.arange(\n                w, dtype=paddle.float32))\n        assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'\n        pos_dim = self.embed_dim // 4\n        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim\n        omega = (1. / (temperature**omega)).unsqueeze(0)\n\n        out_x = grid_x.reshape([-1, 1]).matmul(omega)\n        out_y = grid_y.reshape([-1, 1]).matmul(omega)\n\n        pos_emb = paddle.concat(\n            [\n                paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),\n                paddle.cos(out_x)\n            ],\n            axis=1)\n\n        return pos_emb.reshape([1, h, w, self.embed_dim])\n\n    def forward(self, inputs):\n        x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])\n        B, Hp, Wp, _ = x.shape\n\n        if self.use_abs_pos:\n            x = x + self.get_2d_sincos_position_embedding(Hp, Wp)\n\n        if self.use_abs_pos_emb:\n            x = x + self.resize_pos_embed(self.pos_embed,\n                                          (self.pos_h, self.pos_w), (Hp, Wp))\n\n        feats = []\n        for idx, blk in enumerate(self.blocks):\n            if self.use_checkpoint and self.training:\n                x = paddle.distributed.fleet.utils.recompute(\n                    blk, x, **{\"preserve_rng_state\": True})\n            else:\n                x = blk(x)\n            if idx in self.out_indices:\n                feats.append(self.norm(x.transpose([0, 3, 1, 2])))\n\n        if self.with_fpn:\n            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]\n            for i in range(len(feats)):\n                feats[i] = fpns[i](feats[i])\n        return feats\n\n    @property\n    def num_layers(self):\n        return len(self.blocks)\n\n    @property\n    def no_weight_decay(self):\n        return {'pos_embed', 'cls_token'}\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=c, stride=s)\n            for c, s in zip(self.out_channels, self.out_strides)\n        ]\n\n\nclass LayerNorm(nn.Layer):\n    \"\"\"\n    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and\n    variance normalization over the channel dimension for inputs that have shape\n    (batch_size, channels, height, width).    \n    Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.\n\n    In ViT, we use the nn.LayerNorm\n    \"\"\"\n\n    def __init__(self, normalized_shape, eps=1e-6):\n        super().__init__()\n        self.weight = self.create_parameter([normalized_shape])\n        self.bias = self.create_parameter([normalized_shape])\n        self.eps = eps\n        self.normalized_shape = (normalized_shape, )\n\n    def forward(self, x):\n        u = x.mean(1, keepdim=True)\n        s = (x - u).pow(2).mean(1, keepdim=True)\n        x = (x - u) / paddle.sqrt(s + self.eps)\n        x = self.weight[:, None, None] * x + self.bias[:, None, None]\n        return x\n\n\n@register\n@serializable\nclass SimpleFeaturePyramid(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 spatial_scales,\n                 num_levels=4,\n                 use_bias=False):\n        \"\"\"\n        Args:\n            in_channels (list[int]): input channels of each level which can be \n                derived from the output shape of backbone by from_config\n            out_channel (int): output channel of each level.\n            spatial_scales (list[float]): list of scaling factors to upsample or downsample\n                the input features for creating pyramid features which can be derived from \n                the output shape of backbone by from_config\n            num_levels (int): number of levels of output features.\n            use_bias (bool): whether use bias or not.\n        \"\"\"\n        super(SimpleFeaturePyramid, self).__init__()\n\n        self.in_channels = in_channels[0]\n        self.out_channels = out_channels\n        self.num_levels = num_levels\n\n        self.stages = []\n        dim = self.in_channels\n        if num_levels == 4:\n            scale_factors = [2.0, 1.0, 0.5]\n        elif num_levels == 5:\n            scale_factors = [4.0, 2.0, 1.0, 0.5]\n        else:\n            raise NotImplementedError(\n                f\"num_levels={num_levels} is not supported yet.\")\n\n        dim = in_channels[0]\n        for idx, scale in enumerate(scale_factors):\n            out_dim = dim\n            if scale == 4.0:\n                layers = [\n                    nn.Conv2DTranspose(\n                        dim, dim // 2, kernel_size=2, stride=2),\n                    nn.LayerNorm(dim // 2),\n                    nn.GELU(),\n                    nn.Conv2DTranspose(\n                        dim // 2, dim // 4, kernel_size=2, stride=2),\n                ]\n                out_dim = dim // 4\n            elif scale == 2.0:\n                layers = [\n                    nn.Conv2DTranspose(\n                        dim, dim // 2, kernel_size=2, stride=2)\n                ]\n                out_dim = dim // 2\n            elif scale == 1.0:\n                layers = []\n            elif scale == 0.5:\n                layers = [nn.MaxPool2D(kernel_size=2, stride=2)]\n\n            layers.extend([\n                nn.Conv2D(\n                    out_dim,\n                    out_channels,\n                    kernel_size=1,\n                    bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(\n                        out_channels,\n                        out_channels,\n                        kernel_size=3,\n                        padding=1,\n                        bias_attr=use_bias, ), LayerNorm(out_channels)\n            ])\n            layers = nn.Sequential(*layers)\n\n            stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))\n            self.add_sublayer(f\"simfp_{stage}\", layers)\n            self.stages.append(layers)\n\n        # top block output feature maps.\n        self.top_block = nn.Sequential(\n            nn.MaxPool2D(\n                kernel_size=1, stride=2, padding=0))\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {\n            'in_channels': [i.channels for i in input_shape],\n            'spatial_scales': [1.0 / i.stride for i in input_shape],\n        }\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(channels=self.out_channels)\n            for _ in range(self.num_levels)\n        ]\n\n    def forward(self, feats):\n        \"\"\"\n        Args:\n            x: Tensor of shape (N,C,H,W).\n        \"\"\"\n        features = feats[0]\n        results = []\n\n        for stage in self.stages:\n            results.append(stage(features))\n\n        top_block_in_feature = results[-1]\n        results.append(self.top_block(top_block_in_feature))\n        assert self.num_levels == len(results)\n\n        return results\n"
  },
  {
    "path": "ppdet/modeling/backbones/vitpose.py",
    "content": "# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py\n# reference: https://arxiv.org/abs/2010.11929\n\nfrom collections.abc import Callable\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn.initializer import TruncatedNormal, Constant, Normal\nfrom ppdet.core.workspace import register, serializable\n\ntrunc_normal_ = TruncatedNormal(std=.02)\n\n\ndef to_2tuple(x):\n    if isinstance(x, (list, tuple)):\n        return x\n    return tuple([x] * 2)\n\n\ndef drop_path(x, drop_prob=0., training=False):\n    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1.0 - drop_prob).astype(x.dtype)\n    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\n\n\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\n\n\nclass Identity(nn.Layer):\n    def __init__(self):\n        super(Identity, self).__init__()\n\n    def forward(self, input):\n        return input\n\n\nclass Mlp(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\n\n\nclass Attention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.,\n                 proj_drop=0.):\n        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n\n        self.attn_drop = nn.Dropout(attn_drop)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n\n    def forward(self, x):\n\n        N, C = x.shape[1:]\n        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //\n                                   self.num_heads)).transpose((2, 0, 3, 1, 4))\n\n        q, k, v = qkv[0], qkv[1], qkv[2]\n\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n\n        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n\n        x = self.proj_drop(x)\n        return x\n\n\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5):\n        super().__init__()\n        if isinstance(norm_layer, str):\n            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm1 = norm_layer(dim)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        self.attn = Attention(\n            dim,\n            num_heads=num_heads,\n            qkv_bias=qkv_bias,\n            qk_scale=qk_scale,\n            attn_drop=attn_drop,\n            proj_drop=drop)\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        if isinstance(norm_layer, str):\n            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm2 = norm_layer(dim)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = Mlp(in_features=dim,\n                       hidden_features=mlp_hidden_dim,\n                       act_layer=act_layer,\n                       drop=drop)\n\n    def forward(self, x):\n        x = x + self.drop_path(self.attn(self.norm1(x)))\n        x = x + self.drop_path(self.mlp(self.norm2(x)))\n\n        return x\n\n\nclass PatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_chans=3,\n                 embed_dim=768,\n                 ratio=1):\n        super().__init__()\n        img_size = to_2tuple(img_size)\n        patch_size = to_2tuple(patch_size)\n\n        num_patches = (img_size[1] // patch_size[1]) * (\n            img_size[0] // patch_size[0]) * (ratio**2)\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.num_patches = num_patches\n\n        self.proj = nn.Conv2D(\n            in_chans,\n            embed_dim,\n            kernel_size=patch_size,\n            stride=(patch_size[0] // ratio),\n            padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1)))\n\n    def forward(self, x):\n        B, C, H, W = x.shape\n        assert H == self.img_size[0] and W == self.img_size[1], \\\n            f\"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).\"\n\n        x = self.proj(x)\n        return x\n\n\n@register\n@serializable\nclass ViT(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n\n        This module is different from ppdet's VisionTransformer (from ppdet/modeling/backbones/visio_transformer.py),\n        the main differences are:\n        1.the module PatchEmbed.proj has padding set,padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1),\n          VisionTransformer dose not\n        2.Attention module qkv is standard.but VisionTransformer provide more options\n        3.MLP module only one Dropout,and VisionTransformer twice;\n        4.VisionTransformer provide fpn layer,but the module does not.\n        \n    \"\"\"\n\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_chans=3,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 ratio=1,\n                 pretrained=None,\n                 **kwargs):\n        super().__init__()\n\n        self.pretrained = pretrained\n        self.num_features = self.embed_dim = embed_dim\n\n        self.patch_embed = PatchEmbed(\n            img_size=img_size,\n            patch_size=patch_size,\n            in_chans=in_chans,\n            embed_dim=embed_dim,\n            ratio=ratio)\n        num_patches = self.patch_embed.num_patches\n\n        self.pos_embed = self.create_parameter(\n            shape=(1, num_patches + 1, embed_dim),\n            default_initializer=trunc_normal_)\n        self.add_parameter(\"pos_embed\", self.pos_embed)\n\n        dpr = np.linspace(0, drop_path_rate, depth, dtype='float32')\n\n        self.blocks = nn.LayerList([\n            Block(\n                dim=embed_dim,\n                num_heads=num_heads,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop_rate,\n                attn_drop=attn_drop_rate,\n                drop_path=dpr[i],\n                norm_layer=norm_layer,\n                epsilon=epsilon) for i in range(depth)\n        ])\n\n        self.last_norm = eval(norm_layer)(embed_dim, epsilon=epsilon)\n        trunc_normal_(self.pos_embed)\n        self._init_weights()\n\n    def _init_weights(self):\n        pretrained = self.pretrained\n\n        if pretrained:\n\n            if 'http' in pretrained:  #URL\n                path = paddle.utils.download.get_weights_path_from_url(\n                    pretrained)\n            else:  #model in local path\n                path = pretrained\n\n            load_state_dict = paddle.load(path)\n            self.set_state_dict(load_state_dict)\n            print(\"Load load_state_dict:\", path)\n\n    def forward_features(self, x):\n\n        B = x.shape[0]\n        x = self.patch_embed(x)\n        B, D, Hp, Wp = x.shape\n        x = x.flatten(2).transpose([0, 2, 1])\n        x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]\n\n        for blk in self.blocks:\n            x = blk(x)\n\n        x = self.last_norm(x)\n        xp = paddle.reshape(\n            paddle.transpose(\n                x, perm=[0, 2, 1]), shape=[B, -1, Hp, Wp])\n\n        return xp\n"
  },
  {
    "path": "ppdet/modeling/bbox_utils.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport paddle\nimport numpy as np\n\n\ndef bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):\n    \"\"\"Encode bboxes to deltas.\n    \"\"\"\n    src_w = src_boxes[:, 2] - src_boxes[:, 0]\n    src_h = src_boxes[:, 3] - src_boxes[:, 1]\n    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w\n    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h\n\n    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]\n    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]\n    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w\n    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h\n\n    wx, wy, ww, wh = weights\n    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w\n    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h\n    dw = ww * paddle.log(tgt_w / src_w)\n    dh = wh * paddle.log(tgt_h / src_h)\n\n    deltas = paddle.stack((dx, dy, dw, dh), axis=1)\n    return deltas\n\n\ndef delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):\n    \"\"\"Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.\n    Note: return tensor shape [n,1,4]\n        If you want to add a reshape, please add after the calling code instead of here.\n    \"\"\"\n    clip_scale = math.log(1000.0 / 16)\n\n    widths = boxes[:, 2] - boxes[:, 0]\n    heights = boxes[:, 3] - boxes[:, 1]\n    ctr_x = boxes[:, 0] + 0.5 * widths\n    ctr_y = boxes[:, 1] + 0.5 * heights\n\n    wx, wy, ww, wh = weights\n    dx = deltas[:, 0::4] / wx\n    dy = deltas[:, 1::4] / wy\n    dw = deltas[:, 2::4] / ww\n    dh = deltas[:, 3::4] / wh\n    # Prevent sending too large values into paddle.exp()\n    dw = paddle.clip(dw, max=clip_scale)\n    dh = paddle.clip(dh, max=clip_scale)\n\n    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)\n    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)\n    pred_w = paddle.exp(dw) * widths.unsqueeze(1)\n    pred_h = paddle.exp(dh) * heights.unsqueeze(1)\n\n    pred_boxes = []\n    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)\n    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)\n    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)\n    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)\n    pred_boxes = paddle.stack(pred_boxes, axis=-1)\n\n    if max_shape is not None:\n        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(\n            min=0, max=max_shape[1])\n        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(\n            min=0, max=max_shape[0])\n    return pred_boxes\n\n\ndef bbox2delta_v2(src_boxes,\n                  tgt_boxes,\n                  delta_mean=[0.0, 0.0, 0.0, 0.0],\n                  delta_std=[1.0, 1.0, 1.0, 1.0]):\n    \"\"\"Encode bboxes to deltas.\n    Modified from bbox2delta() which just use weight parameters to multiply deltas.\n    \"\"\"\n    src_w = src_boxes[:, 2] - src_boxes[:, 0]\n    src_h = src_boxes[:, 3] - src_boxes[:, 1]\n    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w\n    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h\n\n    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]\n    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]\n    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w\n    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h\n\n    dx = (tgt_ctr_x - src_ctr_x) / src_w\n    dy = (tgt_ctr_y - src_ctr_y) / src_h\n    dw = paddle.log(tgt_w / src_w)\n    dh = paddle.log(tgt_h / src_h)\n\n    deltas = paddle.stack((dx, dy, dw, dh), axis=1)\n    deltas = (\n        deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)\n    return deltas\n\n\ndef delta2bbox_v2(deltas,\n                  boxes,\n                  delta_mean=[0.0, 0.0, 0.0, 0.0],\n                  delta_std=[1.0, 1.0, 1.0, 1.0],\n                  max_shape=None,\n                  ctr_clip=32.0):\n    \"\"\"Decode deltas to bboxes.\n    Modified from delta2bbox() which just use weight parameters to be divided by deltas.\n    Used in YOLOFHead.\n    Note: return tensor shape [n,1,4]\n        If you want to add a reshape, please add after the calling code instead of here.\n    \"\"\"\n    clip_scale = math.log(1000.0 / 16)\n\n    widths = boxes[:, 2] - boxes[:, 0]\n    heights = boxes[:, 3] - boxes[:, 1]\n    ctr_x = boxes[:, 0] + 0.5 * widths\n    ctr_y = boxes[:, 1] + 0.5 * heights\n\n    deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)\n    dx = deltas[:, 0::4]\n    dy = deltas[:, 1::4]\n    dw = deltas[:, 2::4]\n    dh = deltas[:, 3::4]\n\n    # Prevent sending too large values into paddle.exp()\n    dx = dx * widths.unsqueeze(1)\n    dy = dy * heights.unsqueeze(1)\n    if ctr_clip is not None:\n        dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)\n        dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)\n        dw = paddle.clip(dw, max=clip_scale)\n        dh = paddle.clip(dh, max=clip_scale)\n    else:\n        dw = dw.clip(min=-clip_scale, max=clip_scale)\n        dh = dh.clip(min=-clip_scale, max=clip_scale)\n\n    pred_ctr_x = dx + ctr_x.unsqueeze(1)\n    pred_ctr_y = dy + ctr_y.unsqueeze(1)\n    pred_w = paddle.exp(dw) * widths.unsqueeze(1)\n    pred_h = paddle.exp(dh) * heights.unsqueeze(1)\n\n    pred_boxes = []\n    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)\n    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)\n    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)\n    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)\n    pred_boxes = paddle.stack(pred_boxes, axis=-1)\n\n    if max_shape is not None:\n        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(\n            min=0, max=max_shape[1])\n        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(\n            min=0, max=max_shape[0])\n    return pred_boxes\n\n\ndef expand_bbox(bboxes, scale):\n    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5\n    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5\n    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5\n    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5\n\n    w_half *= scale\n    h_half *= scale\n\n    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)\n    bboxes_exp[:, 0] = x_c - w_half\n    bboxes_exp[:, 2] = x_c + w_half\n    bboxes_exp[:, 1] = y_c - h_half\n    bboxes_exp[:, 3] = y_c + h_half\n\n    return bboxes_exp\n\n\ndef clip_bbox(boxes, im_shape):\n    h, w = im_shape[0], im_shape[1]\n    x1 = boxes[:, 0].clip(0, w)\n    y1 = boxes[:, 1].clip(0, h)\n    x2 = boxes[:, 2].clip(0, w)\n    y2 = boxes[:, 3].clip(0, h)\n    return paddle.stack([x1, y1, x2, y2], axis=1)\n\n\ndef nonempty_bbox(boxes, min_size=0, return_mask=False):\n    w = boxes[:, 2] - boxes[:, 0]\n    h = boxes[:, 3] - boxes[:, 1]\n    mask = paddle.logical_and(h > min_size, w > min_size)\n    if return_mask:\n        return mask\n    keep = paddle.nonzero(mask).flatten()\n    return keep\n\n\ndef bbox_area(boxes):\n    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])\n\n\ndef bbox_overlaps(boxes1, boxes2):\n    \"\"\"\n    Calculate overlaps between boxes1 and boxes2\n\n    Args:\n        boxes1 (Tensor): boxes with shape [M, 4]\n        boxes2 (Tensor): boxes with shape [N, 4]\n\n    Return:\n        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]\n    \"\"\"\n    M = boxes1.shape[0]\n    N = boxes2.shape[0]\n    if M * N == 0:\n        return paddle.zeros([M, N], dtype='float32')\n    area1 = bbox_area(boxes1)\n    area2 = bbox_area(boxes2)\n\n    xy_max = paddle.minimum(\n        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])\n    xy_min = paddle.maximum(\n        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])\n    width_height = xy_max - xy_min\n    width_height = width_height.clip(min=0)\n    inter = width_height.prod(axis=2)\n\n    overlaps = paddle.where(inter > 0, inter /\n                            (paddle.unsqueeze(area1, 1) + area2 - inter),\n                            paddle.zeros_like(inter))\n    return overlaps\n\n\ndef batch_bbox_overlaps(bboxes1,\n                        bboxes2,\n                        mode='iou',\n                        is_aligned=False,\n                        eps=1e-6):\n    \"\"\"Calculate overlap between two set of bboxes.\n    If ``is_aligned `` is ``False``, then calculate the overlaps between each\n    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned\n    pair of bboxes1 and bboxes2.\n    Args:\n        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.\n        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.\n            B indicates the batch dim, in shape (B1, B2, ..., Bn).\n            If ``is_aligned `` is ``True``, then m and n must be equal.\n        mode (str): \"iou\" (intersection over union) or \"iof\" (intersection over\n            foreground).\n        is_aligned (bool, optional): If True, then m and n must be equal.\n            Default False.\n        eps (float, optional): A value added to the denominator for numerical\n            stability. Default 1e-6.\n    Returns:\n        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)\n    \"\"\"\n    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)\n    # Either the boxes are empty or the length of boxes's last dimenstion is 4\n    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)\n    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)\n\n    # Batch dim must be the same\n    # Batch dim: (B1, B2, ... Bn)\n    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]\n    batch_shape = bboxes1.shape[:-2]\n\n    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0\n    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0\n    if is_aligned:\n        assert rows == cols\n\n    if rows * cols == 0:\n        if is_aligned:\n            return paddle.full(batch_shape + (rows, ), 1)\n        else:\n            return paddle.full(batch_shape + (rows, cols), 1)\n\n    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])\n    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])\n\n    if is_aligned:\n        lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2])  # [B, rows, 2]\n        rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:])  # [B, rows, 2]\n\n        wh = (rb - lt).clip(min=0)  # [B, rows, 2]\n        overlap = wh[:, 0] * wh[:, 1]\n\n        if mode in ['iou', 'giou']:\n            union = area1 + area2 - overlap\n        else:\n            union = area1\n        if mode == 'giou':\n            enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])\n            enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])\n    else:\n        lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),\n                            bboxes2[:, :2])  # [B, rows, cols, 2]\n        rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),\n                            bboxes2[:, 2:])  # [B, rows, cols, 2]\n\n        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]\n        overlap = wh[:, :, 0] * wh[:, :, 1]\n\n        if mode in ['iou', 'giou']:\n            union = area1.reshape([rows,1]) \\\n                    + area2.reshape([1,cols]) - overlap\n        else:\n            union = area1[:, None]\n        if mode == 'giou':\n            enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),\n                                         bboxes2[:, :2])\n            enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),\n                                         bboxes2[:, 2:])\n\n    eps = paddle.to_tensor([eps])\n    union = paddle.maximum(union, eps)\n    ious = overlap / union\n    if mode in ['iou', 'iof']:\n        return ious\n    # calculate gious\n    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)\n    enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]\n    enclose_area = paddle.maximum(enclose_area, eps)\n    gious = ious - (enclose_area - union) / enclose_area\n    return 1 - gious\n\n\ndef xywh2xyxy(box):\n    x, y, w, h = box\n    x1 = x - w * 0.5\n    y1 = y - h * 0.5\n    x2 = x + w * 0.5\n    y2 = y + h * 0.5\n    return [x1, y1, x2, y2]\n\n\ndef make_grid(h, w, dtype):\n    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])\n    return paddle.stack((xv, yv), 2).cast(dtype=dtype)\n\n\ndef decode_yolo(box, anchor, downsample_ratio):\n    \"\"\"decode yolo box\n\n    Args:\n        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        anchor (list): anchor with the shape [na, 2]\n        downsample_ratio (int): downsample ratio, default 32\n        scale (float): scale, default 1.\n\n    Return:\n        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]\n    \"\"\"\n    x, y, w, h = box\n    na, grid_h, grid_w = x.shape[1:4]\n    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))\n    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w\n    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h\n\n    anchor = paddle.to_tensor(anchor, dtype=x.dtype)\n    anchor = anchor.reshape((1, na, 1, 1, 2))\n    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)\n    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)\n\n    return [x1, y1, w1, h1]\n\n\ndef batch_iou_similarity(box1, box2, eps=1e-9):\n    \"\"\"Calculate iou of box1 and box2 in batch\n\n    Args:\n        box1 (Tensor): box with the shape [N, M1, 4]\n        box2 (Tensor): box with the shape [N, M2, 4]\n\n    Return:\n        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]\n    \"\"\"\n    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]\n    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]\n    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]\n    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]\n    x1y1 = paddle.maximum(px1y1, gx1y1)\n    x2y2 = paddle.minimum(px2y2, gx2y2)\n    overlap = (x2y2 - x1y1).clip(0).prod(-1)\n    area1 = (px2y2 - px1y1).clip(0).prod(-1)\n    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)\n    union = area1 + area2 - overlap + eps\n    return overlap / union\n\n\ndef bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):\n    \"\"\"calculate the iou of box1 and box2\n\n    Args:\n        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]\n        giou (bool): whether use giou or not, default False\n        diou (bool): whether use diou or not, default False\n        ciou (bool): whether use ciou or not, default False\n        eps (float): epsilon to avoid divide by zero\n\n    Return:\n        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]\n    \"\"\"\n    px1, py1, px2, py2 = box1\n    gx1, gy1, gx2, gy2 = box2\n    x1 = paddle.maximum(px1, gx1)\n    y1 = paddle.maximum(py1, gy1)\n    x2 = paddle.minimum(px2, gx2)\n    y2 = paddle.minimum(py2, gy2)\n\n    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))\n\n    area1 = (px2 - px1) * (py2 - py1)\n    area1 = area1.clip(0)\n\n    area2 = (gx2 - gx1) * (gy2 - gy1)\n    area2 = area2.clip(0)\n\n    union = area1 + area2 - overlap + eps\n    iou = overlap / union\n\n    if giou or ciou or diou:\n        # convex w, h\n        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)\n        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)\n        if giou:\n            c_area = cw * ch + eps\n            return iou - (c_area - union) / c_area\n        else:\n            # convex diagonal squared\n            c2 = cw**2 + ch**2 + eps\n            # center distance\n            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4\n            if diou:\n                return iou - rho2 / c2\n            else:\n                w1, h1 = px2 - px1, py2 - py1 + eps\n                w2, h2 = gx2 - gx1, gy2 - gy1 + eps\n                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)\n                v = (4 / math.pi**2) * paddle.pow(delta, 2)\n                alpha = v / (1 + eps - iou + v)\n                alpha.stop_gradient = True\n                return iou - (rho2 / c2 + v * alpha)\n    else:\n        return iou\n\n\ndef bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):\n    \"\"\"\n    Calculate the iou of box1 and box2 with numpy.\n\n    Args:\n        box1 (ndarray): [N, 4]\n        box2 (ndarray): [M, 4], usually N != M\n        x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True\n        eps (float): epsilon to avoid divide by zero\n    Return:\n        iou (ndarray): iou of box1 and box2, [N, M]\n    \"\"\"\n    N, M = len(box1), len(box2)  # usually N != M\n    if x1y1x2y2:\n        b1_x1, b1_y1 = box1[:, 0], box1[:, 1]\n        b1_x2, b1_y2 = box1[:, 2], box1[:, 3]\n        b2_x1, b2_y1 = box2[:, 0], box2[:, 1]\n        b2_x2, b2_y2 = box2[:, 2], box2[:, 3]\n    else:\n        # cxcywh style\n        # Transform from center and width to exact coordinates\n        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2\n        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2\n        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2\n        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2\n\n    # get the coordinates of the intersection rectangle\n    inter_rect_x1 = np.zeros((N, M), dtype=np.float32)\n    inter_rect_y1 = np.zeros((N, M), dtype=np.float32)\n    inter_rect_x2 = np.zeros((N, M), dtype=np.float32)\n    inter_rect_y2 = np.zeros((N, M), dtype=np.float32)\n    for i in range(len(box2)):\n        inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])\n        inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])\n        inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])\n        inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])\n    # Intersection area\n    inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(\n        inter_rect_y2 - inter_rect_y1, 0)\n    # Union Area\n    b1_area = np.repeat(\n        ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)\n    b2_area = np.repeat(\n        ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)\n\n    ious = inter_area / (b1_area + b2_area - inter_area + eps)\n    return ious\n\n\ndef bbox2distance(points, bbox, max_dis=None, eps=0.1):\n    \"\"\"Decode bounding box based on distances.\n    Args:\n        points (Tensor): Shape (n, 2), [x, y].\n        bbox (Tensor): Shape (n, 4), \"xyxy\" format\n        max_dis (float): Upper bound of the distance.\n        eps (float): a small value to ensure target < max_dis, instead <=\n    Returns:\n        Tensor: Decoded distances.\n    \"\"\"\n    left = points[:, 0] - bbox[:, 0]\n    top = points[:, 1] - bbox[:, 1]\n    right = bbox[:, 2] - points[:, 0]\n    bottom = bbox[:, 3] - points[:, 1]\n    if max_dis is not None:\n        left = left.clip(min=0, max=max_dis - eps)\n        top = top.clip(min=0, max=max_dis - eps)\n        right = right.clip(min=0, max=max_dis - eps)\n        bottom = bottom.clip(min=0, max=max_dis - eps)\n    return paddle.stack([left, top, right, bottom], -1)\n\n\ndef distance2bbox(points, distance, max_shape=None):\n    \"\"\"Decode distance prediction to bounding box.\n        Args:\n            points (Tensor): Shape (n, 2), [x, y].\n            distance (Tensor): Distance from the given point to 4\n                boundaries (left, top, right, bottom).\n            max_shape (tuple): Shape of the image.\n        Returns:\n            Tensor: Decoded bboxes.\n        \"\"\"\n    x1 = points[:, 0] - distance[:, 0]\n    y1 = points[:, 1] - distance[:, 1]\n    x2 = points[:, 0] + distance[:, 2]\n    y2 = points[:, 1] + distance[:, 3]\n    if max_shape is not None:\n        x1 = x1.clip(min=0, max=max_shape[1])\n        y1 = y1.clip(min=0, max=max_shape[0])\n        x2 = x2.clip(min=0, max=max_shape[1])\n        y2 = y2.clip(min=0, max=max_shape[0])\n    return paddle.stack([x1, y1, x2, y2], -1)\n\n\ndef bbox_center(boxes):\n    \"\"\"Get bbox centers from boxes.\n    Args:\n        boxes (Tensor): boxes with shape (..., 4), \"xmin, ymin, xmax, ymax\" format.\n    Returns:\n        Tensor: boxes centers with shape (..., 2), \"cx, cy\" format.\n    \"\"\"\n    boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2\n    boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2\n    return paddle.stack([boxes_cx, boxes_cy], axis=-1)\n\n\ndef batch_distance2bbox(points, distance, max_shapes=None):\n    \"\"\"Decode distance prediction to bounding box for batch.\n    Args:\n        points (Tensor): [B, ..., 2], \"xy\" format\n        distance (Tensor): [B, ..., 4], \"ltrb\" format\n        max_shapes (Tensor): [B, 2], \"h,w\" format, Shape of the image.\n    Returns:\n        Tensor: Decoded bboxes, \"x1y1x2y2\" format.\n    \"\"\"\n    lt, rb = paddle.split(distance, 2, -1)\n    # while tensor add parameters, parameters should be better placed on the second place\n    x1y1 = -lt + points\n    x2y2 = rb + points\n    out_bbox = paddle.concat([x1y1, x2y2], -1)\n    if max_shapes is not None:\n        max_shapes = max_shapes.flip(-1).tile([1, 2])\n        delta_dim = out_bbox.ndim - max_shapes.ndim\n        for _ in range(delta_dim):\n            max_shapes.unsqueeze_(1)\n        out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)\n        out_bbox = paddle.where(out_bbox > 0, out_bbox,\n                                paddle.zeros_like(out_bbox))\n    return out_bbox\n\n\ndef iou_similarity(box1, box2, eps=1e-10):\n    \"\"\"Calculate iou of box1 and box2\n\n    Args:\n        box1 (Tensor): box with the shape [M1, 4]\n        box2 (Tensor): box with the shape [M2, 4]\n\n    Return:\n        iou (Tensor): iou between box1 and box2 with the shape [M1, M2]\n    \"\"\"\n    box1 = box1.unsqueeze(1)  # [M1, 4] -> [M1, 1, 4]\n    box2 = box2.unsqueeze(0)  # [M2, 4] -> [1, M2, 4]\n    px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]\n    gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]\n    x1y1 = paddle.maximum(px1y1, gx1y1)\n    x2y2 = paddle.minimum(px2y2, gx2y2)\n    overlap = (x2y2 - x1y1).clip(0).prod(-1)\n    area1 = (px2y2 - px1y1).clip(0).prod(-1)\n    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)\n    union = area1 + area2 - overlap + eps\n    return overlap / union\n"
  },
  {
    "path": "ppdet/modeling/clrnet_utils.py",
    "content": "import paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.modeling.initializer import constant_\nfrom paddle.nn.initializer import KaimingNormal\n\n\nclass ConvModule(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=1,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=False,\n                 norm_type='bn',\n                 wtih_act=True):\n        super(ConvModule, self).__init__()\n        assert norm_type in ['bn', 'sync_bn', 'gn', None]\n        self.with_norm = norm_type is not None\n        self.wtih_act = wtih_act\n        self.conv = nn.Conv2D(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            bias_attr=bias,\n            weight_attr=KaimingNormal())\n        if self.with_norm:\n            if norm_type == 'bn':\n                self.bn = nn.BatchNorm2D(out_channels)\n            elif norm_type == 'gn':\n                self.bn = nn.GroupNorm(out_channels, out_channels)\n\n        if self.wtih_act:\n            self.act = nn.ReLU()\n\n    def forward(self, inputs):\n        x = self.conv(inputs)\n        if self.with_norm:\n            x = self.bn(x)\n        if self.wtih_act:\n            x = self.act(x)\n        return x\n\n\ndef LinearModule(hidden_dim):\n    return nn.LayerList(\n        [nn.Linear(\n            hidden_dim, hidden_dim, bias_attr=True), nn.ReLU()])\n\n\nclass FeatureResize(nn.Layer):\n    def __init__(self, size=(10, 25)):\n        super(FeatureResize, self).__init__()\n        self.size = size\n\n    def forward(self, x):\n        x = F.interpolate(x, self.size)\n        return x.flatten(2)\n\n\nclass ROIGather(nn.Layer):\n    '''\n    ROIGather module for gather global information\n    Args: \n        in_channels: prior feature channels\n        num_priors: prior numbers we predefined\n        sample_points: the number of sampled points when we extract feature from line\n        fc_hidden_dim: the fc output channel\n        refine_layers: the total number of layers to build refine\n    '''\n\n    def __init__(self,\n                 in_channels,\n                 num_priors,\n                 sample_points,\n                 fc_hidden_dim,\n                 refine_layers,\n                 mid_channels=48):\n        super(ROIGather, self).__init__()\n        self.in_channels = in_channels\n        self.num_priors = num_priors\n        self.f_key = ConvModule(\n            in_channels=self.in_channels,\n            out_channels=self.in_channels,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            norm_type='bn')\n\n        self.f_query = nn.Sequential(\n            nn.Conv1D(\n                in_channels=num_priors,\n                out_channels=num_priors,\n                kernel_size=1,\n                stride=1,\n                padding=0,\n                groups=num_priors),\n            nn.ReLU(), )\n        self.f_value = nn.Conv2D(\n            in_channels=self.in_channels,\n            out_channels=self.in_channels,\n            kernel_size=1,\n            stride=1,\n            padding=0)\n        self.W = nn.Conv1D(\n            in_channels=num_priors,\n            out_channels=num_priors,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            groups=num_priors)\n\n        self.resize = FeatureResize()\n        constant_(self.W.weight, 0)\n        constant_(self.W.bias, 0)\n\n        self.convs = nn.LayerList()\n        self.catconv = nn.LayerList()\n        for i in range(refine_layers):\n            self.convs.append(\n                ConvModule(\n                    in_channels,\n                    mid_channels, (9, 1),\n                    padding=(4, 0),\n                    bias=False,\n                    norm_type='bn'))\n\n            self.catconv.append(\n                ConvModule(\n                    mid_channels * (i + 1),\n                    in_channels, (9, 1),\n                    padding=(4, 0),\n                    bias=False,\n                    norm_type='bn'))\n\n        self.fc = nn.Linear(\n            sample_points * fc_hidden_dim, fc_hidden_dim, bias_attr=True)\n\n        self.fc_norm = nn.LayerNorm(fc_hidden_dim)\n\n    def roi_fea(self, x, layer_index):\n        feats = []\n        for i, feature in enumerate(x):\n            feat_trans = self.convs[i](feature)\n            feats.append(feat_trans)\n        cat_feat = paddle.concat(feats, axis=1)\n        cat_feat = self.catconv[layer_index](cat_feat)\n        return cat_feat\n\n    def forward(self, roi_features, x, layer_index):\n        '''\n        Args:\n            roi_features: prior feature, shape: (Batch * num_priors, prior_feat_channel, sample_point, 1)\n            x: feature map\n            layer_index: currently on which layer to refine\n        Return: \n            roi: prior features with gathered global information, shape: (Batch, num_priors, fc_hidden_dim)\n        '''\n\n        roi = self.roi_fea(roi_features, layer_index)\n        # return roi\n        # print(roi.shape)\n        # return roi\n        bs = x.shape[0]\n        # print(bs)\n        #roi = roi.contiguous().view(bs * self.num_priors, -1)\n        roi = roi.reshape([bs * self.num_priors, -1])\n        # roi = paddle.randn([192,2304])\n        # return roi\n        # print(roi)\n        # print(self.fc)\n        # print(self.fc.weight)\n        roi = self.fc(roi)\n        roi = F.relu(self.fc_norm(roi))\n        # return roi\n        #roi = roi.view(bs, self.num_priors, -1)\n        roi = roi.reshape([bs, self.num_priors, -1])\n        query = roi\n\n        value = self.resize(self.f_value(x))  # (B, C, N) global feature\n        query = self.f_query(\n            query)  # (B, N, 1) sample context feature from prior roi\n        key = self.f_key(x)\n        value = value.transpose(perm=[0, 2, 1])\n        key = self.resize(key)  # (B, C, N) global feature\n        sim_map = paddle.matmul(query, key)\n        sim_map = (self.in_channels**-.5) * sim_map\n        sim_map = F.softmax(sim_map, axis=-1)\n\n        context = paddle.matmul(sim_map, value)\n        context = self.W(context)\n\n        roi = roi + F.dropout(context, p=0.1, training=self.training)\n\n        return roi\n\n\nclass SegDecoder(nn.Layer):\n    '''\n    Optionaly seg decoder\n    '''\n\n    def __init__(self,\n                 image_height,\n                 image_width,\n                 num_class,\n                 prior_feat_channels=64,\n                 refine_layers=3):\n        super().__init__()\n        self.dropout = nn.Dropout2D(0.1)\n        self.conv = nn.Conv2D(prior_feat_channels * refine_layers, num_class, 1)\n        self.image_height = image_height\n        self.image_width = image_width\n\n    def forward(self, x):\n        x = self.dropout(x)\n        x = self.conv(x)\n        x = F.interpolate(\n            x,\n            size=[self.image_height, self.image_width],\n            mode='bilinear',\n            align_corners=False)\n        return x\n\n\nimport paddle.nn as nn\n\n\ndef accuracy(pred, target, topk=1, thresh=None):\n    \"\"\"Calculate accuracy according to the prediction and target.\n\n    Args:\n        pred (torch.Tensor): The model prediction, shape (N, num_class)\n        target (torch.Tensor): The target of each prediction, shape (N, )\n        topk (int | tuple[int], optional): If the predictions in ``topk``\n            matches the target, the predictions will be regarded as\n            correct ones. Defaults to 1.\n        thresh (float, optional): If not None, predictions with scores under\n            this threshold are considered incorrect. Default to None.\n\n    Returns:\n        float | tuple[float]: If the input ``topk`` is a single integer,\n            the function will return a single float as accuracy. If\n            ``topk`` is a tuple containing multiple integers, the\n            function will return a tuple containing accuracies of\n            each ``topk`` number.\n    \"\"\"\n    assert isinstance(topk, (int, tuple))\n    if isinstance(topk, int):\n        topk = (topk, )\n        return_single = True\n    else:\n        return_single = False\n\n    maxk = max(topk)\n    if pred.shape[0] == 0:\n        accu = [pred.new_tensor(0.) for i in range(len(topk))]\n        return accu[0] if return_single else accu\n    assert pred.ndim == 2 and target.ndim == 1\n    assert pred.shape[0] == target.shape[0]\n    assert maxk <= pred.shape[1], \\\n        f'maxk {maxk} exceeds pred dimension {pred.shape[1]}'\n    pred_value, pred_label = pred.topk(maxk, axis=1)\n    pred_label = pred_label.t()  # transpose to shape (maxk, N)\n    correct = pred_label.equal(target.reshape([1, -1]).expand_as(pred_label))\n    if thresh is not None:\n        # Only prediction values larger than thresh are counted as correct\n        correct = correct & (pred_value > thresh).t()\n    res = []\n    for k in topk:\n        correct_k = correct[:k].reshape([-1]).cast(\"float32\").sum(0,\n                                                                  keepdim=True)\n        correct_k = correct_k * (100.0 / pred.shape[0])\n        res.append(correct_k)\n    return res[0] if return_single else res\n\n\nclass Accuracy(nn.Layer):\n    def __init__(self, topk=(1, ), thresh=None):\n        \"\"\"Module to calculate the accuracy.\n\n        Args:\n            topk (tuple, optional): The criterion used to calculate the\n                accuracy. Defaults to (1,).\n            thresh (float, optional): If not None, predictions with scores\n                under this threshold are considered incorrect. Default to None.\n        \"\"\"\n        super().__init__()\n        self.topk = topk\n        self.thresh = thresh\n\n    def forward(self, pred, target):\n        \"\"\"Forward function to calculate accuracy.\n\n        Args:\n            pred (torch.Tensor): Prediction of models.\n            target (torch.Tensor): Target for each prediction.\n\n        Returns:\n            tuple[float]: The accuracies under different topk criterions.\n        \"\"\"\n        return accuracy(pred, target, self.topk, self.thresh)\n"
  },
  {
    "path": "ppdet/modeling/cls_utils.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\ndef _get_class_default_kwargs(cls, *args, **kwargs):\n    \"\"\"\n    Get default arguments of a class in dict format, if args and\n    kwargs is specified, it will replace default arguments\n    \"\"\"\n    varnames = cls.__init__.__code__.co_varnames\n    argcount = cls.__init__.__code__.co_argcount\n    keys = varnames[:argcount]\n    assert keys[0] == 'self'\n    keys = keys[1:]\n\n    values = list(cls.__init__.__defaults__)\n    assert len(values) == len(keys)\n\n    if len(args) > 0:\n        for i, arg in enumerate(args):\n            values[i] = arg\n\n    default_kwargs = dict(zip(keys, values))\n\n    if len(kwargs) > 0:\n        for k, v in kwargs.items():\n            default_kwargs[k] = v\n\n    return default_kwargs\n"
  },
  {
    "path": "ppdet/modeling/heads/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import bbox_head\nfrom . import mask_head\nfrom . import yolo_head\nfrom . import roi_extractor\nfrom . import ssd_head\nfrom . import fcos_head\nfrom . import solov2_head\nfrom . import ttf_head\nfrom . import cascade_head\nfrom . import face_head\nfrom . import s2anet_head\nfrom . import keypoint_hrhrnet_head\nfrom . import centernet_head\nfrom . import gfl_head\nfrom . import simota_head\nfrom . import pico_head\nfrom . import detr_head\nfrom . import sparsercnn_head\nfrom . import tood_head\nfrom . import retina_head\nfrom . import ppyoloe_head\nfrom . import fcosr_head\nfrom . import ppyoloe_r_head\nfrom . import yolof_head\nfrom . import ppyoloe_contrast_head\nfrom . import centertrack_head\nfrom . import sparse_roi_head\nfrom . import vitpose_head\nfrom . import clrnet_head\nfrom . import ppyoloe_ins_head\n\nfrom .bbox_head import *\nfrom .mask_head import *\nfrom .yolo_head import *\nfrom .roi_extractor import *\nfrom .ssd_head import *\nfrom .fcos_head import *\nfrom .solov2_head import *\nfrom .ttf_head import *\nfrom .cascade_head import *\nfrom .face_head import *\nfrom .s2anet_head import *\nfrom .keypoint_hrhrnet_head import *\nfrom .centernet_head import *\nfrom .gfl_head import *\nfrom .simota_head import *\nfrom .pico_head import *\nfrom .detr_head import *\nfrom .sparsercnn_head import *\nfrom .tood_head import *\nfrom .retina_head import *\nfrom .ppyoloe_head import *\nfrom .fcosr_head import *\nfrom .ppyoloe_r_head import *\nfrom .yolof_head import *\nfrom .ppyoloe_contrast_head import *\nfrom .centertrack_head import *\nfrom .sparse_roi_head import *\nfrom .petr_head import *\nfrom .vitpose_head import *\nfrom .clrnet_head import *\nfrom .ppyoloe_ins_head import PPYOLOEInsHead\n"
  },
  {
    "path": "ppdet/modeling/heads/bbox_head.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal, XavierUniform, KaimingNormal\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register, create\nfrom .roi_extractor import RoIAlign\nfrom ..shape_spec import ShapeSpec\nfrom ..bbox_utils import bbox2delta\nfrom ..cls_utils import _get_class_default_kwargs\nfrom ppdet.modeling.layers import ConvNormLayer\n\n__all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead']\n\n\n@register\nclass TwoFCHead(nn.Layer):\n    \"\"\"\n    RCNN bbox head with Two fc layers to extract feature\n\n    Args:\n        in_channel (int): Input channel which can be derived by from_config\n        out_channel (int): Output channel\n        resolution (int): Resolution of input feature map, default 7\n    \"\"\"\n\n    def __init__(self, in_channel=256, out_channel=1024, resolution=7):\n        super(TwoFCHead, self).__init__()\n        self.in_channel = in_channel\n        self.out_channel = out_channel\n        fan = in_channel * resolution * resolution\n        self.fc6 = nn.Linear(\n            in_channel * resolution * resolution,\n            out_channel,\n            weight_attr=paddle.ParamAttr(\n                initializer=XavierUniform(fan_out=fan)))\n        self.fc6.skip_quant = True\n\n        self.fc7 = nn.Linear(\n            out_channel,\n            out_channel,\n            weight_attr=paddle.ParamAttr(initializer=XavierUniform()))\n        self.fc7.skip_quant = True\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        s = input_shape\n        s = s[0] if isinstance(s, (list, tuple)) else s\n        return {'in_channel': s.channels}\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=self.out_channel, )]\n\n    def forward(self, rois_feat):\n        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)\n        fc6 = self.fc6(rois_feat)\n        fc6 = F.relu(fc6)\n        fc7 = self.fc7(fc6)\n        fc7 = F.relu(fc7)\n        return fc7\n\n\n@register\nclass XConvNormHead(nn.Layer):\n    __shared__ = ['norm_type', 'freeze_norm']\n    \"\"\"\n    RCNN bbox head with serveral convolution layers\n\n    Args:\n        in_channel (int): Input channels which can be derived by from_config\n        num_convs (int): The number of conv layers\n        conv_dim (int): The number of channels for the conv layers\n        out_channel (int): Output channels\n        resolution (int): Resolution of input feature map\n        norm_type (string): Norm type, bn, gn, sync_bn are available, \n            default `gn`\n        freeze_norm (bool): Whether to freeze the norm\n        stage_name (string): Prefix name for conv layer,  '' by default\n    \"\"\"\n\n    def __init__(self,\n                 in_channel=256,\n                 num_convs=4,\n                 conv_dim=256,\n                 out_channel=1024,\n                 resolution=7,\n                 norm_type='gn',\n                 freeze_norm=False,\n                 stage_name=''):\n        super(XConvNormHead, self).__init__()\n        self.in_channel = in_channel\n        self.num_convs = num_convs\n        self.conv_dim = conv_dim\n        self.out_channel = out_channel\n        self.norm_type = norm_type\n        self.freeze_norm = freeze_norm\n\n        self.bbox_head_convs = []\n        fan = conv_dim * 3 * 3\n        initializer = KaimingNormal(fan_in=fan)\n        for i in range(self.num_convs):\n            in_c = in_channel if i == 0 else conv_dim\n            head_conv_name = stage_name + 'bbox_head_conv{}'.format(i)\n            head_conv = self.add_sublayer(\n                head_conv_name,\n                ConvNormLayer(\n                    ch_in=in_c,\n                    ch_out=conv_dim,\n                    filter_size=3,\n                    stride=1,\n                    norm_type=self.norm_type,\n                    freeze_norm=self.freeze_norm,\n                    initializer=initializer))\n            self.bbox_head_convs.append(head_conv)\n\n        fan = conv_dim * resolution * resolution\n        self.fc6 = nn.Linear(\n            conv_dim * resolution * resolution,\n            out_channel,\n            weight_attr=paddle.ParamAttr(\n                initializer=XavierUniform(fan_out=fan)),\n            bias_attr=paddle.ParamAttr(\n                learning_rate=2., regularizer=L2Decay(0.)))\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        s = input_shape\n        s = s[0] if isinstance(s, (list, tuple)) else s\n        return {'in_channel': s.channels}\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=self.out_channel, )]\n\n    def forward(self, rois_feat):\n        for i in range(self.num_convs):\n            rois_feat = F.relu(self.bbox_head_convs[i](rois_feat))\n        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)\n        fc6 = F.relu(self.fc6(rois_feat))\n        return fc6\n\n\n@register\nclass BBoxHead(nn.Layer):\n    __shared__ = ['num_classes', 'use_cot']\n    __inject__ = ['bbox_assigner', 'bbox_loss', 'loss_cot']\n    \"\"\"\n    RCNN bbox head\n\n    Args:\n        head (nn.Layer): Extract feature in bbox head\n        in_channel (int): Input channel after RoI extractor\n        roi_extractor (object): The module of RoI Extractor\n        bbox_assigner (object): The module of Box Assigner, label and sample the \n            box.\n        with_pool (bool): Whether to use pooling for the RoI feature.\n        num_classes (int): The number of classes\n        bbox_weight (List[float]): The weight to get the decode box\n        cot_classes (int): The number of base classes\n        loss_cot (object): The module of Label-cotuning\n        use_cot(bool): whether to use Label-cotuning \n    \"\"\"\n\n    def __init__(self,\n                 head,\n                 in_channel,\n                 roi_extractor=_get_class_default_kwargs(RoIAlign),\n                 bbox_assigner='BboxAssigner',\n                 with_pool=False,\n                 num_classes=80,\n                 bbox_weight=[10., 10., 5., 5.],\n                 bbox_loss=None,\n                 loss_normalize_pos=False,\n                 cot_classes=None,\n                 loss_cot='COTLoss',\n                 use_cot=False):\n        super(BBoxHead, self).__init__()\n        self.head = head\n        self.roi_extractor = roi_extractor\n        if isinstance(roi_extractor, dict):\n            self.roi_extractor = RoIAlign(**roi_extractor)\n        self.bbox_assigner = bbox_assigner\n\n        self.with_pool = with_pool\n        self.num_classes = num_classes\n        self.bbox_weight = bbox_weight\n        self.bbox_loss = bbox_loss\n        self.loss_normalize_pos = loss_normalize_pos\n\n        self.loss_cot = loss_cot\n        self.cot_relation = None\n        self.cot_classes = cot_classes\n        self.use_cot = use_cot\n        if use_cot:\n            self.cot_bbox_score = nn.Linear(\n                in_channel,\n                self.num_classes + 1,\n                weight_attr=paddle.ParamAttr(initializer=Normal(\n                    mean=0.0, std=0.01)))\n            \n            self.bbox_score = nn.Linear(\n                in_channel,\n                self.cot_classes + 1,\n                weight_attr=paddle.ParamAttr(initializer=Normal(\n                    mean=0.0, std=0.01)))\n            self.cot_bbox_score.skip_quant = True\n        else:\n            self.bbox_score = nn.Linear(\n                in_channel,\n                self.num_classes + 1,\n                weight_attr=paddle.ParamAttr(initializer=Normal(\n                    mean=0.0, std=0.01)))\n        self.bbox_score.skip_quant = True\n\n        self.bbox_delta = nn.Linear(\n            in_channel,\n            4 * self.num_classes,\n            weight_attr=paddle.ParamAttr(initializer=Normal(\n                mean=0.0, std=0.001)))\n        self.bbox_delta.skip_quant = True\n        self.assigned_label = None\n        self.assigned_rois = None\n\n    def init_cot_head(self, relationship):\n        self.cot_relation = relationship\n        \n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        roi_pooler = cfg['roi_extractor']\n        assert isinstance(roi_pooler, dict)\n        kwargs = RoIAlign.from_config(cfg, input_shape)\n        roi_pooler.update(kwargs)\n        kwargs = {'input_shape': input_shape}\n        head = create(cfg['head'], **kwargs)\n        return {\n            'roi_extractor': roi_pooler,\n            'head': head,\n            'in_channel': head.out_shape[0].channels\n        }\n\n    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None, cot=False):\n        \"\"\"\n        body_feats (list[Tensor]): Feature maps from backbone\n        rois (list[Tensor]): RoIs generated from RPN module\n        rois_num (Tensor): The number of RoIs in each image\n        inputs (dict{Tensor}): The ground-truth of image\n        \"\"\"\n        if self.training:\n            rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs)\n            self.assigned_rois = (rois, rois_num)\n            self.assigned_targets = targets\n\n        rois_feat = self.roi_extractor(body_feats, rois, rois_num)\n        bbox_feat = self.head(rois_feat)\n        if self.with_pool:\n            feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1)\n            feat = paddle.squeeze(feat, axis=[2, 3])\n        else:\n            feat = bbox_feat\n        if self.use_cot:\n            scores = self.cot_bbox_score(feat)\n            cot_scores = self.bbox_score(feat)\n        else:\n            scores = self.bbox_score(feat)\n        deltas = self.bbox_delta(feat)\n\n        if self.training:\n            loss = self.get_loss(\n                scores,\n                deltas,\n                targets,\n                rois,\n                self.bbox_weight,\n                loss_normalize_pos=self.loss_normalize_pos)\n            \n            if self.cot_relation is not None:\n                loss_cot = self.loss_cot(cot_scores, targets, self.cot_relation)\n                loss.update(loss_cot)\n            return loss, bbox_feat\n        else:\n            if cot:\n                pred = self.get_prediction(cot_scores, deltas)\n            else:\n                pred = self.get_prediction(scores, deltas)\n            return pred, self.head\n\n\n    def get_loss(self,\n                 scores,\n                 deltas,\n                 targets,\n                 rois,\n                 bbox_weight,\n                 loss_normalize_pos=False):\n        \"\"\"\n        scores (Tensor): scores from bbox head outputs\n        deltas (Tensor): deltas from bbox head outputs\n        targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds\n        rois (List[Tensor]): RoIs generated in each batch\n        \"\"\"\n        cls_name = 'loss_bbox_cls'\n        reg_name = 'loss_bbox_reg'\n        loss_bbox = {}\n\n        # TODO: better pass args\n        tgt_labels, tgt_bboxes, tgt_gt_inds = targets\n\n        # bbox cls\n        tgt_labels = paddle.concat(tgt_labels) if len(\n            tgt_labels) > 1 else tgt_labels[0]\n        valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()\n        if valid_inds.shape[0] == 0:\n            loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')\n        else:\n            tgt_labels = tgt_labels.cast('int64')\n            tgt_labels.stop_gradient = True\n\n            if not loss_normalize_pos:\n                loss_bbox_cls = F.cross_entropy(\n                    input=scores, label=tgt_labels, reduction='mean')\n            else:\n                loss_bbox_cls = F.cross_entropy(\n                    input=scores, label=tgt_labels,\n                    reduction='none').sum() / (tgt_labels.shape[0] + 1e-7)\n\n            loss_bbox[cls_name] = loss_bbox_cls\n\n        # bbox reg\n\n        cls_agnostic_bbox_reg = deltas.shape[1] == 4\n\n        fg_inds = paddle.nonzero(\n            paddle.logical_and(tgt_labels >= 0, tgt_labels <\n                               self.num_classes)).flatten()\n\n        if fg_inds.numel() == 0:\n            # loss_bbox[reg_name] = paddle.zeros([1], dtype='float32')\n            loss_bbox[reg_name] = scores.mean() * 0. + deltas.mean() * 0.\n            return loss_bbox\n\n        if cls_agnostic_bbox_reg:\n            reg_delta = paddle.gather(deltas, fg_inds)\n        else:\n            fg_gt_classes = paddle.gather(tgt_labels, fg_inds)\n\n            reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1)\n            reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1])\n\n            reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4)\n\n            reg_col_inds = reg_col_inds.reshape([-1, 1])\n            reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1)\n\n            reg_delta = paddle.gather(deltas, fg_inds)\n            reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4])\n        rois = paddle.concat(rois) if len(rois) > 1 else rois[0]\n        tgt_bboxes = paddle.concat(tgt_bboxes) if len(\n            tgt_bboxes) > 1 else tgt_bboxes[0]\n\n        reg_target = bbox2delta(rois, tgt_bboxes, bbox_weight)\n        reg_target = paddle.gather(reg_target, fg_inds)\n        reg_target.stop_gradient = True\n\n        if self.bbox_loss is not None:\n            reg_delta = self.bbox_transform(reg_delta)\n            reg_target = self.bbox_transform(reg_target)\n\n            if not loss_normalize_pos:\n                loss_bbox_reg = self.bbox_loss(\n                    reg_delta, reg_target).sum() / tgt_labels.shape[0]\n                loss_bbox_reg *= self.num_classes\n\n            else:\n                loss_bbox_reg = self.bbox_loss(\n                    reg_delta, reg_target).sum() / (tgt_labels.shape[0] + 1e-7)\n\n        else:\n            loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum(\n            ) / tgt_labels.shape[0]\n\n        loss_bbox[reg_name] = loss_bbox_reg\n\n        return loss_bbox\n\n    def bbox_transform(self, deltas, weights=[0.1, 0.1, 0.2, 0.2]):\n        wx, wy, ww, wh = weights\n\n        deltas = paddle.reshape(deltas, shape=(0, -1, 4))\n\n        dx = paddle.slice(deltas, axes=[2], starts=[0], ends=[1]) * wx\n        dy = paddle.slice(deltas, axes=[2], starts=[1], ends=[2]) * wy\n        dw = paddle.slice(deltas, axes=[2], starts=[2], ends=[3]) * ww\n        dh = paddle.slice(deltas, axes=[2], starts=[3], ends=[4]) * wh\n\n        dw = paddle.clip(dw, -1.e10, np.log(1000. / 16))\n        dh = paddle.clip(dh, -1.e10, np.log(1000. / 16))\n\n        pred_ctr_x = dx\n        pred_ctr_y = dy\n        pred_w = paddle.exp(dw)\n        pred_h = paddle.exp(dh)\n\n        x1 = pred_ctr_x - 0.5 * pred_w\n        y1 = pred_ctr_y - 0.5 * pred_h\n        x2 = pred_ctr_x + 0.5 * pred_w\n        y2 = pred_ctr_y + 0.5 * pred_h\n\n        x1 = paddle.reshape(x1, shape=(-1, ))\n        y1 = paddle.reshape(y1, shape=(-1, ))\n        x2 = paddle.reshape(x2, shape=(-1, ))\n        y2 = paddle.reshape(y2, shape=(-1, ))\n\n        return paddle.concat([x1, y1, x2, y2])\n\n    def get_prediction(self, score, delta):\n        bbox_prob = F.softmax(score)\n        return delta, bbox_prob\n\n    def get_head(self, ):\n        return self.head\n\n    def get_assigned_targets(self, ):\n        return self.assigned_targets\n\n    def get_assigned_rois(self, ):\n        return self.assigned_rois\n"
  },
  {
    "path": "ppdet/modeling/heads/cascade_head.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal\n\nfrom ppdet.core.workspace import register\nfrom .bbox_head import BBoxHead, TwoFCHead, XConvNormHead\nfrom .roi_extractor import RoIAlign\nfrom ..shape_spec import ShapeSpec\nfrom ..bbox_utils import delta2bbox, clip_bbox, nonempty_bbox\nfrom ..cls_utils import _get_class_default_kwargs\n\n__all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead']\n\n\n@register\nclass CascadeTwoFCHead(nn.Layer):\n    __shared__ = ['num_cascade_stage']\n    \"\"\"\n    Cascade RCNN bbox head  with Two fc layers to extract feature\n\n    Args:\n        in_channel (int): Input channel which can be derived by from_config\n        out_channel (int): Output channel\n        resolution (int): Resolution of input feature map, default 7\n        num_cascade_stage (int): The number of cascade stage, default 3\n    \"\"\"\n\n    def __init__(self,\n                 in_channel=256,\n                 out_channel=1024,\n                 resolution=7,\n                 num_cascade_stage=3):\n        super(CascadeTwoFCHead, self).__init__()\n\n        self.in_channel = in_channel\n        self.out_channel = out_channel\n\n        self.head_list = []\n        for stage in range(num_cascade_stage):\n            head_per_stage = self.add_sublayer(\n                str(stage), TwoFCHead(in_channel, out_channel, resolution))\n            self.head_list.append(head_per_stage)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        s = input_shape\n        s = s[0] if isinstance(s, (list, tuple)) else s\n        return {'in_channel': s.channels}\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=self.out_channel, )]\n\n    def forward(self, rois_feat, stage=0):\n        out = self.head_list[stage](rois_feat)\n        return out\n\n\n@register\nclass CascadeXConvNormHead(nn.Layer):\n    __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage']\n    \"\"\"\n    Cascade RCNN bbox head with serveral convolution layers\n\n    Args:\n        in_channel (int): Input channels which can be derived by from_config\n        num_convs (int): The number of conv layers\n        conv_dim (int): The number of channels for the conv layers\n        out_channel (int): Output channels\n        resolution (int): Resolution of input feature map\n        norm_type (string): Norm type, bn, gn, sync_bn are available, \n            default `gn`\n        freeze_norm (bool): Whether to freeze the norm\n        num_cascade_stage (int): The number of cascade stage, default 3\n    \"\"\"\n\n    def __init__(self,\n                 in_channel=256,\n                 num_convs=4,\n                 conv_dim=256,\n                 out_channel=1024,\n                 resolution=7,\n                 norm_type='gn',\n                 freeze_norm=False,\n                 num_cascade_stage=3):\n        super(CascadeXConvNormHead, self).__init__()\n        self.in_channel = in_channel\n        self.out_channel = out_channel\n\n        self.head_list = []\n        for stage in range(num_cascade_stage):\n            head_per_stage = self.add_sublayer(\n                str(stage),\n                XConvNormHead(\n                    in_channel,\n                    num_convs,\n                    conv_dim,\n                    out_channel,\n                    resolution,\n                    norm_type,\n                    freeze_norm,\n                    stage_name='stage{}_'.format(stage)))\n            self.head_list.append(head_per_stage)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        s = input_shape\n        s = s[0] if isinstance(s, (list, tuple)) else s\n        return {'in_channel': s.channels}\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=self.out_channel, )]\n\n    def forward(self, rois_feat, stage=0):\n        out = self.head_list[stage](rois_feat)\n        return out\n\n\n@register\nclass CascadeHead(BBoxHead):\n    __shared__ = ['num_classes', 'num_cascade_stages']\n    __inject__ = ['bbox_assigner', 'bbox_loss']\n    \"\"\"\n    Cascade RCNN bbox head\n\n    Args:\n        head (nn.Layer): Extract feature in bbox head\n        in_channel (int): Input channel after RoI extractor\n        roi_extractor (object): The module of RoI Extractor\n        bbox_assigner (object): The module of Box Assigner, label and sample the \n            box.\n        num_classes (int): The number of classes\n        bbox_weight (List[List[float]]): The weight to get the decode box and the \n            length of weight is the number of cascade stage\n        num_cascade_stages (int): THe number of stage to refine the box\n    \"\"\"\n\n    def __init__(self,\n                 head,\n                 in_channel,\n                 roi_extractor=_get_class_default_kwargs(RoIAlign),\n                 bbox_assigner='BboxAssigner',\n                 num_classes=80,\n                 bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0],\n                              [30.0, 30.0, 15.0, 15.0]],\n                 num_cascade_stages=3,\n                 bbox_loss=None,\n                 reg_class_agnostic=True,\n                 stage_loss_weights=None,\n                 loss_normalize_pos=False,\n                 add_gt_as_proposals=[True, False, False]):\n\n        nn.Layer.__init__(self, )\n        self.head = head\n        self.roi_extractor = roi_extractor\n        if isinstance(roi_extractor, dict):\n            self.roi_extractor = RoIAlign(**roi_extractor)\n        self.bbox_assigner = bbox_assigner\n\n        self.num_classes = num_classes\n        self.bbox_weight = bbox_weight\n        self.num_cascade_stages = num_cascade_stages\n        self.bbox_loss = bbox_loss\n        self.stage_loss_weights = [\n            1. / num_cascade_stages for _ in range(num_cascade_stages)\n        ] if stage_loss_weights is None else stage_loss_weights\n        self.add_gt_as_proposals = add_gt_as_proposals\n\n        assert len(\n            self.stage_loss_weights\n        ) == num_cascade_stages, f'stage_loss_weights({len(self.stage_loss_weights)}) do not equal to num_cascade_stages({num_cascade_stages})'\n\n        self.reg_class_agnostic = reg_class_agnostic\n        num_bbox_delta = 4 if reg_class_agnostic else 4 * num_classes\n        self.loss_normalize_pos = loss_normalize_pos\n\n        self.bbox_score_list = []\n        self.bbox_delta_list = []\n        for i in range(num_cascade_stages):\n            score_name = 'bbox_score_stage{}'.format(i)\n            delta_name = 'bbox_delta_stage{}'.format(i)\n            bbox_score = self.add_sublayer(\n                score_name,\n                nn.Linear(\n                    in_channel,\n                    self.num_classes + 1,\n                    weight_attr=paddle.ParamAttr(initializer=Normal(\n                        mean=0.0, std=0.01))))\n\n            bbox_delta = self.add_sublayer(\n                delta_name,\n                nn.Linear(\n                    in_channel,\n                    num_bbox_delta,\n                    weight_attr=paddle.ParamAttr(initializer=Normal(\n                        mean=0.0, std=0.001))))\n            self.bbox_score_list.append(bbox_score)\n            self.bbox_delta_list.append(bbox_delta)\n        self.assigned_label = None\n        self.assigned_rois = None\n\n    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None):\n        \"\"\"\n        body_feats (list[Tensor]): Feature maps from backbone\n        rois (Tensor): RoIs generated from RPN module\n        rois_num (Tensor): The number of RoIs in each image\n        inputs (dict{Tensor}): The ground-truth of image\n        \"\"\"\n        targets = []\n        if self.training:\n            rois, rois_num, targets = self.bbox_assigner(\n                rois,\n                rois_num,\n                inputs,\n                add_gt_as_proposals=self.add_gt_as_proposals[0])\n            targets_list = [targets]\n            self.assigned_rois = (rois, rois_num)\n            self.assigned_targets = targets\n\n        pred_bbox = None\n        head_out_list = []\n        for i in range(self.num_cascade_stages):\n            if i > 0:\n                rois, rois_num = self._get_rois_from_boxes(pred_bbox,\n                                                           inputs['im_shape'])\n                if self.training:\n                    rois, rois_num, targets = self.bbox_assigner(\n                        rois,\n                        rois_num,\n                        inputs,\n                        i,\n                        is_cascade=True,\n                        add_gt_as_proposals=self.add_gt_as_proposals[i])\n                    targets_list.append(targets)\n\n            rois_feat = self.roi_extractor(body_feats, rois, rois_num)\n            bbox_feat = self.head(rois_feat, i)\n            scores = self.bbox_score_list[i](bbox_feat)\n            deltas = self.bbox_delta_list[i](bbox_feat)\n\n            # TODO (lyuwenyu) Is it correct for only one class ?\n            if not self.reg_class_agnostic and i < self.num_cascade_stages - 1:\n                deltas = deltas.reshape([deltas.shape[0], self.num_classes, 4])\n                labels = scores[:, :-1].argmax(axis=-1)\n\n                if self.training:\n                    deltas = deltas[paddle.arange(deltas.shape[0]), labels]\n                else:\n                    deltas = deltas[((deltas + 10000) * F.one_hot(\n                        labels, num_classes=self.num_classes).unsqueeze(-1) != 0\n                                     ).nonzero(as_tuple=True)].reshape(\n                                         [deltas.shape[0], 4])\n\n            head_out_list.append([scores, deltas, rois])\n            pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i])\n\n        if self.training:\n            loss = {}\n            for stage, value in enumerate(zip(head_out_list, targets_list)):\n                (scores, deltas, rois), targets = value\n                loss_stage = self.get_loss(\n                    scores,\n                    deltas,\n                    targets,\n                    rois,\n                    self.bbox_weight[stage],\n                    loss_normalize_pos=self.loss_normalize_pos)\n                for k, v in loss_stage.items():\n                    loss[k + \"_stage{}\".format(\n                        stage)] = v * self.stage_loss_weights[stage]\n\n            return loss, bbox_feat\n        else:\n            scores, deltas, self.refined_rois = self.get_prediction(\n                head_out_list)\n            return (deltas, scores), self.head\n\n    def _get_rois_from_boxes(self, boxes, im_shape):\n        rois = []\n        for i, boxes_per_image in enumerate(boxes):\n            clip_box = clip_bbox(boxes_per_image, im_shape[i])\n            if self.training:\n                keep = nonempty_bbox(clip_box)\n                if keep.shape[0] == 0:\n                    keep = paddle.zeros([1], dtype='int32')\n                clip_box = paddle.gather(clip_box, keep)\n            rois.append(clip_box)\n        rois_num = paddle.concat([paddle.shape(r)[0:1] for r in rois])\n        return rois, rois_num\n\n    def _get_pred_bbox(self, deltas, proposals, weights):\n        pred_proposals = paddle.concat(proposals) if len(\n            proposals) > 1 else proposals[0]\n        pred_bbox = delta2bbox(deltas, pred_proposals, weights)\n        pred_bbox = paddle.reshape(pred_bbox, [-1, deltas.shape[-1]])\n        num_prop = []\n        for p in proposals:\n            num_prop.append(p.shape[0])\n\n        # NOTE(dev): num_prob will be tagged as LoDTensorArray because it\n        # depends on batch_size under @to_static. However the argument\n        # num_or_sections in paddle.split does not support LoDTensorArray,\n        # so we use [-1] to replace it if num_prop is not list. The modification\n        # This ensures the correctness of both dynamic and static graphs.\n        if not isinstance(num_prop, list):\n            num_prop = [-1]\n        return pred_bbox.split(num_prop)\n\n    def get_prediction(self, head_out_list):\n        \"\"\"\n        head_out_list(List[Tensor]): scores, deltas, rois\n        \"\"\"\n        pred_list = []\n        scores_list = [F.softmax(head[0]) for head in head_out_list]\n        scores = paddle.add_n(scores_list) / self.num_cascade_stages\n        # Get deltas and rois from the last stage\n        _, deltas, rois = head_out_list[-1]\n        return scores, deltas, rois\n\n    def get_refined_rois(self, ):\n        return self.refined_rois\n"
  },
  {
    "path": "ppdet/modeling/heads/centernet_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Constant, Uniform\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.losses import CTFocalLoss, GIoULoss\n\n\nclass ConvLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=False):\n        super(ConvLayer, self).__init__()\n        bias_attr = False\n        fan_in = ch_in * kernel_size**2\n        bound = 1 / math.sqrt(fan_in)\n        param_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound))\n        if bias:\n            bias_attr = paddle.ParamAttr(initializer=Constant(0.))\n        self.conv = nn.Conv2D(\n            in_channels=ch_in,\n            out_channels=ch_out,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            weight_attr=param_attr,\n            bias_attr=bias_attr)\n\n    def forward(self, inputs):\n        out = self.conv(inputs)\n        return out\n\n\n@register\nclass CenterNetHead(nn.Layer):\n    \"\"\"\n    Args:\n        in_channels (int): the channel number of input to CenterNetHead.\n        num_classes (int): the number of classes, 80 (COCO dataset) by default.\n        head_planes (int): the channel number in all head, 256 by default.\n        prior_bias (float): prior bias in heatmap head, -2.19 by default, -4.6 in CenterTrack\n        regress_ltrb (bool): whether to regress left/top/right/bottom or\n            width/height for a box, True by default.\n        size_loss (str): the type of size regression loss, 'L1' by default, can be 'giou'.\n        loss_weight (dict): the weight of each loss.\n        add_iou (bool): whether to add iou branch, False by default.\n    \"\"\"\n\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 in_channels,\n                 num_classes=80,\n                 head_planes=256,\n                 prior_bias=-2.19,\n                 regress_ltrb=True,\n                 size_loss='L1',\n                 loss_weight={\n                     'heatmap': 1.0,\n                     'size': 0.1,\n                     'offset': 1.0,\n                     'iou': 0.0,\n                 },\n                 add_iou=False):\n        super(CenterNetHead, self).__init__()\n        self.regress_ltrb = regress_ltrb\n        self.loss_weight = loss_weight\n        self.add_iou = add_iou\n\n        # heatmap head\n        self.heatmap = nn.Sequential(\n            ConvLayer(\n                in_channels, head_planes, kernel_size=3, padding=1, bias=True),\n            nn.ReLU(),\n            ConvLayer(\n                head_planes,\n                num_classes,\n                kernel_size=1,\n                stride=1,\n                padding=0,\n                bias=True))\n        with paddle.no_grad():\n            self.heatmap[2].conv.bias[:] = prior_bias\n\n        # size(ltrb or wh) head\n        self.size = nn.Sequential(\n            ConvLayer(\n                in_channels, head_planes, kernel_size=3, padding=1, bias=True),\n            nn.ReLU(),\n            ConvLayer(\n                head_planes,\n                4 if regress_ltrb else 2,\n                kernel_size=1,\n                stride=1,\n                padding=0,\n                bias=True))\n        self.size_loss = size_loss\n\n        # offset head\n        self.offset = nn.Sequential(\n            ConvLayer(\n                in_channels, head_planes, kernel_size=3, padding=1, bias=True),\n            nn.ReLU(),\n            ConvLayer(\n                head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))\n\n        # iou head (optinal)\n        if self.add_iou and 'iou' in self.loss_weight:\n            self.iou = nn.Sequential(\n                ConvLayer(\n                    in_channels,\n                    head_planes,\n                    kernel_size=3,\n                    padding=1,\n                    bias=True),\n                nn.ReLU(),\n                ConvLayer(\n                    head_planes,\n                    4 if regress_ltrb else 2,\n                    kernel_size=1,\n                    stride=1,\n                    padding=0,\n                    bias=True))\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        if isinstance(input_shape, (list, tuple)):\n            input_shape = input_shape[0]\n        return {'in_channels': input_shape.channels}\n\n    def forward(self, feat, inputs):\n        heatmap = F.sigmoid(self.heatmap(feat))\n        size = self.size(feat)\n        offset = self.offset(feat)\n        head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset}\n        if self.add_iou and 'iou' in self.loss_weight:\n            iou = self.iou(feat)\n            head_outs.update({'iou': iou})\n\n        if self.training:\n            losses = self.get_loss(inputs, self.loss_weight, head_outs)\n            return losses\n        else:\n            return head_outs\n\n    def get_loss(self, inputs, weights, head_outs):\n        # 1.heatmap(hm) head loss: CTFocalLoss\n        heatmap = head_outs['heatmap']\n        heatmap_target = inputs['heatmap']\n        heatmap = paddle.clip(heatmap, 1e-4, 1 - 1e-4)\n        ctfocal_loss = CTFocalLoss()\n        heatmap_loss = ctfocal_loss(heatmap, heatmap_target)\n\n        # 2.size(wh) head loss: L1 loss or GIoU loss\n        size = head_outs['size']\n        index = inputs['index']\n        mask = inputs['index_mask']\n        size = paddle.transpose(size, perm=[0, 2, 3, 1])\n        size_n, _, _, size_c = size.shape\n        size = paddle.reshape(size, shape=[size_n, -1, size_c])\n        index = paddle.unsqueeze(index, 2)\n        batch_inds = list()\n        for i in range(size_n):\n            batch_ind = paddle.full(\n                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')\n            batch_inds.append(batch_ind)\n        batch_inds = paddle.concat(batch_inds, axis=0)\n        index = paddle.concat(x=[batch_inds, index], axis=2)\n        pos_size = paddle.gather_nd(size, index=index)\n        mask = paddle.unsqueeze(mask, axis=2)\n        size_mask = paddle.expand_as(mask, pos_size)\n        size_mask = paddle.cast(size_mask, dtype=pos_size.dtype)\n        pos_num = size_mask.sum()\n        size_mask.stop_gradient = True\n        if self.size_loss == 'L1':\n            if self.regress_ltrb:\n                size_target = inputs['size']\n                # shape: [bs, max_per_img, 4]\n            else:\n                if inputs['size'].shape[-1] == 2:\n                    # inputs['size'] is wh, and regress as wh\n                    # shape: [bs, max_per_img, 2]\n                    size_target = inputs['size']\n                else:\n                    # inputs['size'] is ltrb, but regress as wh\n                    # shape: [bs, max_per_img, 4]\n                    size_target = inputs['size'][:, :, 0:2] + inputs[\n                        'size'][:, :, 2:]\n\n            size_target.stop_gradient = True\n            size_loss = F.l1_loss(\n                pos_size * size_mask, size_target * size_mask, reduction='sum')\n            size_loss = size_loss / (pos_num + 1e-4)\n        elif self.size_loss == 'giou':\n            size_target = inputs['bbox_xys']\n            size_target.stop_gradient = True\n            centers_x = (size_target[:, :, 0:1] + size_target[:, :, 2:3]) / 2.0\n            centers_y = (size_target[:, :, 1:2] + size_target[:, :, 3:4]) / 2.0\n            x1 = centers_x - pos_size[:, :, 0:1]\n            y1 = centers_y - pos_size[:, :, 1:2]\n            x2 = centers_x + pos_size[:, :, 2:3]\n            y2 = centers_y + pos_size[:, :, 3:4]\n            pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1)\n            giou_loss = GIoULoss(reduction='sum')\n            size_loss = giou_loss(\n                pred_boxes * size_mask,\n                size_target * size_mask,\n                iou_weight=size_mask,\n                loc_reweight=None)\n            size_loss = size_loss / (pos_num + 1e-4)\n\n        # 3.offset(reg) head loss: L1 loss\n        offset = head_outs['offset']\n        offset_target = inputs['offset']\n        offset = paddle.transpose(offset, perm=[0, 2, 3, 1])\n        offset_n, _, _, offset_c = offset.shape\n        offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c])\n        pos_offset = paddle.gather_nd(offset, index=index)\n        offset_mask = paddle.expand_as(mask, pos_offset)\n        offset_mask = paddle.cast(offset_mask, dtype=pos_offset.dtype)\n        pos_num = offset_mask.sum()\n        offset_mask.stop_gradient = True\n        offset_target.stop_gradient = True\n        offset_loss = F.l1_loss(\n            pos_offset * offset_mask,\n            offset_target * offset_mask,\n            reduction='sum')\n        offset_loss = offset_loss / (pos_num + 1e-4)\n\n        # 4.iou head loss: GIoU loss (optinal)\n        if self.add_iou and 'iou' in self.loss_weight:\n            iou = head_outs['iou']\n            iou = paddle.transpose(iou, perm=[0, 2, 3, 1])\n            iou_n, _, _, iou_c = iou.shape\n            iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c])\n            pos_iou = paddle.gather_nd(iou, index=index)\n            iou_mask = paddle.expand_as(mask, pos_iou)\n            iou_mask = paddle.cast(iou_mask, dtype=pos_iou.dtype)\n            pos_num = iou_mask.sum()\n            iou_mask.stop_gradient = True\n            gt_bbox_xys = inputs['bbox_xys']\n            gt_bbox_xys.stop_gradient = True\n            centers_x = (gt_bbox_xys[:, :, 0:1] + gt_bbox_xys[:, :, 2:3]) / 2.0\n            centers_y = (gt_bbox_xys[:, :, 1:2] + gt_bbox_xys[:, :, 3:4]) / 2.0\n            x1 = centers_x - pos_size[:, :, 0:1]\n            y1 = centers_y - pos_size[:, :, 1:2]\n            x2 = centers_x + pos_size[:, :, 2:3]\n            y2 = centers_y + pos_size[:, :, 3:4]\n            pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1)\n            giou_loss = GIoULoss(reduction='sum')\n            iou_loss = giou_loss(\n                pred_boxes * iou_mask,\n                gt_bbox_xys * iou_mask,\n                iou_weight=iou_mask,\n                loc_reweight=None)\n            iou_loss = iou_loss / (pos_num + 1e-4)\n\n        losses = {\n            'heatmap_loss': heatmap_loss,\n            'size_loss': size_loss,\n            'offset_loss': offset_loss,\n        }\n        det_loss = weights['heatmap'] * heatmap_loss + weights[\n            'size'] * size_loss + weights['offset'] * offset_loss\n\n        if self.add_iou and 'iou' in self.loss_weight:\n            losses.update({'iou_loss': iou_loss})\n            det_loss += weights['iou'] * iou_loss\n        losses.update({'det_loss': det_loss})\n        return losses\n"
  },
  {
    "path": "ppdet/modeling/heads/centertrack_head.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom .centernet_head import ConvLayer\nfrom ..keypoint_utils import get_affine_transform\n\n__all__ = ['CenterTrackHead']\n\n\n@register\nclass CenterTrackHead(nn.Layer):\n    \"\"\"\n    Args:\n        in_channels (int): the channel number of input to CenterNetHead.\n        num_classes (int): the number of classes, 1 (MOT17 dataset) by default.\n        head_planes (int): the channel number in all head, 256 by default.\n        task (str): the type of task for regression, 'tracking' by default.\n        loss_weight (dict): the weight of each loss.\n        add_ltrb_amodal (bool): whether to add ltrb_amodal branch, False by default.\n    \"\"\"\n\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 in_channels,\n                 num_classes=1,\n                 head_planes=256,\n                 task='tracking',\n                 loss_weight={\n                     'tracking': 1.0,\n                     'ltrb_amodal': 0.1,\n                 },\n                 add_ltrb_amodal=True):\n        super(CenterTrackHead, self).__init__()\n        self.task = task\n        self.loss_weight = loss_weight\n        self.add_ltrb_amodal = add_ltrb_amodal\n\n        # tracking head\n        self.tracking = nn.Sequential(\n            ConvLayer(\n                in_channels, head_planes, kernel_size=3, padding=1, bias=True),\n            nn.ReLU(),\n            ConvLayer(\n                head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))\n\n        # ltrb_amodal head\n        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:\n            self.ltrb_amodal = nn.Sequential(\n                ConvLayer(\n                    in_channels,\n                    head_planes,\n                    kernel_size=3,\n                    padding=1,\n                    bias=True),\n                nn.ReLU(),\n                ConvLayer(\n                    head_planes,\n                    4,\n                    kernel_size=1,\n                    stride=1,\n                    padding=0,\n                    bias=True))\n\n        # TODO: add more tasks\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        if isinstance(input_shape, (list, tuple)):\n            input_shape = input_shape[0]\n        return {'in_channels': input_shape.channels}\n\n    def forward(self,\n                feat,\n                inputs,\n                bboxes=None,\n                bbox_inds=None,\n                topk_clses=None,\n                topk_ys=None,\n                topk_xs=None):\n        tracking = self.tracking(feat)\n        head_outs = {'tracking': tracking}\n        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:\n            ltrb_amodal = self.ltrb_amodal(feat)\n            head_outs.update({'ltrb_amodal': ltrb_amodal})\n\n        if self.training:\n            losses = self.get_loss(inputs, self.loss_weight, head_outs)\n            return losses\n        else:\n            ret = self.generic_decode(head_outs, bboxes, bbox_inds, topk_ys,\n                                      topk_xs)\n            return ret\n\n    def get_loss(self, inputs, weights, head_outs):\n        index = inputs['index'].unsqueeze(2)\n        mask = inputs['index_mask'].unsqueeze(2)\n        batch_inds = list()\n        for i in range(head_outs['tracking'].shape[0]):\n            batch_ind = paddle.full(\n                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')\n            batch_inds.append(batch_ind)\n        batch_inds = paddle.concat(batch_inds, axis=0)\n        index = paddle.concat(x=[batch_inds, index], axis=2)\n\n        # 1.tracking head loss: L1 loss\n        tracking = head_outs['tracking'].transpose([0, 2, 3, 1])\n        tracking_target = inputs['tracking']\n        bs, _, _, c = tracking.shape\n        tracking = tracking.reshape([bs, -1, c])\n        pos_tracking = paddle.gather_nd(tracking, index=index)\n        tracking_mask = paddle.cast(\n            paddle.expand_as(mask, pos_tracking), dtype=pos_tracking.dtype)\n        pos_num = tracking_mask.sum()\n        tracking_mask.stop_gradient = True\n        tracking_target.stop_gradient = True\n        tracking_loss = F.l1_loss(\n            pos_tracking * tracking_mask,\n            tracking_target * tracking_mask,\n            reduction='sum')\n        tracking_loss = tracking_loss / (pos_num + 1e-4)\n\n        # 2.ltrb_amodal head loss(optinal): L1 loss\n        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:\n            ltrb_amodal = head_outs['ltrb_amodal'].transpose([0, 2, 3, 1])\n            ltrb_amodal_target = inputs['ltrb_amodal']\n            bs, _, _, c = ltrb_amodal.shape\n            ltrb_amodal = ltrb_amodal.reshape([bs, -1, c])\n            pos_ltrb_amodal = paddle.gather_nd(ltrb_amodal, index=index)\n            ltrb_amodal_mask = paddle.cast(\n                paddle.expand_as(mask, pos_ltrb_amodal),\n                dtype=pos_ltrb_amodal.dtype)\n            pos_num = ltrb_amodal_mask.sum()\n            ltrb_amodal_mask.stop_gradient = True\n            ltrb_amodal_target.stop_gradient = True\n            ltrb_amodal_loss = F.l1_loss(\n                pos_ltrb_amodal * ltrb_amodal_mask,\n                ltrb_amodal_target * ltrb_amodal_mask,\n                reduction='sum')\n            ltrb_amodal_loss = ltrb_amodal_loss / (pos_num + 1e-4)\n\n        losses = {'tracking_loss': tracking_loss, }\n        plugin_loss = weights['tracking'] * tracking_loss\n\n        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:\n            losses.update({'ltrb_amodal_loss': ltrb_amodal_loss})\n            plugin_loss += weights['ltrb_amodal'] * ltrb_amodal_loss\n        losses.update({'plugin_loss': plugin_loss})\n        return losses\n\n    def generic_decode(self, head_outs, bboxes, bbox_inds, topk_ys, topk_xs):\n        topk_ys = paddle.floor(topk_ys)  # note: More accurate\n        topk_xs = paddle.floor(topk_xs)\n        cts = paddle.concat([topk_xs, topk_ys], 1)\n        ret = {'bboxes': bboxes, 'cts': cts}\n\n        regression_heads = ['tracking']  # todo: add more tasks\n        for head in regression_heads:\n            if head in head_outs:\n                ret[head] = _tranpose_and_gather_feat(head_outs[head],\n                                                      bbox_inds)\n\n        if 'ltrb_amodal' in head_outs:\n            ltrb_amodal = head_outs['ltrb_amodal']\n            ltrb_amodal = _tranpose_and_gather_feat(ltrb_amodal, bbox_inds)\n            bboxes_amodal = paddle.concat(\n                [\n                    topk_xs * 1.0 + ltrb_amodal[..., 0:1],\n                    topk_ys * 1.0 + ltrb_amodal[..., 1:2],\n                    topk_xs * 1.0 + ltrb_amodal[..., 2:3],\n                    topk_ys * 1.0 + ltrb_amodal[..., 3:4]\n                ],\n                axis=1)\n            ret['bboxes'] = paddle.concat([bboxes[:, 0:2], bboxes_amodal], 1)\n            # cls_id, score, x0, y0, x1, y1\n\n        return ret\n\n    def centertrack_post_process(self, dets, meta, out_thresh):\n        if not ('bboxes' in dets):\n            return [{}]\n\n        preds = []\n        c, s = meta['center'].numpy(), meta['scale'].numpy()\n        h, w = meta['out_height'].numpy(), meta['out_width'].numpy()\n        trans = get_affine_transform(\n            center=c[0],\n            input_size=s[0],\n            rot=0,\n            output_size=[w[0], h[0]],\n            shift=(0., 0.),\n            inv=True).astype(np.float32)\n        for i, dets_bbox in enumerate(dets['bboxes']):\n            if dets_bbox[1] < out_thresh:\n                break\n            item = {}\n            item['score'] = dets_bbox[1]\n            item['class'] = int(dets_bbox[0]) + 1\n            item['ct'] = transform_preds_with_trans(\n                dets['cts'][i].reshape([1, 2]), trans).reshape(2)\n\n            if 'tracking' in dets:\n                tracking = transform_preds_with_trans(\n                    (dets['tracking'][i] + dets['cts'][i]).reshape([1, 2]),\n                    trans).reshape(2)\n                item['tracking'] = tracking - item['ct']\n\n            if 'bboxes' in dets:\n                bbox = transform_preds_with_trans(\n                    dets_bbox[2:6].reshape([2, 2]), trans).reshape(4)\n                item['bbox'] = bbox\n\n            preds.append(item)\n        return preds\n\n\ndef transform_preds_with_trans(coords, trans):\n    target_coords = np.ones((coords.shape[0], 3), np.float32)\n    target_coords[:, :2] = coords\n    target_coords = np.dot(trans, target_coords.transpose()).transpose()\n    return target_coords[:, :2]\n\n\ndef _tranpose_and_gather_feat(feat, bbox_inds):\n    feat = feat.transpose([0, 2, 3, 1])\n    feat = feat.reshape([-1, feat.shape[3]])\n    feat = paddle.gather(feat, bbox_inds)\n    return feat\n"
  },
  {
    "path": "ppdet/modeling/heads/clrnet_head.py",
    "content": "import math\nimport paddle\nimport numpy as np\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\n\nfrom ppdet.modeling.initializer import normal_\nfrom ppdet.modeling.lane_utils import Lane\nfrom ppdet.modeling.losses import line_iou\nfrom ppdet.modeling.clrnet_utils import ROIGather, LinearModule, SegDecoder\n\n__all__ = ['CLRHead']\n\n\n@register\nclass CLRHead(nn.Layer):\n    __inject__ = ['loss']\n    __shared__ = [\n        'img_w', 'img_h', 'ori_img_h', 'num_classes', 'cut_height',\n        'num_points', \"max_lanes\"\n    ]\n\n    def __init__(self,\n                 num_points=72,\n                 prior_feat_channels=64,\n                 fc_hidden_dim=64,\n                 num_priors=192,\n                 img_w=800,\n                 img_h=320,\n                 ori_img_h=590,\n                 cut_height=270,\n                 num_classes=5,\n                 num_fc=2,\n                 refine_layers=3,\n                 sample_points=36,\n                 conf_threshold=0.4,\n                 nms_thres=0.5,\n                 max_lanes=4,\n                 loss='CLRNetLoss'):\n        super(CLRHead, self).__init__()\n        self.img_w = img_w\n        self.img_h = img_h\n        self.n_strips = num_points - 1\n        self.n_offsets = num_points\n        self.num_priors = num_priors\n        self.sample_points = sample_points\n        self.refine_layers = refine_layers\n        self.num_classes = num_classes\n        self.fc_hidden_dim = fc_hidden_dim\n        self.ori_img_h = ori_img_h\n        self.cut_height = cut_height\n        self.conf_threshold = conf_threshold\n        self.nms_thres = nms_thres\n        self.max_lanes = max_lanes\n        self.prior_feat_channels = prior_feat_channels\n        self.loss = loss\n        self.register_buffer(\n            name='sample_x_indexs',\n            tensor=(paddle.linspace(\n                start=0, stop=1, num=self.sample_points,\n                dtype=paddle.float32) * self.n_strips).astype(dtype='int64'))\n        self.register_buffer(\n            name='prior_feat_ys',\n            tensor=paddle.flip(\n                x=(1 - self.sample_x_indexs.astype('float32') / self.n_strips),\n                axis=[-1]))\n        self.register_buffer(\n            name='prior_ys',\n            tensor=paddle.linspace(\n                start=1, stop=0, num=self.n_offsets).astype('float32'))\n        self.prior_feat_channels = prior_feat_channels\n        self._init_prior_embeddings()\n        init_priors, priors_on_featmap = self.generate_priors_from_embeddings()\n        self.register_buffer(name='priors', tensor=init_priors)\n        self.register_buffer(name='priors_on_featmap', tensor=priors_on_featmap)\n        self.seg_decoder = SegDecoder(self.img_h, self.img_w, self.num_classes,\n                                      self.prior_feat_channels,\n                                      self.refine_layers)\n        reg_modules = list()\n        cls_modules = list()\n        for _ in range(num_fc):\n            reg_modules += [*LinearModule(self.fc_hidden_dim)]\n            cls_modules += [*LinearModule(self.fc_hidden_dim)]\n        self.reg_modules = nn.LayerList(sublayers=reg_modules)\n        self.cls_modules = nn.LayerList(sublayers=cls_modules)\n        self.roi_gather = ROIGather(self.prior_feat_channels, self.num_priors,\n                                    self.sample_points, self.fc_hidden_dim,\n                                    self.refine_layers)\n        self.reg_layers = nn.Linear(\n            in_features=self.fc_hidden_dim,\n            out_features=self.n_offsets + 1 + 2 + 1,\n            bias_attr=True)\n        self.cls_layers = nn.Linear(\n            in_features=self.fc_hidden_dim, out_features=2, bias_attr=True)\n        self.init_weights()\n\n    def init_weights(self):\n        for m in self.cls_layers.parameters():\n            normal_(m, mean=0.0, std=0.001)\n        for m in self.reg_layers.parameters():\n            normal_(m, mean=0.0, std=0.001)\n\n    def pool_prior_features(self, batch_features, num_priors, prior_xs):\n        \"\"\"\n        pool prior feature from feature map.\n        Args:\n            batch_features (Tensor): Input feature maps, shape: (B, C, H, W) \n        \"\"\"\n        batch_size = batch_features.shape[0]\n        prior_xs = prior_xs.reshape([batch_size, num_priors, -1, 1])\n\n        prior_ys = self.prior_feat_ys.tile(repeat_times=[\n            batch_size * num_priors\n        ]).reshape([batch_size, num_priors, -1, 1])\n        prior_xs = prior_xs * 2.0 - 1.0\n        prior_ys = prior_ys * 2.0 - 1.0\n        grid = paddle.concat(x=(prior_xs, prior_ys), axis=-1)\n        feature = F.grid_sample(\n            x=batch_features, grid=grid,\n            align_corners=True).transpose(perm=[0, 2, 1, 3])\n        feature = feature.reshape([\n            batch_size * num_priors, self.prior_feat_channels,\n            self.sample_points, 1\n        ])\n        return feature\n\n    def generate_priors_from_embeddings(self):\n        predictions = self.prior_embeddings.weight\n        # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, 72 coordinates, score[0] = negative prob, score[1] = positive prob       \n        priors = paddle.zeros(\n            (self.num_priors, 2 + 2 + 2 + self.n_offsets),\n            dtype=predictions.dtype)\n        priors[:, 2:5] = predictions.clone()\n        priors[:, 6:] = (\n            priors[:, 3].unsqueeze(1).clone().tile([1, self.n_offsets]) *\n            (self.img_w - 1) +\n            ((1 - self.prior_ys.tile([self.num_priors, 1]) -\n              priors[:, 2].unsqueeze(1).clone().tile([1, self.n_offsets])) *\n             self.img_h / paddle.tan(x=priors[:, 4].unsqueeze(1).clone().tile(\n                 [1, self.n_offsets]) * math.pi + 1e-05))) / (self.img_w - 1)\n        priors_on_featmap = paddle.index_select(\n            priors, 6 + self.sample_x_indexs, axis=-1)\n        return priors, priors_on_featmap\n\n    def _init_prior_embeddings(self):\n        self.prior_embeddings = nn.Embedding(self.num_priors, 3)\n        bottom_priors_nums = self.num_priors * 3 // 4\n        left_priors_nums, _ = self.num_priors // 8, self.num_priors // 8\n        strip_size = 0.5 / (left_priors_nums // 2 - 1)\n        bottom_strip_size = 1 / (bottom_priors_nums // 4 + 1)\n\n        with paddle.no_grad():\n            for i in range(left_priors_nums):\n                self.prior_embeddings.weight[i, 0] = i // 2 * strip_size\n                self.prior_embeddings.weight[i, 1] = 0.0\n                self.prior_embeddings.weight[i,\n                                             2] = 0.16 if i % 2 == 0 else 0.32\n\n            for i in range(left_priors_nums,\n                           left_priors_nums + bottom_priors_nums):\n                self.prior_embeddings.weight[i, 0] = 0.0\n                self.prior_embeddings.weight[i, 1] = (\n                    (i - left_priors_nums) // 4 + 1) * bottom_strip_size\n                self.prior_embeddings.weight[i, 2] = 0.2 * (i % 4 + 1)\n\n            for i in range(left_priors_nums + bottom_priors_nums,\n                           self.num_priors):\n                self.prior_embeddings.weight[i, 0] = (\n                    i - left_priors_nums - bottom_priors_nums) // 2 * strip_size\n                self.prior_embeddings.weight[i, 1] = 1.0\n                self.prior_embeddings.weight[i,\n                                             2] = 0.68 if i % 2 == 0 else 0.84\n\n    def forward(self, x, inputs=None):\n        \"\"\"\n        Take pyramid features as input to perform Cross Layer Refinement and finally output the prediction lanes.\n        Each feature is a 4D tensor.\n        Args:\n            x: input features (list[Tensor])\n        Return:\n            prediction_list: each layer's prediction result\n            seg: segmentation result for auxiliary loss\n        \"\"\"\n        batch_features = list(x[len(x) - self.refine_layers:])\n        batch_features.reverse()\n        batch_size = batch_features[-1].shape[0]\n\n        if self.training:\n            self.priors, self.priors_on_featmap = self.generate_priors_from_embeddings(\n            )\n        priors, priors_on_featmap = self.priors.tile(\n            [batch_size, 1,\n             1]), self.priors_on_featmap.tile([batch_size, 1, 1])\n        predictions_lists = []\n        prior_features_stages = []\n\n        for stage in range(self.refine_layers):\n            num_priors = priors_on_featmap.shape[1]\n            prior_xs = paddle.flip(x=priors_on_featmap, axis=[2])\n            batch_prior_features = self.pool_prior_features(\n                batch_features[stage], num_priors, prior_xs)\n            prior_features_stages.append(batch_prior_features)\n\n            fc_features = self.roi_gather(prior_features_stages,\n                                          batch_features[stage], stage)\n            # return fc_features\n            fc_features = fc_features.reshape(\n                [num_priors, batch_size, -1]).reshape(\n                    [batch_size * num_priors, self.fc_hidden_dim])\n            cls_features = fc_features.clone()\n            reg_features = fc_features.clone()\n\n            for cls_layer in self.cls_modules:\n                cls_features = cls_layer(cls_features)\n\n            # return cls_features\n            for reg_layer in self.reg_modules:\n                reg_features = reg_layer(reg_features)\n            cls_logits = self.cls_layers(cls_features)\n            reg = self.reg_layers(reg_features)\n\n            cls_logits = cls_logits.reshape(\n                [batch_size, -1, cls_logits.shape[1]])\n            reg = reg.reshape([batch_size, -1, reg.shape[1]])\n            predictions = priors.clone()\n            predictions[:, :, :2] = cls_logits\n            predictions[:, :, 2:5] += reg[:, :, :3]\n            predictions[:, :, 5] = reg[:, :, 3]\n\n            def tran_tensor(t):\n                return t.unsqueeze(axis=2).clone().tile([1, 1, self.n_offsets])\n\n            predictions[..., 6:] = (\n                tran_tensor(predictions[..., 3]) * (self.img_w - 1) +\n                ((1 - self.prior_ys.tile([batch_size, num_priors, 1]) -\n                  tran_tensor(predictions[..., 2])) * self.img_h / paddle.tan(\n                      tran_tensor(predictions[..., 4]) * math.pi + 1e-05))) / (\n                          self.img_w - 1)\n\n            prediction_lines = predictions.clone()\n            predictions[..., 6:] += reg[..., 4:]\n            predictions_lists.append(predictions)\n\n            if stage != self.refine_layers - 1:\n                priors = prediction_lines.detach().clone()\n                priors_on_featmap = priors.index_select(\n                    6 + self.sample_x_indexs, axis=-1)\n\n        if self.training:\n            seg = None\n            seg_features = paddle.concat(\n                [\n                    F.interpolate(\n                        feature,\n                        size=[\n                            batch_features[-1].shape[2],\n                            batch_features[-1].shape[3]\n                        ],\n                        mode='bilinear',\n                        align_corners=False) for feature in batch_features\n                ],\n                axis=1)\n\n            seg = self.seg_decoder(seg_features)\n\n            output = {'predictions_lists': predictions_lists, 'seg': seg}\n            return self.loss(output, inputs)\n        return predictions_lists[-1]\n\n    def predictions_to_pred(self, predictions):\n        \"\"\"\n        Convert predictions to internal Lane structure for evaluation.\n        \"\"\"\n        self.prior_ys = paddle.to_tensor(self.prior_ys)\n        self.prior_ys = self.prior_ys.astype('float64')\n        lanes = []\n        for lane in predictions:\n            lane_xs = lane[6:].clone()\n            start = min(\n                max(0, int(round(lane[2].item() * self.n_strips))),\n                self.n_strips)\n            length = int(round(lane[5].item()))\n            end = start + length - 1\n            end = min(end, len(self.prior_ys) - 1)\n            if start > 0:\n                mask = ((lane_xs[:start] >= 0.) &\n                        (lane_xs[:start] <= 1.)).cpu().detach().numpy()[::-1]\n                mask = ~((mask.cumprod()[::-1]).astype(np.bool_))\n                lane_xs[:start][mask] = -2\n            if end < len(self.prior_ys) - 1:\n                lane_xs[end + 1:] = -2\n\n            lane_ys = self.prior_ys[lane_xs >= 0].clone()\n            lane_xs = lane_xs[lane_xs >= 0]\n            lane_xs = lane_xs.flip(axis=0).astype('float64')\n            lane_ys = lane_ys.flip(axis=0)\n\n            lane_ys = (lane_ys *\n                       (self.ori_img_h - self.cut_height) + self.cut_height\n                       ) / self.ori_img_h\n            if len(lane_xs) <= 1:\n                continue\n            points = paddle.stack(\n                x=(lane_xs.reshape([-1, 1]), lane_ys.reshape([-1, 1])),\n                axis=1).squeeze(axis=2)\n            lane = Lane(\n                points=points.cpu().numpy(),\n                metadata={\n                    'start_x': lane[3],\n                    'start_y': lane[2],\n                    'conf': lane[1]\n                })\n            lanes.append(lane)\n        return lanes\n\n    def lane_nms(self, predictions, scores, nms_overlap_thresh, top_k):\n        \"\"\"\n        NMS for lane detection.\n        predictions: paddle.Tensor [num_lanes,conf,y,x,lenght,72offsets] [12,77]\n        scores: paddle.Tensor [num_lanes]\n        nms_overlap_thresh: float\n        top_k: int\n        \"\"\"\n        # sort by scores to get idx\n        idx = scores.argsort(descending=True)\n        keep = []\n\n        condidates = predictions.clone()\n        condidates = condidates.index_select(idx)\n\n        while len(condidates) > 0:\n            keep.append(idx[0])\n            if len(keep) >= top_k or len(condidates) == 1:\n                break\n\n            ious = []\n            for i in range(1, len(condidates)):\n                ious.append(1 - line_iou(\n                    condidates[i].unsqueeze(0),\n                    condidates[0].unsqueeze(0),\n                    img_w=self.img_w,\n                    length=15))\n            ious = paddle.to_tensor(ious)\n\n            mask = ious <= nms_overlap_thresh\n            id = paddle.where(mask == False)[0]\n\n            if id.shape[0] == 0:\n                break\n            condidates = condidates[1:].index_select(id)\n            idx = idx[1:].index_select(id)\n        keep = paddle.stack(keep)\n\n        return keep\n\n    def get_lanes(self, output, as_lanes=True):\n        \"\"\"\n        Convert model output to lanes.\n        \"\"\"\n        softmax = nn.Softmax(axis=1)\n        decoded = []\n\n        for predictions in output:\n            threshold = self.conf_threshold\n            scores = softmax(predictions[:, :2])[:, 1]\n            keep_inds = scores >= threshold\n            predictions = predictions[keep_inds]\n            scores = scores[keep_inds]\n\n            if predictions.shape[0] == 0:\n                decoded.append([])\n                continue\n            nms_predictions = predictions.detach().clone()\n            nms_predictions = paddle.concat(\n                x=[nms_predictions[..., :4], nms_predictions[..., 5:]], axis=-1)\n\n            nms_predictions[..., 4] = nms_predictions[..., 4] * self.n_strips\n            nms_predictions[..., 5:] = nms_predictions[..., 5:] * (\n                self.img_w - 1)\n\n            keep = self.lane_nms(\n                nms_predictions[..., 5:],\n                scores,\n                nms_overlap_thresh=self.nms_thres,\n                top_k=self.max_lanes)\n\n            predictions = predictions.index_select(keep)\n\n            if predictions.shape[0] == 0:\n                decoded.append([])\n                continue\n            predictions[:, 5] = paddle.round(predictions[:, 5] * self.n_strips)\n            if as_lanes:\n                pred = self.predictions_to_pred(predictions)\n            else:\n                pred = predictions\n            decoded.append(pred)\n        return decoded\n"
  },
  {
    "path": "ppdet/modeling/heads/detr_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nimport pycocotools.mask as mask_util\nfrom ..initializer import linear_init_, constant_\nfrom ..transformers.utils import inverse_sigmoid\n\n__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead', 'DINOv3Head']\n\n\nclass MLP(nn.Layer):\n    \"\"\"This code is based on\n        https://github.com/facebookresearch/detr/blob/main/models/detr.py\n    \"\"\"\n\n    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):\n        super().__init__()\n        self.num_layers = num_layers\n        h = [hidden_dim] * (num_layers - 1)\n        self.layers = nn.LayerList(\n            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        for l in self.layers:\n            linear_init_(l)\n\n    def forward(self, x):\n        for i, layer in enumerate(self.layers):\n            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)\n        return x\n\n\nclass MultiHeadAttentionMap(nn.Layer):\n    \"\"\"This code is based on\n        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py\n\n        This is a 2D attention module, which only returns the attention softmax (no multiplication by value)\n    \"\"\"\n\n    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,\n                 bias=True):\n        super().__init__()\n        self.num_heads = num_heads\n        self.hidden_dim = hidden_dim\n        self.dropout = nn.Dropout(dropout)\n\n        weight_attr = paddle.ParamAttr(\n            initializer=paddle.nn.initializer.XavierUniform())\n        bias_attr = paddle.framework.ParamAttr(\n            initializer=paddle.nn.initializer.Constant()) if bias else False\n\n        self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)\n        self.k_proj = nn.Conv2D(\n            query_dim,\n            hidden_dim,\n            1,\n            weight_attr=weight_attr,\n            bias_attr=bias_attr)\n\n        self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5\n\n    def forward(self, q, k, mask=None):\n        q = self.q_proj(q)\n        k = self.k_proj(k)\n        bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\\\n                                      self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]\n        qh = q.reshape([bs, num_queries, n, c])\n        kh = k.reshape([bs, n, c, h, w])\n        # weights = paddle.einsum(\"bqnc,bnchw->bqnhw\", qh * self.normalize_fact, kh)\n        qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])\n        kh = kh.reshape([-1, c, h * w])\n        weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(\n            [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])\n\n        if mask is not None:\n            weights += mask\n        # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247\n        weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)\n        weights = self.dropout(weights)\n        return weights\n\n\nclass MaskHeadFPNConv(nn.Layer):\n    \"\"\"This code is based on\n        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py\n\n        Simple convolutional head, using group norm.\n        Upsampling is done using a FPN approach\n    \"\"\"\n\n    def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):\n        super().__init__()\n\n        inter_dims = [input_dim,\n                      ] + [context_dim // (2**i) for i in range(1, 5)]\n        weight_attr = paddle.ParamAttr(\n            initializer=paddle.nn.initializer.KaimingUniform())\n        bias_attr = paddle.framework.ParamAttr(\n            initializer=paddle.nn.initializer.Constant())\n\n        self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,\n                                       weight_attr, bias_attr)\n        self.conv_inter = nn.LayerList()\n        for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):\n            self.conv_inter.append(\n                self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,\n                                  bias_attr))\n\n        self.conv_out = nn.Conv2D(\n            inter_dims[-1],\n            1,\n            3,\n            padding=1,\n            weight_attr=weight_attr,\n            bias_attr=bias_attr)\n\n        self.adapter = nn.LayerList()\n        for i in range(len(fpn_dims)):\n            self.adapter.append(\n                nn.Conv2D(\n                    fpn_dims[i],\n                    inter_dims[i + 1],\n                    1,\n                    weight_attr=weight_attr,\n                    bias_attr=bias_attr))\n\n    def _make_layers(self,\n                     in_dims,\n                     out_dims,\n                     kernel_size,\n                     num_groups,\n                     weight_attr=None,\n                     bias_attr=None):\n        return nn.Sequential(\n            nn.Conv2D(\n                in_dims,\n                out_dims,\n                kernel_size,\n                padding=kernel_size // 2,\n                weight_attr=weight_attr,\n                bias_attr=bias_attr),\n            nn.GroupNorm(num_groups, out_dims),\n            nn.ReLU())\n\n    def forward(self, x, bbox_attention_map, fpns):\n        x = paddle.concat([\n            x.tile([bbox_attention_map.shape[1], 1, 1, 1]),\n            bbox_attention_map.flatten(0, 1)\n        ], 1)\n        x = self.conv0(x)\n        for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],\n                                                    self.adapter, fpns):\n            feat = adapter_layer(feat).tile(\n                [bbox_attention_map.shape[1], 1, 1, 1])\n            x = inter_layer(x)\n            x = feat + F.interpolate(x, size=feat.shape[-2:])\n\n        x = self.conv_inter[-1](x)\n        x = self.conv_out(x)\n        return x\n\n\n@register\nclass DETRHead(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']\n    __inject__ = ['loss']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=256,\n                 nhead=8,\n                 num_mlp_layers=3,\n                 loss='DETRLoss',\n                 fpn_dims=[1024, 512, 256],\n                 with_mask_head=False,\n                 use_focal_loss=False):\n        super(DETRHead, self).__init__()\n        # add background class\n        self.num_classes = num_classes if use_focal_loss else num_classes + 1\n        self.hidden_dim = hidden_dim\n        self.loss = loss\n        self.with_mask_head = with_mask_head\n        self.use_focal_loss = use_focal_loss\n\n        self.score_head = nn.Linear(hidden_dim, self.num_classes)\n        self.bbox_head = MLP(hidden_dim,\n                             hidden_dim,\n                             output_dim=4,\n                             num_layers=num_mlp_layers)\n        if self.with_mask_head:\n            self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,\n                                                        nhead)\n            self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,\n                                             hidden_dim)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.score_head)\n\n    @classmethod\n    def from_config(cls, cfg, hidden_dim, nhead, input_shape):\n\n        return {\n            'hidden_dim': hidden_dim,\n            'nhead': nhead,\n            'fpn_dims': [i.channels for i in input_shape[::-1]][1:]\n        }\n\n    @staticmethod\n    def get_gt_mask_from_polygons(gt_poly, pad_mask):\n        out_gt_mask = []\n        for polygons, padding in zip(gt_poly, pad_mask):\n            height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())\n            masks = []\n            for obj_poly in polygons:\n                rles = mask_util.frPyObjects(obj_poly, height, width)\n                rle = mask_util.merge(rles)\n                masks.append(\n                    paddle.to_tensor(mask_util.decode(rle)).astype('float32'))\n            masks = paddle.stack(masks)\n            masks_pad = paddle.zeros(\n                [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])\n            masks_pad[:, :height, :width] = masks\n            out_gt_mask.append(masks_pad)\n        return out_gt_mask\n\n    def forward(self, out_transformer, body_feats, inputs=None):\n        r\"\"\"\n        Args:\n            out_transformer (Tuple): (feats: [num_levels, batch_size,\n                                                num_queries, hidden_dim],\n                            memory: [batch_size, hidden_dim, h, w],\n                            src_proj: [batch_size, h*w, hidden_dim],\n                            src_mask: [batch_size, 1, 1, h, w])\n            body_feats (List(Tensor)): list[[B, C, H, W]]\n            inputs (dict): dict(inputs)\n        \"\"\"\n        feats, memory, src_proj, src_mask = out_transformer\n        outputs_logit = self.score_head(feats)\n        outputs_bbox = F.sigmoid(self.bbox_head(feats))\n        outputs_seg = None\n        if self.with_mask_head:\n            bbox_attention_map = self.bbox_attention(feats[-1], memory,\n                                                     src_mask)\n            fpn_feats = [a for a in body_feats[::-1]][1:]\n            outputs_seg = self.mask_head(src_proj, bbox_attention_map,\n                                         fpn_feats)\n            outputs_seg = outputs_seg.reshape([\n                feats.shape[1], feats.shape[2], outputs_seg.shape[-2],\n                outputs_seg.shape[-1]\n            ])\n\n        if self.training:\n            assert inputs is not None\n            assert 'gt_bbox' in inputs and 'gt_class' in inputs\n            gt_mask = self.get_gt_mask_from_polygons(\n                inputs['gt_poly'],\n                inputs['pad_mask']) if 'gt_poly' in inputs else None\n            return self.loss(\n                outputs_bbox,\n                outputs_logit,\n                inputs['gt_bbox'],\n                inputs['gt_class'],\n                masks=outputs_seg,\n                gt_mask=gt_mask)\n        else:\n            return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)\n\n\n@register\nclass DeformableDETRHead(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim']\n    __inject__ = ['loss']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=512,\n                 nhead=8,\n                 num_mlp_layers=3,\n                 loss='DETRLoss'):\n        super(DeformableDETRHead, self).__init__()\n        self.num_classes = num_classes\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.loss = loss\n\n        self.score_head = nn.Linear(hidden_dim, self.num_classes)\n        self.bbox_head = MLP(hidden_dim,\n                             hidden_dim,\n                             output_dim=4,\n                             num_layers=num_mlp_layers)\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.score_head)\n        constant_(self.score_head.bias, -4.595)\n        constant_(self.bbox_head.layers[-1].weight)\n\n        with paddle.no_grad():\n            bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)\n            bias[2:] = -2.0\n            self.bbox_head.layers[-1].bias.set_value(bias)\n\n    @classmethod\n    def from_config(cls, cfg, hidden_dim, nhead, input_shape):\n        return {'hidden_dim': hidden_dim, 'nhead': nhead}\n\n    def forward(self, out_transformer, body_feats, inputs=None):\n        r\"\"\"\n        Args:\n            out_transformer (Tuple): (feats: [num_levels, batch_size,\n                                                num_queries, hidden_dim],\n                            memory: [batch_size,\n                                \\sum_{l=0}^{L-1} H_l \\cdot W_l, hidden_dim],\n                            reference_points: [batch_size, num_queries, 2])\n            body_feats (List(Tensor)): list[[B, C, H, W]]\n            inputs (dict): dict(inputs)\n        \"\"\"\n        feats, memory, reference_points = out_transformer\n        reference_points = inverse_sigmoid(reference_points.unsqueeze(0))\n        outputs_bbox = self.bbox_head(feats)\n\n        # It's equivalent to \"outputs_bbox[:, :, :, :2] += reference_points\",\n        # but the gradient is wrong in paddle.\n        outputs_bbox = paddle.concat(\n            [\n                outputs_bbox[:, :, :, :2] + reference_points,\n                outputs_bbox[:, :, :, 2:]\n            ],\n            axis=-1)\n\n        outputs_bbox = F.sigmoid(outputs_bbox)\n        outputs_logit = self.score_head(feats)\n\n        if self.training:\n            assert inputs is not None\n            assert 'gt_bbox' in inputs and 'gt_class' in inputs\n\n            return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],\n                             inputs['gt_class'])\n        else:\n            return (outputs_bbox[-1], outputs_logit[-1], None)\n\n\n@register\nclass DINOHead(nn.Layer):\n    __inject__ = ['loss']\n\n    def __init__(self, loss='DINOLoss', eval_idx=-1):\n        super(DINOHead, self).__init__()\n        self.loss = loss\n        self.eval_idx = eval_idx\n\n    def forward(self, out_transformer, body_feats, inputs=None):\n        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,\n         dn_meta) = out_transformer\n        if self.training:\n            assert inputs is not None\n            assert 'gt_bbox' in inputs and 'gt_class' in inputs\n\n            if dn_meta is not None:\n                if isinstance(dn_meta, list):\n                    dual_groups = len(dn_meta) - 1\n                    dec_out_bboxes = paddle.split(\n                        dec_out_bboxes, dual_groups + 1, axis=2)\n                    dec_out_logits = paddle.split(\n                        dec_out_logits, dual_groups + 1, axis=2)\n                    enc_topk_bboxes = paddle.split(\n                        enc_topk_bboxes, dual_groups + 1, axis=1)\n                    enc_topk_logits = paddle.split(\n                        enc_topk_logits, dual_groups + 1, axis=1)\n\n                    dec_out_bboxes_list = []\n                    dec_out_logits_list = []\n                    dn_out_bboxes_list = []\n                    dn_out_logits_list = []\n                    loss = {}\n                    for g_id in range(dual_groups + 1):\n                        if dn_meta[g_id] is not None:\n                            dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(\n                                dec_out_bboxes[g_id],\n                                dn_meta[g_id]['dn_num_split'],\n                                axis=2)\n                            dn_out_logits_gid, dec_out_logits_gid = paddle.split(\n                                dec_out_logits[g_id],\n                                dn_meta[g_id]['dn_num_split'],\n                                axis=2)\n                        else:\n                            dn_out_bboxes_gid, dn_out_logits_gid = None, None\n                            dec_out_bboxes_gid = dec_out_bboxes[g_id]\n                            dec_out_logits_gid = dec_out_logits[g_id]\n                        out_bboxes_gid = paddle.concat([\n                            enc_topk_bboxes[g_id].unsqueeze(0),\n                            dec_out_bboxes_gid\n                        ])\n                        out_logits_gid = paddle.concat([\n                            enc_topk_logits[g_id].unsqueeze(0),\n                            dec_out_logits_gid\n                        ])\n                        loss_gid = self.loss(\n                            out_bboxes_gid,\n                            out_logits_gid,\n                            inputs['gt_bbox'],\n                            inputs['gt_class'],\n                            dn_out_bboxes=dn_out_bboxes_gid,\n                            dn_out_logits=dn_out_logits_gid,\n                            dn_meta=dn_meta[g_id])\n                        # sum loss\n                        for key, value in loss_gid.items():\n                            loss.update({\n                                key: loss.get(key, paddle.zeros([1])) + value\n                            })\n\n                    # average across (dual_groups + 1)\n                    for key, value in loss.items():\n                        loss.update({key: value / (dual_groups + 1)})\n                    return loss\n                else:\n                    dn_out_bboxes, dec_out_bboxes = paddle.split(\n                        dec_out_bboxes, dn_meta['dn_num_split'], axis=2)\n                    dn_out_logits, dec_out_logits = paddle.split(\n                        dec_out_logits, dn_meta['dn_num_split'], axis=2)\n            else:\n                dn_out_bboxes, dn_out_logits = None, None\n\n            out_bboxes = paddle.concat(\n                [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])\n            out_logits = paddle.concat(\n                [enc_topk_logits.unsqueeze(0), dec_out_logits])\n\n            return self.loss(\n                out_bboxes,\n                out_logits,\n                inputs['gt_bbox'],\n                inputs['gt_class'],\n                dn_out_bboxes=dn_out_bboxes,\n                dn_out_logits=dn_out_logits,\n                dn_meta=dn_meta,\n                gt_score=inputs.get('gt_score', None))\n        else:\n            return (dec_out_bboxes[self.eval_idx],\n                    dec_out_logits[self.eval_idx], None)\n\n\n@register\nclass MaskDINOHead(nn.Layer):\n    __inject__ = ['loss']\n\n    def __init__(self, loss='DINOLoss'):\n        super(MaskDINOHead, self).__init__()\n        self.loss = loss\n\n    def forward(self, out_transformer, body_feats, inputs=None):\n        (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,\n         dn_meta) = out_transformer\n        if self.training:\n            assert inputs is not None\n            assert 'gt_bbox' in inputs and 'gt_class' in inputs\n            assert 'gt_segm' in inputs\n\n            if dn_meta is not None:\n                dn_out_logits, dec_out_logits = paddle.split(\n                    dec_out_logits, dn_meta['dn_num_split'], axis=2)\n                dn_out_bboxes, dec_out_bboxes = paddle.split(\n                    dec_out_bboxes, dn_meta['dn_num_split'], axis=2)\n                dn_out_masks, dec_out_masks = paddle.split(\n                    dec_out_masks, dn_meta['dn_num_split'], axis=2)\n                if init_out is not None:\n                    init_out_logits, init_out_bboxes, init_out_masks = init_out\n                    init_out_logits_dn, init_out_logits = paddle.split(\n                        init_out_logits, dn_meta['dn_num_split'], axis=1)\n                    init_out_bboxes_dn, init_out_bboxes = paddle.split(\n                        init_out_bboxes, dn_meta['dn_num_split'], axis=1)\n                    init_out_masks_dn, init_out_masks = paddle.split(\n                        init_out_masks, dn_meta['dn_num_split'], axis=1)\n\n                    dec_out_logits = paddle.concat(\n                        [init_out_logits.unsqueeze(0), dec_out_logits])\n                    dec_out_bboxes = paddle.concat(\n                        [init_out_bboxes.unsqueeze(0), dec_out_bboxes])\n                    dec_out_masks = paddle.concat(\n                        [init_out_masks.unsqueeze(0), dec_out_masks])\n\n                    dn_out_logits = paddle.concat(\n                        [init_out_logits_dn.unsqueeze(0), dn_out_logits])\n                    dn_out_bboxes = paddle.concat(\n                        [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])\n                    dn_out_masks = paddle.concat(\n                        [init_out_masks_dn.unsqueeze(0), dn_out_masks])\n            else:\n                dn_out_bboxes, dn_out_logits = None, None\n                dn_out_masks = None\n\n            enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out\n            out_logits = paddle.concat(\n                [enc_out_logits.unsqueeze(0), dec_out_logits])\n            out_bboxes = paddle.concat(\n                [enc_out_bboxes.unsqueeze(0), dec_out_bboxes])\n            out_masks = paddle.concat(\n                [enc_out_masks.unsqueeze(0), dec_out_masks])\n\n            inputs['gt_segm'] = [gt_segm.astype(out_masks.dtype)\n                                 for gt_segm in inputs['gt_segm']]\n\n            return self.loss(\n                out_bboxes,\n                out_logits,\n                inputs['gt_bbox'],\n                inputs['gt_class'],\n                masks=out_masks,\n                gt_mask=inputs['gt_segm'],\n                dn_out_logits=dn_out_logits,\n                dn_out_bboxes=dn_out_bboxes,\n                dn_out_masks=dn_out_masks,\n                dn_meta=dn_meta)\n        else:\n            return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])\n\n@register\nclass DINOv3Head(nn.Layer):\n    __inject__ = ['loss']\n    __shared__ = ['o2m_branch', 'num_queries_o2m']\n\n\n    def __init__(self, loss='DINOLoss', eval_idx=-1, o2m=4, o2m_branch=False, num_queries_o2m=450):\n        super(DINOv3Head, self).__init__()\n        self.loss = loss\n        self.eval_idx = eval_idx\n        self.o2m = o2m\n        self.o2m_branch = o2m_branch\n        self.num_queries_o2m = num_queries_o2m\n\n    def forward(self, out_transformer, body_feats, inputs=None):\n        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,\n         dn_meta) = out_transformer\n        if self.training:\n            assert inputs is not None\n            assert 'gt_bbox' in inputs and 'gt_class' in inputs\n\n            if dn_meta is not None:\n                num_groups = len(dn_meta)\n                total_dec_queries = dec_out_bboxes.shape[2]\n                total_enc_queries = enc_topk_bboxes.shape[1]\n                loss = {}\n                if self.o2m_branch:\n                    dec_out_bboxes, dec_out_bboxes_o2m = paddle.split(dec_out_bboxes, [total_dec_queries - self.num_queries_o2m, self.num_queries_o2m], axis=2)\n                    dec_out_logits, dec_out_logits_o2m = paddle.split(dec_out_logits, [total_dec_queries - self.num_queries_o2m, self.num_queries_o2m], axis=2)\n                    enc_topk_bboxes, enc_topk_bboxes_o2m = paddle.split(enc_topk_bboxes, [total_enc_queries - self.num_queries_o2m, self.num_queries_o2m], axis=1)\n                    enc_topk_logits, enc_topk_logits_o2m = paddle.split(enc_topk_logits, [total_enc_queries - self.num_queries_o2m, self.num_queries_o2m], axis=1)\n\n                    out_bboxes_o2m = paddle.concat([enc_topk_bboxes_o2m.unsqueeze(0), dec_out_bboxes_o2m])\n                    out_logits_o2m = paddle.concat([enc_topk_logits_o2m.unsqueeze(0), dec_out_logits_o2m])\n                    loss_o2m = self.loss(\n                        out_bboxes_o2m,\n                        out_logits_o2m,\n                        inputs['gt_bbox'],\n                        inputs['gt_class'],\n                        dn_out_bboxes=None,\n                        dn_out_logits=None,\n                        dn_meta=None,\n                        o2m=self.o2m)\n                    for key, value in loss_o2m.items():\n                        key = key + '_o2m_branch'\n                        loss.update({\n                            key: loss.get(key, paddle.zeros([1])) + value\n                        })\n                \n                split_dec_num = [sum(dn['dn_num_split']) for dn in dn_meta]\n                split_enc_num = [dn['dn_num_split'][1] for dn in dn_meta]\n                dec_out_bboxes = paddle.split(dec_out_bboxes, split_dec_num, axis=2)\n                dec_out_logits = paddle.split(dec_out_logits, split_dec_num, axis=2)\n                enc_topk_bboxes = paddle.split(enc_topk_bboxes, split_enc_num, axis=1)\n                enc_topk_logits = paddle.split(enc_topk_logits, split_enc_num, axis=1)\n\n                for g_id in range(num_groups):\n                    dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(\n                        dec_out_bboxes[g_id], dn_meta[g_id]['dn_num_split'], axis=2)\n                    dn_out_logits_gid, dec_out_logits_gid = paddle.split(\n                        dec_out_logits[g_id], dn_meta[g_id]['dn_num_split'], axis=2)\n                    out_bboxes_gid = paddle.concat([\n                        enc_topk_bboxes[g_id].unsqueeze(0), dec_out_bboxes_gid])\n                    out_logits_gid = paddle.concat([\n                        enc_topk_logits[g_id].unsqueeze(0), dec_out_logits_gid])\n                    \n                    loss_gid = self.loss(\n                        out_bboxes_gid,\n                        out_logits_gid,\n                        inputs['gt_bbox'],\n                        inputs['gt_class'],\n                        dn_out_bboxes=dn_out_bboxes_gid,\n                        dn_out_logits=dn_out_logits_gid,\n                        dn_meta=dn_meta[g_id])\n                    # sum loss\n                    for key, value in loss_gid.items():\n                        loss.update({\n                            key: loss.get(key, paddle.zeros([1])) + value\n                        })\n\n                # average across (dual_groups + 1)\n                for key, value in loss.items():\n                    if '_o2m_branch' not in key:\n                        loss.update({key: value / num_groups})\n                return loss\n            else:\n                dn_out_bboxes, dn_out_logits = None, None\n\n            out_bboxes = paddle.concat(\n                [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])\n            out_logits = paddle.concat(\n                [enc_topk_logits.unsqueeze(0), dec_out_logits])\n\n            return self.loss(\n                out_bboxes,\n                out_logits,\n                inputs['gt_bbox'],\n                inputs['gt_class'],\n                dn_out_bboxes=dn_out_bboxes,\n                dn_out_logits=dn_out_logits,\n                dn_meta=dn_meta,\n                gt_score=inputs.get('gt_score', None))\n        else:\n            return (dec_out_bboxes[self.eval_idx],\n                    dec_out_logits[self.eval_idx], None)\n"
  },
  {
    "path": "ppdet/modeling/heads/face_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\nfrom ppdet.core.workspace import register\nfrom ..layers import AnchorGeneratorSSD\nfrom ..cls_utils import _get_class_default_kwargs\n\n\n@register\nclass FaceHead(nn.Layer):\n    \"\"\"\n    Head block for Face detection network\n\n    Args:\n        num_classes (int): Number of output classes.\n        in_channels (int): Number of input channels.\n        anchor_generator(object): instance of anchor genertor method.\n        kernel_size (int): kernel size of Conv2D in FaceHead.\n        padding (int): padding of Conv2D in FaceHead.\n        conv_decay (float): norm_decay (float): weight decay for conv layer weights.\n        loss (object): loss of face detection model.\n    \"\"\"\n    __shared__ = ['num_classes']\n    __inject__ = ['anchor_generator', 'loss']\n\n    def __init__(self,\n                 num_classes=80,\n                 in_channels=[96, 96],\n                 anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),\n                 kernel_size=3,\n                 padding=1,\n                 conv_decay=0.,\n                 loss='SSDLoss'):\n        super(FaceHead, self).__init__()\n        # add background class\n        self.num_classes = num_classes + 1\n        self.in_channels = in_channels\n        self.anchor_generator = anchor_generator\n        self.loss = loss\n\n        if isinstance(anchor_generator, dict):\n            self.anchor_generator = AnchorGeneratorSSD(**anchor_generator)\n\n        self.num_priors = self.anchor_generator.num_priors\n        self.box_convs = []\n        self.score_convs = []\n        for i, num_prior in enumerate(self.num_priors):\n            box_conv_name = \"boxes{}\".format(i)\n            box_conv = self.add_sublayer(\n                box_conv_name,\n                nn.Conv2D(\n                    in_channels=self.in_channels[i],\n                    out_channels=num_prior * 4,\n                    kernel_size=kernel_size,\n                    padding=padding))\n            self.box_convs.append(box_conv)\n\n            score_conv_name = \"scores{}\".format(i)\n            score_conv = self.add_sublayer(\n                score_conv_name,\n                nn.Conv2D(\n                    in_channels=self.in_channels[i],\n                    out_channels=num_prior * self.num_classes,\n                    kernel_size=kernel_size,\n                    padding=padding))\n            self.score_convs.append(score_conv)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    def forward(self, feats, image, gt_bbox=None, gt_class=None):\n        box_preds = []\n        cls_scores = []\n        prior_boxes = []\n        for feat, box_conv, score_conv in zip(feats, self.box_convs,\n                                              self.score_convs):\n            box_pred = box_conv(feat)\n            box_pred = paddle.transpose(box_pred, [0, 2, 3, 1])\n            box_pred = paddle.reshape(box_pred, [0, -1, 4])\n            box_preds.append(box_pred)\n\n            cls_score = score_conv(feat)\n            cls_score = paddle.transpose(cls_score, [0, 2, 3, 1])\n            cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes])\n            cls_scores.append(cls_score)\n\n        prior_boxes = self.anchor_generator(feats, image)\n\n        if self.training:\n            return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class,\n                                 prior_boxes)\n        else:\n            return (box_preds, cls_scores), prior_boxes\n\n    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):\n        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)\n"
  },
  {
    "path": "ppdet/modeling/heads/fcos_head.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Normal, Constant\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.layers import ConvNormLayer, MultiClassNMS\n\n__all__ = ['FCOSFeat', 'FCOSHead', 'FCOSHead_ARSL']\n\n\nclass ScaleReg(nn.Layer):\n    \"\"\"\n    Parameter for scaling the regression outputs.\n    \"\"\"\n\n    def __init__(self):\n        super(ScaleReg, self).__init__()\n        self.scale_reg = self.create_parameter(\n            shape=[1],\n            attr=ParamAttr(initializer=Constant(value=1.)),\n            dtype=\"float32\")\n\n    def forward(self, inputs):\n        out = inputs * self.scale_reg\n        return out\n\n\n@register\nclass FCOSFeat(nn.Layer):\n    \"\"\"\n    FCOSFeat of FCOS\n\n    Args:\n        feat_in (int): The channel number of input Tensor.\n        feat_out (int): The channel number of output Tensor.\n        num_convs (int): The convolution number of the FCOSFeat.\n        norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.\n        use_dcn (bool): Whether to use dcn in tower or not.\n    \"\"\"\n\n    def __init__(self,\n                 feat_in=256,\n                 feat_out=256,\n                 num_convs=4,\n                 norm_type='bn',\n                 use_dcn=False):\n        super(FCOSFeat, self).__init__()\n        self.feat_in = feat_in\n        self.feat_out = feat_out\n        self.num_convs = num_convs\n        self.norm_type = norm_type\n        self.cls_subnet_convs = []\n        self.reg_subnet_convs = []\n        for i in range(self.num_convs):\n            in_c = feat_in if i == 0 else feat_out\n\n            cls_conv_name = 'fcos_head_cls_tower_conv_{}'.format(i)\n            cls_conv = self.add_sublayer(\n                cls_conv_name,\n                ConvNormLayer(\n                    ch_in=in_c,\n                    ch_out=feat_out,\n                    filter_size=3,\n                    stride=1,\n                    norm_type=norm_type,\n                    use_dcn=use_dcn,\n                    bias_on=True,\n                    lr_scale=2.))\n            self.cls_subnet_convs.append(cls_conv)\n\n            reg_conv_name = 'fcos_head_reg_tower_conv_{}'.format(i)\n            reg_conv = self.add_sublayer(\n                reg_conv_name,\n                ConvNormLayer(\n                    ch_in=in_c,\n                    ch_out=feat_out,\n                    filter_size=3,\n                    stride=1,\n                    norm_type=norm_type,\n                    use_dcn=use_dcn,\n                    bias_on=True,\n                    lr_scale=2.))\n            self.reg_subnet_convs.append(reg_conv)\n\n    def forward(self, fpn_feat):\n        cls_feat = fpn_feat\n        reg_feat = fpn_feat\n        for i in range(self.num_convs):\n            cls_feat = F.relu(self.cls_subnet_convs[i](cls_feat))\n            reg_feat = F.relu(self.reg_subnet_convs[i](reg_feat))\n        return cls_feat, reg_feat\n\n\n@register\nclass FCOSHead(nn.Layer):\n    \"\"\"\n    FCOSHead\n    Args:\n        num_classes (int): Number of classes\n        fcos_feat (object): Instance of 'FCOSFeat'\n        fpn_stride (list): The stride of each FPN Layer\n        prior_prob (float): Used to set the bias init for the class prediction layer\n        norm_reg_targets (bool): Normalization the regression target if true\n        centerness_on_reg (bool): The prediction of centerness on regression or clssification branch\n        num_shift (float): Relative offset between the center of the first shift and the top-left corner of img\n        fcos_loss (object): Instance of 'FCOSLoss'\n        nms (object): Instance of 'MultiClassNMS'\n        trt (bool): Whether to use trt in nms of deploy\n    \"\"\"\n    __inject__ = ['fcos_feat', 'fcos_loss', 'nms']\n    __shared__ = ['num_classes', 'trt']\n\n    def __init__(self,\n                 num_classes=80,\n                 fcos_feat='FCOSFeat',\n                 fpn_stride=[8, 16, 32, 64, 128],\n                 prior_prob=0.01,\n                 multiply_strides_reg_targets=False,\n                 norm_reg_targets=True,\n                 centerness_on_reg=True,\n                 num_shift=0.5,\n                 sqrt_score=False,\n                 fcos_loss='FCOSLoss',\n                 nms='MultiClassNMS',\n                 trt=False):\n        super(FCOSHead, self).__init__()\n        self.fcos_feat = fcos_feat\n        self.num_classes = num_classes\n        self.fpn_stride = fpn_stride\n        self.prior_prob = prior_prob\n        self.fcos_loss = fcos_loss\n        self.norm_reg_targets = norm_reg_targets\n        self.centerness_on_reg = centerness_on_reg\n        self.multiply_strides_reg_targets = multiply_strides_reg_targets\n        self.num_shift = num_shift\n        self.nms = nms\n        if isinstance(self.nms, MultiClassNMS) and trt:\n            self.nms.trt = trt\n        self.sqrt_score = sqrt_score\n        self.is_teacher = False\n\n        conv_cls_name = \"fcos_head_cls\"\n        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)\n        self.fcos_head_cls = self.add_sublayer(\n            conv_cls_name,\n            nn.Conv2D(\n                in_channels=256,\n                out_channels=self.num_classes,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(\n                    initializer=Constant(value=bias_init_value))))\n\n        conv_reg_name = \"fcos_head_reg\"\n        self.fcos_head_reg = self.add_sublayer(\n            conv_reg_name,\n            nn.Conv2D(\n                in_channels=256,\n                out_channels=4,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n\n        conv_centerness_name = \"fcos_head_centerness\"\n        self.fcos_head_centerness = self.add_sublayer(\n            conv_centerness_name,\n            nn.Conv2D(\n                in_channels=256,\n                out_channels=1,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n\n        self.scales_regs = []\n        for i in range(len(self.fpn_stride)):\n            lvl = int(math.log(int(self.fpn_stride[i]), 2))\n            feat_name = 'p{}_feat'.format(lvl)\n            scale_reg = self.add_sublayer(feat_name, ScaleReg())\n            self.scales_regs.append(scale_reg)\n\n    def _compute_locations_by_level(self, fpn_stride, feature, num_shift=0.5):\n        \"\"\"\n        Compute locations of anchor points of each FPN layer\n        Args:\n            fpn_stride (int): The stride of current FPN feature map\n            feature (Tensor): Tensor of current FPN feature map\n        Return:\n            Anchor points locations of current FPN feature map\n        \"\"\"\n        h, w = feature.shape[2], feature.shape[3]\n        shift_x = paddle.arange(0, w * fpn_stride, fpn_stride)\n        shift_y = paddle.arange(0, h * fpn_stride, fpn_stride)\n        shift_x = paddle.unsqueeze(shift_x, axis=0)\n        shift_y = paddle.unsqueeze(shift_y, axis=1)\n        shift_x = paddle.expand(shift_x, shape=[h, w])\n        shift_y = paddle.expand(shift_y, shape=[h, w])\n\n        shift_x = paddle.reshape(shift_x, shape=[-1])\n        shift_y = paddle.reshape(shift_y, shape=[-1])\n        location = paddle.stack(\n            [shift_x, shift_y], axis=-1) + float(fpn_stride * num_shift)\n        return location\n\n    def forward(self, fpn_feats, targets=None):\n        assert len(fpn_feats) == len(\n            self.fpn_stride\n        ), \"The size of fpn_feats is not equal to size of fpn_stride\"\n        cls_logits_list = []\n        bboxes_reg_list = []\n        centerness_list = []\n        for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs,\n                                                   self.fpn_stride, fpn_feats):\n            fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat)\n            cls_logits = self.fcos_head_cls(fcos_cls_feat)\n            bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat))\n            if self.centerness_on_reg:\n                centerness = self.fcos_head_centerness(fcos_reg_feat)\n            else:\n                centerness = self.fcos_head_centerness(fcos_cls_feat)\n            if self.norm_reg_targets:\n                bbox_reg = F.relu(bbox_reg)\n                if self.multiply_strides_reg_targets:\n                    bbox_reg = bbox_reg * fpn_stride\n                else:\n                    if not self.training or targets.get(\n                            'get_data',\n                            False) or targets.get('is_teacher', False):\n                        bbox_reg = bbox_reg * fpn_stride\n            else:\n                bbox_reg = paddle.exp(bbox_reg)\n            cls_logits_list.append(cls_logits)\n            bboxes_reg_list.append(bbox_reg)\n            centerness_list.append(centerness)\n\n        if targets is not None:\n            self.is_teacher = targets.get('is_teacher', False)\n            if self.is_teacher:\n                return [cls_logits_list, bboxes_reg_list, centerness_list]\n\n        if self.training and targets is not None:\n            get_data = targets.get('get_data', False)\n            if get_data:\n                return [cls_logits_list, bboxes_reg_list, centerness_list]\n\n            losses = {}\n            fcos_head_outs = [cls_logits_list, bboxes_reg_list, centerness_list]\n            losses_fcos = self.get_loss(fcos_head_outs, targets)\n            losses.update(losses_fcos)\n\n            total_loss = paddle.add_n(list(losses.values()))\n            losses.update({'loss': total_loss})\n            return losses\n        else:\n            # eval or infer\n            locations_list = []\n            for fpn_stride, feature in zip(self.fpn_stride, fpn_feats):\n                location = self._compute_locations_by_level(fpn_stride, feature,\n                                                            self.num_shift)\n                locations_list.append(location)\n\n            fcos_head_outs = [\n                locations_list, cls_logits_list, bboxes_reg_list,\n                centerness_list\n            ]\n            return fcos_head_outs\n\n    def get_loss(self, fcos_head_outs, targets):\n        cls_logits, bboxes_reg, centerness = fcos_head_outs\n\n        # get labels,reg_target,centerness\n        tag_labels, tag_bboxes, tag_centerness = [], [], []\n        for i in range(len(self.fpn_stride)):\n            k_lbl = 'labels{}'.format(i)\n            if k_lbl in targets:\n                tag_labels.append(targets[k_lbl])\n            k_box = 'reg_target{}'.format(i)\n            if k_box in targets:\n                tag_bboxes.append(targets[k_box])\n            k_ctn = 'centerness{}'.format(i)\n            if k_ctn in targets:\n                tag_centerness.append(targets[k_ctn])\n\n        losses_fcos = self.fcos_loss(cls_logits, bboxes_reg, centerness,\n                                     tag_labels, tag_bboxes, tag_centerness)\n        return losses_fcos\n\n    def _post_process_by_level(self,\n                               locations,\n                               box_cls,\n                               box_reg,\n                               box_ctn,\n                               sqrt_score=False):\n        box_scores = F.sigmoid(box_cls).flatten(2).transpose([0, 2, 1])\n        box_centerness = F.sigmoid(box_ctn).flatten(2).transpose([0, 2, 1])\n        pred_scores = box_scores * box_centerness\n        if sqrt_score:\n            pred_scores = paddle.sqrt(pred_scores)\n\n        box_reg_ch_last = box_reg.flatten(2).transpose([0, 2, 1])\n        box_reg_decoding = paddle.stack(\n            [\n                locations[:, 0] - box_reg_ch_last[:, :, 0],\n                locations[:, 1] - box_reg_ch_last[:, :, 1],\n                locations[:, 0] + box_reg_ch_last[:, :, 2],\n                locations[:, 1] + box_reg_ch_last[:, :, 3]\n            ],\n            axis=1)\n        pred_boxes = box_reg_decoding.transpose([0, 2, 1])\n\n        return pred_scores, pred_boxes\n\n    def post_process(self, fcos_head_outs, scale_factor):\n        locations, cls_logits, bboxes_reg, centerness = fcos_head_outs\n        pred_bboxes, pred_scores = [], []\n\n        for pts, cls, reg, ctn in zip(locations, cls_logits, bboxes_reg,\n                                      centerness):\n            scores, boxes = self._post_process_by_level(pts, cls, reg, ctn,\n                                                        self.sqrt_score)\n            pred_scores.append(scores)\n            pred_bboxes.append(boxes)\n        pred_bboxes = paddle.concat(pred_bboxes, axis=1)\n        pred_scores = paddle.concat(pred_scores, axis=1)\n\n        # scale bbox to origin\n        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)\n        scale_factor = paddle.concat(\n            [scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4])\n        pred_bboxes /= scale_factor\n\n        pred_scores = pred_scores.transpose([0, 2, 1])\n        bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)\n        return bbox_pred, bbox_num\n\n\n@register\nclass FCOSHead_ARSL(FCOSHead):\n    \"\"\"\n    FCOSHead of ARSL for semi-det(ssod)\n    Args:\n        fcos_feat (object): Instance of 'FCOSFeat'\n        num_classes (int): Number of classes\n        fpn_stride (list): The stride of each FPN Layer\n        prior_prob (float): Used to set the bias init for the class prediction layer\n        fcos_loss (object): Instance of 'FCOSLoss'\n        norm_reg_targets (bool): Normalization the regression target if true\n        centerness_on_reg (bool): The prediction of centerness on regression or clssification branch\n        nms (object): Instance of 'MultiClassNMS'\n        trt (bool): Whether to use trt in nms of deploy\n    \"\"\"\n    __inject__ = ['fcos_feat', 'fcos_loss', 'nms']\n    __shared__ = ['num_classes', 'trt']\n\n    def __init__(self,\n                 num_classes=80,\n                 fcos_feat='FCOSFeat',\n                 fpn_stride=[8, 16, 32, 64, 128],\n                 prior_prob=0.01,\n                 multiply_strides_reg_targets=False,\n                 norm_reg_targets=True,\n                 centerness_on_reg=True,\n                 num_shift=0.5,\n                 sqrt_score=False,\n                 fcos_loss='FCOSLossMILC',\n                 nms='MultiClassNMS',\n                 trt=False):\n        super(FCOSHead_ARSL, self).__init__()\n        self.fcos_feat = fcos_feat\n        self.num_classes = num_classes\n        self.fpn_stride = fpn_stride\n        self.prior_prob = prior_prob\n        self.fcos_loss = fcos_loss\n        self.norm_reg_targets = norm_reg_targets\n        self.centerness_on_reg = centerness_on_reg\n        self.multiply_strides_reg_targets = multiply_strides_reg_targets\n        self.num_shift = num_shift\n        self.nms = nms\n        if isinstance(self.nms, MultiClassNMS) and trt:\n            self.nms.trt = trt\n        self.sqrt_score = sqrt_score\n\n        conv_cls_name = \"fcos_head_cls\"\n        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)\n        self.fcos_head_cls = self.add_sublayer(\n            conv_cls_name,\n            nn.Conv2D(\n                in_channels=256,\n                out_channels=self.num_classes,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(\n                    initializer=Constant(value=bias_init_value))))\n\n        conv_reg_name = \"fcos_head_reg\"\n        self.fcos_head_reg = self.add_sublayer(\n            conv_reg_name,\n            nn.Conv2D(\n                in_channels=256,\n                out_channels=4,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n\n        conv_centerness_name = \"fcos_head_centerness\"\n        self.fcos_head_centerness = self.add_sublayer(\n            conv_centerness_name,\n            nn.Conv2D(\n                in_channels=256,\n                out_channels=1,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n\n        self.scales_regs = []\n        for i in range(len(self.fpn_stride)):\n            lvl = int(math.log(int(self.fpn_stride[i]), 2))\n            feat_name = 'p{}_feat'.format(lvl)\n            scale_reg = self.add_sublayer(feat_name, ScaleReg())\n            self.scales_regs.append(scale_reg)\n\n    def forward(self, fpn_feats, targets=None):\n        assert len(fpn_feats) == len(\n            self.fpn_stride\n        ), \"The size of fpn_feats is not equal to size of fpn_stride\"\n        cls_logits_list = []\n        bboxes_reg_list = []\n        centerness_list = []\n        for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs,\n                                                   self.fpn_stride, fpn_feats):\n            fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat)\n            cls_logits = self.fcos_head_cls(fcos_cls_feat)\n            bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat))\n            if self.centerness_on_reg:\n                centerness = self.fcos_head_centerness(fcos_reg_feat)\n            else:\n                centerness = self.fcos_head_centerness(fcos_cls_feat)\n            if self.norm_reg_targets:\n                bbox_reg = F.relu(bbox_reg)\n                if not self.training:\n                    bbox_reg = bbox_reg * fpn_stride\n            else:\n                bbox_reg = paddle.exp(bbox_reg)\n            cls_logits_list.append(cls_logits)\n            bboxes_reg_list.append(bbox_reg)\n            centerness_list.append(centerness)\n\n        if not self.training:\n            locations_list = []\n            for fpn_stride, feature in zip(self.fpn_stride, fpn_feats):\n                location = self._compute_locations_by_level(fpn_stride, feature)\n                locations_list.append(location)\n\n            return locations_list, cls_logits_list, bboxes_reg_list, centerness_list\n        else:\n            return cls_logits_list, bboxes_reg_list, centerness_list\n\n    def get_loss(self, fcos_head_outs, tag_labels, tag_bboxes, tag_centerness):\n        cls_logits, bboxes_reg, centerness = fcos_head_outs\n        return self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels,\n                              tag_bboxes, tag_centerness)\n"
  },
  {
    "path": "ppdet/modeling/heads/fcosr_head.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\nfrom .fcos_head import ScaleReg\nfrom ..initializer import bias_init_with_prob, constant_, normal_\nfrom ..ops import get_act_fn, anchor_generator\nfrom ..rbox_utils import box2corners\nfrom ..losses import ProbIoULoss\nimport numpy as np\n\n__all__ = ['FCOSRHead']\n\n\ndef trunc_div(a, b):\n    ipt = paddle.divide(a, b)\n    sign_ipt = paddle.sign(ipt)\n    abs_ipt = paddle.abs(ipt)\n    abs_ipt = paddle.floor(abs_ipt)\n    out = paddle.multiply(sign_ipt, abs_ipt)\n    return out\n\n\ndef fmod(a, b):\n    return a - trunc_div(a, b) * b\n\n\ndef fmod_eval(a, b):\n    return a - a.divide(b).cast(paddle.int32).cast(paddle.float32) * b\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size=3,\n                 stride=1,\n                 groups=1,\n                 padding=0,\n                 norm_cfg={'name': 'gn',\n                           'num_groups': 32},\n                 act=None):\n        super(ConvBNLayer, self).__init__()\n\n        self.conv = nn.Conv2D(\n            in_channels=ch_in,\n            out_channels=ch_out,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=padding,\n            groups=groups,\n            bias_attr=False)\n\n        norm_type = norm_cfg['name']\n        if norm_type in ['sync_bn', 'bn']:\n            self.norm = nn.BatchNorm2D(\n                ch_out,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        else:\n            groups = norm_cfg.get('num_groups', 1)\n            self.norm = nn.GroupNorm(\n                num_groups=groups,\n                num_channels=ch_out,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.act = get_act_fn(act) if act is None or isinstance(act, (\n            str, dict)) else act\n\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.norm(x)\n        x = self.act(x)\n\n        return x\n\n\n@register\nclass FCOSRHead(nn.Layer):\n    \"\"\" FCOSR Head, refer to https://arxiv.org/abs/2111.10780 for details \"\"\"\n\n    __shared__ = ['num_classes', 'trt']\n    __inject__ = ['assigner', 'nms']\n\n    def __init__(self,\n                 num_classes=15,\n                 in_channels=256,\n                 feat_channels=256,\n                 stacked_convs=4,\n                 act='relu',\n                 fpn_strides=[4, 8, 16, 32, 64],\n                 trt=False,\n                 loss_weight={'class': 1.0,\n                              'probiou': 1.0},\n                 norm_cfg={'name': 'gn',\n                           'num_groups': 32},\n                 assigner='FCOSRAssigner',\n                 nms='MultiClassNMS'):\n\n        super(FCOSRHead, self).__init__()\n        self.in_channels = in_channels\n        self.num_classes = num_classes\n        self.fpn_strides = fpn_strides\n        self.stacked_convs = stacked_convs\n        self.loss_weight = loss_weight\n        self.half_pi = paddle.to_tensor(\n            [1.5707963267948966], dtype=paddle.float32)\n        self.probiou_loss = ProbIoULoss(mode='l1')\n        act = get_act_fn(\n            act, trt=trt) if act is None or isinstance(act,\n                                                       (str, dict)) else act\n        self.trt = trt\n        self.loss_weight = loss_weight\n        self.assigner = assigner\n        self.nms = nms\n        # stem\n        self.stem_cls = nn.LayerList()\n        self.stem_reg = nn.LayerList()\n        for i in range(self.stacked_convs):\n            self.stem_cls.append(\n                ConvBNLayer(\n                    self.in_channels[i],\n                    feat_channels,\n                    filter_size=3,\n                    stride=1,\n                    padding=1,\n                    norm_cfg=norm_cfg,\n                    act=act))\n            self.stem_reg.append(\n                ConvBNLayer(\n                    self.in_channels[i],\n                    feat_channels,\n                    filter_size=3,\n                    stride=1,\n                    padding=1,\n                    norm_cfg=norm_cfg,\n                    act=act))\n\n        self.scales = nn.LayerList(\n            [ScaleReg() for _ in range(len(fpn_strides))])\n\n        # prediction\n        self.pred_cls = nn.Conv2D(feat_channels, self.num_classes, 3, padding=1)\n\n        self.pred_xy = nn.Conv2D(feat_channels, 2, 3, padding=1)\n\n        self.pred_wh = nn.Conv2D(feat_channels, 2, 3, padding=1)\n\n        self.pred_angle = nn.Conv2D(feat_channels, 1, 3, padding=1)\n\n        self._init_weights()\n\n    def _init_weights(self):\n        for cls_, reg_ in zip(self.stem_cls, self.stem_reg):\n            normal_(cls_.conv.weight, std=0.01)\n            normal_(reg_.conv.weight, std=0.01)\n\n        bias_cls = bias_init_with_prob(0.01)\n        normal_(self.pred_cls.weight, std=0.01)\n        constant_(self.pred_cls.bias, bias_cls)\n        normal_(self.pred_xy.weight, std=0.01)\n        normal_(self.pred_wh.weight, std=0.01)\n        normal_(self.pred_angle.weight, std=0.01)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    def _generate_anchors(self, feats):\n        if self.trt:\n            anchor_points = []\n            for feat, stride in zip(feats, self.fpn_strides):\n                _, _, h, w = feat.shape\n                anchor, _ = anchor_generator(\n                    feat,\n                    stride * 4,\n                    1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride],\n                    offset=0.5)\n                x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1)\n                xc = (x1 + x2 + 1) / 2\n                yc = (y1 + y2 + 1) / 2\n                anchor_point = paddle.concat(\n                    [xc, yc], axis=-1).reshape((1, h * w, 2))\n                anchor_points.append(anchor_point)\n            anchor_points = paddle.concat(anchor_points, axis=1)\n            return anchor_points, None, None\n        else:\n            anchor_points = []\n            stride_tensor = []\n            num_anchors_list = []\n            for feat, stride in zip(feats, self.fpn_strides):\n                _, _, h, w = feat.shape\n                shift_x = (paddle.arange(end=w) + 0.5) * stride\n                shift_y = (paddle.arange(end=h) + 0.5) * stride\n                shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)\n                anchor_point = paddle.cast(\n                    paddle.stack(\n                        [shift_x, shift_y], axis=-1), dtype='float32')\n                anchor_points.append(anchor_point.reshape([1, -1, 2]))\n                stride_tensor.append(\n                    paddle.full(\n                        [1, h * w, 1], stride, dtype='float32'))\n                num_anchors_list.append(h * w)\n            anchor_points = paddle.concat(anchor_points, axis=1)\n            stride_tensor = paddle.concat(stride_tensor, axis=1)\n            return anchor_points, stride_tensor, num_anchors_list\n\n    def forward(self, feats, target=None):\n        if self.training:\n            return self.forward_train(feats, target)\n        else:\n            return self.forward_eval(feats, target)\n\n    def forward_train(self, feats, target=None):\n        anchor_points, stride_tensor, num_anchors_list = self._generate_anchors(\n            feats)\n        cls_pred_list, reg_pred_list = [], []\n        for stride, feat, scale in zip(self.fpn_strides, feats, self.scales):\n            # cls\n            cls_feat = feat\n            for cls_layer in self.stem_cls:\n                cls_feat = cls_layer(cls_feat)\n            cls_pred = F.sigmoid(self.pred_cls(cls_feat))\n            cls_pred_list.append(cls_pred.flatten(2).transpose((0, 2, 1)))\n            # reg\n            reg_feat = feat\n            for reg_layer in self.stem_reg:\n                reg_feat = reg_layer(reg_feat)\n\n            reg_xy = scale(self.pred_xy(reg_feat)) * stride\n            reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride\n            reg_angle = self.pred_angle(reg_feat)\n            reg_angle = fmod(reg_angle, self.half_pi)\n            reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)\n            reg_pred_list.append(reg_pred.flatten(2).transpose((0, 2, 1)))\n\n        cls_pred_list = paddle.concat(cls_pred_list, axis=1)\n        reg_pred_list = paddle.concat(reg_pred_list, axis=1)\n\n        return self.get_loss([\n            cls_pred_list, reg_pred_list, anchor_points, stride_tensor,\n            num_anchors_list\n        ], target)\n\n    def forward_eval(self, feats, target=None):\n        cls_pred_list, reg_pred_list = [], []\n        anchor_points, _, _ = self._generate_anchors(feats)\n        for stride, feat, scale in zip(self.fpn_strides, feats, self.scales):\n            b, _, h, w = feat.shape\n            # cls\n            cls_feat = feat\n            for cls_layer in self.stem_cls:\n                cls_feat = cls_layer(cls_feat)\n            cls_pred = F.sigmoid(self.pred_cls(cls_feat))\n            cls_pred_list.append(cls_pred.reshape([b, self.num_classes, h * w]))\n            # reg\n            reg_feat = feat\n            for reg_layer in self.stem_reg:\n                reg_feat = reg_layer(reg_feat)\n\n            reg_xy = scale(self.pred_xy(reg_feat)) * stride\n            reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride\n            reg_angle = self.pred_angle(reg_feat)\n            reg_angle = fmod_eval(reg_angle, self.half_pi)\n            reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)\n            reg_pred = reg_pred.reshape([b, 5, h * w]).transpose((0, 2, 1))\n            reg_pred_list.append(reg_pred)\n\n        cls_pred_list = paddle.concat(cls_pred_list, axis=2)\n        reg_pred_list = paddle.concat(reg_pred_list, axis=1)\n        reg_pred_list = self._bbox_decode(anchor_points, reg_pred_list)\n        return cls_pred_list, reg_pred_list\n\n    def _bbox_decode(self, points, reg_pred_list):\n        xy, wha = paddle.split(reg_pred_list, [2, 3], axis=-1)\n        xy = xy + points\n        return paddle.concat([xy, wha], axis=-1)\n\n    def _box2corners(self, pred_bboxes):\n        \"\"\" convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4)\n\n        Args:\n            pred_bboxes (Tensor): [B, N, 5]\n        \n        Returns:\n            polys (Tensor): [B, N, 8]\n        \"\"\"\n        x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1)\n        cos_a_half = paddle.cos(angle) * 0.5\n        sin_a_half = paddle.sin(angle) * 0.5\n        w_x = cos_a_half * w\n        w_y = sin_a_half * w\n        h_x = -sin_a_half * h\n        h_y = cos_a_half * h\n        return paddle.concat(\n            [\n                x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y,\n                x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y\n            ],\n            axis=-1)\n\n    def get_loss(self, head_outs, gt_meta):\n        cls_pred_list, reg_pred_list, anchor_points, stride_tensor, num_anchors_list = head_outs\n        gt_labels = gt_meta['gt_class']\n        gt_bboxes = gt_meta['gt_bbox']\n        gt_rboxes = gt_meta['gt_rbox']\n        pad_gt_mask = gt_meta['pad_gt_mask']\n        # decode\n        pred_rboxes = self._bbox_decode(anchor_points, reg_pred_list)\n        # label assignment\n        assigned_labels, assigned_rboxes, assigned_scores = \\\n            self.assigner(\n                anchor_points,\n                stride_tensor,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                gt_rboxes,\n                pad_gt_mask,\n                self.num_classes,\n                pred_rboxes\n            )\n\n        # reg_loss\n        mask_positive = (assigned_labels != self.num_classes)\n        num_pos = mask_positive.sum().item()\n        if num_pos > 0:\n            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5])\n            pred_rboxes_pos = paddle.masked_select(pred_rboxes,\n                                                   bbox_mask).reshape([-1, 5])\n            assigned_rboxes_pos = paddle.masked_select(\n                assigned_rboxes, bbox_mask).reshape([-1, 5])\n            bbox_weight = paddle.masked_select(\n                assigned_scores.sum(-1), mask_positive).reshape([-1])\n            avg_factor = bbox_weight.sum()\n            loss_probiou = self.probiou_loss(pred_rboxes_pos,\n                                             assigned_rboxes_pos)\n            loss_probiou = paddle.sum(loss_probiou * bbox_weight) / avg_factor\n        else:\n            loss_probiou = pred_rboxes.sum() * 0.\n\n        avg_factor = max(num_pos, 1.0)\n        # cls_loss\n        loss_cls = self._qfocal_loss(\n            cls_pred_list, assigned_scores, reduction='sum')\n        loss_cls = loss_cls / avg_factor\n\n        loss = self.loss_weight['class'] * loss_cls + \\\n               self.loss_weight['probiou'] * loss_probiou\n        out_dict = {\n            'loss': loss,\n            'loss_probiou': loss_probiou,\n            'loss_cls': loss_cls\n        }\n        return out_dict\n\n    @staticmethod\n    def _qfocal_loss(score, label, gamma=2.0, reduction='sum'):\n        weight = (score - label).pow(gamma)\n        loss = F.binary_cross_entropy(\n            score, label, weight=weight, reduction=reduction)\n        return loss\n\n    def post_process(self, head_outs, scale_factor):\n        pred_scores, pred_rboxes = head_outs\n        # [B, N, 5] -> [B, N, 4, 2] -> [B, N, 8]\n        pred_rboxes = self._box2corners(pred_rboxes)\n        # scale bbox to origin\n        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)\n        scale_factor = paddle.concat(\n            [\n                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,\n                scale_y\n            ],\n            axis=-1).reshape([-1, 1, 8])\n        pred_rboxes /= scale_factor\n        bbox_pred, bbox_num, before_nms_indexes = self.nms(pred_rboxes,\n                                                           pred_scores)\n        return bbox_pred, bbox_num, before_nms_indexes\n"
  },
  {
    "path": "ppdet/modeling/heads/gfl_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\n# The code is based on:\n# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/gfl_head.py\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Normal, Constant\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.bbox_utils import distance2bbox, bbox2distance, batch_distance2bbox\nfrom ppdet.data.transform.atss_assigner import bbox_overlaps\n\n__all__ = ['GFLHead', 'LDGFLHead']\n\n\nclass ScaleReg(nn.Layer):\n    \"\"\"\n    Parameter for scaling the regression outputs.\n    \"\"\"\n\n    def __init__(self):\n        super(ScaleReg, self).__init__()\n        self.scale_reg = self.create_parameter(\n            shape=[1],\n            attr=ParamAttr(initializer=Constant(value=1.)),\n            dtype=\"float32\")\n\n    def forward(self, inputs):\n        out = inputs * self.scale_reg\n        return out\n\n\nclass Integral(nn.Layer):\n    \"\"\"A fixed layer for calculating integral result from distribution.\n    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,\n    P(y_i) denotes the softmax vector that represents the discrete distribution\n    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}\n    Args:\n        reg_max (int): The maximal value of the discrete set. Default: 16. You\n            may want to reset it according to your new dataset or related\n            settings.\n    \"\"\"\n\n    def __init__(self, reg_max=16):\n        super(Integral, self).__init__()\n        self.reg_max = reg_max\n        self.register_buffer('project',\n                             paddle.linspace(0, self.reg_max, self.reg_max + 1))\n\n    def forward(self, x):\n        \"\"\"Forward feature from the regression head to get integral result of\n        bounding box location.\n        Args:\n            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),\n                n is self.reg_max.\n        Returns:\n            x (Tensor): Integral result of box locations, i.e., distance\n                offsets from the box center in four directions, shape (N, 4).\n        \"\"\"\n        x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1)\n        x = F.linear(x, self.project)\n        if self.training:\n            x = x.reshape([-1, 4])\n        return x\n\n\n@register\nclass DGQP(nn.Layer):\n    \"\"\"Distribution-Guided Quality Predictor of GFocal head\n    Args:\n        reg_topk (int): top-k statistics of distribution to guide LQE\n        reg_channels (int): hidden layer unit to generate LQE\n        add_mean (bool): Whether to calculate the mean of top-k statistics\n    \"\"\"\n\n    def __init__(self, reg_topk=4, reg_channels=64, add_mean=True):\n        super(DGQP, self).__init__()\n        self.reg_topk = reg_topk\n        self.reg_channels = reg_channels\n        self.add_mean = add_mean\n        self.total_dim = reg_topk\n        if add_mean:\n            self.total_dim += 1\n        self.reg_conv1 = self.add_sublayer(\n            'dgqp_reg_conv1',\n            nn.Conv2D(\n                in_channels=4 * self.total_dim,\n                out_channels=self.reg_channels,\n                kernel_size=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n        self.reg_conv2 = self.add_sublayer(\n            'dgqp_reg_conv2',\n            nn.Conv2D(\n                in_channels=self.reg_channels,\n                out_channels=1,\n                kernel_size=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n\n    def forward(self, x):\n        \"\"\"Forward feature from the regression head to get integral result of\n        bounding box location.\n        Args:\n            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),\n                n is self.reg_max.\n        Returns:\n            x (Tensor): Integral result of box locations, i.e., distance\n                offsets from the box center in four directions, shape (N, 4).\n        \"\"\"\n        N, _, H, W = x.shape[:]\n        prob = F.softmax(x.reshape([N, 4, -1, H, W]), axis=2)\n        prob_topk, _ = prob.topk(self.reg_topk, axis=2)\n        if self.add_mean:\n            stat = paddle.concat(\n                [prob_topk, prob_topk.mean(\n                    axis=2, keepdim=True)], axis=2)\n        else:\n            stat = prob_topk\n        y = F.relu(self.reg_conv1(stat.reshape([N, 4 * self.total_dim, H, W])))\n        y = F.sigmoid(self.reg_conv2(y))\n        return y\n\n\n@register\nclass GFLHead(nn.Layer):\n    \"\"\"\n    GFLHead\n    Args:\n        conv_feat (object): Instance of 'FCOSFeat'\n        num_classes (int): Number of classes\n        fpn_stride (list): The stride of each FPN Layer\n        prior_prob (float): Used to set the bias init for the class prediction layer\n        loss_class (object): Instance of QualityFocalLoss.\n        loss_dfl (object): Instance of DistributionFocalLoss.\n        loss_bbox (object): Instance of bbox loss.\n        reg_max: Max value of integral set :math: `{0, ..., reg_max}`\n                n QFL setting. Default: 16.\n    \"\"\"\n    __inject__ = [\n        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'nms'\n    ]\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 conv_feat='FCOSFeat',\n                 dgqp_module=None,\n                 num_classes=80,\n                 fpn_stride=[8, 16, 32, 64, 128],\n                 prior_prob=0.01,\n                 loss_class='QualityFocalLoss',\n                 loss_dfl='DistributionFocalLoss',\n                 loss_bbox='GIoULoss',\n                 reg_max=16,\n                 feat_in_chan=256,\n                 nms=None,\n                 nms_pre=1000,\n                 cell_offset=0):\n        super(GFLHead, self).__init__()\n        self.conv_feat = conv_feat\n        self.dgqp_module = dgqp_module\n        self.num_classes = num_classes\n        self.fpn_stride = fpn_stride\n        self.prior_prob = prior_prob\n        self.loss_qfl = loss_class\n        self.loss_dfl = loss_dfl\n        self.loss_bbox = loss_bbox\n        self.reg_max = reg_max\n        self.feat_in_chan = feat_in_chan\n        self.nms = nms\n        self.nms_pre = nms_pre\n        self.cell_offset = cell_offset\n        self.use_sigmoid = self.loss_qfl.use_sigmoid\n        if self.use_sigmoid:\n            self.cls_out_channels = self.num_classes\n        else:\n            self.cls_out_channels = self.num_classes + 1\n\n        conv_cls_name = \"gfl_head_cls\"\n        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)\n        self.gfl_head_cls = self.add_sublayer(\n            conv_cls_name,\n            nn.Conv2D(\n                in_channels=self.feat_in_chan,\n                out_channels=self.cls_out_channels,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(\n                    initializer=Constant(value=bias_init_value))))\n\n        conv_reg_name = \"gfl_head_reg\"\n        self.gfl_head_reg = self.add_sublayer(\n            conv_reg_name,\n            nn.Conv2D(\n                in_channels=self.feat_in_chan,\n                out_channels=4 * (self.reg_max + 1),\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n\n        self.scales_regs = []\n        for i in range(len(self.fpn_stride)):\n            lvl = int(math.log(int(self.fpn_stride[i]), 2))\n            feat_name = 'p{}_feat'.format(lvl)\n            scale_reg = self.add_sublayer(feat_name, ScaleReg())\n            self.scales_regs.append(scale_reg)\n\n        self.distribution_project = Integral(self.reg_max)\n\n    def forward(self, fpn_feats):\n        assert len(fpn_feats) == len(\n            self.fpn_stride\n        ), \"The size of fpn_feats is not equal to size of fpn_stride\"\n        cls_logits_list = []\n        bboxes_reg_list = []\n        for stride, scale_reg, fpn_feat in zip(self.fpn_stride,\n                                               self.scales_regs, fpn_feats):\n            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)\n            cls_score = self.gfl_head_cls(conv_cls_feat)\n            bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))\n            if self.dgqp_module:\n                quality_score = self.dgqp_module(bbox_pred)\n                cls_score = F.sigmoid(cls_score) * quality_score\n            if not self.training:\n                cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))\n                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])\n                b, cell_h, cell_w, _ = cls_score.shape\n                y, x = self.get_single_level_center_point(\n                    [cell_h, cell_w], stride, cell_offset=self.cell_offset)\n                center_points = paddle.stack([x, y], axis=-1)\n                cls_score = cls_score.reshape([b, -1, self.cls_out_channels])\n                bbox_pred = self.distribution_project(bbox_pred) * stride\n                bbox_pred = bbox_pred.reshape([-1, cell_h * cell_w, 4])\n\n                # NOTE: If keep_ratio=False and image shape value that\n                # multiples of 32, distance2bbox not set max_shapes parameter\n                # to speed up model prediction. If need to set max_shapes,\n                # please use inputs['im_shape'].\n                bbox_pred = batch_distance2bbox(\n                    center_points, bbox_pred, max_shapes=None)\n\n            cls_logits_list.append(cls_score)\n            bboxes_reg_list.append(bbox_pred)\n\n        return (cls_logits_list, bboxes_reg_list)\n\n    def _images_to_levels(self, target, num_level_anchors):\n        \"\"\"\n        Convert targets by image to targets by feature level.\n        \"\"\"\n        level_targets = []\n        start = 0\n        for n in num_level_anchors:\n            end = start + n\n            level_targets.append(target[:, start:end].squeeze(0))\n            start = end\n        return level_targets\n\n    def _grid_cells_to_center(self, grid_cells):\n        \"\"\"\n        Get center location of each gird cell\n        Args:\n            grid_cells: grid cells of a feature map\n        Returns:\n            center points\n        \"\"\"\n        cells_cx = (grid_cells[:, 2] + grid_cells[:, 0]) / 2\n        cells_cy = (grid_cells[:, 3] + grid_cells[:, 1]) / 2\n        return paddle.stack([cells_cx, cells_cy], axis=-1)\n\n    def get_loss(self, gfl_head_outs, gt_meta):\n        cls_logits, bboxes_reg = gfl_head_outs\n        num_level_anchors = [\n            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits\n        ]\n        grid_cells_list = self._images_to_levels(gt_meta['grid_cells'],\n                                                 num_level_anchors)\n        labels_list = self._images_to_levels(gt_meta['labels'],\n                                             num_level_anchors)\n        label_weights_list = self._images_to_levels(gt_meta['label_weights'],\n                                                    num_level_anchors)\n        bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'],\n                                                   num_level_anchors)\n        num_total_pos = sum(gt_meta['pos_num'])\n        try:\n            paddle.distributed.all_reduce(num_total_pos)\n            num_total_pos = paddle.clip(\n                num_total_pos / paddle.distributed.get_world_size(), min=1)\n        except:\n            num_total_pos = max(num_total_pos, 1)\n\n        loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], []\n        for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride in zip(\n                cls_logits, bboxes_reg, grid_cells_list, labels_list,\n                label_weights_list, bbox_targets_list, self.fpn_stride):\n            grid_cells = grid_cells.reshape([-1, 4])\n            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(\n                [-1, self.cls_out_channels])\n            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(\n                [-1, 4 * (self.reg_max + 1)])\n            bbox_targets = bbox_targets.reshape([-1, 4])\n            labels = labels.reshape([-1])\n            label_weights = label_weights.reshape([-1])\n\n            bg_class_ind = self.num_classes\n            pos_inds = paddle.nonzero(\n                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),\n                as_tuple=False).squeeze(1)\n            score = np.zeros(labels.shape)\n            if len(pos_inds) > 0:\n                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)\n                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)\n                pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0)\n                pos_grid_cell_centers = self._grid_cells_to_center(\n                    pos_grid_cells) / stride\n\n                weight_targets = F.sigmoid(cls_score.detach())\n                weight_targets = paddle.gather(\n                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)\n                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)\n                pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers,\n                                                     pos_bbox_pred_corners)\n                pos_decode_bbox_targets = pos_bbox_targets / stride\n                bbox_iou = bbox_overlaps(\n                    pos_decode_bbox_pred.detach().numpy(),\n                    pos_decode_bbox_targets.detach().numpy(),\n                    is_aligned=True)\n                score[pos_inds.numpy()] = bbox_iou\n                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])\n                target_corners = bbox2distance(pos_grid_cell_centers,\n                                               pos_decode_bbox_targets,\n                                               self.reg_max).reshape([-1])\n                # regression loss\n                loss_bbox = paddle.sum(\n                    self.loss_bbox(pos_decode_bbox_pred,\n                                   pos_decode_bbox_targets) * weight_targets)\n\n                # dfl loss\n                loss_dfl = self.loss_dfl(\n                    pred_corners,\n                    target_corners,\n                    weight=weight_targets.expand([-1, 4]).reshape([-1]),\n                    avg_factor=4.0)\n            else:\n                loss_bbox = bbox_pred.sum() * 0\n                loss_dfl = bbox_pred.sum() * 0\n                weight_targets = paddle.to_tensor([0], dtype='float32')\n\n            # qfl loss\n            score = paddle.to_tensor(score)\n            loss_qfl = self.loss_qfl(\n                cls_score, (labels, score),\n                weight=label_weights,\n                avg_factor=num_total_pos)\n            loss_bbox_list.append(loss_bbox)\n            loss_dfl_list.append(loss_dfl)\n            loss_qfl_list.append(loss_qfl)\n            avg_factor.append(weight_targets.sum())\n\n        avg_factor = sum(avg_factor)\n        try:\n            paddle.distributed.all_reduce(avg_factor)\n            avg_factor = paddle.clip(\n                avg_factor / paddle.distributed.get_world_size(), min=1)\n        except:\n            avg_factor = max(avg_factor.item(), 1)\n        if avg_factor <= 0:\n            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n            loss_bbox = paddle.to_tensor(\n                0, dtype='float32', stop_gradient=False)\n            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n        else:\n            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))\n            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))\n            loss_qfl = sum(loss_qfl_list)\n            loss_bbox = sum(losses_bbox)\n            loss_dfl = sum(losses_dfl)\n\n        loss_states = dict(\n            loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)\n\n        return loss_states\n\n    def get_single_level_center_point(self, featmap_size, stride,\n                                      cell_offset=0):\n        \"\"\"\n        Generate pixel centers of a single stage feature map.\n        Args:\n            featmap_size: height and width of the feature map\n            stride: down sample stride of the feature map\n        Returns:\n            y and x of the center points\n        \"\"\"\n        h, w = featmap_size\n        x_range = (paddle.arange(w, dtype='float32') + cell_offset) * stride\n        y_range = (paddle.arange(h, dtype='float32') + cell_offset) * stride\n        y, x = paddle.meshgrid(y_range, x_range)\n        y = y.flatten()\n        x = x.flatten()\n        return y, x\n\n    def post_process(self, gfl_head_outs, im_shape, scale_factor):\n        cls_scores, bboxes_reg = gfl_head_outs\n        bboxes = paddle.concat(bboxes_reg, axis=1)\n        # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]\n        im_scale = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)\n        bboxes /= im_scale\n        mlvl_scores = paddle.concat(cls_scores, axis=1)\n        mlvl_scores = mlvl_scores.transpose([0, 2, 1])\n        bbox_pred, bbox_num, _ = self.nms(bboxes, mlvl_scores)\n        return bbox_pred, bbox_num\n\n\n@register\nclass LDGFLHead(GFLHead):\n    \"\"\"\n    GFLHead for LD distill\n    Args:\n        conv_feat (object): Instance of 'FCOSFeat'\n        num_classes (int): Number of classes\n        fpn_stride (list): The stride of each FPN Layer\n        prior_prob (float): Used to set the bias init for the class prediction layer\n        loss_class (object): Instance of QualityFocalLoss.\n        loss_dfl (object): Instance of DistributionFocalLoss.\n        loss_bbox (object): Instance of bbox loss.\n        reg_max: Max value of integral set :math: `{0, ..., reg_max}`\n                n QFL setting. Default: 16.\n    \"\"\"\n    __inject__ = [\n        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',\n        'loss_ld', 'loss_ld_vlr', 'loss_kd', 'nms'\n    ]\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 conv_feat='FCOSFeat',\n                 dgqp_module=None,\n                 num_classes=80,\n                 fpn_stride=[8, 16, 32, 64, 128],\n                 prior_prob=0.01,\n                 loss_class='QualityFocalLoss',\n                 loss_dfl='DistributionFocalLoss',\n                 loss_bbox='GIoULoss',\n                 loss_ld='KnowledgeDistillationKLDivLoss',\n                 loss_ld_vlr='KnowledgeDistillationKLDivLoss',\n                 loss_kd='KnowledgeDistillationKLDivLoss',\n                 reg_max=16,\n                 feat_in_chan=256,\n                 nms=None,\n                 nms_pre=1000,\n                 cell_offset=0):\n\n        super(LDGFLHead, self).__init__(\n            conv_feat=conv_feat,\n            dgqp_module=dgqp_module,\n            num_classes=num_classes,\n            fpn_stride=fpn_stride,\n            prior_prob=prior_prob,\n            loss_class=loss_class,\n            loss_dfl=loss_dfl,\n            loss_bbox=loss_bbox,\n            reg_max=reg_max,\n            feat_in_chan=feat_in_chan,\n            nms=nms,\n            nms_pre=nms_pre,\n            cell_offset=cell_offset)\n        self.loss_ld = loss_ld\n        self.loss_kd = loss_kd\n        self.loss_ld_vlr = loss_ld_vlr\n\n    def forward(self, fpn_feats):\n        assert len(fpn_feats) == len(\n            self.fpn_stride\n        ), \"The size of fpn_feats is not equal to size of fpn_stride\"\n        cls_logits_list = []\n        bboxes_reg_list = []\n        for stride, scale_reg, fpn_feat in zip(self.fpn_stride,\n                                               self.scales_regs, fpn_feats):\n            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)\n            cls_score = self.gfl_head_cls(conv_cls_feat)\n            bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))\n\n            if self.dgqp_module:\n                quality_score = self.dgqp_module(bbox_pred)\n                cls_score = F.sigmoid(cls_score) * quality_score\n            if not self.training:\n                cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))\n                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])\n                b, cell_h, cell_w, _ = cls_score.shape\n                y, x = self.get_single_level_center_point(\n                    [cell_h, cell_w], stride, cell_offset=self.cell_offset)\n                center_points = paddle.stack([x, y], axis=-1)\n                cls_score = cls_score.reshape([b, -1, self.cls_out_channels])\n                bbox_pred = self.distribution_project(bbox_pred) * stride\n                bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])\n\n                # NOTE: If keep_ratio=False and image shape value that\n                # multiples of 32, distance2bbox not set max_shapes parameter\n                # to speed up model prediction. If need to set max_shapes,\n                # please use inputs['im_shape'].\n                bbox_pred = batch_distance2bbox(\n                    center_points, bbox_pred, max_shapes=None)\n\n            cls_logits_list.append(cls_score)\n            bboxes_reg_list.append(bbox_pred)\n\n        return (cls_logits_list, bboxes_reg_list)\n\n    def get_loss(self, gfl_head_outs, gt_meta, soft_label_list,\n                 soft_targets_list):\n        cls_logits, bboxes_reg = gfl_head_outs\n\n        num_level_anchors = [\n            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits\n        ]\n\n        grid_cells_list = self._images_to_levels(gt_meta['grid_cells'],\n                                                 num_level_anchors)\n\n        labels_list = self._images_to_levels(gt_meta['labels'],\n                                             num_level_anchors)\n\n        label_weights_list = self._images_to_levels(gt_meta['label_weights'],\n                                                    num_level_anchors)\n        bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'],\n                                                   num_level_anchors)\n        # vlr regions                                         \n        vlr_regions_list = self._images_to_levels(gt_meta['vlr_regions'],\n                                                  num_level_anchors)\n\n        num_total_pos = sum(gt_meta['pos_num'])\n        try:\n            paddle.distributed.all_reduce(num_total_pos)\n            num_total_pos = paddle.clip(\n                num_total_pos / paddle.distributed.get_world_size(), min=1.)\n        except:\n            num_total_pos = max(num_total_pos, 1)\n\n        loss_bbox_list, loss_dfl_list, loss_qfl_list, loss_ld_list, avg_factor = [], [], [], [], []\n        loss_ld_vlr_list, loss_kd_list = [], []\n\n        for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride, soft_targets,\\\n                soft_label, vlr_region in zip(\n                cls_logits, bboxes_reg, grid_cells_list, labels_list,\n                label_weights_list, bbox_targets_list, self.fpn_stride, soft_targets_list,\n                soft_label_list, vlr_regions_list):\n\n            grid_cells = grid_cells.reshape([-1, 4])\n            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(\n                [-1, self.cls_out_channels])\n            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(\n                [-1, 4 * (self.reg_max + 1)])\n\n            soft_targets = soft_targets.transpose([0, 2, 3, 1]).reshape(\n                [-1, 4 * (self.reg_max + 1)])\n\n            soft_label = soft_label.transpose([0, 2, 3, 1]).reshape(\n                [-1, self.cls_out_channels])\n\n            # feture im\n            # teacher_x = teacher_x.transpose([0, 2, 3, 1]).reshape([-1, 256])\n            # x = x.transpose([0, 2, 3, 1]).reshape([-1, 256])  \n\n            bbox_targets = bbox_targets.reshape([-1, 4])\n            labels = labels.reshape([-1])\n            label_weights = label_weights.reshape([-1])\n\n            vlr_region = vlr_region.reshape([-1])\n\n            bg_class_ind = self.num_classes\n            pos_inds = paddle.nonzero(\n                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),\n                as_tuple=False).squeeze(1)\n            score = np.zeros(labels.shape)\n\n            remain_inds = (vlr_region > 0).nonzero()\n\n            if len(pos_inds) > 0:\n                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)\n                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)\n                pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0)\n\n                pos_grid_cell_centers = self._grid_cells_to_center(\n                    pos_grid_cells) / stride\n\n                weight_targets = F.sigmoid(cls_score.detach())\n                weight_targets = paddle.gather(\n                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)\n                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)\n                pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers,\n                                                     pos_bbox_pred_corners)\n                pos_decode_bbox_targets = pos_bbox_targets / stride\n                bbox_iou = bbox_overlaps(\n                    pos_decode_bbox_pred.detach().numpy(),\n                    pos_decode_bbox_targets.detach().numpy(),\n                    is_aligned=True)\n                score[pos_inds.numpy()] = bbox_iou\n                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])\n\n                pos_soft_targets = paddle.gather(soft_targets, pos_inds, axis=0)\n                soft_corners = pos_soft_targets.reshape([-1, self.reg_max + 1])\n\n                target_corners = bbox2distance(pos_grid_cell_centers,\n                                               pos_decode_bbox_targets,\n                                               self.reg_max).reshape([-1])\n                # regression loss\n                loss_bbox = paddle.sum(\n                    self.loss_bbox(pos_decode_bbox_pred,\n                                   pos_decode_bbox_targets) * weight_targets)\n\n                # dfl loss\n                loss_dfl = self.loss_dfl(\n                    pred_corners,\n                    target_corners,\n                    weight=weight_targets.expand([-1, 4]).reshape([-1]),\n                    avg_factor=4.0)\n\n                # ld loss\n                loss_ld = self.loss_ld(\n                    pred_corners,\n                    soft_corners,\n                    weight=weight_targets.expand([-1, 4]).reshape([-1]),\n                    avg_factor=4.0)\n\n                loss_kd = self.loss_kd(\n                    paddle.gather(\n                        cls_score, pos_inds, axis=0),\n                    paddle.gather(\n                        soft_label, pos_inds, axis=0),\n                    weight=paddle.gather(\n                        label_weights, pos_inds, axis=0),\n                    avg_factor=pos_inds.shape[0])\n\n            else:\n                loss_bbox = bbox_pred.sum() * 0\n                loss_dfl = bbox_pred.sum() * 0\n                loss_ld = bbox_pred.sum() * 0\n                loss_kd = bbox_pred.sum() * 0\n                weight_targets = paddle.to_tensor([0], dtype='float32')\n\n            if len(remain_inds) > 0:\n                neg_pred_corners = bbox_pred[remain_inds].reshape(\n                    [-1, self.reg_max + 1])\n                neg_soft_corners = soft_targets[remain_inds].reshape(\n                    [-1, self.reg_max + 1])\n\n                remain_targets = vlr_region[remain_inds]\n\n                loss_ld_vlr = self.loss_ld_vlr(\n                    neg_pred_corners,\n                    neg_soft_corners,\n                    weight=remain_targets.expand([-1, 4]).reshape([-1]),\n                    avg_factor=16.0)\n            else:\n                loss_ld_vlr = bbox_pred.sum() * 0\n\n            # qfl loss\n            score = paddle.to_tensor(score)\n            loss_qfl = self.loss_qfl(\n                cls_score, (labels, score),\n                weight=label_weights,\n                avg_factor=num_total_pos)\n\n            loss_bbox_list.append(loss_bbox)\n            loss_dfl_list.append(loss_dfl)\n            loss_qfl_list.append(loss_qfl)\n            loss_ld_list.append(loss_ld)\n            loss_ld_vlr_list.append(loss_ld_vlr)\n            loss_kd_list.append(loss_kd)\n            avg_factor.append(weight_targets.sum())\n\n        avg_factor = sum(avg_factor)  # + 1e-6\n        try:\n            paddle.distributed.all_reduce(avg_factor)\n            avg_factor = paddle.clip(\n                avg_factor / paddle.distributed.get_world_size(), min=1)\n        except:\n            avg_factor = max(avg_factor.item(), 1)\n\n        if avg_factor <= 0:\n            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n            loss_bbox = paddle.to_tensor(\n                0, dtype='float32', stop_gradient=False)\n            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n            loss_ld = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n            loss_ld_vlr = paddle.to_tensor(\n                0, dtype='float32', stop_gradient=False)\n            loss_kd = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n        else:\n            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))\n            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))\n            loss_qfl = sum(loss_qfl_list)\n            loss_bbox = sum(losses_bbox)\n            loss_dfl = sum(losses_dfl)\n            loss_ld = sum(loss_ld_list)\n            loss_ld_vlr = sum(loss_ld_vlr_list)\n            loss_kd = sum(loss_kd_list)\n\n        loss_states = dict(\n            loss_qfl=loss_qfl,\n            loss_bbox=loss_bbox,\n            loss_dfl=loss_dfl,\n            loss_ld=loss_ld,\n            loss_ld_vlr=loss_ld_vlr,\n            loss_kd=loss_kd)\n\n        return loss_states\n"
  },
  {
    "path": "ppdet/modeling/heads/keypoint_hrhrnet_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\nfrom ppdet.core.workspace import register\nfrom .. import layers as L\nfrom ..backbones.hrnet import BasicBlock\n\n\n@register\nclass HrHRNetHead(nn.Layer):\n    __inject__ = ['loss']\n\n    def __init__(self, num_joints, loss='HrHRNetLoss', swahr=False, width=32):\n        \"\"\"\n        Head for HigherHRNet network\n\n        Args:\n            num_joints (int): number of keypoints\n            hrloss (object): HrHRNetLoss instance\n            swahr (bool): whether to use swahr\n            width (int): hrnet channel width\n        \"\"\"\n        super(HrHRNetHead, self).__init__()\n        self.loss = loss\n\n        self.num_joints = num_joints\n        num_featout1 = num_joints * 2\n        num_featout2 = num_joints\n        self.swahr = swahr\n        self.conv1 = L.Conv2d(width, num_featout1, 1, 1, 0, bias=True)\n        self.conv2 = L.Conv2d(width, num_featout2, 1, 1, 0, bias=True)\n        self.deconv = nn.Sequential(\n            L.ConvTranspose2d(\n                num_featout1 + width, width, 4, 2, 1, 0, bias=False),\n            L.BatchNorm2d(width),\n            L.ReLU())\n        self.blocks = nn.Sequential(*(BasicBlock(\n            num_channels=width,\n            num_filters=width,\n            has_se=False,\n            freeze_norm=False,\n            name='HrHRNetHead_{}'.format(i)) for i in range(4)))\n\n        self.interpolate = L.Upsample(2, mode='bilinear')\n        self.concat = L.Concat(dim=1)\n        if swahr:\n            self.scalelayer0 = nn.Sequential(\n                L.Conv2d(\n                    width, num_joints, 1, 1, 0, bias=True),\n                L.BatchNorm2d(num_joints),\n                L.ReLU(),\n                L.Conv2d(\n                    num_joints,\n                    num_joints,\n                    9,\n                    1,\n                    4,\n                    groups=num_joints,\n                    bias=True))\n            self.scalelayer1 = nn.Sequential(\n                L.Conv2d(\n                    width, num_joints, 1, 1, 0, bias=True),\n                L.BatchNorm2d(num_joints),\n                L.ReLU(),\n                L.Conv2d(\n                    num_joints,\n                    num_joints,\n                    9,\n                    1,\n                    4,\n                    groups=num_joints,\n                    bias=True))\n\n    def forward(self, feats, targets=None):\n        x1 = feats[0]\n        xo1 = self.conv1(x1)\n        x2 = self.blocks(self.deconv(self.concat((x1, xo1))))\n        xo2 = self.conv2(x2)\n        num_joints = self.num_joints\n        if self.training:\n            heatmap1, tagmap = paddle.split(xo1, 2, axis=1)\n            if self.swahr:\n                so1 = self.scalelayer0(x1)\n                so2 = self.scalelayer1(x2)\n                hrhrnet_outputs = ([heatmap1, so1], [xo2, so2], tagmap)\n                return self.loss(hrhrnet_outputs, targets)\n            else:\n                hrhrnet_outputs = (heatmap1, xo2, tagmap)\n                return self.loss(hrhrnet_outputs, targets)\n\n        # averaged heatmap, upsampled tagmap\n        upsampled = self.interpolate(xo1)\n        avg = (upsampled[:, :num_joints] + xo2[:, :num_joints]) / 2\n        return avg, upsampled[:, num_joints:]\n"
  },
  {
    "path": "ppdet/modeling/heads/mask_head.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import KaimingNormal\n\nfrom ppdet.core.workspace import register, create\nfrom ppdet.modeling.layers import ConvNormLayer\nfrom .roi_extractor import RoIAlign\nfrom ..cls_utils import _get_class_default_kwargs\n\n\n@register\nclass MaskFeat(nn.Layer):\n    \"\"\"\n    Feature extraction in Mask head\n\n    Args:\n        in_channel (int): Input channels\n        out_channel (int): Output channels\n        num_convs (int): The number of conv layers, default 4\n        norm_type (string | None): Norm type, bn, gn, sync_bn are available,\n            default None\n    \"\"\"\n\n    def __init__(self,\n                 in_channel=256,\n                 out_channel=256,\n                 num_convs=4,\n                 norm_type=None):\n        super(MaskFeat, self).__init__()\n        self.num_convs = num_convs\n        self.in_channel = in_channel\n        self.out_channel = out_channel\n        self.norm_type = norm_type\n        fan_conv = out_channel * 3 * 3\n        fan_deconv = out_channel * 2 * 2\n\n        mask_conv = nn.Sequential()\n        if norm_type == 'gn':\n            for i in range(self.num_convs):\n                conv_name = 'mask_inter_feat_{}'.format(i + 1)\n                mask_conv.add_sublayer(\n                    conv_name,\n                    ConvNormLayer(\n                        ch_in=in_channel if i == 0 else out_channel,\n                        ch_out=out_channel,\n                        filter_size=3,\n                        stride=1,\n                        norm_type=self.norm_type,\n                        initializer=KaimingNormal(fan_in=fan_conv),\n                        skip_quant=True))\n                mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())\n        else:\n            for i in range(self.num_convs):\n                conv_name = 'mask_inter_feat_{}'.format(i + 1)\n                conv = nn.Conv2D(\n                    in_channels=in_channel if i == 0 else out_channel,\n                    out_channels=out_channel,\n                    kernel_size=3,\n                    padding=1,\n                    weight_attr=paddle.ParamAttr(\n                        initializer=KaimingNormal(fan_in=fan_conv)))\n                conv.skip_quant = True\n                mask_conv.add_sublayer(conv_name, conv)\n                mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())\n        mask_conv.add_sublayer(\n            'conv5_mask',\n            nn.Conv2DTranspose(\n                in_channels=self.out_channel if num_convs > 0 else self.in_channel,\n                out_channels=self.out_channel,\n                kernel_size=2,\n                stride=2,\n                weight_attr=paddle.ParamAttr(\n                    initializer=KaimingNormal(fan_in=fan_deconv))))\n        mask_conv.add_sublayer('conv5_mask' + 'act', nn.ReLU())\n        self.upsample = mask_conv\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        if isinstance(input_shape, (list, tuple)):\n            input_shape = input_shape[0]\n        return {'in_channel': input_shape.channels, }\n\n    def out_channels(self):\n        return self.out_channel\n\n    def forward(self, feats):\n        return self.upsample(feats)\n\n\n@register\nclass MaskHead(nn.Layer):\n    __shared__ = ['num_classes', 'export_onnx']\n    __inject__ = ['mask_assigner']\n    \"\"\"\n    RCNN mask head\n\n    Args:\n        head (nn.Layer): Extract feature in mask head\n        roi_extractor (object): The module of RoI Extractor\n        mask_assigner (object): The module of Mask Assigner, \n            label and sample the mask\n        num_classes (int): The number of classes\n        share_bbox_feat (bool): Whether to share the feature from bbox head,\n            default false\n    \"\"\"\n\n    def __init__(self,\n                 head,\n                 roi_extractor=_get_class_default_kwargs(RoIAlign),\n                 mask_assigner='MaskAssigner',\n                 num_classes=80,\n                 share_bbox_feat=False,\n                 export_onnx=False):\n        super(MaskHead, self).__init__()\n        self.num_classes = num_classes\n        self.export_onnx = export_onnx\n\n        self.roi_extractor = roi_extractor\n        if isinstance(roi_extractor, dict):\n            self.roi_extractor = RoIAlign(**roi_extractor)\n        self.head = head\n        self.in_channels = head.out_channels()\n        self.mask_assigner = mask_assigner\n        self.share_bbox_feat = share_bbox_feat\n        self.bbox_head = None\n\n        self.mask_fcn_logits = nn.Conv2D(\n            in_channels=self.in_channels,\n            out_channels=self.num_classes,\n            kernel_size=1,\n            weight_attr=paddle.ParamAttr(initializer=KaimingNormal(\n                fan_in=self.num_classes)))\n        self.mask_fcn_logits.skip_quant = True\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        roi_pooler = cfg['roi_extractor']\n        assert isinstance(roi_pooler, dict)\n        kwargs = RoIAlign.from_config(cfg, input_shape)\n        roi_pooler.update(kwargs)\n        kwargs = {'input_shape': input_shape}\n        head = create(cfg['head'], **kwargs)\n        return {\n            'roi_extractor': roi_pooler,\n            'head': head,\n        }\n\n    def get_loss(self, mask_logits, mask_label, mask_target, mask_weight):\n        mask_label = F.one_hot(mask_label, self.num_classes).unsqueeze([2, 3])\n        mask_label = paddle.expand_as(mask_label, mask_logits)\n        mask_label.stop_gradient = True\n        mask_pred = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))\n        shape = mask_logits.shape\n        mask_pred = paddle.reshape(mask_pred, [shape[0], shape[2], shape[3]])\n\n        mask_target = mask_target.cast('float32')\n        mask_weight = mask_weight.unsqueeze([1, 2])\n        loss_mask = F.binary_cross_entropy_with_logits(\n            mask_pred, mask_target, weight=mask_weight, reduction=\"mean\")\n        return loss_mask\n\n    def forward_train(self, body_feats, rois, rois_num, inputs, targets,\n                      bbox_feat):\n        \"\"\"\n        body_feats (list[Tensor]): Multi-level backbone features\n        rois (list[Tensor]): Proposals for each batch with shape [N, 4]\n        rois_num (Tensor): The number of proposals for each batch\n        inputs (dict): ground truth info\n        \"\"\"\n        tgt_labels, _, tgt_gt_inds = targets\n        rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner(\n            rois, tgt_labels, tgt_gt_inds, inputs)\n\n        if self.share_bbox_feat:\n            rois_feat = paddle.gather(bbox_feat, mask_index)\n        else:\n            rois_feat = self.roi_extractor(body_feats, rois, rois_num)\n        mask_feat = self.head(rois_feat)\n        mask_logits = self.mask_fcn_logits(mask_feat)\n\n        loss_mask = self.get_loss(mask_logits, tgt_classes, tgt_masks,\n                                  tgt_weights)\n        return {'loss_mask': loss_mask}\n\n    def forward_test(self,\n                     body_feats,\n                     rois,\n                     rois_num,\n                     scale_factor,\n                     feat_func=None):\n        \"\"\"\n        body_feats (list[Tensor]): Multi-level backbone features\n        rois (Tensor): Prediction from bbox head with shape [N, 6]\n        rois_num (Tensor): The number of prediction for each batch\n        scale_factor (Tensor): The scale factor from origin size to input size\n        \"\"\"\n        if not self.export_onnx and rois.shape[0] == 0:\n            mask_out = paddle.full([1, 1, 1], -1)\n        else:\n            bbox = [rois[:, 2:]]\n            labels = rois[:, 0].cast('int32')\n            rois_feat = self.roi_extractor(body_feats, bbox, rois_num)\n            if self.share_bbox_feat:\n                assert feat_func is not None\n                rois_feat = feat_func(rois_feat)\n\n            mask_feat = self.head(rois_feat)\n            mask_logit = self.mask_fcn_logits(mask_feat)\n            if self.num_classes == 1:\n                mask_out = F.sigmoid(mask_logit)[:, 0, :, :]\n            else:\n                num_masks = mask_logit.shape[0]\n                index = paddle.arange(num_masks).cast('int32')\n                mask_out = mask_logit[index, labels]\n                mask_out_shape = mask_out.shape\n                mask_out = paddle.reshape(mask_out, \n                    index.shape + [mask_out_shape[-2]] + [mask_out_shape[-1]])\n                mask_out = F.sigmoid(mask_out)\n        return mask_out\n\n    def forward(self,\n                body_feats,\n                rois,\n                rois_num,\n                inputs,\n                targets=None,\n                bbox_feat=None,\n                feat_func=None):\n        if self.training:\n            return self.forward_train(body_feats, rois, rois_num, inputs,\n                                      targets, bbox_feat)\n        else:\n            im_scale = inputs['scale_factor']\n            return self.forward_test(body_feats, rois, rois_num, im_scale,\n                                     feat_func)\n"
  },
  {
    "path": "ppdet/modeling/heads/petr_head.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\"\"\"\nthis code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/dense_heads/petr_head.py\n\"\"\"\nimport copy\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nimport paddle.distributed as dist\n\nfrom ..transformers.petr_transformer import inverse_sigmoid, masked_fill\nfrom ..initializer import constant_, normal_\n\n__all__ = [\"PETRHead\"]\n\nfrom functools import partial\n\n\ndef bias_init_with_prob(prior_prob: float) -> float:\n    \"\"\"initialize conv/fc bias value according to a given probability value.\"\"\"\n    bias_init = float(-np.log((1 - prior_prob) / prior_prob))\n    return bias_init\n\n\ndef multi_apply(func, *args, **kwargs):\n    \"\"\"Apply function to a list of arguments.\n\n    Note:\n        This function applies the ``func`` to multiple inputs and\n        map the multiple outputs of the ``func`` into different\n        list. Each list contains the same type of outputs corresponding\n        to different inputs.\n\n    Args:\n        func (Function): A function that will be applied to a list of\n            arguments\n\n    Returns:\n        tuple(list): A tuple containing multiple list, each list contains \\\n            a kind of returned results by the function\n    \"\"\"\n    pfunc = partial(func, **kwargs) if kwargs else func\n    map_results = map(pfunc, *args)\n    res = tuple(map(list, zip(*map_results)))\n    return res\n\n\ndef reduce_mean(tensor):\n    \"\"\"\"Obtain the mean of tensor on different GPUs.\"\"\"\n    if not (dist.get_world_size() and dist.is_initialized()):\n        return tensor\n    tensor = tensor.clone()\n    dist.all_reduce(\n        tensor.divide(\n            paddle.to_tensor(\n                dist.get_world_size(), dtype='float32')),\n        op=dist.ReduceOp.SUM)\n    return tensor\n\n\ndef gaussian_radius(det_size, min_overlap=0.7):\n    \"\"\"calculate gaussian radius according to object size.\n    \"\"\"\n    height, width = det_size\n\n    a1 = 1\n    b1 = (height + width)\n    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)\n    sq1 = paddle.sqrt(b1**2 - 4 * a1 * c1)\n    r1 = (b1 + sq1) / 2\n\n    a2 = 4\n    b2 = 2 * (height + width)\n    c2 = (1 - min_overlap) * width * height\n    sq2 = paddle.sqrt(b2**2 - 4 * a2 * c2)\n    r2 = (b2 + sq2) / 2\n\n    a3 = 4 * min_overlap\n    b3 = -2 * min_overlap * (height + width)\n    c3 = (min_overlap - 1) * width * height\n    sq3 = paddle.sqrt(b3**2 - 4 * a3 * c3)\n    r3 = (b3 + sq3) / 2\n    return min(r1, r2, r3)\n\n\ndef gaussian2D(shape, sigma=1):\n    m, n = [(ss - 1.) / 2. for ss in shape]\n    y = paddle.arange(-m, m + 1, dtype=\"float32\")[:, None]\n    x = paddle.arange(-n, n + 1, dtype=\"float32\")[None, :]\n    # y, x = np.ogrid[-m:m + 1, -n:n + 1]\n\n    h = paddle.exp(-(x * x + y * y) / (2 * sigma * sigma))\n    h[h < np.finfo(np.float32).eps * h.max()] = 0\n    return h\n\n\ndef draw_umich_gaussian(heatmap, center, radius, k=1):\n    diameter = 2 * radius + 1\n    gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)\n    gaussian = paddle.to_tensor(gaussian, dtype=heatmap.dtype)\n\n    x, y = int(center[0]), int(center[1])\n    radius = int(radius)\n\n    height, width = heatmap.shape[0:2]\n\n    left, right = min(x, radius), min(width - x, radius + 1)\n    top, bottom = min(y, radius), min(height - y, radius + 1)\n\n    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]\n    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:\n                               radius + right]\n    # assert masked_gaussian.equal(1).float().sum() == 1\n    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:\n        heatmap[y - top:y + bottom, x - left:x + right] = paddle.maximum(\n            masked_heatmap, masked_gaussian * k)\n    return heatmap\n\n\n@register\nclass PETRHead(nn.Layer):\n    \"\"\"Head of `End-to-End Multi-Person Pose Estimation with Transformers`.\n\n    Args:\n        num_classes (int): Number of categories excluding the background.\n        in_channels (int): Number of channels in the input feature map.\n        num_query (int): Number of query in Transformer.\n        num_kpt_fcs (int, optional): Number of fully-connected layers used in\n            `FFN`, which is then used for the keypoint regression head.\n            Default 2.\n        transformer (obj:`mmcv.ConfigDict`|dict): ConfigDict is used for\n            building the Encoder and Decoder. Default: None.\n        sync_cls_avg_factor (bool): Whether to sync the avg_factor of\n            all ranks. Default to False.\n        positional_encoding (obj:`mmcv.ConfigDict`|dict):\n            Config for position encoding.\n        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the\n            classification loss. Default `CrossEntropyLoss`.\n        loss_kpt (obj:`mmcv.ConfigDict`|dict): Config of the\n            regression loss. Default `L1Loss`.\n        loss_oks (obj:`mmcv.ConfigDict`|dict): Config of the\n            regression oks loss. Default `OKSLoss`.\n        loss_hm (obj:`mmcv.ConfigDict`|dict): Config of the\n            regression heatmap loss. Default `NegLoss`.\n        as_two_stage (bool) : Whether to generate the proposal from\n            the outputs of encoder.\n        with_kpt_refine (bool): Whether to refine the reference points\n            in the decoder. Defaults to True.\n        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of\n            transformer head.\n        init_cfg (dict or list[dict], optional): Initialization config dict.\n            Default: None.\n    \"\"\"\n    __inject__ = [\n        \"transformer\", \"positional_encoding\", \"assigner\", \"sampler\", \"loss_cls\",\n        \"loss_kpt\", \"loss_oks\", \"loss_hm\", \"loss_kpt_rpn\", \"loss_kpt_refine\",\n        \"loss_oks_refine\"\n    ]\n\n    def __init__(self,\n                 num_classes,\n                 in_channels,\n                 num_query=100,\n                 num_kpt_fcs=2,\n                 num_keypoints=17,\n                 transformer=None,\n                 sync_cls_avg_factor=True,\n                 positional_encoding='SinePositionalEncoding',\n                 loss_cls='FocalLoss',\n                 loss_kpt='L1Loss',\n                 loss_oks='OKSLoss',\n                 loss_hm='CenterFocalLoss',\n                 with_kpt_refine=True,\n                 assigner='PoseHungarianAssigner',\n                 sampler='PseudoSampler',\n                 loss_kpt_rpn='L1Loss',\n                 loss_kpt_refine='L1Loss',\n                 loss_oks_refine='opera.OKSLoss',\n                 test_cfg=dict(max_per_img=100),\n                 init_cfg=None,\n                 **kwargs):\n        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,\n        # since it brings inconvenience when the initialization of\n        # `AnchorFreeHead` is called.\n        super().__init__()\n        self.bg_cls_weight = 0\n        self.sync_cls_avg_factor = sync_cls_avg_factor\n        self.assigner = assigner\n        self.sampler = sampler\n        self.num_query = num_query\n        self.num_classes = num_classes\n        self.in_channels = in_channels\n        self.num_kpt_fcs = num_kpt_fcs\n        self.test_cfg = test_cfg\n        self.fp16_enabled = False\n        self.as_two_stage = transformer.as_two_stage\n        self.with_kpt_refine = with_kpt_refine\n        self.num_keypoints = num_keypoints\n        self.loss_cls = loss_cls\n        self.loss_kpt = loss_kpt\n        self.loss_kpt_rpn = loss_kpt_rpn\n        self.loss_kpt_refine = loss_kpt_refine\n        self.loss_oks = loss_oks\n        self.loss_oks_refine = loss_oks_refine\n        self.loss_hm = loss_hm\n        if self.loss_cls.use_sigmoid:\n            self.cls_out_channels = num_classes\n        else:\n            self.cls_out_channels = num_classes + 1\n        self.positional_encoding = positional_encoding\n        self.transformer = transformer\n        self.embed_dims = self.transformer.embed_dims\n        # assert 'num_feats' in positional_encoding\n        num_feats = positional_encoding.num_pos_feats\n        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \\\n            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \\\n            f' and {num_feats}.'\n        self._init_layers()\n        self.init_weights()\n\n    def _init_layers(self):\n        \"\"\"Initialize classification branch and keypoint branch of head.\"\"\"\n\n        fc_cls = nn.Linear(self.embed_dims, self.cls_out_channels)\n\n        kpt_branch = []\n        kpt_branch.append(nn.Linear(self.embed_dims, 512))\n        kpt_branch.append(nn.ReLU())\n        for _ in range(self.num_kpt_fcs):\n            kpt_branch.append(nn.Linear(512, 512))\n            kpt_branch.append(nn.ReLU())\n        kpt_branch.append(nn.Linear(512, 2 * self.num_keypoints))\n        kpt_branch = nn.Sequential(*kpt_branch)\n\n        def _get_clones(module, N):\n            return nn.LayerList([copy.deepcopy(module) for i in range(N)])\n\n        # last kpt_branch is used to generate proposal from\n        # encode feature map when as_two_stage is True.\n        num_pred = (self.transformer.decoder.num_layers + 1) if \\\n            self.as_two_stage else self.transformer.decoder.num_layers\n\n        if self.with_kpt_refine:\n            self.cls_branches = _get_clones(fc_cls, num_pred)\n            self.kpt_branches = _get_clones(kpt_branch, num_pred)\n        else:\n            self.cls_branches = nn.LayerList([fc_cls for _ in range(num_pred)])\n            self.kpt_branches = nn.LayerList(\n                [kpt_branch for _ in range(num_pred)])\n\n        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims * 2)\n\n        refine_kpt_branch = []\n        for _ in range(self.num_kpt_fcs):\n            refine_kpt_branch.append(\n                nn.Linear(self.embed_dims, self.embed_dims))\n            refine_kpt_branch.append(nn.ReLU())\n        refine_kpt_branch.append(nn.Linear(self.embed_dims, 2))\n        refine_kpt_branch = nn.Sequential(*refine_kpt_branch)\n        if self.with_kpt_refine:\n            num_pred = self.transformer.refine_decoder.num_layers\n            self.refine_kpt_branches = _get_clones(refine_kpt_branch, num_pred)\n        self.fc_hm = nn.Linear(self.embed_dims, self.num_keypoints)\n\n    def init_weights(self):\n        \"\"\"Initialize weights of the PETR head.\"\"\"\n        self.transformer.init_weights()\n        if self.loss_cls.use_sigmoid:\n            bias_init = bias_init_with_prob(0.01)\n            for m in self.cls_branches:\n                constant_(m.bias, bias_init)\n        for m in self.kpt_branches:\n            constant_(m[-1].bias, 0)\n        # initialization of keypoint refinement branch\n        if self.with_kpt_refine:\n            for m in self.refine_kpt_branches:\n                constant_(m[-1].bias, 0)\n        # initialize bias for heatmap prediction\n        bias_init = bias_init_with_prob(0.1)\n        normal_(self.fc_hm.weight, std=0.01)\n        constant_(self.fc_hm.bias, bias_init)\n\n    def forward(self, mlvl_feats, img_metas):\n        \"\"\"Forward function.\n\n        Args:\n            mlvl_feats (tuple[Tensor]): Features from the upstream\n                network, each is a 4D-tensor with shape\n                (N, C, H, W).\n            img_metas (list[dict]): List of image information.\n\n        Returns:\n            outputs_classes (Tensor): Outputs from the classification head,\n                shape [nb_dec, bs, num_query, cls_out_channels]. Note\n                cls_out_channels should include background.\n            outputs_kpts (Tensor): Sigmoid outputs from the regression\n                head with normalized coordinate format (cx, cy, w, h).\n                Shape [nb_dec, bs, num_query, K*2].\n            enc_outputs_class (Tensor): The score of each point on encode\n                feature map, has shape (N, h*w, num_class). Only when\n                as_two_stage is Ture it would be returned, otherwise\n                `None` would be returned.\n            enc_outputs_kpt (Tensor): The proposal generate from the\n                encode feature map, has shape (N, h*w, K*2). Only when\n                as_two_stage is Ture it would be returned, otherwise\n                `None` would be returned.\n        \"\"\"\n\n        batch_size = mlvl_feats[0].shape[0]\n        input_img_h, input_img_w = img_metas[0]['batch_input_shape']\n        img_masks = paddle.zeros(\n            (batch_size, input_img_h, input_img_w), dtype=mlvl_feats[0].dtype)\n        for img_id in range(batch_size):\n            img_h, img_w, _ = img_metas[img_id]['img_shape']\n            img_masks[img_id, :img_h, :img_w] = 1\n\n        mlvl_masks = []\n        mlvl_positional_encodings = []\n        for feat in mlvl_feats:\n            mlvl_masks.append(\n                F.interpolate(\n                    img_masks[None], size=feat.shape[-2:]).squeeze(0))\n            mlvl_positional_encodings.append(\n                self.positional_encoding(mlvl_masks[-1]).transpose(\n                    [0, 3, 1, 2]))\n\n        query_embeds = self.query_embedding.weight\n        hs, init_reference, inter_references, \\\n            enc_outputs_class, enc_outputs_kpt, hm_proto, memory = \\\n                self.transformer(\n                    mlvl_feats,\n                    mlvl_masks,\n                    query_embeds,\n                    mlvl_positional_encodings,\n                    kpt_branches=self.kpt_branches \\\n                        if self.with_kpt_refine else None,  # noqa:E501\n                    cls_branches=self.cls_branches \\\n                        if self.as_two_stage else None  # noqa:E501\n            )\n\n        outputs_classes = []\n        outputs_kpts = []\n\n        for lvl in range(hs.shape[0]):\n            if lvl == 0:\n                reference = init_reference\n            else:\n                reference = inter_references[lvl - 1]\n            reference = inverse_sigmoid(reference)\n            outputs_class = self.cls_branches[lvl](hs[lvl])\n            tmp_kpt = self.kpt_branches[lvl](hs[lvl])\n            assert reference.shape[-1] == self.num_keypoints * 2\n            tmp_kpt += reference\n            outputs_kpt = F.sigmoid(tmp_kpt)\n            outputs_classes.append(outputs_class)\n            outputs_kpts.append(outputs_kpt)\n\n        outputs_classes = paddle.stack(outputs_classes)\n        outputs_kpts = paddle.stack(outputs_kpts)\n\n        if hm_proto is not None:\n            # get heatmap prediction (training phase)\n            hm_memory, hm_mask = hm_proto\n            hm_pred = self.fc_hm(hm_memory)\n            hm_proto = (hm_pred.transpose((0, 3, 1, 2)), hm_mask)\n\n        if self.as_two_stage:\n            return outputs_classes, outputs_kpts, \\\n                enc_outputs_class, F.sigmoid(enc_outputs_kpt), \\\n                hm_proto, memory, mlvl_masks\n        else:\n            raise RuntimeError('only \"as_two_stage=True\" is supported.')\n\n    def forward_refine(self, memory, mlvl_masks, refine_targets, losses,\n                       img_metas):\n        \"\"\"Forward function.\n\n        Args:\n            mlvl_masks (tuple[Tensor]): The key_padding_mask from\n                different level used for encoder and decoder,\n                each is a 3D-tensor with shape (bs, H, W).\n            losses (dict[str, Tensor]): A dictionary of loss components.\n            img_metas (list[dict]): List of image information.\n\n        Returns:\n            dict[str, Tensor]: A dictionary of loss components.\n        \"\"\"\n        kpt_preds, kpt_targets, area_targets, kpt_weights = refine_targets\n        pos_inds = kpt_weights.sum(-1) > 0\n        if not pos_inds.any():\n            pos_kpt_preds = paddle.zeros_like(kpt_preds[:1])\n            pos_img_inds = paddle.zeros([1], dtype=\"int64\")\n        else:\n            pos_kpt_preds = kpt_preds[pos_inds]\n            pos_img_inds = (pos_inds.nonzero() /\n                            self.num_query).squeeze(1).astype(\"int64\")\n        hs, init_reference, inter_references = self.transformer.forward_refine(\n            mlvl_masks,\n            memory,\n            pos_kpt_preds.detach(),\n            pos_img_inds,\n            kpt_branches=self.refine_kpt_branches\n            if self.with_kpt_refine else None,  # noqa:E501\n        )\n\n        outputs_kpts = []\n\n        for lvl in range(hs.shape[0]):\n            if lvl == 0:\n                reference = init_reference\n            else:\n                reference = inter_references[lvl - 1]\n            reference = inverse_sigmoid(reference)\n            tmp_kpt = self.refine_kpt_branches[lvl](hs[lvl])\n            assert reference.shape[-1] == 2\n            tmp_kpt += reference\n            outputs_kpt = F.sigmoid(tmp_kpt)\n            outputs_kpts.append(outputs_kpt)\n        outputs_kpts = paddle.stack(outputs_kpts)\n\n        if not self.training:\n            return outputs_kpts\n\n        num_valid_kpt = paddle.clip(\n            reduce_mean(kpt_weights.sum()), min=1).item()\n        num_total_pos = paddle.to_tensor(\n            [outputs_kpts.shape[1]], dtype=kpt_weights.dtype)\n        num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()\n\n        if not pos_inds.any():\n            for i, kpt_refine_preds in enumerate(outputs_kpts):\n                loss_kpt = loss_oks = kpt_refine_preds.sum() * 0\n                losses[f'd{i}.loss_kpt_refine'] = loss_kpt\n                losses[f'd{i}.loss_oks_refine'] = loss_oks\n                continue\n            return losses\n\n        batch_size = mlvl_masks[0].shape[0]\n        factors = []\n        for img_id in range(batch_size):\n            img_h, img_w, _ = img_metas[img_id]['img_shape']\n            factor = paddle.to_tensor(\n                [img_w, img_h, img_w, img_h],\n                dtype=\"float32\").squeeze(-1).unsqueeze(0).tile(\n                    (self.num_query, 1))\n            factors.append(factor)\n        factors = paddle.concat(factors, 0)\n        factors = factors[pos_inds][:, :2].tile((1, kpt_preds.shape[-1] // 2))\n\n        pos_kpt_weights = kpt_weights[pos_inds]\n        pos_kpt_targets = kpt_targets[pos_inds]\n        pos_kpt_targets_scaled = pos_kpt_targets * factors\n        pos_areas = area_targets[pos_inds]\n        pos_valid = kpt_weights[pos_inds][:, 0::2]\n        for i, kpt_refine_preds in enumerate(outputs_kpts):\n            if not pos_inds.any():\n                print(\"refine kpt and oks skip\")\n                loss_kpt = loss_oks = kpt_refine_preds.sum() * 0\n                losses[f'd{i}.loss_kpt_refine'] = loss_kpt\n                losses[f'd{i}.loss_oks_refine'] = loss_oks\n                continue\n\n            # kpt L1 Loss\n            pos_refine_preds = kpt_refine_preds.reshape(\n                (kpt_refine_preds.shape[0], -1))\n            loss_kpt = self.loss_kpt_refine(\n                pos_refine_preds,\n                pos_kpt_targets,\n                pos_kpt_weights,\n                avg_factor=num_valid_kpt)\n            losses[f'd{i}.loss_kpt_refine'] = loss_kpt\n            # kpt oks loss\n            pos_refine_preds_scaled = pos_refine_preds * factors\n            assert (pos_areas > 0).all()\n            loss_oks = self.loss_oks_refine(\n                pos_refine_preds_scaled,\n                pos_kpt_targets_scaled,\n                pos_valid,\n                pos_areas,\n                avg_factor=num_total_pos)\n            losses[f'd{i}.loss_oks_refine'] = loss_oks\n        return losses\n\n    # over-write because img_metas are needed as inputs for bbox_head.\n    def forward_train(self,\n                      x,\n                      img_metas,\n                      gt_bboxes,\n                      gt_labels=None,\n                      gt_keypoints=None,\n                      gt_areas=None,\n                      gt_bboxes_ignore=None,\n                      proposal_cfg=None,\n                      **kwargs):\n        \"\"\"Forward function for training mode.\n\n        Args:\n            x (list[Tensor]): Features from backbone.\n            img_metas (list[dict]): Meta information of each image, e.g.,\n                image size, scaling factor, etc.\n            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,\n                shape (num_gts, 4).\n            gt_labels (list[Tensor]): Ground truth labels of each box,\n                shape (num_gts,).\n            gt_keypoints (list[Tensor]): Ground truth keypoints of the image,\n                shape (num_gts, K*3).\n            gt_areas (list[Tensor]): Ground truth mask areas of each box,\n                shape (num_gts,).\n            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be\n                ignored, shape (num_ignored_gts, 4).\n            proposal_cfg (mmcv.Config): Test / postprocessing configuration,\n                if None, test_cfg would be used.\n\n        Returns:\n            dict[str, Tensor]: A dictionary of loss components.\n        \"\"\"\n        assert proposal_cfg is None, '\"proposal_cfg\" must be None'\n        outs = self(x, img_metas)\n        memory, mlvl_masks = outs[-2:]\n        outs = outs[:-2]\n        if gt_labels is None:\n            loss_inputs = outs + (gt_bboxes, gt_keypoints, gt_areas, img_metas)\n        else:\n            loss_inputs = outs + (gt_bboxes, gt_labels, gt_keypoints, gt_areas,\n                                  img_metas)\n        losses_and_targets = self.loss(\n            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)\n        # losses = losses_and_targets\n        losses, refine_targets = losses_and_targets\n        # get pose refinement loss\n        losses = self.forward_refine(memory, mlvl_masks, refine_targets, losses,\n                                     img_metas)\n        return losses\n\n    def loss(self,\n             all_cls_scores,\n             all_kpt_preds,\n             enc_cls_scores,\n             enc_kpt_preds,\n             enc_hm_proto,\n             gt_bboxes_list,\n             gt_labels_list,\n             gt_keypoints_list,\n             gt_areas_list,\n             img_metas,\n             gt_bboxes_ignore=None):\n        \"\"\"Loss function.\n\n        Args:\n            all_cls_scores (Tensor): Classification score of all\n                decoder layers, has shape\n                [nb_dec, bs, num_query, cls_out_channels].\n            all_kpt_preds (Tensor): Sigmoid regression\n                outputs of all decode layers. Each is a 4D-tensor with\n                normalized coordinate format (x_{i}, y_{i}) and shape\n                [nb_dec, bs, num_query, K*2].\n            enc_cls_scores (Tensor): Classification scores of\n                points on encode feature map, has shape\n                (N, h*w, num_classes). Only be passed when as_two_stage is\n                True, otherwise is None.\n            enc_kpt_preds (Tensor): Regression results of each points\n                on the encode feature map, has shape (N, h*w, K*2). Only be\n                passed when as_two_stage is True, otherwise is None.\n            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image\n                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.\n            gt_labels_list (list[Tensor]): Ground truth class indices for each\n                image with shape (num_gts, ).\n            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each\n                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,\n                    ..., p^{K}_x, p^{K}_y, p^{K}_v] format.\n            gt_areas_list (list[Tensor]): Ground truth mask areas for each\n                image with shape (num_gts, ).\n            img_metas (list[dict]): List of image meta information.\n            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes\n                which can be ignored for each image. Default None.\n\n        Returns:\n            dict[str, Tensor]: A dictionary of loss components.\n        \"\"\"\n        assert gt_bboxes_ignore is None, \\\n            f'{self.__class__.__name__} only supports ' \\\n            f'for gt_bboxes_ignore setting to None.'\n\n        num_dec_layers = len(all_cls_scores)\n        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]\n        all_gt_keypoints_list = [\n            gt_keypoints_list for _ in range(num_dec_layers)\n        ]\n        all_gt_areas_list = [gt_areas_list for _ in range(num_dec_layers)]\n        img_metas_list = [img_metas for _ in range(num_dec_layers)]\n\n        losses_cls, losses_kpt, losses_oks, kpt_preds_list, kpt_targets_list, \\\n            area_targets_list, kpt_weights_list = multi_apply(\n                self.loss_single, all_cls_scores, all_kpt_preds,\n                all_gt_labels_list, all_gt_keypoints_list,\n                all_gt_areas_list, img_metas_list)\n\n        loss_dict = dict()\n        # loss of proposal generated from encode feature map.\n        if enc_cls_scores is not None:\n            binary_labels_list = [\n                paddle.zeros_like(gt_labels_list[i])\n                for i in range(len(img_metas))\n            ]\n            enc_loss_cls, enc_losses_kpt = \\\n                self.loss_single_rpn(\n                    enc_cls_scores, enc_kpt_preds, binary_labels_list,\n                    gt_keypoints_list, gt_areas_list, img_metas)\n            loss_dict['enc_loss_cls'] = enc_loss_cls\n            loss_dict['enc_loss_kpt'] = enc_losses_kpt\n\n        # loss from the last decoder layer\n        loss_dict['loss_cls'] = losses_cls[-1]\n        loss_dict['loss_kpt'] = losses_kpt[-1]\n        loss_dict['loss_oks'] = losses_oks[-1]\n        # loss from other decoder layers\n        num_dec_layer = 0\n        for loss_cls_i, loss_kpt_i, loss_oks_i in zip(\n                losses_cls[:-1], losses_kpt[:-1], losses_oks[:-1]):\n            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i\n            loss_dict[f'd{num_dec_layer}.loss_kpt'] = loss_kpt_i\n            loss_dict[f'd{num_dec_layer}.loss_oks'] = loss_oks_i\n            num_dec_layer += 1\n\n        # losses of heatmap generated from P3 feature map\n        hm_pred, hm_mask = enc_hm_proto\n        loss_hm = self.loss_heatmap(hm_pred, hm_mask, gt_keypoints_list,\n                                    gt_labels_list, gt_bboxes_list)\n        loss_dict['loss_hm'] = loss_hm\n\n        return loss_dict, (kpt_preds_list[-1], kpt_targets_list[-1],\n                           area_targets_list[-1], kpt_weights_list[-1])\n\n    def loss_heatmap(self, hm_pred, hm_mask, gt_keypoints, gt_labels,\n                     gt_bboxes):\n        assert hm_pred.shape[-2:] == hm_mask.shape[-2:]\n        num_img, _, h, w = hm_pred.shape\n        # placeholder of heatmap target (Gaussian distribution)\n        hm_target = paddle.zeros(hm_pred.shape, hm_pred.dtype)\n        for i, (gt_label, gt_bbox, gt_keypoint\n                ) in enumerate(zip(gt_labels, gt_bboxes, gt_keypoints)):\n            if gt_label.shape[0] == 0:\n                continue\n            gt_keypoint = gt_keypoint.reshape((gt_keypoint.shape[0], -1,\n                                               3)).clone()\n            gt_keypoint[..., :2] /= 8\n\n            assert gt_keypoint[..., 0].max() <= w + 0.5  # new coordinate system\n            assert gt_keypoint[..., 1].max() <= h + 0.5  # new coordinate system\n            gt_bbox /= 8\n            gt_w = gt_bbox[:, 2] - gt_bbox[:, 0]\n            gt_h = gt_bbox[:, 3] - gt_bbox[:, 1]\n            for j in range(gt_label.shape[0]):\n                # get heatmap radius\n                kp_radius = paddle.clip(\n                    paddle.floor(\n                        gaussian_radius(\n                            (gt_h[j], gt_w[j]), min_overlap=0.9)),\n                    min=0,\n                    max=3)\n                for k in range(self.num_keypoints):\n                    if gt_keypoint[j, k, 2] > 0:\n                        gt_kp = gt_keypoint[j, k, :2]\n                        gt_kp_int = paddle.floor(gt_kp)\n                        hm_target[i, k] = draw_umich_gaussian(\n                            hm_target[i, k], gt_kp_int, kp_radius)\n        # compute heatmap loss\n        hm_pred = paddle.clip(\n            F.sigmoid(hm_pred), min=1e-4, max=1 - 1e-4)  # refer to CenterNet\n        loss_hm = self.loss_hm(\n            hm_pred,\n            hm_target.detach(),\n            mask=~hm_mask.astype(\"bool\").unsqueeze(1))\n        return loss_hm\n\n    def loss_single(self, cls_scores, kpt_preds, gt_labels_list,\n                    gt_keypoints_list, gt_areas_list, img_metas):\n        \"\"\"Loss function for outputs from a single decoder layer of a single\n        feature level.\n\n        Args:\n            cls_scores (Tensor): Box score logits from a single decoder layer\n                for all images. Shape [bs, num_query, cls_out_channels].\n            kpt_preds (Tensor): Sigmoid outputs from a single decoder layer\n                for all images, with normalized coordinate (x_{i}, y_{i}) and\n                shape [bs, num_query, K*2].\n            gt_labels_list (list[Tensor]): Ground truth class indices for each\n                image with shape (num_gts, ).\n            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each\n                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,\n                ..., p^{K}_x, p^{K}_y, p^{K}_v] format.\n            gt_areas_list (list[Tensor]): Ground truth mask areas for each\n                image with shape (num_gts, ).\n            img_metas (list[dict]): List of image meta information.\n\n        Returns:\n            dict[str, Tensor]: A dictionary of loss components for outputs from\n                a single decoder layer.\n        \"\"\"\n        num_imgs = cls_scores.shape[0]\n        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]\n        kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]\n        cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,\n                                           gt_labels_list, gt_keypoints_list,\n                                           gt_areas_list, img_metas)\n        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,\n         area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets\n        labels = paddle.concat(labels_list, 0)\n        label_weights = paddle.concat(label_weights_list, 0)\n        kpt_targets = paddle.concat(kpt_targets_list, 0)\n        kpt_weights = paddle.concat(kpt_weights_list, 0)\n        area_targets = paddle.concat(area_targets_list, 0)\n\n        # classification loss\n        cls_scores = cls_scores.reshape((-1, self.cls_out_channels))\n        # construct weighted avg_factor to match with the official DETR repo\n        cls_avg_factor = num_total_pos * 1.0 + \\\n            num_total_neg * self.bg_cls_weight\n        if self.sync_cls_avg_factor:\n            cls_avg_factor = reduce_mean(\n                paddle.to_tensor(\n                    [cls_avg_factor], dtype=cls_scores.dtype))\n        cls_avg_factor = max(cls_avg_factor, 1)\n\n        loss_cls = self.loss_cls(\n            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)\n\n        # Compute the average number of gt keypoints accross all gpus, for\n        # normalization purposes\n        num_total_pos = paddle.to_tensor([num_total_pos], dtype=loss_cls.dtype)\n        num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()\n\n        # construct factors used for rescale keypoints\n        factors = []\n        for img_meta, kpt_pred in zip(img_metas, kpt_preds):\n            img_h, img_w, _ = img_meta['img_shape']\n            factor = paddle.to_tensor(\n                [img_w, img_h, img_w, img_h],\n                dtype=kpt_pred.dtype).squeeze().unsqueeze(0).tile(\n                    (kpt_pred.shape[0], 1))\n            factors.append(factor)\n        factors = paddle.concat(factors, 0)\n\n        # keypoint regression loss\n        kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))\n        num_valid_kpt = paddle.clip(\n            reduce_mean(kpt_weights.sum()), min=1).item()\n        # assert num_valid_kpt == (kpt_targets>0).sum().item()\n        loss_kpt = self.loss_kpt(\n            kpt_preds,\n            kpt_targets.detach(),\n            kpt_weights.detach(),\n            avg_factor=num_valid_kpt)\n\n        # keypoint oks loss\n        pos_inds = kpt_weights.sum(-1) > 0\n        if not pos_inds.any():\n            loss_oks = kpt_preds.sum() * 0\n        else:\n            factors = factors[pos_inds][:, :2].tile((\n                (1, kpt_preds.shape[-1] // 2)))\n            pos_kpt_preds = kpt_preds[pos_inds] * factors\n            pos_kpt_targets = kpt_targets[pos_inds] * factors\n            pos_areas = area_targets[pos_inds]\n            pos_valid = kpt_weights[pos_inds][..., 0::2]\n            assert (pos_areas > 0).all()\n            loss_oks = self.loss_oks(\n                pos_kpt_preds,\n                pos_kpt_targets,\n                pos_valid,\n                pos_areas,\n                avg_factor=num_total_pos)\n        return loss_cls, loss_kpt, loss_oks, kpt_preds, kpt_targets, \\\n            area_targets, kpt_weights\n\n    def get_targets(self, cls_scores_list, kpt_preds_list, gt_labels_list,\n                    gt_keypoints_list, gt_areas_list, img_metas):\n        \"\"\"Compute regression and classification targets for a batch image.\n\n        Outputs from a single decoder layer of a single feature level are used.\n\n        Args:\n            cls_scores_list (list[Tensor]): Box score logits from a single\n                decoder layer for each image with shape [num_query,\n                cls_out_channels].\n            kpt_preds_list (list[Tensor]): Sigmoid outputs from a single\n                decoder layer for each image, with normalized coordinate\n                (x_{i}, y_{i}) and shape [num_query, K*2].\n            gt_labels_list (list[Tensor]): Ground truth class indices for each\n                image with shape (num_gts, ).\n            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each\n                image with shape (num_gts, K*3).\n            gt_areas_list (list[Tensor]): Ground truth mask areas for each\n                image with shape (num_gts, ).\n            img_metas (list[dict]): List of image meta information.\n\n        Returns:\n            tuple: a tuple containing the following targets.\n\n                - labels_list (list[Tensor]): Labels for all images.\n                - label_weights_list (list[Tensor]): Label weights for all\n                    images.\n                - kpt_targets_list (list[Tensor]): Keypoint targets for all\n                    images.\n                - kpt_weights_list (list[Tensor]): Keypoint weights for all\n                    images.\n                - area_targets_list (list[Tensor]): area targets for all\n                    images.\n                - num_total_pos (int): Number of positive samples in all\n                    images.\n                - num_total_neg (int): Number of negative samples in all\n                    images.\n        \"\"\"\n        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,\n         area_targets_list, pos_inds_list, neg_inds_list) = multi_apply(\n             self._get_target_single, cls_scores_list, kpt_preds_list,\n             gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas)\n        num_total_pos = sum((inds.numel() for inds in pos_inds_list))\n        num_total_neg = sum((inds.numel() for inds in neg_inds_list))\n        return (labels_list, label_weights_list, kpt_targets_list,\n                kpt_weights_list, area_targets_list, num_total_pos,\n                num_total_neg)\n\n    def _get_target_single(self, cls_score, kpt_pred, gt_labels, gt_keypoints,\n                           gt_areas, img_meta):\n        \"\"\"Compute regression and classification targets for one image.\n\n        Outputs from a single decoder layer of a single feature level are used.\n\n        Args:\n            cls_score (Tensor): Box score logits from a single decoder layer\n                for one image. Shape [num_query, cls_out_channels].\n            kpt_pred (Tensor): Sigmoid outputs from a single decoder layer\n                for one image, with normalized coordinate (x_{i}, y_{i}) and\n                shape [num_query, K*2].\n            gt_labels (Tensor): Ground truth class indices for one image\n                with shape (num_gts, ).\n            gt_keypoints (Tensor): Ground truth keypoints for one image with\n                shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., \\\n                    p^{K}_x, p^{K}_y, p^{K}_v] format.\n            gt_areas (Tensor): Ground truth mask areas for one image\n                with shape (num_gts, ).\n            img_meta (dict): Meta information for one image.\n\n        Returns:\n            tuple[Tensor]: a tuple containing the following for one image.\n\n                - labels (Tensor): Labels of each image.\n                - label_weights (Tensor): Label weights of each image.\n                - kpt_targets (Tensor): Keypoint targets of each image.\n                - kpt_weights (Tensor): Keypoint weights of each image.\n                - area_targets (Tensor): Area targets of each image.\n                - pos_inds (Tensor): Sampled positive indices for each image.\n                - neg_inds (Tensor): Sampled negative indices for each image.\n        \"\"\"\n        num_bboxes = kpt_pred.shape[0]\n        # assigner and sampler\n        assign_result = self.assigner.assign(cls_score, kpt_pred, gt_labels,\n                                             gt_keypoints, gt_areas, img_meta)\n        sampling_result = self.sampler.sample(assign_result, kpt_pred,\n                                              gt_keypoints)\n\n        pos_inds = sampling_result.pos_inds\n        neg_inds = sampling_result.neg_inds\n\n        # label targets\n        labels = paddle.full((num_bboxes, ), self.num_classes, dtype=\"int64\")\n        label_weights = paddle.ones((num_bboxes, ), dtype=gt_labels.dtype)\n        kpt_targets = paddle.zeros_like(kpt_pred)\n        kpt_weights = paddle.zeros_like(kpt_pred)\n        area_targets = paddle.zeros((kpt_pred.shape[0], ), dtype=kpt_pred.dtype)\n\n        if pos_inds.size == 0:\n            return (labels, label_weights, kpt_targets, kpt_weights,\n                    area_targets, pos_inds, neg_inds)\n\n        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds][\n            ..., 0].astype(\"int64\")\n\n        img_h, img_w, _ = img_meta['img_shape']\n        # keypoint targets\n        pos_gt_kpts = gt_keypoints[sampling_result.pos_assigned_gt_inds]\n        pos_gt_kpts = pos_gt_kpts.reshape(\n            (len(sampling_result.pos_assigned_gt_inds), -1, 3))\n        valid_idx = pos_gt_kpts[:, :, 2] > 0\n        pos_kpt_weights = kpt_weights[pos_inds].reshape(\n            (pos_gt_kpts.shape[0], kpt_weights.shape[-1] // 2, 2))\n        # pos_kpt_weights[valid_idx][...] = 1.0\n        pos_kpt_weights = masked_fill(pos_kpt_weights,\n                                      valid_idx.unsqueeze(-1), 1.0)\n        kpt_weights[pos_inds] = pos_kpt_weights.reshape(\n            (pos_kpt_weights.shape[0], kpt_pred.shape[-1]))\n\n        factor = paddle.to_tensor(\n            [img_w, img_h], dtype=kpt_pred.dtype).squeeze().unsqueeze(0)\n        pos_gt_kpts_normalized = pos_gt_kpts[..., :2]\n        pos_gt_kpts_normalized[..., 0] = pos_gt_kpts_normalized[..., 0] / \\\n            factor[:, 0:1]\n        pos_gt_kpts_normalized[..., 1] = pos_gt_kpts_normalized[..., 1] / \\\n            factor[:, 1:2]\n        kpt_targets[pos_inds] = pos_gt_kpts_normalized.reshape(\n            (pos_gt_kpts.shape[0], kpt_pred.shape[-1]))\n\n        pos_gt_areas = gt_areas[sampling_result.pos_assigned_gt_inds][..., 0]\n        area_targets[pos_inds] = pos_gt_areas\n\n        return (labels, label_weights, kpt_targets, kpt_weights, area_targets,\n                pos_inds, neg_inds)\n\n    def loss_single_rpn(self, cls_scores, kpt_preds, gt_labels_list,\n                        gt_keypoints_list, gt_areas_list, img_metas):\n        \"\"\"Loss function for outputs from a single decoder layer of a single\n        feature level.\n\n        Args:\n            cls_scores (Tensor): Box score logits from a single decoder layer\n                for all images. Shape [bs, num_query, cls_out_channels].\n            kpt_preds (Tensor): Sigmoid outputs from a single decoder layer\n                for all images, with normalized coordinate (x_{i}, y_{i}) and\n                shape [bs, num_query, K*2].\n            gt_labels_list (list[Tensor]): Ground truth class indices for each\n                image with shape (num_gts, ).\n            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each\n                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,\n                ..., p^{K}_x, p^{K}_y, p^{K}_v] format.\n            gt_areas_list (list[Tensor]): Ground truth mask areas for each\n                image with shape (num_gts, ).\n            img_metas (list[dict]): List of image meta information.\n\n        Returns:\n            dict[str, Tensor]: A dictionary of loss components for outputs from\n                a single decoder layer.\n        \"\"\"\n        num_imgs = cls_scores.shape[0]\n        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]\n        kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]\n        cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,\n                                           gt_labels_list, gt_keypoints_list,\n                                           gt_areas_list, img_metas)\n        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,\n         area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets\n        labels = paddle.concat(labels_list, 0)\n        label_weights = paddle.concat(label_weights_list, 0)\n        kpt_targets = paddle.concat(kpt_targets_list, 0)\n        kpt_weights = paddle.concat(kpt_weights_list, 0)\n\n        # classification loss\n        cls_scores = cls_scores.reshape((-1, self.cls_out_channels))\n        # construct weighted avg_factor to match with the official DETR repo\n        cls_avg_factor = num_total_pos * 1.0 + \\\n            num_total_neg * self.bg_cls_weight\n        if self.sync_cls_avg_factor:\n            cls_avg_factor = reduce_mean(\n                paddle.to_tensor(\n                    [cls_avg_factor], dtype=cls_scores.dtype))\n        cls_avg_factor = max(cls_avg_factor, 1)\n\n        cls_avg_factor = max(cls_avg_factor, 1)\n        loss_cls = self.loss_cls(\n            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)\n\n        # Compute the average number of gt keypoints accross all gpus, for\n        # normalization purposes\n        # num_total_pos = loss_cls.to_tensor([num_total_pos])\n        # num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()\n\n        # keypoint regression loss\n        kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))\n        num_valid_kpt = paddle.clip(\n            reduce_mean(kpt_weights.sum()), min=1).item()\n        # assert num_valid_kpt == (kpt_targets>0).sum().item()\n        loss_kpt = self.loss_kpt_rpn(\n            kpt_preds, kpt_targets, kpt_weights, avg_factor=num_valid_kpt)\n\n        return loss_cls, loss_kpt\n\n    def get_bboxes(self,\n                   all_cls_scores,\n                   all_kpt_preds,\n                   enc_cls_scores,\n                   enc_kpt_preds,\n                   hm_proto,\n                   memory,\n                   mlvl_masks,\n                   img_metas,\n                   rescale=False):\n        \"\"\"Transform network outputs for a batch into bbox predictions.\n\n        Args:\n            all_cls_scores (Tensor): Classification score of all\n                decoder layers, has shape\n                [nb_dec, bs, num_query, cls_out_channels].\n            all_kpt_preds (Tensor): Sigmoid regression\n                outputs of all decode layers. Each is a 4D-tensor with\n                normalized coordinate format (x_{i}, y_{i}) and shape\n                [nb_dec, bs, num_query, K*2].\n            enc_cls_scores (Tensor): Classification scores of points on\n                encode feature map, has shape (N, h*w, num_classes).\n                Only be passed when as_two_stage is True, otherwise is None.\n            enc_kpt_preds (Tensor): Regression results of each points\n                on the encode feature map, has shape (N, h*w, K*2). Only be\n                passed when as_two_stage is True, otherwise is None.\n            img_metas (list[dict]): Meta information of each image.\n            rescale (bool, optional): If True, return boxes in original\n                image space. Defalut False.\n\n        Returns:\n            list[list[Tensor, Tensor]]: Each item in result_list is 3-tuple.\n                The first item is an (n, 5) tensor, where the first 4 columns\n                are bounding box positions (tl_x, tl_y, br_x, br_y) and the\n                5-th column is a score between 0 and 1. The second item is a\n                (n,) tensor where each item is the predicted class label of\n                the corresponding box. The third item is an (n, K, 3) tensor\n                with [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y,\n                p^{K}_v] format.\n        \"\"\"\n        cls_scores = all_cls_scores[-1]\n        kpt_preds = all_kpt_preds[-1]\n\n        result_list = []\n        for img_id in range(len(img_metas)):\n            cls_score = cls_scores[img_id]\n            kpt_pred = kpt_preds[img_id]\n            img_shape = img_metas[img_id]['img_shape']\n            scale_factor = img_metas[img_id]['scale_factor']\n            # TODO: only support single image test\n            # memory_i = memory[:, img_id, :]\n            # mlvl_mask = mlvl_masks[img_id]\n            proposals = self._get_bboxes_single(cls_score, kpt_pred, img_shape,\n                                                scale_factor, memory,\n                                                mlvl_masks, rescale)\n            result_list.append(proposals)\n        return result_list\n\n    def _get_bboxes_single(self,\n                           cls_score,\n                           kpt_pred,\n                           img_shape,\n                           scale_factor,\n                           memory,\n                           mlvl_masks,\n                           rescale=False):\n        \"\"\"Transform outputs from the last decoder layer into bbox predictions\n        for each image.\n\n        Args:\n            cls_score (Tensor): Box score logits from the last decoder layer\n                for each image. Shape [num_query, cls_out_channels].\n            kpt_pred (Tensor): Sigmoid outputs from the last decoder layer\n                for each image, with coordinate format (x_{i}, y_{i}) and\n                shape [num_query, K*2].\n            img_shape (tuple[int]): Shape of input image, (height, width, 3).\n            scale_factor (ndarray, optional): Scale factor of the image arange\n                as (w_scale, h_scale, w_scale, h_scale).\n            rescale (bool, optional): If True, return boxes in original image\n                space. Default False.\n\n        Returns:\n            tuple[Tensor]: Results of detected bboxes and labels.\n\n                - det_bboxes: Predicted bboxes with shape [num_query, 5],\n                    where the first 4 columns are bounding box positions\n                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores\n                    between 0 and 1.\n                - det_labels: Predicted labels of the corresponding box with\n                    shape [num_query].\n                - det_kpts: Predicted keypoints with shape [num_query, K, 3].\n        \"\"\"\n        assert len(cls_score) == len(kpt_pred)\n        max_per_img = self.test_cfg.get('max_per_img', self.num_query)\n        # exclude background\n        if self.loss_cls.use_sigmoid:\n            cls_score = F.sigmoid(cls_score)\n            scores, indexs = cls_score.reshape([-1]).topk(max_per_img)\n            det_labels = indexs % self.num_classes\n            bbox_index = indexs // self.num_classes\n            kpt_pred = kpt_pred[bbox_index]\n        else:\n            scores, det_labels = F.softmax(cls_score, axis=-1)[..., :-1].max(-1)\n            scores, bbox_index = scores.topk(max_per_img)\n            kpt_pred = kpt_pred[bbox_index]\n            det_labels = det_labels[bbox_index]\n\n        # ----- results after pose decoder -----\n        # det_kpts = kpt_pred.reshape((kpt_pred.shape[0], -1, 2))\n\n        # ----- results after joint decoder (default) -----\n        # import time\n        # start = time.time()\n        refine_targets = (kpt_pred, None, None, paddle.ones_like(kpt_pred))\n        refine_outputs = self.forward_refine(memory, mlvl_masks, refine_targets,\n                                             None, None)\n        # end = time.time()\n        # print(f'refine time: {end - start:.6f}')\n        det_kpts = refine_outputs[-1]\n\n        det_kpts[..., 0] = det_kpts[..., 0] * img_shape[1]\n        det_kpts[..., 1] = det_kpts[..., 1] * img_shape[0]\n        det_kpts[..., 0].clip_(min=0, max=img_shape[1])\n        det_kpts[..., 1].clip_(min=0, max=img_shape[0])\n        if rescale:\n            det_kpts /= paddle.to_tensor(\n                scale_factor[:2],\n                dtype=det_kpts.dtype).unsqueeze(0).unsqueeze(0)\n\n        # use circumscribed rectangle box of keypoints as det bboxes\n        x1 = det_kpts[..., 0].min(axis=1, keepdim=True)\n        y1 = det_kpts[..., 1].min(axis=1, keepdim=True)\n        x2 = det_kpts[..., 0].max(axis=1, keepdim=True)\n        y2 = det_kpts[..., 1].max(axis=1, keepdim=True)\n        det_bboxes = paddle.concat([x1, y1, x2, y2], axis=1)\n        det_bboxes = paddle.concat((det_bboxes, scores.unsqueeze(1)), -1)\n\n        det_kpts = paddle.concat(\n            (det_kpts, paddle.ones(\n                det_kpts[..., :1].shape, dtype=det_kpts.dtype)),\n            axis=2)\n\n        return det_bboxes, det_labels, det_kpts\n\n    def simple_test(self, feats, img_metas, rescale=False):\n        \"\"\"Test det bboxes without test-time augmentation.\n\n        Args:\n            feats (tuple[paddle.Tensor]): Multi-level features from the\n                upstream network, each is a 4D-tensor.\n            img_metas (list[dict]): List of image information.\n            rescale (bool, optional): Whether to rescale the results.\n                Defaults to False.\n\n        Returns:\n            list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is\n                3-tuple. The first item is ``bboxes`` with shape (n, 5),\n                where 5 represent (tl_x, tl_y, br_x, br_y, score).\n                The shape of the second tensor in the tuple is ``labels``\n                with shape (n,). The third item is ``kpts`` with shape\n                (n, K, 3), in [p^{1}_x, p^{1}_y, p^{1}_v, p^{K}_x, p^{K}_y,\n                p^{K}_v] format.\n        \"\"\"\n        # forward of this head requires img_metas\n        outs = self.forward(feats, img_metas)\n        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)\n        return results_list\n\n    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):\n        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)\n"
  },
  {
    "path": "ppdet/modeling/heads/pico_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Normal, Constant\n\nfrom ppdet.modeling.ops import get_static_shape\nfrom ..initializer import normal_\nfrom ..assigners.utils import generate_anchors_for_grid_cell\nfrom ..bbox_utils import bbox_center, batch_distance2bbox, bbox2distance\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.layers import ConvNormLayer\nfrom .simota_head import OTAVFLHead\nfrom .gfl_head import Integral, GFLHead\nfrom ppdet.modeling.necks.csp_pan import DPModule\n\neps = 1e-9\n\n__all__ = ['PicoHead', 'PicoHeadV2', 'PicoFeat']\n\n\ndef npu_avg_pool2d(feat, w, h):\n    batch_size, channels, _, _ = feat.shape\n    feat_flat = paddle.reshape(feat, [batch_size, channels, -1])\n    feat_mean = paddle.mean(feat_flat, axis=2)\n    feat_mean = paddle.reshape(\n        feat_mean, [batch_size, channels, w, h])\n    return feat_mean\n\nclass PicoSE(nn.Layer):\n    def __init__(self, feat_channels):\n        super(PicoSE, self).__init__()\n        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)\n        self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1)\n\n        self._init_weights()\n\n    def _init_weights(self):\n        normal_(self.fc.weight, std=0.001)\n\n    def forward(self, feat, avg_feat):\n        weight = F.sigmoid(self.fc(avg_feat))\n        out = self.conv(feat * weight)\n        return out\n\n\n@register\nclass PicoFeat(nn.Layer):\n    \"\"\"\n    PicoFeat of PicoDet\n\n    Args:\n        feat_in (int): The channel number of input Tensor.\n        feat_out (int): The channel number of output Tensor.\n        num_convs (int): The convolution number of the LiteGFLFeat.\n        norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.\n        share_cls_reg (bool): Whether to share the cls and reg output.\n        act (str): The act of per layers.\n        use_se (bool): Whether to use se module.\n    \"\"\"\n\n    def __init__(self,\n                 feat_in=256,\n                 feat_out=96,\n                 num_fpn_stride=3,\n                 num_convs=2,\n                 norm_type='bn',\n                 share_cls_reg=False,\n                 act='hard_swish',\n                 use_se=False):\n        super(PicoFeat, self).__init__()\n        self.num_convs = num_convs\n        self.norm_type = norm_type\n        self.share_cls_reg = share_cls_reg\n        self.act = act\n        self.use_se = use_se\n        self.cls_convs = []\n        self.reg_convs = []\n\n        if paddle.device.get_device().startswith(\"npu\"):\n            self.device = \"npu\"\n        else:\n            self.device = None\n            \n        if use_se:\n            assert share_cls_reg == True, \\\n                'In the case of using se, share_cls_reg must be set to True'\n            self.se = nn.LayerList()\n        for stage_idx in range(num_fpn_stride):\n            cls_subnet_convs = []\n            reg_subnet_convs = []\n            for i in range(self.num_convs):\n                in_c = feat_in if i == 0 else feat_out\n                cls_conv_dw = self.add_sublayer(\n                    'cls_conv_dw{}.{}'.format(stage_idx, i),\n                    ConvNormLayer(\n                        ch_in=in_c,\n                        ch_out=feat_out,\n                        filter_size=5,\n                        stride=1,\n                        groups=feat_out,\n                        norm_type=norm_type,\n                        bias_on=False,\n                        lr_scale=2.))\n                cls_subnet_convs.append(cls_conv_dw)\n                cls_conv_pw = self.add_sublayer(\n                    'cls_conv_pw{}.{}'.format(stage_idx, i),\n                    ConvNormLayer(\n                        ch_in=in_c,\n                        ch_out=feat_out,\n                        filter_size=1,\n                        stride=1,\n                        norm_type=norm_type,\n                        bias_on=False,\n                        lr_scale=2.))\n                cls_subnet_convs.append(cls_conv_pw)\n\n                if not self.share_cls_reg:\n                    reg_conv_dw = self.add_sublayer(\n                        'reg_conv_dw{}.{}'.format(stage_idx, i),\n                        ConvNormLayer(\n                            ch_in=in_c,\n                            ch_out=feat_out,\n                            filter_size=5,\n                            stride=1,\n                            groups=feat_out,\n                            norm_type=norm_type,\n                            bias_on=False,\n                            lr_scale=2.))\n                    reg_subnet_convs.append(reg_conv_dw)\n                    reg_conv_pw = self.add_sublayer(\n                        'reg_conv_pw{}.{}'.format(stage_idx, i),\n                        ConvNormLayer(\n                            ch_in=in_c,\n                            ch_out=feat_out,\n                            filter_size=1,\n                            stride=1,\n                            norm_type=norm_type,\n                            bias_on=False,\n                            lr_scale=2.))\n                    reg_subnet_convs.append(reg_conv_pw)\n            self.cls_convs.append(cls_subnet_convs)\n            self.reg_convs.append(reg_subnet_convs)\n            if use_se:\n                self.se.append(PicoSE(feat_out))\n\n    def act_func(self, x):\n        if self.act == \"leaky_relu\":\n            x = F.leaky_relu(x)\n        elif self.act == \"hard_swish\":\n            x = F.hardswish(x)\n        elif self.act == \"relu6\":\n            x = F.relu6(x)\n        return x\n\n    def forward(self, fpn_feat, stage_idx):\n        assert stage_idx < len(self.cls_convs)\n        cls_feat = fpn_feat\n        reg_feat = fpn_feat\n        for i in range(len(self.cls_convs[stage_idx])):\n            cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat))\n            reg_feat = cls_feat\n            if not self.share_cls_reg:\n                reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat))\n        if self.use_se:\n            if self.device == \"npu\":\n                avg_feat = npu_avg_pool2d(cls_feat, 1, 1)\n            else:\n                avg_feat = F.adaptive_avg_pool2d(cls_feat, (1, 1))\n            se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat))\n            return cls_feat, se_feat\n        return cls_feat, reg_feat\n\n\n@register\nclass PicoHead(OTAVFLHead):\n    \"\"\"\n    PicoHead\n    Args:\n        conv_feat (object): Instance of 'PicoFeat'\n        num_classes (int): Number of classes\n        fpn_stride (list): The stride of each FPN Layer\n        prior_prob (float): Used to set the bias init for the class prediction layer\n        loss_class (object): Instance of VariFocalLoss.\n        loss_dfl (object): Instance of DistributionFocalLoss.\n        loss_bbox (object): Instance of bbox loss.\n        assigner (object): Instance of label assigner.\n        reg_max: Max value of integral set :math: `{0, ..., reg_max}`\n                n QFL setting. Default: 7.\n    \"\"\"\n    __inject__ = [\n        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',\n        'assigner', 'nms'\n    ]\n    __shared__ = ['num_classes', 'eval_size']\n\n    def __init__(self,\n                 conv_feat='PicoFeat',\n                 dgqp_module=None,\n                 num_classes=80,\n                 fpn_stride=[8, 16, 32],\n                 prior_prob=0.01,\n                 loss_class='VariFocalLoss',\n                 loss_dfl='DistributionFocalLoss',\n                 loss_bbox='GIoULoss',\n                 assigner='SimOTAAssigner',\n                 reg_max=16,\n                 feat_in_chan=96,\n                 nms=None,\n                 nms_pre=1000,\n                 cell_offset=0,\n                 eval_size=None):\n        super(PicoHead, self).__init__(\n            conv_feat=conv_feat,\n            dgqp_module=dgqp_module,\n            num_classes=num_classes,\n            fpn_stride=fpn_stride,\n            prior_prob=prior_prob,\n            loss_class=loss_class,\n            loss_dfl=loss_dfl,\n            loss_bbox=loss_bbox,\n            assigner=assigner,\n            reg_max=reg_max,\n            feat_in_chan=feat_in_chan,\n            nms=nms,\n            nms_pre=nms_pre,\n            cell_offset=cell_offset)\n        self.conv_feat = conv_feat\n        self.num_classes = num_classes\n        self.fpn_stride = fpn_stride\n        self.prior_prob = prior_prob\n        self.loss_vfl = loss_class\n        self.loss_dfl = loss_dfl\n        self.loss_bbox = loss_bbox\n        self.assigner = assigner\n        self.reg_max = reg_max\n        self.feat_in_chan = feat_in_chan\n        self.nms = nms\n        self.nms_pre = nms_pre\n        self.cell_offset = cell_offset\n        self.eval_size = eval_size\n        self.device = paddle.device.get_device()\n\n        self.use_sigmoid = self.loss_vfl.use_sigmoid\n        if self.use_sigmoid:\n            self.cls_out_channels = self.num_classes\n        else:\n            self.cls_out_channels = self.num_classes + 1\n        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)\n        # Clear the super class initialization\n        self.gfl_head_cls = None\n        self.gfl_head_reg = None\n        self.scales_regs = None\n\n        self.head_cls_list = []\n        self.head_reg_list = []\n        for i in range(len(fpn_stride)):\n            head_cls = self.add_sublayer(\n                \"head_cls\" + str(i),\n                nn.Conv2D(\n                    in_channels=self.feat_in_chan,\n                    out_channels=self.cls_out_channels + 4 * (self.reg_max + 1)\n                    if self.conv_feat.share_cls_reg else self.cls_out_channels,\n                    kernel_size=1,\n                    stride=1,\n                    padding=0,\n                    weight_attr=ParamAttr(initializer=Normal(\n                        mean=0., std=0.01)),\n                    bias_attr=ParamAttr(\n                        initializer=Constant(value=bias_init_value))))\n            self.head_cls_list.append(head_cls)\n            if not self.conv_feat.share_cls_reg:\n                head_reg = self.add_sublayer(\n                    \"head_reg\" + str(i),\n                    nn.Conv2D(\n                        in_channels=self.feat_in_chan,\n                        out_channels=4 * (self.reg_max + 1),\n                        kernel_size=1,\n                        stride=1,\n                        padding=0,\n                        weight_attr=ParamAttr(initializer=Normal(\n                            mean=0., std=0.01)),\n                        bias_attr=ParamAttr(initializer=Constant(value=0))))\n                self.head_reg_list.append(head_reg)\n\n        # initialize the anchor points\n        if self.eval_size:\n            self.anchor_points, self.stride_tensor = self._generate_anchors()\n\n    def forward(self, fpn_feats, export_post_process=True):\n        assert len(fpn_feats) == len(\n            self.fpn_stride\n        ), \"The size of fpn_feats is not equal to size of fpn_stride\"\n\n        if self.training:\n            return self.forward_train(fpn_feats)\n        else:\n            return self.forward_eval(\n                fpn_feats, export_post_process=export_post_process)\n\n    def forward_train(self, fpn_feats):\n        cls_logits_list, bboxes_reg_list = [], []\n        for i, fpn_feat in enumerate(fpn_feats):\n            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)\n            if self.conv_feat.share_cls_reg:\n                cls_logits = self.head_cls_list[i](conv_cls_feat)\n                cls_score, bbox_pred = paddle.split(\n                    cls_logits,\n                    [self.cls_out_channels, 4 * (self.reg_max + 1)],\n                    axis=1)\n            else:\n                cls_score = self.head_cls_list[i](conv_cls_feat)\n                bbox_pred = self.head_reg_list[i](conv_reg_feat)\n\n            if self.dgqp_module:\n                quality_score = self.dgqp_module(bbox_pred)\n                cls_score = F.sigmoid(cls_score) * quality_score\n\n            cls_logits_list.append(cls_score)\n            bboxes_reg_list.append(bbox_pred)\n\n        return (cls_logits_list, bboxes_reg_list)\n\n    def forward_eval(self, fpn_feats, export_post_process=True):\n        if self.eval_size:\n            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor\n        else:\n            anchor_points, stride_tensor = self._generate_anchors(fpn_feats)\n        cls_logits_list, bboxes_reg_list = [], []\n        for i, fpn_feat in enumerate(fpn_feats):\n            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)\n            if self.conv_feat.share_cls_reg:\n                cls_logits = self.head_cls_list[i](conv_cls_feat)\n                cls_score, bbox_pred = paddle.split(\n                    cls_logits,\n                    [self.cls_out_channels, 4 * (self.reg_max + 1)],\n                    axis=1)\n            else:\n                cls_score = self.head_cls_list[i](conv_cls_feat)\n                bbox_pred = self.head_reg_list[i](conv_reg_feat)\n\n            if self.dgqp_module:\n                quality_score = self.dgqp_module(bbox_pred)\n                cls_score = F.sigmoid(cls_score) * quality_score\n\n            if not export_post_process:\n                # Now only supports batch size = 1 in deploy\n                # TODO(ygh): support batch size > 1\n                cls_score_out = F.sigmoid(cls_score).reshape(\n                    [1, self.cls_out_channels, -1]).transpose([0, 2, 1])\n                bbox_pred = bbox_pred.reshape([1, (self.reg_max + 1) * 4,\n                                               -1]).transpose([0, 2, 1])\n            else:\n                _, _, h, w = fpn_feat.shape\n                l = h * w\n                cls_score_out = F.sigmoid(\n                    cls_score.reshape([-1, self.cls_out_channels, l]))\n                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])\n                bbox_pred = self.distribution_project(bbox_pred)\n                bbox_pred = bbox_pred.reshape([-1, l, 4])\n\n            cls_logits_list.append(cls_score_out)\n            bboxes_reg_list.append(bbox_pred)\n\n        if export_post_process:\n            cls_logits_list = paddle.concat(cls_logits_list, axis=-1)\n            bboxes_reg_list = paddle.concat(bboxes_reg_list, axis=1)\n            bboxes_reg_list = batch_distance2bbox(anchor_points,\n                                                  bboxes_reg_list)\n            bboxes_reg_list *= stride_tensor\n\n        return (cls_logits_list, bboxes_reg_list)\n\n    def _generate_anchors(self, feats=None):\n        # just use in eval time\n        anchor_points = []\n        stride_tensor = []\n        for i, stride in enumerate(self.fpn_stride):\n            if feats is not None:\n                _, _, h, w = feats[i].shape\n            else:\n                h = math.ceil(self.eval_size[0] / stride)\n                w = math.ceil(self.eval_size[1] / stride)\n            shift_x = paddle.arange(end=w) + self.cell_offset\n            shift_y = paddle.arange(end=h) + self.cell_offset\n            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)\n            anchor_point = paddle.cast(\n                paddle.stack(\n                    [shift_x, shift_y], axis=-1), dtype='float32')\n            anchor_points.append(anchor_point.reshape([-1, 2]))\n            stride_tensor.append(\n                paddle.full(\n                    [h * w, 1], stride, dtype='float32'))\n        anchor_points = paddle.concat(anchor_points)\n        stride_tensor = paddle.concat(stride_tensor)\n        return anchor_points, stride_tensor\n\n    def post_process(self,\n                     head_outs,\n                     scale_factor,\n                     export_nms=True,\n                     nms_cpu=False):\n        pred_scores, pred_bboxes = head_outs\n        if not export_nms:\n            return pred_bboxes, pred_scores\n        else:\n            # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]\n            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)\n            scale_factor = paddle.concat(\n                [scale_x, scale_y, scale_x, scale_y],\n                axis=-1).reshape([-1, 1, 4])\n            # scale bbox to origin image size.\n            pred_bboxes /= scale_factor\n            if nms_cpu:\n                paddle.set_device(\"cpu\")\n                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)\n                paddle.set_device(self.device)\n            else:\n                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)\n            return bbox_pred, bbox_num\n\n\n@register\nclass PicoHeadV2(GFLHead):\n    \"\"\"\n    PicoHeadV2\n    Args:\n        conv_feat (object): Instance of 'PicoFeat'\n        num_classes (int): Number of classes\n        fpn_stride (list): The stride of each FPN Layer\n        prior_prob (float): Used to set the bias init for the class prediction layer\n        loss_class (object): Instance of VariFocalLoss.\n        loss_dfl (object): Instance of DistributionFocalLoss.\n        loss_bbox (object): Instance of bbox loss.\n        assigner (object): Instance of label assigner.\n        reg_max: Max value of integral set :math: `{0, ..., reg_max}`\n                n QFL setting. Default: 7.\n    \"\"\"\n    __inject__ = [\n        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',\n        'static_assigner', 'assigner', 'nms'\n    ]\n    __shared__ = ['num_classes', 'eval_size']\n\n    def __init__(self,\n                 conv_feat='PicoFeatV2',\n                 dgqp_module=None,\n                 num_classes=80,\n                 fpn_stride=[8, 16, 32],\n                 prior_prob=0.01,\n                 use_align_head=True,\n                 loss_class='VariFocalLoss',\n                 loss_dfl='DistributionFocalLoss',\n                 loss_bbox='GIoULoss',\n                 static_assigner_epoch=60,\n                 static_assigner='ATSSAssigner',\n                 assigner='TaskAlignedAssigner',\n                 reg_max=16,\n                 feat_in_chan=96,\n                 nms=None,\n                 nms_pre=1000,\n                 cell_offset=0,\n                 act='hard_swish',\n                 grid_cell_scale=5.0,\n                 eval_size=None):\n        super(PicoHeadV2, self).__init__(\n            conv_feat=conv_feat,\n            dgqp_module=dgqp_module,\n            num_classes=num_classes,\n            fpn_stride=fpn_stride,\n            prior_prob=prior_prob,\n            loss_class=loss_class,\n            loss_dfl=loss_dfl,\n            loss_bbox=loss_bbox,\n            reg_max=reg_max,\n            feat_in_chan=feat_in_chan,\n            nms=nms,\n            nms_pre=nms_pre,\n            cell_offset=cell_offset, )\n        self.conv_feat = conv_feat\n        self.num_classes = num_classes\n        self.fpn_stride = fpn_stride\n        self.prior_prob = prior_prob\n        self.loss_vfl = loss_class\n        self.loss_dfl = loss_dfl\n        self.loss_bbox = loss_bbox\n\n        self.static_assigner_epoch = static_assigner_epoch\n        self.static_assigner = static_assigner\n        self.assigner = assigner\n\n        self.reg_max = reg_max\n        self.feat_in_chan = feat_in_chan\n        self.nms = nms\n        self.nms_pre = nms_pre\n        self.cell_offset = cell_offset\n        self.act = act\n        self.grid_cell_scale = grid_cell_scale\n        self.use_align_head = use_align_head\n        self.cls_out_channels = self.num_classes\n        self.eval_size = eval_size\n\n        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)\n        # Clear the super class initialization\n        self.gfl_head_cls = None\n        self.gfl_head_reg = None\n        self.scales_regs = None\n\n        self.head_cls_list = nn.LayerList()\n        self.head_reg_list = nn.LayerList()\n        self.cls_align = nn.LayerList()\n\n        for i in range(len(fpn_stride)):\n            head_cls = self.add_sublayer(\n                \"head_cls\" + str(i),\n                nn.Conv2D(\n                    in_channels=self.feat_in_chan,\n                    out_channels=self.cls_out_channels,\n                    kernel_size=1,\n                    stride=1,\n                    padding=0,\n                    weight_attr=ParamAttr(initializer=Normal(\n                        mean=0., std=0.01)),\n                    bias_attr=ParamAttr(\n                        initializer=Constant(value=bias_init_value))))\n            self.head_cls_list.append(head_cls)\n            head_reg = self.add_sublayer(\n                \"head_reg\" + str(i),\n                nn.Conv2D(\n                    in_channels=self.feat_in_chan,\n                    out_channels=4 * (self.reg_max + 1),\n                    kernel_size=1,\n                    stride=1,\n                    padding=0,\n                    weight_attr=ParamAttr(initializer=Normal(\n                        mean=0., std=0.01)),\n                    bias_attr=ParamAttr(initializer=Constant(value=0))))\n            self.head_reg_list.append(head_reg)\n            if self.use_align_head:\n                self.cls_align.append(\n                    DPModule(\n                        self.feat_in_chan,\n                        1,\n                        5,\n                        act=self.act,\n                        use_act_in_out=False))\n\n        # initialize the anchor points\n        if self.eval_size:\n            self.anchor_points, self.stride_tensor = self._generate_anchors()\n\n    def forward(self, fpn_feats, export_post_process=True):\n        assert len(fpn_feats) == len(\n            self.fpn_stride\n        ), \"The size of fpn_feats is not equal to size of fpn_stride\"\n\n        if self.training:\n            return self.forward_train(fpn_feats)\n        else:\n            return self.forward_eval(\n                fpn_feats, export_post_process=export_post_process)\n\n    def forward_train(self, fpn_feats):\n        cls_score_list, reg_list, box_list = [], [], []\n        for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):\n            b, _, h, w = get_static_shape(fpn_feat)\n            # task decomposition\n            conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)\n            cls_logit = self.head_cls_list[i](se_feat)\n            reg_pred = self.head_reg_list[i](se_feat)\n\n            # cls prediction and alignment\n            if self.use_align_head:\n                cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))\n                cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()\n            else:\n                cls_score = F.sigmoid(cls_logit)\n\n            cls_score_out = cls_score.transpose([0, 2, 3, 1])\n            bbox_pred = reg_pred.transpose([0, 2, 3, 1])\n            b, cell_h, cell_w, _ = cls_score_out.shape\n            y, x = self.get_single_level_center_point(\n                [cell_h, cell_w], stride, cell_offset=self.cell_offset)\n            center_points = paddle.stack([x, y], axis=-1)\n            cls_score_out = cls_score_out.reshape(\n                [b, -1, self.cls_out_channels])\n            bbox_pred = self.distribution_project(bbox_pred) * stride\n            bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])\n            bbox_pred = batch_distance2bbox(\n                center_points, bbox_pred, max_shapes=None)\n            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))\n            reg_list.append(reg_pred.flatten(2).transpose([0, 2, 1]))\n            box_list.append(bbox_pred / stride)\n\n        cls_score_list = paddle.concat(cls_score_list, axis=1)\n        box_list = paddle.concat(box_list, axis=1)\n        reg_list = paddle.concat(reg_list, axis=1)\n        return cls_score_list, reg_list, box_list, fpn_feats\n\n    def forward_eval(self, fpn_feats, export_post_process=True):\n        if self.eval_size:\n            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor\n        else:\n            anchor_points, stride_tensor = self._generate_anchors(fpn_feats)\n        cls_score_list, box_list = [], []\n        for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):\n            _, _, h, w = fpn_feat.shape\n            # task decomposition\n            conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)\n            cls_logit = self.head_cls_list[i](se_feat)\n            reg_pred = self.head_reg_list[i](se_feat)\n\n            # cls prediction and alignment\n            if self.use_align_head:\n                cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))\n                cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()\n            else:\n                cls_score = F.sigmoid(cls_logit)\n\n            if not export_post_process:\n                # Now only supports batch size = 1 in deploy\n                cls_score_list.append(\n                    cls_score.reshape([1, self.cls_out_channels, -1]).transpose(\n                        [0, 2, 1]))\n                box_list.append(\n                    reg_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose(\n                        [0, 2, 1]))\n            else:\n                l = h * w\n                cls_score_out = cls_score.reshape(\n                    [-1, self.cls_out_channels, l])\n                bbox_pred = reg_pred.transpose([0, 2, 3, 1])\n                bbox_pred = self.distribution_project(bbox_pred)\n                bbox_pred = bbox_pred.reshape([-1, l, 4])\n                cls_score_list.append(cls_score_out)\n                box_list.append(bbox_pred)\n\n        if export_post_process:\n            cls_score_list = paddle.concat(cls_score_list, axis=-1)\n            box_list = paddle.concat(box_list, axis=1)\n            box_list = batch_distance2bbox(anchor_points, box_list)\n            box_list *= stride_tensor\n\n        return cls_score_list, box_list\n\n    def get_loss(self, head_outs, gt_meta):\n        pred_scores, pred_regs, pred_bboxes, fpn_feats = head_outs\n        gt_labels = gt_meta['gt_class']\n        gt_bboxes = gt_meta['gt_bbox']\n        gt_scores = gt_meta['gt_score'] if 'gt_score' in gt_meta else None\n        num_imgs = gt_meta['im_id'].shape[0]\n        pad_gt_mask = gt_meta['pad_gt_mask']\n\n        anchors, _, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell(\n            fpn_feats, self.fpn_stride, self.grid_cell_scale, self.cell_offset)\n\n        centers = bbox_center(anchors)\n\n        # label assignment\n        if gt_meta['epoch_id'] < self.static_assigner_epoch:\n            assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(\n                anchors,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index=self.num_classes,\n                gt_scores=gt_scores,\n                pred_bboxes=pred_bboxes.detach() * stride_tensor_list)\n\n        else:\n            assigned_labels, assigned_bboxes, assigned_scores = self.assigner(\n                pred_scores.detach(),\n                pred_bboxes.detach() * stride_tensor_list,\n                centers,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index=self.num_classes,\n                gt_scores=gt_scores)\n\n        assigned_bboxes /= stride_tensor_list\n\n        centers_shape = centers.shape\n        flatten_centers = centers.expand(\n            [num_imgs, centers_shape[0], centers_shape[1]]).reshape([-1, 2])\n        flatten_strides = stride_tensor_list.expand(\n            [num_imgs, centers_shape[0], 1]).reshape([-1, 1])\n        flatten_cls_preds = pred_scores.reshape([-1, self.num_classes])\n        flatten_regs = pred_regs.reshape([-1, 4 * (self.reg_max + 1)])\n        flatten_bboxes = pred_bboxes.reshape([-1, 4])\n        flatten_bbox_targets = assigned_bboxes.reshape([-1, 4])\n        flatten_labels = assigned_labels.reshape([-1])\n        flatten_assigned_scores = assigned_scores.reshape(\n            [-1, self.num_classes])\n\n        pos_inds = paddle.nonzero(\n            paddle.logical_and((flatten_labels >= 0),\n                               (flatten_labels < self.num_classes)),\n            as_tuple=False).squeeze(1)\n\n        num_total_pos = len(pos_inds)\n\n        if num_total_pos > 0:\n            pos_bbox_targets = paddle.gather(\n                flatten_bbox_targets, pos_inds, axis=0)\n            pos_decode_bbox_pred = paddle.gather(\n                flatten_bboxes, pos_inds, axis=0)\n            pos_reg = paddle.gather(flatten_regs, pos_inds, axis=0)\n            pos_strides = paddle.gather(flatten_strides, pos_inds, axis=0)\n            pos_centers = paddle.gather(\n                flatten_centers, pos_inds, axis=0) / pos_strides\n\n            weight_targets = flatten_assigned_scores.detach()\n            weight_targets = paddle.gather(\n                weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)\n\n            pred_corners = pos_reg.reshape([-1, self.reg_max + 1])\n            target_corners = bbox2distance(pos_centers, pos_bbox_targets,\n                                           self.reg_max).reshape([-1])\n            # regression loss\n            loss_bbox = paddle.sum(\n                self.loss_bbox(pos_decode_bbox_pred,\n                               pos_bbox_targets) * weight_targets)\n\n            # dfl loss\n            loss_dfl = self.loss_dfl(\n                pred_corners,\n                target_corners,\n                weight=weight_targets.expand([-1, 4]).reshape([-1]),\n                avg_factor=4.0)\n        else:\n            loss_bbox = paddle.zeros([])\n            loss_dfl = paddle.zeros([])\n\n        avg_factor = flatten_assigned_scores.sum()\n        if paddle.distributed.get_world_size() > 1:\n            paddle.distributed.all_reduce(avg_factor)\n            avg_factor = paddle.clip(\n                avg_factor / paddle.distributed.get_world_size(), min=1)\n        loss_vfl = self.loss_vfl(\n            flatten_cls_preds, flatten_assigned_scores, avg_factor=avg_factor)\n\n        loss_bbox = loss_bbox / avg_factor\n        loss_dfl = loss_dfl / avg_factor\n\n        loss_states = dict(\n            loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)\n\n        return loss_states\n\n    def _generate_anchors(self, feats=None):\n        # just use in eval time\n        anchor_points = []\n        stride_tensor = []\n        for i, stride in enumerate(self.fpn_stride):\n            if feats is not None:\n                _, _, h, w = feats[i].shape\n            else:\n                h = math.ceil(self.eval_size[0] / stride)\n                w = math.ceil(self.eval_size[1] / stride)\n            shift_x = paddle.arange(end=w) + self.cell_offset\n            shift_y = paddle.arange(end=h) + self.cell_offset\n            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)\n            anchor_point = paddle.cast(\n                paddle.stack(\n                    [shift_x, shift_y], axis=-1), dtype='float32')\n            anchor_points.append(anchor_point.reshape([-1, 2]))\n            stride_tensor.append(\n                paddle.full(\n                    [h * w, 1], stride, dtype='float32'))\n        anchor_points = paddle.concat(anchor_points)\n        stride_tensor = paddle.concat(stride_tensor)\n        return anchor_points, stride_tensor\n\n    def post_process(self,\n                     head_outs,\n                     scale_factor,\n                     export_nms=True,\n                     nms_cpu=False):\n        pred_scores, pred_bboxes = head_outs\n        if not export_nms:\n            return pred_bboxes, pred_scores\n        else:\n            # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]\n            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)\n            scale_factor = paddle.concat(\n                [scale_x, scale_y, scale_x, scale_y],\n                axis=-1).reshape([-1, 1, 4])\n            # scale bbox to origin image size.\n            pred_bboxes /= scale_factor\n            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)\n            return bbox_pred, bbox_num\n"
  },
  {
    "path": "ppdet/modeling/heads/ppyoloe_contrast_head.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\n\nfrom ..initializer import bias_init_with_prob, constant_\nfrom ..assigners.utils import generate_anchors_for_grid_cell\nfrom ppdet.modeling.heads.ppyoloe_head import PPYOLOEHead\n\n__all__ = ['PPYOLOEContrastHead']\n\n\n@register\nclass PPYOLOEContrastHead(PPYOLOEHead):\n    __shared__ = [\n        'num_classes', 'eval_size', 'trt', 'exclude_nms',\n        'exclude_post_process', 'use_shared_conv', 'for_distill'\n    ]\n    __inject__ = ['static_assigner', 'assigner', 'nms', 'contrast_loss']\n\n    def __init__(self,\n                 in_channels=[1024, 512, 256],\n                 num_classes=80,\n                 act='swish',\n                 fpn_strides=(32, 16, 8),\n                 grid_cell_scale=5.0,\n                 grid_cell_offset=0.5,\n                 reg_max=16,\n                 reg_range=None,\n                 static_assigner_epoch=4,\n                 use_varifocal_loss=True,\n                 static_assigner='ATSSAssigner',\n                 assigner='TaskAlignedAssigner',\n                 contrast_loss='SupContrast',\n                 nms='MultiClassNMS',\n                 eval_size=None,\n                 loss_weight={\n                     'class': 1.0,\n                     'iou': 2.5,\n                     'dfl': 0.5,\n                 },\n                 trt=False,\n                 attn_conv='convbn',\n                 exclude_nms=False,\n                 exclude_post_process=False,\n                 use_shared_conv=True,\n                 for_distill=False):\n        super().__init__(in_channels, num_classes, act, fpn_strides,\n                         grid_cell_scale, grid_cell_offset, reg_max, reg_range,\n                         static_assigner_epoch, use_varifocal_loss,\n                         static_assigner, assigner, nms, eval_size, loss_weight,\n                         trt, attn_conv, exclude_nms, exclude_post_process,\n                         use_shared_conv, for_distill)\n\n        assert len(in_channels) > 0, \"len(in_channels) should > 0\"\n        self.contrast_loss = contrast_loss\n        self.contrast_encoder = nn.LayerList()\n        for in_c in self.in_channels:\n            self.contrast_encoder.append(nn.Conv2D(in_c, 128, 3, padding=1))\n        self._init_contrast_encoder()\n\n    def _init_contrast_encoder(self):\n        bias_en = bias_init_with_prob(0.01)\n        for en_ in self.contrast_encoder:\n            constant_(en_.weight)\n            constant_(en_.bias, bias_en)\n\n    def forward_train(self, feats, targets, aux_pred=None):\n        anchors, anchor_points, num_anchors_list, stride_tensor = \\\n            generate_anchors_for_grid_cell(\n                feats, self.fpn_strides, self.grid_cell_scale,\n                self.grid_cell_offset)\n\n        cls_score_list, reg_distri_list = [], []\n        contrast_encoder_list = []\n        for i, feat in enumerate(feats):\n            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))\n            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +\n                                         feat)\n            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))\n            contrast_logit = self.contrast_encoder[i](self.stem_cls[i](\n                feat, avg_feat) + feat)\n            contrast_encoder_list.append(\n                contrast_logit.flatten(2).transpose([0, 2, 1]))\n            # cls and reg\n            cls_score = F.sigmoid(cls_logit)\n            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))\n            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))\n        cls_score_list = paddle.concat(cls_score_list, axis=1)\n        reg_distri_list = paddle.concat(reg_distri_list, axis=1)\n        contrast_encoder_list = paddle.concat(contrast_encoder_list, axis=1)\n\n        return self.get_loss([\n            cls_score_list, reg_distri_list, contrast_encoder_list, anchors,\n            anchor_points, num_anchors_list, stride_tensor\n        ], targets)\n\n    def get_loss(self, head_outs, gt_meta):\n        pred_scores, pred_distri, pred_contrast_encoder, anchors,\\\n        anchor_points, num_anchors_list, stride_tensor = head_outs\n\n        anchor_points_s = anchor_points / stride_tensor\n        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)\n\n        gt_labels = gt_meta['gt_class']\n        gt_bboxes = gt_meta['gt_bbox']\n        pad_gt_mask = gt_meta['pad_gt_mask']\n        # label assignment\n        if gt_meta['epoch_id'] < self.static_assigner_epoch:\n            assigned_labels, assigned_bboxes, assigned_scores = \\\n                self.static_assigner(\n                    anchors,\n                    num_anchors_list,\n                    gt_labels,\n                    gt_bboxes,\n                    pad_gt_mask,\n                    bg_index=self.num_classes,\n                    pred_bboxes=pred_bboxes.detach() * stride_tensor)\n            alpha_l = 0.25\n        else:\n            if self.sm_use:\n                assigned_labels, assigned_bboxes, assigned_scores = \\\n                    self.assigner(\n                    pred_scores.detach(),\n                    pred_bboxes.detach() * stride_tensor,\n                    anchor_points,\n                    stride_tensor,\n                    gt_labels,\n                    gt_bboxes,\n                    pad_gt_mask,\n                    bg_index=self.num_classes)\n            else:\n                assigned_labels, assigned_bboxes, assigned_scores = \\\n                    self.assigner(\n                    pred_scores.detach(),\n                    pred_bboxes.detach() * stride_tensor,\n                    anchor_points,\n                    num_anchors_list,\n                    gt_labels,\n                    gt_bboxes,\n                    pad_gt_mask,\n                    bg_index=self.num_classes)\n            alpha_l = -1\n        # rescale bbox\n        assigned_bboxes /= stride_tensor\n        # cls loss\n        if self.use_varifocal_loss:\n            one_hot_label = F.one_hot(assigned_labels,\n                                      self.num_classes + 1)[..., :-1]\n            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,\n                                            one_hot_label)\n        else:\n            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)\n\n        assigned_scores_sum = assigned_scores.sum()\n        if paddle.distributed.get_world_size() > 1:\n            paddle.distributed.all_reduce(assigned_scores_sum)\n            assigned_scores_sum /= paddle.distributed.get_world_size()\n        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)\n        loss_cls /= assigned_scores_sum\n\n        loss_l1, loss_iou, loss_dfl = \\\n            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,\n                            assigned_labels, assigned_bboxes, assigned_scores,\n                            assigned_scores_sum)\n        # contrast loss\n        loss_contrast = self.contrast_loss(pred_contrast_encoder.reshape([-1, pred_contrast_encoder.shape[-1]]), \\\n            assigned_labels.reshape([-1]), assigned_scores.max(-1).reshape([-1]))\n\n        loss = self.loss_weight['class'] * loss_cls + \\\n               self.loss_weight['iou'] * loss_iou + \\\n               self.loss_weight['dfl'] * loss_dfl + \\\n               self.loss_weight['contrast'] * loss_contrast\n\n        out_dict = {\n            'loss': loss,\n            'loss_cls': loss_cls,\n            'loss_iou': loss_iou,\n            'loss_dfl': loss_dfl,\n            'loss_l1': loss_l1,\n            'loss_contrast': loss_contrast\n        }\n        return out_dict\n"
  },
  {
    "path": "ppdet/modeling/heads/ppyoloe_head.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import KaimingNormal\nfrom paddle.nn.initializer import Normal, Constant\n\nfrom ..bbox_utils import batch_distance2bbox\nfrom ..losses import GIoULoss\nfrom ..initializer import bias_init_with_prob, constant_, normal_\nfrom ..assigners.utils import generate_anchors_for_grid_cell\nfrom ppdet.modeling.backbones.cspresnet import ConvBNLayer, RepVggBlock\nfrom ppdet.modeling.ops import get_static_shape, get_act_fn\nfrom ppdet.modeling.layers import MultiClassNMS\n\n__all__ = ['PPYOLOEHead', 'SimpleConvHead']\n\n\nclass ESEAttn(nn.Layer):\n    def __init__(self, feat_channels, act='swish', attn_conv='convbn'):\n        super(ESEAttn, self).__init__()\n        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)\n        if attn_conv == 'convbn':\n            self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)\n        elif attn_conv == 'repvgg':\n            self.conv = RepVggBlock(feat_channels, feat_channels, act=act)\n        else:\n            self.conv = None\n        self._init_weights()\n\n    def _init_weights(self):\n        normal_(self.fc.weight, std=0.001)\n\n    def forward(self, feat, avg_feat):\n        weight = F.sigmoid(self.fc(avg_feat))\n        if self.conv:\n            return self.conv(feat * weight)\n        else:\n            return feat * weight\n\n\n@register\nclass PPYOLOEHead(nn.Layer):\n    __shared__ = [\n        'num_classes', 'eval_size', 'trt', 'exclude_nms',\n        'exclude_post_process', 'use_shared_conv', 'for_distill'\n    ]\n    __inject__ = ['static_assigner', 'assigner', 'nms']\n\n    def __init__(self,\n                 in_channels=[1024, 512, 256],\n                 num_classes=80,\n                 act='swish',\n                 fpn_strides=(32, 16, 8),\n                 grid_cell_scale=5.0,\n                 grid_cell_offset=0.5,\n                 reg_max=16,\n                 reg_range=None,\n                 static_assigner_epoch=4,\n                 use_varifocal_loss=True,\n                 static_assigner='ATSSAssigner',\n                 assigner='TaskAlignedAssigner',\n                 nms='MultiClassNMS',\n                 eval_size=None,\n                 loss_weight={\n                     'class': 1.0,\n                     'iou': 2.5,\n                     'dfl': 0.5,\n                 },\n                 trt=False,\n                 attn_conv='convbn',\n                 exclude_nms=False,\n                 exclude_post_process=False,\n                 use_shared_conv=True,\n                 for_distill=False):\n        super(PPYOLOEHead, self).__init__()\n        assert len(in_channels) > 0, \"len(in_channels) should > 0\"\n        self.in_channels = in_channels\n        self.num_classes = num_classes\n        self.fpn_strides = fpn_strides\n        self.grid_cell_scale = grid_cell_scale\n        self.grid_cell_offset = grid_cell_offset\n        if reg_range:\n            self.sm_use = True\n            self.reg_range = reg_range\n        else:\n            self.sm_use = False\n            self.reg_range = (0, reg_max + 1)\n        self.reg_channels = self.reg_range[1] - self.reg_range[0]\n        self.iou_loss = GIoULoss()\n        self.loss_weight = loss_weight\n        self.use_varifocal_loss = use_varifocal_loss\n        self.eval_size = eval_size\n\n        self.static_assigner_epoch = static_assigner_epoch\n        self.static_assigner = static_assigner\n        self.assigner = assigner\n        self.nms = nms\n        if isinstance(self.nms, MultiClassNMS) and trt:\n            self.nms.trt = trt\n        self.exclude_nms = exclude_nms\n        self.exclude_post_process = exclude_post_process\n        self.use_shared_conv = use_shared_conv\n        self.for_distill = for_distill\n        self.is_teacher = False\n\n        # stem\n        self.stem_cls = nn.LayerList()\n        self.stem_reg = nn.LayerList()\n        act = get_act_fn(\n            act, trt=trt) if act is None or isinstance(act,\n                                                       (str, dict)) else act\n        for in_c in self.in_channels:\n            self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))\n            self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))\n        # pred head\n        self.pred_cls = nn.LayerList()\n        self.pred_reg = nn.LayerList()\n        for in_c in self.in_channels:\n            self.pred_cls.append(\n                nn.Conv2D(\n                    in_c, self.num_classes, 3, padding=1))\n            self.pred_reg.append(\n                nn.Conv2D(\n                    in_c, 4 * self.reg_channels, 3, padding=1))\n        # projection conv\n        self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False)\n        self.proj_conv.skip_quant = True\n        self._init_weights()\n\n        if self.for_distill:\n            self.distill_pairs = {}\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    def _init_weights(self):\n        bias_cls = bias_init_with_prob(0.01)\n        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):\n            constant_(cls_.weight)\n            constant_(cls_.bias, bias_cls)\n            constant_(reg_.weight)\n            constant_(reg_.bias, 1.0)\n\n        proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1,\n                               self.reg_channels).reshape(\n                                   [1, self.reg_channels, 1, 1])\n        self.proj_conv.weight.set_value(proj)\n        self.proj_conv.weight.stop_gradient = True\n        if self.eval_size:\n            anchor_points, stride_tensor = self._generate_anchors()\n            self.anchor_points = anchor_points\n            self.stride_tensor = stride_tensor\n\n    def m_avg_pool2d(self, feat, w, h):\n        batch_size, channels, _, _ = feat.shape\n        feat_flat = paddle.reshape(feat, [batch_size, channels, -1])\n        feat_mean = paddle.mean(feat_flat, axis=2)\n        feat_mean = paddle.reshape(\n            feat_mean, [batch_size, channels, w, h])\n        return feat_mean\n\n    def forward_train(self, feats, targets, aux_pred=None):\n        anchors, anchor_points, num_anchors_list, stride_tensor = \\\n            generate_anchors_for_grid_cell(\n                feats, self.fpn_strides, self.grid_cell_scale,\n                self.grid_cell_offset)\n\n        cls_score_list, reg_distri_list = [], []\n        for i, feat in enumerate(feats):\n            if (paddle.get_device()[:3]=='npu'):\n                avg_feat = self.m_avg_pool2d(feat, 1, 1)\n            else:\n                avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))\n            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +\n                                         feat)\n            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))\n            # cls and reg\n            cls_score = F.sigmoid(cls_logit)\n            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))\n            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))\n        cls_score_list = paddle.concat(cls_score_list, axis=1)\n        reg_distri_list = paddle.concat(reg_distri_list, axis=1)\n\n        if targets.get('is_teacher', False):\n            pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)\n            return cls_score_list, pred_deltas * stride_tensor, pred_dfls\n\n        if targets.get('get_data', False):\n            pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)\n            return cls_score_list, pred_deltas * stride_tensor, pred_dfls\n\n        return self.get_loss([\n            cls_score_list, reg_distri_list, anchors, anchor_points,\n            num_anchors_list, stride_tensor\n        ], targets, aux_pred)\n\n    def _generate_anchors(self, feats=None, dtype='float32'):\n        # just use in eval time\n        anchor_points = []\n        stride_tensor = []\n        for i, stride in enumerate(self.fpn_strides):\n            if feats is not None:\n                _, _, h, w = feats[i].shape\n            else:\n                h = int(self.eval_size[0] / stride)\n                w = int(self.eval_size[1] / stride)\n            shift_x = paddle.arange(end=w) + self.grid_cell_offset\n            shift_y = paddle.arange(end=h) + self.grid_cell_offset\n            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)\n            anchor_point = paddle.cast(\n                paddle.stack(\n                    [shift_x, shift_y], axis=-1), dtype=dtype)\n            anchor_points.append(anchor_point.reshape([-1, 2]))\n            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))\n        anchor_points = paddle.concat(anchor_points)\n        stride_tensor = paddle.concat(stride_tensor)\n        return anchor_points, stride_tensor\n\n    def forward_eval(self, feats):\n        if self.eval_size:\n            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor\n        else:\n            anchor_points, stride_tensor = self._generate_anchors(feats)\n        cls_score_list, reg_dist_list = [], []\n        for i, feat in enumerate(feats):\n            _, _, h, w = feat.shape\n            l = h * w\n            if (paddle.device.get_device()[:3]=='npu'):\n                avg_feat = self.m_avg_pool2d(feat, 1, 1)\n            else:\n                avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))\n            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +\n                                         feat)\n            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))\n            reg_dist = reg_dist.reshape(\n                [-1, 4, self.reg_channels, l]).transpose([0, 2, 3, 1])\n            if self.use_shared_conv:\n                reg_dist = self.proj_conv(F.softmax(\n                    reg_dist, axis=1)).squeeze(1)\n            else:\n                reg_dist = F.softmax(reg_dist, axis=1)\n            # cls and reg\n            cls_score = F.sigmoid(cls_logit)\n            cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))\n            reg_dist_list.append(reg_dist)\n\n        cls_score_list = paddle.concat(cls_score_list, axis=-1)\n        if self.use_shared_conv:\n            reg_dist_list = paddle.concat(reg_dist_list, axis=1)\n        else:\n            reg_dist_list = paddle.concat(reg_dist_list, axis=2)\n            reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1)\n\n        return cls_score_list, reg_dist_list, anchor_points, stride_tensor\n\n    def forward(self, feats, targets=None, aux_pred=None):\n        assert len(feats) == len(self.fpn_strides), \\\n            \"The size of feats is not equal to size of fpn_strides\"\n\n        if self.training:\n            return self.forward_train(feats, targets, aux_pred)\n        else:\n            if targets is not None:\n                # only for semi-det\n                self.is_teacher = targets.get('is_teacher', False)\n                if self.is_teacher:\n                    return self.forward_train(feats, targets, aux_pred=None)\n                else:\n                    return self.forward_eval(feats)\n\n            return self.forward_eval(feats)\n\n    @staticmethod\n    def _focal_loss(score, label, alpha=0.25, gamma=2.0):\n        weight = (score - label).pow(gamma)\n        if alpha > 0:\n            alpha_t = alpha * label + (1 - alpha) * (1 - label)\n            weight *= alpha_t\n        loss = F.binary_cross_entropy(\n            score, label, weight=weight, reduction='sum')\n        return loss\n\n    @staticmethod\n    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):\n        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label\n        loss = F.binary_cross_entropy(\n            pred_score, gt_score, weight=weight, reduction='sum')\n        return loss\n\n    def _bbox_decode(self, anchor_points, pred_dist):\n        _, l, _ = get_static_shape(pred_dist)\n        pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels]))\n        pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1)\n        return batch_distance2bbox(anchor_points, pred_dist)\n\n    def _bbox_decode_fake(self, pred_dist):\n        _, l, _ = get_static_shape(pred_dist)\n        pred_dist_dfl = F.softmax(\n            pred_dist.reshape([-1, l, 4, self.reg_channels]))\n        pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1, 2\n                                                            ])).squeeze(1)\n        return pred_dist, pred_dist_dfl\n\n    def _bbox2distance(self, points, bbox):\n        x1y1, x2y2 = paddle.split(bbox, 2, -1)\n        lt = points - x1y1\n        rb = x2y2 - points\n        return paddle.concat([lt, rb], -1).clip(self.reg_range[0],\n                                                self.reg_range[1] - 1 - 0.01)\n\n    def _df_loss(self, pred_dist, target, lower_bound=0):\n        target_left = paddle.cast(target.floor(), 'int64')\n        target_right = target_left + 1\n        weight_left = target_right.astype('float32') - target\n        weight_right = 1 - weight_left\n        loss_left = F.cross_entropy(\n            pred_dist, target_left - lower_bound,\n            reduction='none') * weight_left\n        loss_right = F.cross_entropy(\n            pred_dist, target_right - lower_bound,\n            reduction='none') * weight_right\n        return (loss_left + loss_right).mean(-1, keepdim=True)\n\n    def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels,\n                   assigned_bboxes, assigned_scores, assigned_scores_sum):\n        # select positive samples mask\n        mask_positive = (assigned_labels != self.num_classes)\n\n        if self.for_distill:\n            # only used for LD main_kd distill\n            self.distill_pairs['mask_positive_select'] = mask_positive\n\n        num_pos = mask_positive.sum()\n        # pos/neg loss\n        if num_pos > 0:\n            # l1 + iou\n            bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile(\n                [1, 1, 4]).astype('bool')\n            pred_bboxes_pos = paddle.masked_select(pred_bboxes,\n                                                   bbox_mask).reshape([-1, 4])\n            assigned_bboxes_pos = paddle.masked_select(\n                assigned_bboxes, bbox_mask).reshape([-1, 4])\n            bbox_weight = paddle.masked_select(\n                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)\n\n            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)\n\n            loss_iou = self.iou_loss(pred_bboxes_pos,\n                                     assigned_bboxes_pos) * bbox_weight\n            loss_iou = loss_iou.sum() / assigned_scores_sum\n\n            dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile(\n                [1, 1, self.reg_channels * 4]).astype('bool')\n            pred_dist_pos = paddle.masked_select(\n                pred_dist, dist_mask).reshape([-1, 4, self.reg_channels])\n            assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes)\n            assigned_ltrb_pos = paddle.masked_select(\n                assigned_ltrb, bbox_mask).reshape([-1, 4])\n            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos,\n                                     self.reg_range[0]) * bbox_weight\n            loss_dfl = loss_dfl.sum() / assigned_scores_sum\n            if self.for_distill:\n                self.distill_pairs['pred_bboxes_pos'] = pred_bboxes_pos\n                self.distill_pairs['pred_dist_pos'] = pred_dist_pos\n                self.distill_pairs['bbox_weight'] = bbox_weight\n        else:\n            loss_l1 = paddle.zeros([])\n            loss_iou = paddle.zeros([])\n            loss_dfl = pred_dist.sum() * 0.\n        return loss_l1, loss_iou, loss_dfl\n\n    def get_loss(self, head_outs, gt_meta, aux_pred=None):\n        pred_scores, pred_distri, anchors,\\\n        anchor_points, num_anchors_list, stride_tensor = head_outs\n\n        anchor_points_s = anchor_points / stride_tensor\n        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)\n\n        if aux_pred is not None:\n            pred_scores_aux = aux_pred[0]\n            pred_bboxes_aux = self._bbox_decode(anchor_points_s, aux_pred[1])\n\n        if 'origin_gt_class' in gt_meta:\n            gt_labels = gt_meta['origin_gt_class']\n            gt_bboxes = gt_meta['origin_gt_bbox']\n            pad_gt_mask = gt_meta['pad_origin_gt_mask']\n        else:\n            gt_labels = gt_meta['gt_class']\n            gt_bboxes = gt_meta['gt_bbox']\n            pad_gt_mask = gt_meta['pad_gt_mask']\n        # label assignment\n        if gt_meta['epoch_id'] < self.static_assigner_epoch:\n            assigned_labels, assigned_bboxes, assigned_scores = \\\n                self.static_assigner(\n                    anchors,\n                    num_anchors_list,\n                    gt_labels,\n                    gt_bboxes,\n                    pad_gt_mask,\n                    bg_index=self.num_classes,\n                    pred_bboxes=pred_bboxes.detach() * stride_tensor)\n            alpha_l = 0.25\n        else:\n            if self.sm_use:\n                # only used in smalldet of PPYOLOE-SOD model\n                assigned_labels, assigned_bboxes, assigned_scores = \\\n                    self.assigner(\n                    pred_scores.detach(),\n                    pred_bboxes.detach() * stride_tensor,\n                    anchor_points,\n                    stride_tensor,\n                    gt_labels,\n                    gt_bboxes,\n                    pad_gt_mask,\n                    bg_index=self.num_classes)\n            else:\n                if aux_pred is None:\n                    if not hasattr(self, \"assigned_labels\"):\n                        assigned_labels, assigned_bboxes, assigned_scores = \\\n                            self.assigner(\n                            pred_scores.detach(),\n                            pred_bboxes.detach() * stride_tensor,\n                            anchor_points,\n                            num_anchors_list,\n                            gt_labels,\n                            gt_bboxes,\n                            pad_gt_mask,\n                            bg_index=self.num_classes)\n                        if self.for_distill:\n                            self.assigned_labels = assigned_labels\n                            self.assigned_bboxes = assigned_bboxes\n                            self.assigned_scores = assigned_scores\n\n                    else:\n                        # only used in distill\n                        assigned_labels = self.assigned_labels\n                        assigned_bboxes = self.assigned_bboxes\n                        assigned_scores = self.assigned_scores\n\n                else:\n                    assigned_labels, assigned_bboxes, assigned_scores = \\\n                            self.assigner(\n                            pred_scores_aux.detach(),\n                            pred_bboxes_aux.detach() * stride_tensor,\n                            anchor_points,\n                            num_anchors_list,\n                            gt_labels,\n                            gt_bboxes,\n                            pad_gt_mask,\n                            bg_index=self.num_classes)\n            alpha_l = -1\n        # rescale bbox\n        assigned_bboxes /= stride_tensor\n\n        assign_out_dict = self.get_loss_from_assign(\n            pred_scores, pred_distri, pred_bboxes, anchor_points_s,\n            assigned_labels, assigned_bboxes, assigned_scores, alpha_l)\n\n        if aux_pred is not None:\n            assign_out_dict_aux = self.get_loss_from_assign(\n                aux_pred[0], aux_pred[1], pred_bboxes_aux, anchor_points_s,\n                assigned_labels, assigned_bboxes, assigned_scores, alpha_l)\n            loss = {}\n            for key in assign_out_dict.keys():\n                loss[key] = assign_out_dict[key] + assign_out_dict_aux[key]\n        else:\n            loss = assign_out_dict\n\n        return loss\n\n    def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes,\n                             anchor_points_s, assigned_labels, assigned_bboxes,\n                             assigned_scores, alpha_l):\n        # cls loss\n        if self.use_varifocal_loss:\n            one_hot_label = F.one_hot(assigned_labels,\n                                      self.num_classes + 1)[..., :-1]\n            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,\n                                            one_hot_label)\n        else:\n            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)\n\n        assigned_scores_sum = assigned_scores.sum()\n        if paddle.distributed.get_world_size() > 1:\n            paddle.distributed.all_reduce(assigned_scores_sum)\n            assigned_scores_sum /= paddle.distributed.get_world_size()\n        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)\n        loss_cls /= assigned_scores_sum\n\n        if self.for_distill:\n            self.distill_pairs['pred_cls_scores'] = pred_scores\n            self.distill_pairs['pos_num'] = assigned_scores_sum\n            self.distill_pairs['assigned_scores'] = assigned_scores\n\n            one_hot_label = F.one_hot(assigned_labels,\n                                      self.num_classes + 1)[..., :-1]\n            self.distill_pairs['target_labels'] = one_hot_label\n\n        loss_l1, loss_iou, loss_dfl = \\\n            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,\n                            assigned_labels, assigned_bboxes, assigned_scores,\n                            assigned_scores_sum)\n        loss = self.loss_weight['class'] * loss_cls + \\\n               self.loss_weight['iou'] * loss_iou + \\\n               self.loss_weight['dfl'] * loss_dfl\n        out_dict = {\n            'loss': loss,\n            'loss_cls': loss_cls,\n            'loss_iou': loss_iou,\n            'loss_dfl': loss_dfl,\n            'loss_l1': loss_l1,\n        }\n        return out_dict\n\n    def post_process(self, head_outs, scale_factor):\n        pred_scores, pred_dist, anchor_points, stride_tensor = head_outs\n        pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)\n        pred_bboxes *= stride_tensor\n        if self.exclude_post_process:\n            return paddle.concat(\n                [pred_bboxes, pred_scores.transpose([0, 2, 1])],\n                axis=-1), None, None\n        else:\n            # scale bbox to origin\n            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)\n            scale_factor = paddle.concat(\n                [scale_x, scale_y, scale_x, scale_y],\n                axis=-1).reshape([-1, 1, 4])\n            pred_bboxes /= scale_factor\n            if self.exclude_nms:\n                # `exclude_nms=True` just use in benchmark\n                return pred_bboxes, pred_scores, None\n            else:\n                bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes,\n                                                             pred_scores)\n                return bbox_pred, bbox_num, nms_keep_idx\n\n\ndef get_activation(name=\"LeakyReLU\"):\n    if name == \"silu\":\n        module = nn.Silu()\n    elif name == \"relu\":\n        module = nn.ReLU()\n    elif name in [\"LeakyReLU\", 'leakyrelu', 'lrelu']:\n        module = nn.LeakyReLU(0.1)\n    elif name is None:\n        module = nn.Identity()\n    else:\n        raise AttributeError(\"Unsupported act type: {}\".format(name))\n    return module\n\n\nclass ConvNormLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 norm_type='gn',\n                 activation=\"LeakyReLU\"):\n        super(ConvNormLayer, self).__init__()\n        assert norm_type in ['bn', 'sync_bn', 'syncbn', 'gn', None]\n        self.conv = nn.Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            bias_attr=False,\n            weight_attr=ParamAttr(initializer=KaimingNormal()))\n\n        if norm_type in ['bn', 'sync_bn', 'syncbn']:\n            self.norm = nn.BatchNorm2D(out_channels)\n        elif norm_type == 'gn':\n            self.norm = nn.GroupNorm(num_groups=32, num_channels=out_channels)\n        else:\n            self.norm = None\n\n        self.act = get_activation(activation)\n\n    def forward(self, x):\n        y = self.conv(x)\n        if self.norm is not None:\n            y = self.norm(y)\n        y = self.act(y)\n        return y\n\n\nclass ScaleReg(nn.Layer):\n    \"\"\"\n    Parameter for scaling the regression outputs.\n    \"\"\"\n\n    def __init__(self, scale=1.0):\n        super(ScaleReg, self).__init__()\n        scale = paddle.to_tensor(scale)\n        self.scale = self.create_parameter(\n            shape=[1],\n            dtype='float32',\n            default_initializer=nn.initializer.Assign(scale))\n\n    def forward(self, x):\n        return x * self.scale\n\n\n@register\nclass SimpleConvHead(nn.Layer):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 num_classes=80,\n                 feat_in=288,\n                 feat_out=288,\n                 num_convs=1,\n                 fpn_strides=[32, 16, 8, 4],\n                 norm_type='gn',\n                 act='LeakyReLU',\n                 prior_prob=0.01,\n                 reg_max=16):\n        super(SimpleConvHead, self).__init__()\n        self.num_classes = num_classes\n        self.feat_in = feat_in\n        self.feat_out = feat_out\n        self.num_convs = num_convs\n        self.fpn_strides = fpn_strides\n        self.reg_max = reg_max\n\n        self.cls_convs = nn.LayerList()\n        self.reg_convs = nn.LayerList()\n        for i in range(self.num_convs):\n            in_c = feat_in if i == 0 else feat_out\n            self.cls_convs.append(\n                ConvNormLayer(\n                    in_c,\n                    feat_out,\n                    3,\n                    stride=1,\n                    padding=1,\n                    norm_type=norm_type,\n                    activation=act))\n            self.reg_convs.append(\n                ConvNormLayer(\n                    in_c,\n                    feat_out,\n                    3,\n                    stride=1,\n                    padding=1,\n                    norm_type=norm_type,\n                    activation=act))\n\n        bias_cls = bias_init_with_prob(prior_prob)\n        self.gfl_cls = nn.Conv2D(\n            feat_out,\n            self.num_classes,\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            weight_attr=ParamAttr(initializer=Normal(\n                mean=0.0, std=0.01)),\n            bias_attr=ParamAttr(initializer=Constant(value=bias_cls)))\n        self.gfl_reg = nn.Conv2D(\n            feat_out,\n            4 * (self.reg_max + 1),\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            weight_attr=ParamAttr(initializer=Normal(\n                mean=0.0, std=0.01)),\n            bias_attr=ParamAttr(initializer=Constant(value=0)))\n\n        self.scales = nn.LayerList()\n        for i in range(len(self.fpn_strides)):\n            self.scales.append(ScaleReg(1.0))\n\n    def forward(self, feats):\n        cls_scores = []\n        bbox_preds = []\n        for x, scale in zip(feats, self.scales):\n            cls_feat = x\n            reg_feat = x\n            for cls_conv in self.cls_convs:\n                cls_feat = cls_conv(cls_feat)\n            for reg_conv in self.reg_convs:\n                reg_feat = reg_conv(reg_feat)\n\n            cls_score = self.gfl_cls(cls_feat)\n            cls_score = F.sigmoid(cls_score)\n            cls_score = cls_score.flatten(2).transpose([0, 2, 1])\n            cls_scores.append(cls_score)\n\n            bbox_pred = scale(self.gfl_reg(reg_feat))\n            bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])\n            bbox_preds.append(bbox_pred)\n\n        cls_scores = paddle.concat(cls_scores, axis=1)\n        bbox_preds = paddle.concat(bbox_preds, axis=1)\n        return cls_scores, bbox_preds\n"
  },
  {
    "path": "ppdet/modeling/heads/ppyoloe_ins_head.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.backbones.csp_darknet import BaseConv\nfrom ppdet.modeling.layers import MultiClassNMS\nfrom ppdet.modeling.ops import get_static_shape, get_act_fn\nfrom .ppyoloe_head import ESEAttn\nfrom ..assigners.utils import generate_anchors_for_grid_cell\nfrom ..bbox_utils import batch_distance2bbox\nfrom ..initializer import bias_init_with_prob, constant_\nfrom ..losses import GIoULoss\n\n__all__ = ['PPYOLOEInsHead']\n\n\ndef custom_binary_cross_entropy_with_logits(x, y):\n    max_val = paddle.maximum(-x, paddle.to_tensor(0.0))\n    loss = (1 - y) * x + max_val + paddle.log(\n        paddle.exp(-max_val) + paddle.exp(-x - max_val))\n    return loss\n\n\nclass MaskProto(nn.Layer):\n    # YOLOv8 mask Proto module for instance segmentation models\n    def __init__(self, ch_in, num_protos=256, num_masks=32, act='silu'):\n        super().__init__()\n        self.conv1 = BaseConv(ch_in, num_protos, 3, 1, act=act)\n        self.upsample = nn.Conv2DTranspose(num_protos,\n                                           num_protos,\n                                           2,\n                                           2,\n                                           0,\n                                           bias_attr=True)\n        self.conv2 = BaseConv(num_protos, num_protos, 3, 1, act=act)\n        self.conv3 = BaseConv(num_protos, num_masks, 1, 1, act=act)\n\n    def forward(self, x):\n        return self.conv3(self.conv2(self.upsample(self.conv1(x))))\n\n\ndef xyxy2xywh(x):\n    \"\"\"\n    Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the\n    top-left corner and (x2, y2) is the bottom-right corner.\n    \"\"\"\n    assert x.shape[\n        -1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'\n    y = paddle.empty_like(x) if isinstance(\n        x, paddle.Tensor) else np.empty_like(x)  # faster than clone/copy\n    y[..., 0] = (x[..., 0] + x[..., 2]) / 2  # x center\n    y[..., 1] = (x[..., 1] + x[..., 3]) / 2  # y center\n    y[..., 2] = x[..., 2] - x[..., 0]  # width\n    y[..., 3] = x[..., 3] - x[..., 1]  # height\n    return y\n\n\ndef crop_mask(masks, boxes):\n    \"\"\"\n    It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box\n\n    Args:\n      masks (paddle.Tensor): [h, w, n] tensor of masks\n      boxes (paddle.Tensor): [n, 4] tensor of bbox coordinates in relative point form\n\n    Returns:\n      (paddle.Tensor): The masks are being cropped to the bounding box.\n    \"\"\"\n    _, h, w = masks.shape\n    x1, y1, x2, y2 = paddle.chunk(boxes[:, :, None], 4, axis=1)\n    r = paddle.arange(w, dtype=x1.dtype)[None, None, :]\n    c = paddle.arange(h, dtype=y1.dtype)[None, :, None]\n    if \"npu\" in paddle.device.get_all_custom_device_type():\n        # bool tensor broadcast multiply is extreamly slow on npu, so we cast it to float32.\n        m_dtype = masks.dtype\n        return masks * ((r >= x1).cast(m_dtype) * (r < x2).cast(m_dtype) *\n                        (c >= y1).cast(m_dtype) * (c < y2).cast(m_dtype))\n    else:\n        return masks * ((r >= x1) * (r < x2) * (c >= y1) *\n                        (c < y2)).astype(masks.dtype)\n\n\ndef process_mask_upsample(protos, masks_in, bboxes, shape):\n    \"\"\"\n    It takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher\n    quality but is slower.\n\n    Args:\n      protos (paddle.Tensor): [mask_dim, mask_h, mask_w]\n      masks_in (paddle.Tensor): [n, mask_dim], n is number of masks after nms\n      bboxes (paddle.Tensor): [n, 4], n is number of masks after nms\n      shape (tuple): the size of the input image (h,w)\n\n    Returns:\n      (paddle.Tensor): The upsampled masks.\n    \"\"\"\n    c, mh, mw = protos.shape  # CHW\n    masks = F.sigmoid(masks_in @ protos.reshape([c, -1])).reshape([-1, mh, mw])\n    masks = F.interpolate(masks[None],\n                          shape,\n                          mode='bilinear',\n                          align_corners=False)[0]  # CHW\n    masks = crop_mask(masks, bboxes)  # CHW\n    return masks\n\n\n@register\nclass PPYOLOEInsHead(nn.Layer):\n    __shared__ = [\n        'num_classes', 'eval_size', 'trt', 'exclude_nms',\n        'exclude_post_process', 'use_shared_conv', 'for_distill', 'width_mult'\n    ]\n    __inject__ = ['static_assigner', 'assigner', 'nms']\n\n    def __init__(self,\n                 in_channels=[1024, 512, 256],\n                 num_classes=80,\n                 act='swish',\n                 fpn_strides=(32, 16, 8),\n                 grid_cell_scale=5.0,\n                 grid_cell_offset=0.5,\n                 reg_max=16,\n                 reg_range=None,\n                 static_assigner_epoch=4,\n                 use_varifocal_loss=True,\n                 static_assigner='ATSSAssigner',\n                 assigner='TaskAlignedAssigner',\n                 nms='MultiClassNMS',\n                 eval_size=None,\n                 loss_weight={\n                     'class': 1.0,\n                     'iou': 2.5,\n                     'dfl': 0.5,\n                 },\n                 trt=False,\n                 attn_conv='convbn',\n                 exclude_nms=False,\n                 exclude_post_process=False,\n                 use_shared_conv=True,\n                 mask_thr_binary=0.5,\n                 num_masks=32,\n                 num_protos=256,\n                 width_mult=1.0,\n                 for_distill=False):\n        super(PPYOLOEInsHead, self).__init__()\n        assert len(in_channels) > 0, \"len(in_channels) should > 0\"\n\n        self.mask_thr_binary = mask_thr_binary\n        self.num_masks = num_masks\n        self.num_protos = int(num_protos * width_mult)\n\n        self.in_channels = in_channels\n        self.num_classes = num_classes\n        self.fpn_strides = fpn_strides\n        self.grid_cell_scale = grid_cell_scale\n        self.grid_cell_offset = grid_cell_offset\n        if reg_range:\n            self.sm_use = True\n            self.reg_range = reg_range\n        else:\n            self.sm_use = False\n            self.reg_range = (0, reg_max + 1)\n        self.reg_channels = self.reg_range[1] - self.reg_range[0]\n        self.iou_loss = GIoULoss()\n        self.loss_weight = loss_weight\n        self.use_varifocal_loss = use_varifocal_loss\n        self.eval_size = eval_size\n\n        self.static_assigner_epoch = static_assigner_epoch\n        self.static_assigner = static_assigner\n        self.assigner = assigner\n        self.nms = nms\n        if isinstance(self.nms, MultiClassNMS) and trt:\n            self.nms.trt = trt\n        self.exclude_nms = exclude_nms\n        self.exclude_post_process = exclude_post_process\n        self.use_shared_conv = use_shared_conv\n        self.for_distill = for_distill\n        self.is_teacher = False\n\n        # stem\n        self.stem_cls = nn.LayerList()\n        self.stem_reg = nn.LayerList()\n        self.stem_ins = nn.LayerList()\n        act = get_act_fn(\n            act, trt=trt) if act is None or isinstance(act,\n                                                       (str, dict)) else act\n        for in_c in self.in_channels:\n            self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))\n            self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))\n            self.stem_ins.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))\n        # pred head\n        self.pred_cls = nn.LayerList()\n        self.pred_reg = nn.LayerList()\n        self.pred_ins = nn.LayerList()\n        for in_c in self.in_channels:\n            self.pred_cls.append(\n                nn.Conv2D(in_c, self.num_classes, 3, padding=1))\n            self.pred_reg.append(\n                nn.Conv2D(in_c, 4 * self.reg_channels, 3, padding=1))\n            self.pred_ins.append(nn.Conv2D(in_c, self.num_masks, 3, padding=1))\n        # projection conv\n        self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False)\n        self.proj_conv.skip_quant = True\n        self._init_weights()\n\n        self.proto = MaskProto(in_channels[-1],\n                               self.num_protos,\n                               self.num_masks,\n                               act=act)\n\n        if self.for_distill:\n            self.distill_pairs = {}\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {\n            'in_channels': [i.channels for i in input_shape],\n        }\n\n    def _init_weights(self):\n        bias_cls = bias_init_with_prob(0.01)\n        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):\n            constant_(cls_.weight)\n            constant_(cls_.bias, bias_cls)\n            constant_(reg_.weight)\n            constant_(reg_.bias, 1.0)\n\n        proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1,\n                               self.reg_channels).reshape(\n                                   [1, self.reg_channels, 1, 1])\n        self.proj_conv.weight.set_value(proj)\n        self.proj_conv.weight.stop_gradient = True\n        if self.eval_size:\n            anchor_points, stride_tensor = self._generate_anchors()\n            self.anchor_points = anchor_points\n            self.stride_tensor = stride_tensor\n\n    def forward_train(self, feats, targets):\n        anchors, anchor_points, num_anchors_list, stride_tensor = \\\n            generate_anchors_for_grid_cell(\n                feats, self.fpn_strides, self.grid_cell_scale,\n                self.grid_cell_offset)\n\n        cls_score_list, reg_distri_list = [], []\n        mask_feat = self.proto(feats[-1])\n        mask_coeff_list = []\n        for i, feat in enumerate(feats):\n            _, _, h, w = feat.shape\n            l = h * w\n            if \"npu\" in paddle.device.get_all_custom_device_type(\n            ):  # backward in avgpool is extremely slow in npu kernel, replace it with mean\n                avg_feat = feat.mean(axis=[2, 3], keepdim=True)\n            else:\n                avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))\n            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +\n                                         feat)\n            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))\n            msk_coeff = self.pred_ins[i](self.stem_ins[i](feat, avg_feat) +\n                                         feat)\n\n            # cls and reg\n            cls_score = F.sigmoid(cls_logit)\n            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))\n            mask_coeff_list.append(msk_coeff.flatten(2).transpose([0, 2,\n                                                                   1]))  ###\n            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))\n        cls_score_list = paddle.concat(cls_score_list, axis=1)\n        mask_coeff_list = paddle.concat(mask_coeff_list, axis=1)\n        reg_distri_list = paddle.concat(reg_distri_list, axis=1)\n\n        return self.get_loss([\n            cls_score_list, reg_distri_list, mask_coeff_list, mask_feat,\n            anchors, anchor_points, num_anchors_list, stride_tensor\n        ], targets)\n\n    def _generate_anchors(self, feats=None, dtype='float32'):\n        # just use in eval time\n        anchor_points = []\n        stride_tensor = []\n        for i, stride in enumerate(self.fpn_strides):\n            if feats is not None:\n                _, _, h, w = feats[i].shape\n            else:\n                h = int(self.eval_size[0] / stride)\n                w = int(self.eval_size[1] / stride)\n            shift_x = paddle.arange(end=w) + self.grid_cell_offset\n            shift_y = paddle.arange(end=h) + self.grid_cell_offset\n            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)\n            anchor_point = paddle.cast(paddle.stack([shift_x, shift_y],\n                                                    axis=-1),\n                                       dtype=dtype)\n            anchor_points.append(anchor_point.reshape([-1, 2]))\n            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))\n        anchor_points = paddle.concat(anchor_points)\n        stride_tensor = paddle.concat(stride_tensor)\n        return anchor_points, stride_tensor\n\n    def forward_eval(self, feats):\n        mask_proto = self.proto(feats[-1])\n\n        if self.eval_size:\n            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor\n        else:\n            anchor_points, stride_tensor = self._generate_anchors(feats)\n        cls_score_list, reg_dist_list, pred_mask_list = [], [], []\n        feats_shapes = []\n        for i, feat in enumerate(feats):\n            _, _, h, w = feat.shape\n            l = h * w\n            feats_shapes.append(l)\n\n            if \"npu\" in paddle.device.get_all_custom_device_type():\n                # backward in avgpool is extremely slow in npu kernel, replace it with mean\n                avg_feat = feat.mean(axis=[2, 3], keepdim=True)\n            else:\n                avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))\n            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +\n                                         feat)\n            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))\n            mask_coeff = self.pred_ins[i](self.stem_ins[i](feat, avg_feat) +\n                                          feat)\n            pred_mask_list.append(mask_coeff.reshape([-1, self.num_masks, l]))\n\n            reg_dist = reg_dist.reshape([-1, 4, self.reg_channels,\n                                         l]).transpose([0, 2, 3, 1])\n\n            if self.use_shared_conv:\n                reg_dist = self.proj_conv(F.softmax(reg_dist,\n                                                    axis=1)).squeeze(1)\n            else:\n                reg_dist = F.softmax(reg_dist, axis=1)\n            # cls and reg\n            cls_score = F.sigmoid(cls_logit)\n            cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))\n            reg_dist_list.append(reg_dist)\n\n        cls_score_list = paddle.concat(cls_score_list, axis=-1)\n        pred_mask_list = paddle.concat(pred_mask_list, axis=-1)\n\n        if self.use_shared_conv:\n            reg_dist_list = paddle.concat(reg_dist_list, axis=1)\n        else:\n            reg_dist_list = paddle.concat(reg_dist_list, axis=2)\n            reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1)\n\n        return cls_score_list, reg_dist_list, pred_mask_list, mask_proto, anchor_points, stride_tensor\n\n    def forward(self, feats, targets=None):\n        assert len(feats) == len(self.fpn_strides), \\\n            \"The size of feats is not equal to size of fpn_strides\"\n        if self.training:\n            return self.forward_train(feats, targets)\n        else:\n            return self.forward_eval(feats)\n\n    @staticmethod\n    def _focal_loss(score, label, alpha=0.25, gamma=2.0):\n        weight = (score - label).pow(gamma)\n        if alpha > 0:\n            alpha_t = alpha * label + (1 - alpha) * (1 - label)\n            weight *= alpha_t\n        loss = F.binary_cross_entropy(score,\n                                      label,\n                                      weight=weight,\n                                      reduction='sum')\n        return loss\n\n    @staticmethod\n    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):\n        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label\n        loss = F.binary_cross_entropy(pred_score,\n                                      gt_score,\n                                      weight=weight,\n                                      reduction='sum')\n        return loss\n\n    def _bbox_decode(self, anchor_points, pred_dist):\n        _, l, _ = get_static_shape(pred_dist)\n        pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels]))\n        pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1,\n                                                        2])).squeeze(1)\n        return batch_distance2bbox(anchor_points, pred_dist)\n\n    def _bbox_decode_fake(self, pred_dist):\n        _, l, _ = get_static_shape(pred_dist)\n        pred_dist_dfl = F.softmax(\n            pred_dist.reshape([-1, l, 4, self.reg_channels]))\n        pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1,\n                                                            2])).squeeze(1)\n        return pred_dist, pred_dist_dfl\n\n    def _bbox2distance(self, points, bbox):\n        x1y1, x2y2 = paddle.split(bbox, 2, -1)\n        lt = points - x1y1\n        rb = x2y2 - points\n        if \"npu\" in paddle.device.get_all_custom_device_type(\n        ):  # npu clip kernel causes nan grad, replace it with maximum & minimum.\n            out = paddle.concat([lt, rb], -1)\n            out = paddle.maximum(\n                out, paddle.to_tensor(self.reg_range[0], dtype=out.dtype))\n            out = paddle.minimum(\n                out,\n                paddle.to_tensor(self.reg_range[1] - 1 - 0.01,\n                                 dtype=out.dtype))\n            return out\n        else:\n            return paddle.concat([lt, rb],\n                                 -1).clip(self.reg_range[0],\n                                          self.reg_range[1] - 1 - 0.01)\n\n    def _df_loss(self, pred_dist, target, lower_bound=0):\n        target_left = paddle.cast(target.floor(), 'int64')\n        target_right = target_left + 1\n        weight_left = target_right.astype('float32') - target\n        weight_right = 1 - weight_left\n        loss_left = F.cross_entropy(pred_dist,\n                                    target_left - lower_bound,\n                                    reduction='none') * weight_left\n        loss_right = F.cross_entropy(pred_dist,\n                                     target_right - lower_bound,\n                                     reduction='none') * weight_right\n        return (loss_left + loss_right).mean(-1, keepdim=True)\n\n    def get_loss(self, head_outs, gt_meta):\n        assert 'gt_bbox' in gt_meta and 'gt_class' in gt_meta\n        assert 'gt_segm' in gt_meta\n\n        pred_scores, pred_distri, pred_mask_coeffs, mask_proto, anchors, \\\n            anchor_points, num_anchors_list, stride_tensor = head_outs\n\n        bs = pred_scores.shape[0]\n        imgsz = paddle.to_tensor(\n            [640, 640]\n        )  # paddle.to_tensor(pred_scores[0].shape[2:]) * self.fpn_strides[0]  # image size (h,w)\n        mask_h, mask_w = mask_proto.shape[-2:]\n\n        anchor_points_s = anchor_points / stride_tensor\n        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)\n\n        gt_labels = paddle.stack(gt_meta['gt_class'])\n        gt_bboxes = paddle.stack(gt_meta['gt_bbox'])\n        pad_gt_mask = paddle.stack(gt_meta['pad_gt_mask'])\n        gt_segms = paddle.stack(gt_meta['gt_segm']).cast('float32')\n        if tuple(gt_segms.shape[-2:]) != (mask_h, mask_w):  # downsample\n            gt_segms = F.interpolate(gt_segms, (mask_h, mask_w),\n                                     mode='nearest').reshape(\n                                         [bs, -1, mask_h * mask_w])\n\n        # label assignment\n        assigned_labels, assigned_bboxes, assigned_scores, assigned_gt_index = \\\n            self.assigner(\n                pred_scores.detach(),\n                pred_bboxes.detach() * stride_tensor,\n                anchor_points,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index=self.num_classes,\n                gt_segms=gt_segms)\n        # rescale bbox\n        assigned_bboxes /= stride_tensor\n\n        # assign segms for masks\n        assigned_masks = paddle.gather(gt_segms.reshape([-1, mask_h * mask_w]),\n                                       assigned_gt_index.flatten(),\n                                       axis=0)\n        assigned_masks = assigned_masks.reshape(\n            [bs, assigned_gt_index.shape[1], mask_h * mask_w])\n\n        assign_out_dict = self.get_loss_from_assign(\n            pred_scores, pred_distri, pred_bboxes, anchor_points_s,\n            assigned_labels, assigned_bboxes, assigned_scores, assigned_masks,\n            pred_mask_coeffs, mask_proto, stride_tensor, imgsz)\n\n        loss = assign_out_dict\n        return loss\n\n    def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes,\n                             anchor_points_s, assigned_labels, assigned_bboxes,\n                             assigned_scores, assigned_masks, pred_mask_coeffs,\n                             mask_proto, stride_tensor, imgsz):\n        # cls loss\n        if self.use_varifocal_loss:\n            one_hot_label = F.one_hot(assigned_labels,\n                                      self.num_classes + 1)[..., :-1]\n            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,\n                                            one_hot_label)\n        else:\n            loss_cls = self._focal_loss(pred_scores,\n                                        assigned_scores,\n                                        alpha_l=-1)\n\n        assigned_scores_sum = assigned_scores.sum()\n        if paddle.distributed.get_world_size() > 1:\n            paddle.distributed.all_reduce(assigned_scores_sum)\n            assigned_scores_sum /= paddle.distributed.get_world_size()\n        if \"npu\" in paddle.device.get_all_custom_device_type():\n            # npu clip kernel causes nan grad, replace it with maximum & minimum.\n            assigned_scores_sum = paddle.maximum(\n                assigned_scores_sum,\n                paddle.to_tensor(1., dtype=assigned_scores_sum.dtype))\n        else:\n            assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)\n\n        loss_cls /= assigned_scores_sum\n\n        # select positive samples mask\n        mask_positive = (assigned_labels != self.num_classes)\n        num_pos = mask_positive.sum()\n        # pos/neg loss\n        if num_pos > 0:\n            # l1 + iou\n            bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile(\n                [1, 1, 4]).astype('bool')\n            pred_bboxes_pos = paddle.masked_select(pred_bboxes,\n                                                   bbox_mask).reshape([-1, 4])\n            assigned_bboxes_pos = paddle.masked_select(\n                assigned_bboxes, bbox_mask).reshape([-1, 4])\n            bbox_weight = paddle.masked_select(assigned_scores.sum(-1),\n                                               mask_positive).unsqueeze(-1)\n\n            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)\n\n            loss_iou = self.iou_loss(pred_bboxes_pos,\n                                     assigned_bboxes_pos) * bbox_weight\n            loss_iou = loss_iou.sum() / assigned_scores_sum\n\n            # dfl loss\n            dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile(\n                [1, 1, self.reg_channels * 4]).astype('bool')\n            pred_dist_pos = paddle.masked_select(pred_distri,\n                                                 dist_mask).reshape([\n                                                     -1, 4, self.reg_channels\n                                                 ])  # pred_dist in funs\n            assigned_ltrb = self._bbox2distance(\n                anchor_points_s, assigned_bboxes)  # anchor_points in func\n            assigned_ltrb_pos = paddle.masked_select(\n                assigned_ltrb, bbox_mask).reshape([-1, 4])\n            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos,\n                                     self.reg_range[0]) * bbox_weight\n            loss_dfl = loss_dfl.sum() / assigned_scores_sum\n\n            # mask loss\n            loss_mask = self.calculate_segmentation_loss(\n                mask_positive, assigned_masks, assigned_bboxes * stride_tensor,\n                mask_proto, pred_mask_coeffs, imgsz)\n            # [bs, 8400] [bs, 8400, 160 * 160] [bs, 8400, 4] [bs, 32, 160, 160] [bs, 8400, 32]\n            loss_mask /= assigned_scores_sum\n        else:\n            loss_l1 = paddle.zeros([1])\n            loss_iou = paddle.zeros([1])\n            loss_mask = paddle.zeros([1])\n            loss_dfl = paddle.zeros([1])\n\n        loss = self.loss_weight['class'] * loss_cls + \\\n               self.loss_weight['iou'] * loss_iou + \\\n               self.loss_weight['dfl'] * loss_dfl + \\\n               self.loss_weight['iou'] * loss_mask\n\n        out_dict = {\n            'loss': loss,\n            'loss_cls': loss_cls,\n            'loss_iou': loss_iou,\n            'loss_dfl': loss_dfl,\n            'loss_mask': loss_mask,\n            'loss_l1': loss_l1,\n        }\n        return out_dict\n\n    def calculate_segmentation_loss(self,\n                                    fg_mask,\n                                    masks,\n                                    target_bboxes,\n                                    proto,\n                                    pred_masks,\n                                    imgsz,\n                                    overlap=True):\n        \"\"\"\n        Calculate the loss for instance segmentation.\n\n        Args:\n            fg_mask (paddle.Tensor): A binary tensor of shape (BS, N_anchors) indicating which anchors are positive.\n            masks (paddle.Tensor): Ground truth masks of shape (BS, H, W) if `overlap` is False, otherwise (BS, ?, H, W).\n            target_gt_idx (paddle.Tensor): Indexes of ground truth objects for each anchor of shape (BS, N_anchors).\n            target_bboxes (paddle.Tensor): Ground truth bounding boxes for each anchor of shape (BS, N_anchors, 4).\n            batch_idx (paddle.Tensor): Batch indices of shape (N_labels_in_batch, 1).\n            proto (paddle.Tensor): Prototype masks of shape (BS, 32, H, W).\n            pred_masks (paddle.Tensor): Predicted masks for each anchor of shape (BS, N_anchors, 32).\n            imgsz (paddle.Tensor): Size of the input image as a tensor of shape (2), i.e., (H, W).\n            overlap (bool): Whether the masks in `masks` tensor overlap.\n\n        Returns:\n            (paddle.Tensor): The calculated loss for instance segmentation.\n\n        Notes:\n            The batch loss can be computed for improved speed at higher memory usage.\n            For example, pred_mask can be computed as follows:\n                pred_mask = paddle.einsum('in,nhw->ihw', pred, proto)  # (i, 32) @ (32, 160, 160) -> (i, 160, 160)\n        \"\"\"\n        _, _, mask_h, mask_w = proto.shape\n        loss = paddle.to_tensor([0.])\n\n        # Normalize to 0-1\n        target_bboxes_normalized = target_bboxes / imgsz[[1, 0, 1, 0]].cast(\n            target_bboxes.dtype)\n        # [8, 8400, 4]\n\n        # Areas of target bboxes\n        marea = xyxy2xywh(target_bboxes_normalized)[...,\n                                                    2:].prod(2).unsqueeze(-1)\n\n        # Normalize to mask size\n        mxyxy = target_bboxes_normalized * paddle.to_tensor(\n            [mask_w, mask_h, mask_w, mask_h],\n            dtype=target_bboxes_normalized.dtype)\n\n        for i, single_i in enumerate(\n                zip(fg_mask, pred_masks, proto, mxyxy, marea, masks)):\n            fg_mask_i, pred_masks_i, proto_i, mxyxy_i, marea_i, masks_i = single_i\n            #  [8400] [8400, 32] [32, 160, 160] [8400, 4]  [8400, 1]  [8400, 25600]\n            if fg_mask_i.any():\n                loss += self.single_mask_loss(masks_i[fg_mask_i],\n                                              pred_masks_i[fg_mask_i], proto_i,\n                                              mxyxy_i[fg_mask_i],\n                                              marea_i[fg_mask_i])\n                # [10, 25600]  [10, 32]  [32, 160, 160]  [10, 4]  [10, 1]\n            else:\n                loss += (proto * 0).sum() + (\n                    pred_masks * 0).sum()  # inf sums may lead to nan loss\n        return loss\n\n    @staticmethod\n    def single_mask_loss(gt_mask, pred, proto, xyxy, area):\n        \"\"\"\n        Compute the instance segmentation loss for a single image.\n        Args:\n            gt_mask (paddle.Tensor): Ground truth mask of shape (n, H, W), where n is the number of objects.\n            pred (paddle.Tensor): Predicted mask coefficients of shape (n, 32).\n            proto (paddle.Tensor): Prototype masks of shape (32, H, W).\n            xyxy (paddle.Tensor): Ground truth bounding boxes in xyxy format, normalized to [0, 1], of shape (n, 4).\n            area (paddle.Tensor): Area of each ground truth bounding box of shape (n,).\n        Returns:\n            (paddle.Tensor): The calculated mask loss for a single image.\n\n        Notes:\n            The function uses the equation pred_mask = paddle.einsum('in,nhw->ihw', pred, proto) to produce the\n            predicted masks from the prototype masks and predicted mask coefficients.\n        \"\"\"\n        nt = pred.shape[0]\n        gt_mask = gt_mask.reshape([nt, *proto.shape[1:]])\n        nmasks = 32\n        pred_mask = (pred @ proto.reshape([nmasks, -1])).reshape(\n            [-1, *proto.shape[1:]])  # (n,32) @ (32,80,80) -> (n,80,80)\n\n        if \"npu\" in paddle.device.get_all_custom_device_type():\n            # bce npu kernel causes nan grad, replace it with numeric stable custom implementation.\n            loss = custom_binary_cross_entropy_with_logits(pred_mask, gt_mask)\n        else:\n            loss = F.binary_cross_entropy_with_logits(pred_mask,\n                                                      gt_mask,\n                                                      reduction='none')\n        return (crop_mask(loss, xyxy).mean(axis=(1, 2)) /\n                area.squeeze(-1)).sum()\n\n    def post_process(self,\n                     head_outs,\n                     im_shape,\n                     scale_factor,\n                     infer_shape=[640, 640],\n                     rescale=True):\n        pred_scores, pred_dist, pred_mask_coeffs, mask_feat, anchor_points, stride_tensor = head_outs\n\n        pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)\n        pred_bboxes *= stride_tensor\n\n        if self.exclude_post_process:\n            return paddle.concat([\n                pred_bboxes,\n                pred_scores.transpose([0, 2, 1]),\n                pred_mask_coeffs.transpose([0, 2, 1])\n            ],\n                                 axis=-1), mask_feat, None\n            # [1, 8400, 4+80+32], [1, 32, 160, 160]\n\n        bbox_pred, bbox_num, keep_idxs = self.nms(pred_bboxes, pred_scores)\n\n        if bbox_num.sum() > 0:\n            pred_mask_coeffs = pred_mask_coeffs.transpose([0, 2, 1])\n            mask_coeffs = paddle.gather(\n                pred_mask_coeffs.reshape([-1, self.num_masks]), keep_idxs)\n\n            mask_logits = process_mask_upsample(mask_feat[0], mask_coeffs,\n                                                bbox_pred[:, 2:6], infer_shape)\n            if rescale:\n                ori_h, ori_w = im_shape[0] / scale_factor[0]\n                mask_logits = F.interpolate(\n                    mask_logits.unsqueeze(0),\n                    size=[\n                        int(paddle.round(mask_logits.shape[-2] /\n                              scale_factor[0][0])),\n                        int(paddle.round(mask_logits.shape[-1] /\n                              scale_factor[0][1]))\n                    ],\n                    mode='bilinear',\n                    align_corners=False)\n                if \"npu\" in paddle.device.get_all_custom_device_type():\n                    # due to npu numeric error, we need to take round of img size.\n                    mask_logits = mask_logits[\n                        ..., :round(ori_h.item()), :round(ori_w.item())]\n                else:\n                    mask_logits = mask_logits[..., :int(ori_h), :int(ori_w)]\n\n            masks = mask_logits.squeeze(0)\n            mask_pred = paddle.to_tensor(masks > self.mask_thr_binary).cast(\"float32\")\n\n            # scale bbox to origin\n            scale_factor = scale_factor.flip(-1).tile([1, 2])\n            bbox_pred[:, 2:6] /= scale_factor\n        else:\n            ori_h, ori_w = im_shape[0] / scale_factor[0]\n            bbox_num = paddle.to_tensor([1]).cast(\"int32\")\n            bbox_pred = paddle.zeros([bbox_num, 6])\n            mask_pred = paddle.zeros([bbox_num, int(ori_h), int(ori_w)])\n\n        return bbox_pred, bbox_num, mask_pred, keep_idxs"
  },
  {
    "path": "ppdet/modeling/heads/ppyoloe_r_head.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\n\nfrom ..losses import ProbIoULoss\nfrom ..initializer import bias_init_with_prob, constant_, normal_, vector_\nfrom ppdet.modeling.backbones.cspresnet import ConvBNLayer\nfrom ppdet.modeling.ops import get_static_shape, get_act_fn, anchor_generator\nfrom ppdet.modeling.layers import MultiClassNMS\n\n__all__ = ['PPYOLOERHead']\n\n\nclass ESEAttn(nn.Layer):\n    def __init__(self, feat_channels, act='swish'):\n        super(ESEAttn, self).__init__()\n        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)\n        self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)\n\n        self._init_weights()\n\n    def _init_weights(self):\n        normal_(self.fc.weight, std=0.01)\n\n    def forward(self, feat, avg_feat):\n        weight = F.sigmoid(self.fc(avg_feat))\n        return self.conv(feat * weight)\n\n\n@register\nclass PPYOLOERHead(nn.Layer):\n    __shared__ = ['num_classes', 'trt', 'export_onnx']\n    __inject__ = ['static_assigner', 'assigner', 'nms']\n\n    def __init__(self,\n                 in_channels=[1024, 512, 256],\n                 num_classes=15,\n                 act='swish',\n                 fpn_strides=(32, 16, 8),\n                 grid_cell_offset=0.5,\n                 angle_max=90,\n                 use_varifocal_loss=True,\n                 static_assigner_epoch=4,\n                 trt=False,\n                 export_onnx=False,\n                 static_assigner='ATSSAssigner',\n                 assigner='TaskAlignedAssigner',\n                 nms='MultiClassNMS',\n                 loss_weight={'class': 1.0,\n                              'iou': 2.5,\n                              'dfl': 0.05}):\n        super(PPYOLOERHead, self).__init__()\n        assert len(in_channels) > 0, \"len(in_channels) should > 0\"\n        self.in_channels = in_channels\n        self.num_classes = num_classes\n        self.fpn_strides = fpn_strides\n        self.grid_cell_offset = grid_cell_offset\n        self.angle_max = angle_max\n        self.loss_weight = loss_weight\n        self.use_varifocal_loss = use_varifocal_loss\n        self.half_pi = paddle.to_tensor(\n            [1.5707963267948966], dtype=paddle.float32)\n        self.half_pi_bin = self.half_pi / angle_max\n        self.iou_loss = ProbIoULoss()\n        self.static_assigner_epoch = static_assigner_epoch\n        self.static_assigner = static_assigner\n        self.assigner = assigner\n        self.nms = nms\n        # stem\n        self.stem_cls = nn.LayerList()\n        self.stem_reg = nn.LayerList()\n        self.stem_angle = nn.LayerList()\n        trt = False if export_onnx else trt\n        self.export_onnx = export_onnx\n        act = get_act_fn(\n            act, trt=trt) if act is None or isinstance(act,\n                                                       (str, dict)) else act\n        self.trt = trt\n        for in_c in self.in_channels:\n            self.stem_cls.append(ESEAttn(in_c, act=act))\n            self.stem_reg.append(ESEAttn(in_c, act=act))\n            self.stem_angle.append(ESEAttn(in_c, act=act))\n        # pred head\n        self.pred_cls = nn.LayerList()\n        self.pred_reg = nn.LayerList()\n        self.pred_angle = nn.LayerList()\n        for in_c in self.in_channels:\n            self.pred_cls.append(\n                nn.Conv2D(\n                    in_c, self.num_classes, 3, padding=1))\n            self.pred_reg.append(nn.Conv2D(in_c, 4, 3, padding=1))\n            self.pred_angle.append(\n                nn.Conv2D(\n                    in_c, self.angle_max + 1, 3, padding=1))\n        self.angle_proj_conv = nn.Conv2D(\n            self.angle_max + 1, 1, 1, bias_attr=False)\n        self._init_weights()\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    def _init_weights(self):\n        bias_cls = bias_init_with_prob(0.01)\n        bias_angle = [10.] + [1.] * self.angle_max\n        for cls_, reg_, angle_ in zip(self.pred_cls, self.pred_reg,\n                                      self.pred_angle):\n            normal_(cls_.weight, std=0.01)\n            constant_(cls_.bias, bias_cls)\n            normal_(reg_.weight, std=0.01)\n            constant_(reg_.bias)\n            constant_(angle_.weight)\n            vector_(angle_.bias, bias_angle)\n\n        angle_proj = paddle.linspace(0, self.angle_max, self.angle_max + 1)\n        self.angle_proj = angle_proj * self.half_pi_bin\n        self.angle_proj_conv.weight.set_value(\n            self.angle_proj.reshape([1, self.angle_max + 1, 1, 1]))\n        self.angle_proj_conv.weight.stop_gradient = True\n\n    def _generate_anchors(self, feats):\n        if self.trt:\n            anchor_points = []\n            for feat, stride in zip(feats, self.fpn_strides):\n                _, _, h, w = feat.shape\n                anchor, _ = anchor_generator(\n                    feat,\n                    stride * 4,\n                    1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride],\n                    offset=0.5)\n                x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1)\n                xc = (x1 + x2 + 1) / 2\n                yc = (y1 + y2 + 1) / 2\n                anchor_point = paddle.concat(\n                    [xc, yc], axis=-1).reshape((1, h * w, 2))\n                anchor_points.append(anchor_point)\n            anchor_points = paddle.concat(anchor_points, axis=1)\n            return anchor_points, None, None\n        else:\n            anchor_points = []\n            stride_tensor = []\n            num_anchors_list = []\n            for feat, stride in zip(feats, self.fpn_strides):\n                _, _, h, w = feat.shape\n                shift_x = (paddle.arange(end=w) + 0.5) * stride\n                shift_y = (paddle.arange(end=h) + 0.5) * stride\n                shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)\n                anchor_point = paddle.cast(\n                    paddle.stack(\n                        [shift_x, shift_y], axis=-1), dtype='float32')\n                anchor_points.append(anchor_point.reshape([1, -1, 2]))\n                stride_tensor.append(\n                    paddle.full(\n                        [1, h * w, 1], stride, dtype='float32'))\n                num_anchors_list.append(h * w)\n            anchor_points = paddle.concat(anchor_points, axis=1)\n            stride_tensor = paddle.concat(stride_tensor, axis=1)\n            return anchor_points, stride_tensor, num_anchors_list\n\n    def forward(self, feats, targets=None):\n        assert len(feats) == len(self.fpn_strides), \\\n            \"The size of feats is not equal to size of fpn_strides\"\n\n        if self.training:\n            return self.forward_train(feats, targets)\n        else:\n            return self.forward_eval(feats)\n\n    def forward_train(self, feats, targets):\n        anchor_points, stride_tensor, num_anchors_list = self._generate_anchors(\n            feats)\n\n        cls_score_list, reg_dist_list, reg_angle_list = [], [], []\n        for i, feat in enumerate(feats):\n            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))\n            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +\n                                         feat)\n            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))\n            reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat))\n            # cls and reg\n            cls_score = F.sigmoid(cls_logit)\n            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))\n            reg_dist_list.append(reg_dist.flatten(2).transpose([0, 2, 1]))\n            reg_angle_list.append(reg_angle.flatten(2).transpose([0, 2, 1]))\n        cls_score_list = paddle.concat(cls_score_list, axis=1)\n        reg_dist_list = paddle.concat(reg_dist_list, axis=1)\n        reg_angle_list = paddle.concat(reg_angle_list, axis=1)\n\n        return self.get_loss([\n            cls_score_list, reg_dist_list, reg_angle_list, anchor_points,\n            num_anchors_list, stride_tensor\n        ], targets)\n\n    def forward_eval(self, feats):\n        cls_score_list, reg_box_list = [], []\n        anchor_points, _, _ = self._generate_anchors(feats)\n        for i, (feat, stride) in enumerate(zip(feats, self.fpn_strides)):\n            b, _, h, w = feat.shape\n            l = h * w\n            # cls\n            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))\n            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +\n                                         feat)\n            # reg\n            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))\n            reg_xy, reg_wh = paddle.split(reg_dist, 2, axis=1)\n            reg_xy = reg_xy * stride\n            reg_wh = (F.elu(reg_wh) + 1.) * stride\n            reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat))\n            reg_angle = self.angle_proj_conv(F.softmax(reg_angle, axis=1))\n            reg_box = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)\n            # cls and reg\n            cls_score = F.sigmoid(cls_logit)\n            cls_score_list.append(cls_score.reshape([b, self.num_classes, l]))\n            reg_box_list.append(reg_box.reshape([b, 5, l]))\n\n        cls_score_list = paddle.concat(cls_score_list, axis=-1)\n        reg_box_list = paddle.concat(reg_box_list, axis=-1).transpose([0, 2, 1])\n        reg_xy, reg_wha = paddle.split(reg_box_list, [2, 3], axis=-1)\n        reg_xy = reg_xy + anchor_points\n        reg_box_list = paddle.concat([reg_xy, reg_wha], axis=-1)\n        return cls_score_list, reg_box_list\n\n    def _bbox_decode(self, points, pred_dist, pred_angle, stride_tensor):\n        # predict vector to x, y, w, h, angle\n        b, l = pred_angle.shape[:2]\n        xy, wh = paddle.split(pred_dist, 2, axis=-1)\n        xy = xy * stride_tensor + points\n        wh = (F.elu(wh) + 1.) * stride_tensor\n        angle = F.softmax(pred_angle.reshape([b, l, 1, self.angle_max + 1\n                                              ])).matmul(self.angle_proj)\n        return paddle.concat([xy, wh, angle], axis=-1)\n\n    def get_loss(self, head_outs, gt_meta):\n        pred_scores, pred_dist, pred_angle, \\\n        anchor_points, num_anchors_list, stride_tensor = head_outs\n        # [B, N, 5] -> [B, N, 5]\n        pred_bboxes = self._bbox_decode(anchor_points, pred_dist, pred_angle,\n                                        stride_tensor)\n        gt_labels = gt_meta['gt_class']\n        # [B, N, 5]\n        gt_bboxes = gt_meta['gt_rbox']\n        pad_gt_mask = gt_meta['pad_gt_mask']\n        # label assignment\n        if gt_meta['epoch_id'] < self.static_assigner_epoch:\n            assigned_labels, assigned_bboxes, assigned_scores = \\\n                self.static_assigner(\n                    anchor_points,\n                    stride_tensor,\n                    num_anchors_list,\n                    gt_labels,\n                    gt_meta['gt_bbox'],\n                    gt_bboxes,\n                    pad_gt_mask,\n                    self.num_classes,\n                    pred_bboxes.detach()\n                )\n        else:\n            assigned_labels, assigned_bboxes, assigned_scores = \\\n                self.assigner(\n                pred_scores.detach(),\n                pred_bboxes.detach(),\n                anchor_points,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index=self.num_classes)\n        alpha_l = -1\n        # cls loss\n        if self.use_varifocal_loss:\n            one_hot_label = F.one_hot(assigned_labels,\n                                      self.num_classes + 1)[..., :-1]\n            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,\n                                            one_hot_label)\n        else:\n            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)\n\n        assigned_scores_sum = assigned_scores.sum()\n        if paddle.distributed.get_world_size() > 1:\n            paddle.distributed.all_reduce(assigned_scores_sum)\n            assigned_scores_sum = paddle.clip(\n                assigned_scores_sum / paddle.distributed.get_world_size(),\n                min=1.)\n        else:\n            assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)\n        loss_cls /= assigned_scores_sum\n\n        loss_iou, loss_dfl = self._bbox_loss(pred_angle, pred_bboxes,\n                                             anchor_points, assigned_labels,\n                                             assigned_bboxes, assigned_scores,\n                                             assigned_scores_sum, stride_tensor)\n\n        loss = self.loss_weight['class'] * loss_cls + \\\n               self.loss_weight['iou'] * loss_iou + \\\n               self.loss_weight['dfl'] * loss_dfl\n        out_dict = {\n            'loss': loss,\n            'loss_cls': loss_cls,\n            'loss_iou': loss_iou,\n            'loss_dfl': loss_dfl\n        }\n        return out_dict\n\n    @staticmethod\n    def _focal_loss(score, label, alpha=0.25, gamma=2.0):\n        weight = (score - label).pow(gamma)\n        if alpha > 0:\n            alpha_t = alpha * label + (1 - alpha) * (1 - label)\n            weight *= alpha_t\n        loss = F.binary_cross_entropy(\n            score, label, weight=weight, reduction='sum')\n        return loss\n\n    @staticmethod\n    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):\n        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label\n        loss = F.binary_cross_entropy(\n            pred_score, gt_score, weight=weight, reduction='sum')\n        return loss\n\n    @staticmethod\n    def _df_loss(pred_dist, target):\n        target_left = paddle.cast(target, 'int64')\n        target_right = target_left + 1\n        weight_left = target_right.astype('float32') - target\n        weight_right = 1 - weight_left\n        loss_left = F.cross_entropy(\n            pred_dist, target_left, reduction='none') * weight_left\n        loss_right = F.cross_entropy(\n            pred_dist, target_right, reduction='none') * weight_right\n        return (loss_left + loss_right).mean(-1, keepdim=True)\n\n    def _bbox_loss(self, pred_angle, pred_bboxes, anchor_points,\n                   assigned_labels, assigned_bboxes, assigned_scores,\n                   assigned_scores_sum, stride_tensor):\n        # select positive samples mask\n        mask_positive = (assigned_labels != self.num_classes)\n        num_pos = mask_positive.sum()\n        # pos/neg loss\n        if num_pos > 0:\n            # iou\n            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5])\n            pred_bboxes_pos = paddle.masked_select(pred_bboxes,\n                                                   bbox_mask).reshape([-1, 5])\n            assigned_bboxes_pos = paddle.masked_select(\n                assigned_bboxes, bbox_mask).reshape([-1, 5])\n            bbox_weight = paddle.masked_select(\n                assigned_scores.sum(-1), mask_positive).reshape([-1])\n\n            loss_iou = self.iou_loss(pred_bboxes_pos,\n                                     assigned_bboxes_pos) * bbox_weight\n            loss_iou = loss_iou.sum() / assigned_scores_sum\n\n            # dfl\n            angle_mask = mask_positive.unsqueeze(-1).tile(\n                [1, 1, self.angle_max + 1])\n            pred_angle_pos = paddle.masked_select(\n                pred_angle, angle_mask).reshape([-1, self.angle_max + 1])\n            assigned_angle_pos = (\n                assigned_bboxes_pos[:, 4] /\n                self.half_pi_bin).clip(0, self.angle_max - 0.01)\n            loss_dfl = self._df_loss(pred_angle_pos, assigned_angle_pos)\n        else:\n            loss_iou = pred_bboxes.sum() * 0.\n            loss_dfl = paddle.zeros([1])\n\n        return loss_iou, loss_dfl\n\n    def _box2corners(self, pred_bboxes):\n        \"\"\" convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4)\n\n        Args:\n            pred_bboxes (Tensor): [B, N, 5]\n        \n        Returns:\n            polys (Tensor): [B, N, 8]\n        \"\"\"\n        x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1)\n        cos_a_half = paddle.cos(angle) * 0.5\n        sin_a_half = paddle.sin(angle) * 0.5\n        w_x = cos_a_half * w\n        w_y = sin_a_half * w\n        h_x = -sin_a_half * h\n        h_y = cos_a_half * h\n        return paddle.concat(\n            [\n                x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y,\n                x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y\n            ],\n            axis=-1)\n\n    def post_process(self, head_outs, scale_factor):\n        pred_scores, pred_bboxes = head_outs\n        # [B, N, 5] -> [B, N, 8]\n        pred_bboxes = self._box2corners(pred_bboxes)\n        # scale bbox to origin\n        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)\n        scale_factor = paddle.concat(\n            [\n                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,\n                scale_y\n            ],\n            axis=-1).reshape([-1, 1, 8])\n        pred_bboxes /= scale_factor\n        if self.export_onnx:\n            return pred_bboxes, pred_scores, None\n        bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes,\n                                                           pred_scores)\n        return bbox_pred, bbox_num, nms_keep_idx\n"
  },
  {
    "path": "ppdet/modeling/heads/retina_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Normal, Constant\nfrom ppdet.modeling.bbox_utils import bbox2delta, delta2bbox\nfrom ppdet.modeling.heads.fcos_head import FCOSFeat\n\nfrom ppdet.core.workspace import register\n\n__all__ = ['RetinaHead']\n\n\n@register\nclass RetinaFeat(FCOSFeat):\n    \"\"\"We use FCOSFeat to construct conv layers in RetinaNet.\n    We rename FCOSFeat to RetinaFeat to avoid confusion.\n    \"\"\"\n    pass\n\n\n@register\nclass RetinaHead(nn.Layer):\n    \"\"\"Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf\n    \"\"\"\n    __shared__ = ['num_classes']\n    __inject__ = [\n        'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',\n        'loss_bbox', 'nms'\n    ]\n\n    def __init__(self,\n                 num_classes=80,\n                 conv_feat='RetinaFeat',\n                 anchor_generator='RetinaAnchorGenerator',\n                 bbox_assigner='MaxIoUAssigner',\n                 loss_class='FocalLoss',\n                 loss_bbox='SmoothL1Loss',\n                 nms='MultiClassNMS',\n                 prior_prob=0.01,\n                 nms_pre=1000,\n                 weights=[1., 1., 1., 1.]):\n        super(RetinaHead, self).__init__()\n        self.num_classes = num_classes\n        self.conv_feat = conv_feat\n        self.anchor_generator = anchor_generator\n        self.bbox_assigner = bbox_assigner\n        self.loss_class = loss_class\n        self.loss_bbox = loss_bbox\n        self.nms = nms\n        self.nms_pre = nms_pre\n        self.weights = weights\n\n        bias_init_value = -math.log((1 - prior_prob) / prior_prob)\n        num_anchors = self.anchor_generator.num_anchors\n        self.retina_cls = nn.Conv2D(\n            in_channels=self.conv_feat.feat_out,\n            out_channels=self.num_classes * num_anchors,\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            weight_attr=ParamAttr(initializer=Normal(\n                mean=0.0, std=0.01)),\n            bias_attr=ParamAttr(initializer=Constant(value=bias_init_value)))\n        self.retina_reg = nn.Conv2D(\n            in_channels=self.conv_feat.feat_out,\n            out_channels=4 * num_anchors,\n            kernel_size=3,\n            stride=1,\n            padding=1,\n            weight_attr=ParamAttr(initializer=Normal(\n                mean=0.0, std=0.01)),\n            bias_attr=ParamAttr(initializer=Constant(value=0)))\n\n    def forward(self, neck_feats, targets=None):\n        cls_logits_list = []\n        bboxes_reg_list = []\n        for neck_feat in neck_feats:\n            conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat)\n            cls_logits = self.retina_cls(conv_cls_feat)\n            bbox_reg = self.retina_reg(conv_reg_feat)\n            cls_logits_list.append(cls_logits)\n            bboxes_reg_list.append(bbox_reg)\n\n        if self.training:\n            return self.get_loss([cls_logits_list, bboxes_reg_list], targets)\n        else:\n            return [cls_logits_list, bboxes_reg_list]\n\n    def get_loss(self, head_outputs, targets):\n        \"\"\"Here we calculate loss for a batch of images.\n        We assign anchors to gts in each image and gather all the assigned\n        postive and negative samples. Then loss is calculated on the gathered\n        samples.\n        \"\"\"\n        cls_logits_list, bboxes_reg_list = head_outputs\n        anchors = self.anchor_generator(cls_logits_list)\n        anchors = paddle.concat(anchors)\n\n        # matches: contain gt_inds\n        # match_labels: -1(ignore), 0(neg) or 1(pos)\n        matches_list, match_labels_list = [], []\n        # assign anchors to gts, no sampling is involved\n        for gt_bbox in targets['gt_bbox']:\n            matches, match_labels = self.bbox_assigner(anchors, gt_bbox)\n            matches_list.append(matches)\n            match_labels_list.append(match_labels)\n\n        # reshape network outputs\n        cls_logits = [\n            _.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes])\n            for _ in cls_logits_list\n        ]\n        bboxes_reg = [\n            _.transpose([0, 2, 3, 1]).reshape([0, -1, 4])\n            for _ in bboxes_reg_list\n        ]\n        cls_logits = paddle.concat(cls_logits, axis=1)\n        bboxes_reg = paddle.concat(bboxes_reg, axis=1)\n\n        cls_pred_list, cls_tar_list = [], []\n        reg_pred_list, reg_tar_list = [], []\n        # find and gather preds and targets in each image\n        for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \\\n            zip(matches_list, match_labels_list, cls_logits, bboxes_reg,\n                targets['gt_bbox'], targets['gt_class']):\n            pos_mask = (match_labels == 1)\n            neg_mask = (match_labels == 0)\n            chosen_mask = paddle.logical_or(pos_mask, neg_mask)\n\n            gt_class = gt_class.reshape([-1])\n            bg_class = paddle.to_tensor(\n                [self.num_classes], dtype=gt_class.dtype)\n            # a trick to assign num_classes to negative targets\n            gt_class = paddle.concat([gt_class, bg_class], axis=-1)\n            matches = paddle.where(neg_mask,\n                                   paddle.full_like(matches, gt_class.size - 1),\n                                   matches)\n\n            cls_pred = cls_logit[chosen_mask]\n            cls_tar = gt_class[matches[chosen_mask]]\n            reg_pred = bbox_reg[pos_mask].reshape([-1, 4])\n            reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4])\n            reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights)\n            cls_pred_list.append(cls_pred)\n            cls_tar_list.append(cls_tar)\n            reg_pred_list.append(reg_pred)\n            reg_tar_list.append(reg_tar)\n        cls_pred = paddle.concat(cls_pred_list)\n        cls_tar = paddle.concat(cls_tar_list)\n        reg_pred = paddle.concat(reg_pred_list)\n        reg_tar = paddle.concat(reg_tar_list)\n\n        avg_factor = max(1.0, reg_pred.shape[0])\n        cls_loss = self.loss_class(\n            cls_pred, cls_tar, reduction='sum') / avg_factor\n\n        if reg_pred.shape[0] == 0:\n            reg_loss = paddle.zeros([])\n            reg_loss.stop_gradient = False\n        else:\n            reg_loss = self.loss_bbox(\n                reg_pred, reg_tar, reduction='sum') / avg_factor\n\n        loss = cls_loss + reg_loss\n        out_dict = {\n            'loss_cls': cls_loss,\n            'loss_reg': reg_loss,\n            'loss': loss,\n        }\n        return out_dict\n\n    def get_bboxes_single(self,\n                          anchors,\n                          cls_scores_list,\n                          bbox_preds_list,\n                          im_shape,\n                          scale_factor,\n                          rescale=True):\n        assert len(cls_scores_list) == len(bbox_preds_list)\n        mlvl_bboxes = []\n        mlvl_scores = []\n        for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list,\n                                                bbox_preds_list):\n            cls_score = cls_score.reshape([-1, self.num_classes])\n            bbox_pred = bbox_pred.reshape([-1, 4])\n            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:\n                max_score = cls_score.max(axis=1)\n                _, topk_inds = max_score.topk(self.nms_pre)\n                bbox_pred = bbox_pred.gather(topk_inds)\n                anchor = anchor.gather(topk_inds)\n                cls_score = cls_score.gather(topk_inds)\n            bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze()\n            mlvl_bboxes.append(bbox_pred)\n            mlvl_scores.append(F.sigmoid(cls_score))\n        mlvl_bboxes = paddle.concat(mlvl_bboxes)\n        mlvl_bboxes = paddle.squeeze(mlvl_bboxes)\n        if rescale:\n            mlvl_bboxes = mlvl_bboxes / paddle.concat(\n                [scale_factor[::-1], scale_factor[::-1]])\n        mlvl_scores = paddle.concat(mlvl_scores)\n        mlvl_scores = mlvl_scores.transpose([1, 0])\n        return mlvl_bboxes, mlvl_scores\n\n    def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor):\n        batch_bboxes = []\n        batch_scores = []\n        for img_id in range(cls_logits[0].shape[0]):\n            num_lvls = len(cls_logits)\n            cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]\n            bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)]\n            bboxes, scores = self.get_bboxes_single(\n                anchors, cls_scores_list, bbox_preds_list, im_shape[img_id],\n                scale_factor[img_id])\n            batch_bboxes.append(bboxes)\n            batch_scores.append(scores)\n        batch_bboxes = paddle.stack(batch_bboxes, axis=0)\n        batch_scores = paddle.stack(batch_scores, axis=0)\n        return batch_bboxes, batch_scores\n\n    def post_process(self, head_outputs, im_shape, scale_factor):\n        cls_logits_list, bboxes_reg_list = head_outputs\n        anchors = self.anchor_generator(cls_logits_list)\n        cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]\n        bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list]\n        bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape,\n                                     scale_factor)\n\n        bbox_pred, bbox_num, nms_keep_idx = self.nms(bboxes, scores)\n        return bbox_pred, bbox_num, nms_keep_idx\n\n\n    def get_scores_single(self, cls_scores_list):\n        mlvl_logits = []\n        for cls_score in  cls_scores_list:\n            cls_score = cls_score.reshape([-1, self.num_classes])\n            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:\n                max_score = cls_score.max(axis=1)\n                _, topk_inds = max_score.topk(self.nms_pre)\n                cls_score = cls_score.gather(topk_inds)\n\n            mlvl_logits.append(cls_score)\n\n        mlvl_logits = paddle.concat(mlvl_logits)\n        mlvl_logits = mlvl_logits.transpose([1, 0])\n\n        return mlvl_logits\n\n    def decode_cls_logits(self, cls_logits_list):\n        cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]\n        batch_logits = []\n        for img_id in range(cls_logits[0].shape[0]):\n            num_lvls = len(cls_logits)\n            cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]\n            logits = self.get_scores_single(cls_scores_list)\n            batch_logits.append(logits)\n        batch_logits = paddle.stack(batch_logits, axis=0)\n        return batch_logits\n\n"
  },
  {
    "path": "ppdet/modeling/heads/roi_extractor.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling import ops\nimport paddle.nn as nn\n\n\ndef _to_list(v):\n    if not isinstance(v, (list, tuple)):\n        return [v]\n    return v\n\n\n@register\nclass RoIAlign(nn.Layer):\n    \"\"\"\n    RoI Align module\n\n    For more details, please refer to the document of roi_align in\n    in https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/vision/ops.py\n\n    Args:\n        resolution (int): The output size, default 14\n        spatial_scale (float): Multiplicative spatial scale factor to translate\n            ROI coords from their input scale to the scale used when pooling.\n            default 0.0625\n        sampling_ratio (int): The number of sampling points in the interpolation\n            grid, default 0\n        canconical_level (int): The referring level of FPN layer with \n            specified level. default 4\n        canonical_size (int): The referring scale of FPN layer with \n            specified scale. default 224\n        start_level (int): The start level of FPN layer to extract RoI feature,\n            default 0\n        end_level (int): The end level of FPN layer to extract RoI feature,\n            default 3\n        aligned (bool): Whether to add offset to rois' coord in roi_align.\n            default false\n    \"\"\"\n\n    def __init__(self,\n                 resolution=14,\n                 spatial_scale=0.0625,\n                 sampling_ratio=0,\n                 canconical_level=4,\n                 canonical_size=224,\n                 start_level=0,\n                 end_level=3,\n                 aligned=False):\n        super(RoIAlign, self).__init__()\n        self.resolution = resolution\n        self.spatial_scale = _to_list(spatial_scale)\n        self.sampling_ratio = sampling_ratio\n        self.canconical_level = canconical_level\n        self.canonical_size = canonical_size\n        self.start_level = start_level\n        self.end_level = end_level\n        self.aligned = False # TODO: npu kernel do not support aligned=True\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'spatial_scale': [1. / i.stride for i in input_shape]}\n\n    def forward(self, feats, roi, rois_num):\n        roi = paddle.concat(roi) if len(roi) > 1 else roi[0]\n        if len(feats) == 1:\n            rois_feat = paddle.vision.ops.roi_align(\n                x=feats[self.start_level],\n                boxes=roi,\n                boxes_num=rois_num,\n                output_size=self.resolution,\n                spatial_scale=self.spatial_scale[0],\n                aligned=self.aligned)\n        else:\n            offset = 2\n            k_min = self.start_level + offset\n            k_max = self.end_level + offset\n            if hasattr(paddle.vision.ops, \"distribute_fpn_proposals\"):\n                distribute_fpn_proposals = getattr(paddle.vision.ops,\n                                                   \"distribute_fpn_proposals\")\n            else:\n                distribute_fpn_proposals = ops.distribute_fpn_proposals\n            rois_dist, restore_index, rois_num_dist = distribute_fpn_proposals(\n                roi,\n                k_min,\n                k_max,\n                self.canconical_level,\n                self.canonical_size,\n                rois_num=rois_num)\n\n            rois_feat_list = []\n            for lvl in range(self.start_level, self.end_level + 1):\n                roi_feat = paddle.vision.ops.roi_align(\n                    x=feats[lvl],\n                    boxes=rois_dist[lvl],\n                    boxes_num=rois_num_dist[lvl],\n                    output_size=self.resolution,\n                    spatial_scale=self.spatial_scale[lvl],\n                    sampling_ratio=self.sampling_ratio,\n                    aligned=self.aligned)\n                rois_feat_list.append(roi_feat)\n            rois_feat_shuffle = paddle.concat(rois_feat_list)\n            rois_feat = paddle.gather(rois_feat_shuffle, restore_index)\n\n        return rois_feat\n"
  },
  {
    "path": "ppdet/modeling/heads/s2anet_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/models/anchor_heads_rotated/s2anet_head.py\n\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal, Constant\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.proposal_generator.target_layer import RBoxAssigner\nfrom ppdet.modeling.proposal_generator.anchor_generator import S2ANetAnchorGenerator\nfrom ppdet.modeling.layers import AlignConv\nfrom ..cls_utils import _get_class_default_kwargs\nimport numpy as np\n\n\n@register\nclass S2ANetHead(nn.Layer):\n    \"\"\"\n    S2Anet head\n    Args:\n        stacked_convs (int): number of stacked_convs\n        feat_in (int): input channels of feat\n        feat_out (int): output channels of feat\n        num_classes (int): num_classes\n        anchor_strides (list): stride of anchors\n        anchor_scales (list): scale of anchors\n        anchor_ratios (list): ratios of anchors\n        target_means (list): target_means\n        target_stds (list): target_stds\n        align_conv_type (str): align_conv_type ['Conv', 'AlignConv']\n        align_conv_size (int): kernel size of align_conv\n        use_sigmoid_cls (bool): use sigmoid_cls or not\n        reg_loss_weight (list): loss weight for regression\n    \"\"\"\n    __shared__ = ['num_classes']\n    __inject__ = ['anchor_assign', 'nms']\n\n    def __init__(self,\n                 stacked_convs=2,\n                 feat_in=256,\n                 feat_out=256,\n                 num_classes=15,\n                 anchor_strides=[8, 16, 32, 64, 128],\n                 anchor_scales=[4],\n                 anchor_ratios=[1.0],\n                 target_means=0.0,\n                 target_stds=1.0,\n                 align_conv_type='AlignConv',\n                 align_conv_size=3,\n                 use_sigmoid_cls=True,\n                 anchor_assign=_get_class_default_kwargs(RBoxAssigner),\n                 reg_loss_weight=[1.0, 1.0, 1.0, 1.0, 1.1],\n                 cls_loss_weight=[1.1, 1.05],\n                 reg_loss_type='l1',\n                 nms_pre=2000,\n                 nms='MultiClassNMS'):\n        super(S2ANetHead, self).__init__()\n        self.stacked_convs = stacked_convs\n        self.feat_in = feat_in\n        self.feat_out = feat_out\n        self.anchor_list = None\n        self.anchor_scales = anchor_scales\n        self.anchor_ratios = anchor_ratios\n        self.anchor_strides = anchor_strides\n        self.anchor_strides = paddle.to_tensor(anchor_strides)\n        self.anchor_base_sizes = list(anchor_strides)\n        self.means = paddle.ones(shape=[5]) * target_means\n        self.stds = paddle.ones(shape=[5]) * target_stds\n        assert align_conv_type in ['AlignConv', 'Conv', 'DCN']\n        self.align_conv_type = align_conv_type\n        self.align_conv_size = align_conv_size\n\n        self.use_sigmoid_cls = use_sigmoid_cls\n        self.cls_out_channels = num_classes if self.use_sigmoid_cls else num_classes + 1\n        self.sampling = False\n        self.anchor_assign = anchor_assign\n        self.reg_loss_weight = reg_loss_weight\n        self.cls_loss_weight = cls_loss_weight\n        self.alpha = 1.0\n        self.beta = 1.0\n        self.reg_loss_type = reg_loss_type\n        self.nms_pre = nms_pre\n        self.nms = nms\n        self.fake_bbox = paddle.to_tensor(\n            np.array(\n                [[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],\n                dtype='float32'))\n        self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))\n\n        # anchor\n        self.anchor_generators = []\n        for anchor_base in self.anchor_base_sizes:\n            self.anchor_generators.append(\n                S2ANetAnchorGenerator(anchor_base, anchor_scales,\n                                      anchor_ratios))\n\n        self.anchor_generators = nn.LayerList(self.anchor_generators)\n        self.fam_cls_convs = nn.Sequential()\n        self.fam_reg_convs = nn.Sequential()\n\n        for i in range(self.stacked_convs):\n            chan_in = self.feat_in if i == 0 else self.feat_out\n\n            self.fam_cls_convs.add_sublayer(\n                'fam_cls_conv_{}'.format(i),\n                nn.Conv2D(\n                    in_channels=chan_in,\n                    out_channels=self.feat_out,\n                    kernel_size=3,\n                    padding=1,\n                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n                    bias_attr=ParamAttr(initializer=Constant(0))))\n\n            self.fam_cls_convs.add_sublayer('fam_cls_conv_{}_act'.format(i),\n                                            nn.ReLU())\n\n            self.fam_reg_convs.add_sublayer(\n                'fam_reg_conv_{}'.format(i),\n                nn.Conv2D(\n                    in_channels=chan_in,\n                    out_channels=self.feat_out,\n                    kernel_size=3,\n                    padding=1,\n                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n                    bias_attr=ParamAttr(initializer=Constant(0))))\n\n            self.fam_reg_convs.add_sublayer('fam_reg_conv_{}_act'.format(i),\n                                            nn.ReLU())\n\n        self.fam_reg = nn.Conv2D(\n            self.feat_out,\n            5,\n            1,\n            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n            bias_attr=ParamAttr(initializer=Constant(0)))\n        prior_prob = 0.01\n        bias_init = float(-np.log((1 - prior_prob) / prior_prob))\n        self.fam_cls = nn.Conv2D(\n            self.feat_out,\n            self.cls_out_channels,\n            1,\n            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n            bias_attr=ParamAttr(initializer=Constant(bias_init)))\n\n        if self.align_conv_type == \"AlignConv\":\n            self.align_conv = AlignConv(self.feat_out, self.feat_out,\n                                        self.align_conv_size)\n        elif self.align_conv_type == \"Conv\":\n            self.align_conv = nn.Conv2D(\n                self.feat_out,\n                self.feat_out,\n                self.align_conv_size,\n                padding=(self.align_conv_size - 1) // 2,\n                bias_attr=ParamAttr(initializer=Constant(0)))\n\n        elif self.align_conv_type == \"DCN\":\n            self.align_conv_offset = nn.Conv2D(\n                self.feat_out,\n                2 * self.align_conv_size**2,\n                1,\n                weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n                bias_attr=ParamAttr(initializer=Constant(0)))\n\n            self.align_conv = paddle.vision.ops.DeformConv2D(\n                self.feat_out,\n                self.feat_out,\n                self.align_conv_size,\n                padding=(self.align_conv_size - 1) // 2,\n                weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n                bias_attr=False)\n\n        self.or_conv = nn.Conv2D(\n            self.feat_out,\n            self.feat_out,\n            kernel_size=3,\n            padding=1,\n            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n            bias_attr=ParamAttr(initializer=Constant(0)))\n\n        # ODM\n        self.odm_cls_convs = nn.Sequential()\n        self.odm_reg_convs = nn.Sequential()\n\n        for i in range(self.stacked_convs):\n            ch_in = self.feat_out\n            # ch_in = int(self.feat_out / 8) if i == 0 else self.feat_out\n\n            self.odm_cls_convs.add_sublayer(\n                'odm_cls_conv_{}'.format(i),\n                nn.Conv2D(\n                    in_channels=ch_in,\n                    out_channels=self.feat_out,\n                    kernel_size=3,\n                    stride=1,\n                    padding=1,\n                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n                    bias_attr=ParamAttr(initializer=Constant(0))))\n\n            self.odm_cls_convs.add_sublayer('odm_cls_conv_{}_act'.format(i),\n                                            nn.ReLU())\n\n            self.odm_reg_convs.add_sublayer(\n                'odm_reg_conv_{}'.format(i),\n                nn.Conv2D(\n                    in_channels=self.feat_out,\n                    out_channels=self.feat_out,\n                    kernel_size=3,\n                    stride=1,\n                    padding=1,\n                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n                    bias_attr=ParamAttr(initializer=Constant(0))))\n\n            self.odm_reg_convs.add_sublayer('odm_reg_conv_{}_act'.format(i),\n                                            nn.ReLU())\n\n        self.odm_cls = nn.Conv2D(\n            self.feat_out,\n            self.cls_out_channels,\n            3,\n            padding=1,\n            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n            bias_attr=ParamAttr(initializer=Constant(bias_init)))\n        self.odm_reg = nn.Conv2D(\n            self.feat_out,\n            5,\n            3,\n            padding=1,\n            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),\n            bias_attr=ParamAttr(initializer=Constant(0)))\n\n    def forward(self, feats, targets=None):\n        fam_reg_list, fam_cls_list = [], []\n        odm_reg_list, odm_cls_list = [], []\n        num_anchors_list, base_anchors_list, refine_anchors_list = [], [], []\n\n        for i, feat in enumerate(feats):\n            # get shape\n            B = feat.shape[0]\n            H, W = feat.shape[2], feat.shape[3]\n\n            NA = H * W\n            num_anchors_list.append(NA)\n\n            fam_cls_feat = self.fam_cls_convs(feat)\n            fam_cls = self.fam_cls(fam_cls_feat)\n            # [N, CLS, H, W] --> [N, H, W, CLS]\n            fam_cls = fam_cls.transpose([0, 2, 3, 1]).reshape(\n                [B, NA, self.cls_out_channels])\n            fam_cls_list.append(fam_cls)\n\n            fam_reg_feat = self.fam_reg_convs(feat)\n            fam_reg = self.fam_reg(fam_reg_feat)\n            # [N, 5, H, W] --> [N, H, W, 5]\n            fam_reg = fam_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5])\n            fam_reg_list.append(fam_reg)\n\n            # prepare anchor\n            init_anchors = self.anchor_generators[i]((H, W),\n                                                     self.anchor_strides[i])\n            init_anchors = init_anchors.reshape([1, NA, 5])\n            base_anchors_list.append(init_anchors.squeeze(0))\n\n            if self.training:\n                refine_anchor = self.bbox_decode(fam_reg.detach(), init_anchors)\n            else:\n                refine_anchor = self.bbox_decode(fam_reg, init_anchors)\n\n            refine_anchors_list.append(refine_anchor)\n\n            if self.align_conv_type == 'AlignConv':\n                align_feat = self.align_conv(feat,\n                                             refine_anchor.clone(), (H, W),\n                                             self.anchor_strides[i])\n            elif self.align_conv_type == 'DCN':\n                align_offset = self.align_conv_offset(feat)\n                align_feat = self.align_conv(feat, align_offset)\n            elif self.align_conv_type == 'Conv':\n                align_feat = self.align_conv(feat)\n\n            or_feat = self.or_conv(align_feat)\n            odm_reg_feat = or_feat\n            odm_cls_feat = or_feat\n\n            odm_reg_feat = self.odm_reg_convs(odm_reg_feat)\n            odm_cls_feat = self.odm_cls_convs(odm_cls_feat)\n\n            odm_cls = self.odm_cls(odm_cls_feat)\n            # [N, CLS, H, W] --> [N, H, W, CLS]\n            odm_cls = odm_cls.transpose([0, 2, 3, 1]).reshape(\n                [B, NA, self.cls_out_channels])\n            odm_cls_list.append(odm_cls)\n\n            odm_reg = self.odm_reg(odm_reg_feat)\n            # [N, 5, H, W] --> [N, H, W, 5]\n            odm_reg = odm_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5])\n            odm_reg_list.append(odm_reg)\n\n        if self.training:\n            return self.get_loss([\n                fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list,\n                num_anchors_list, base_anchors_list, refine_anchors_list\n            ], targets)\n        else:\n            odm_bboxes_list = []\n            for odm_reg, refine_anchor in zip(odm_reg_list,\n                                              refine_anchors_list):\n                odm_bboxes = self.bbox_decode(odm_reg, refine_anchor)\n                odm_bboxes_list.append(odm_bboxes)\n            return [odm_bboxes_list, odm_cls_list]\n\n    def get_bboxes(self, head_outs):\n        perd_bboxes_list, pred_scores_list = head_outs\n        batch = pred_scores_list[0].shape[0]\n        bboxes, bbox_num = [], []\n        for i in range(batch):\n            pred_scores_per_image = [t[i] for t in pred_scores_list]\n            pred_bboxes_per_image = [t[i] for t in perd_bboxes_list]\n            bbox_per_image, bbox_num_per_image = self.get_bboxes_single(\n                pred_scores_per_image, pred_bboxes_per_image)\n            bboxes.append(bbox_per_image)\n            bbox_num.append(bbox_num_per_image)\n\n        bboxes = paddle.concat(bboxes)\n        bbox_num = paddle.concat(bbox_num)\n        return bboxes, bbox_num\n\n    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):\n        \"\"\"\n        Rescale, clip and filter the bbox from the output of NMS to\n        get final prediction.\n        Args:\n            bboxes(Tensor): bboxes [N, 10]\n            bbox_num(Tensor): bbox_num\n            im_shape(Tensor): [1 2]\n            scale_factor(Tensor): [1 2]\n        Returns:\n            bbox_pred(Tensor): The output is the prediction with shape [N, 8]\n                               including labels, scores and bboxes. The size of\n                               bboxes are corresponding to the original image.\n        \"\"\"\n        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)\n\n        origin_shape_list = []\n        scale_factor_list = []\n        # scale_factor: scale_y, scale_x\n        for i in range(bbox_num.shape[0]):\n            expand_shape = paddle.expand(origin_shape[i:i + 1, :],\n                                         [bbox_num[i], 2])\n            scale_y, scale_x = scale_factor[i, 0:1], scale_factor[i, 1:2]\n            scale = paddle.concat([\n                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,\n                scale_y\n            ])\n            expand_scale = paddle.expand(scale, [bbox_num[i], 8])\n            origin_shape_list.append(expand_shape)\n            scale_factor_list.append(expand_scale)\n\n        origin_shape_list = paddle.concat(origin_shape_list)\n        scale_factor_list = paddle.concat(scale_factor_list)\n\n        # bboxes: [N, 10], label, score, bbox\n        pred_label_score = bboxes[:, 0:2]\n        pred_bbox = bboxes[:, 2:]\n\n        # rescale bbox to original image\n        pred_bbox = pred_bbox.reshape([-1, 8])\n        scaled_bbox = pred_bbox / scale_factor_list\n        origin_h = origin_shape_list[:, 0]\n        origin_w = origin_shape_list[:, 1]\n\n        bboxes = scaled_bbox\n        zeros = paddle.zeros_like(origin_h)\n        x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros)\n        y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros)\n        x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros)\n        y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros)\n        x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros)\n        y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros)\n        x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros)\n        y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros)\n        pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1)\n        pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1)\n        return pred_result\n\n    def get_bboxes_single(self, cls_score_list, bbox_pred_list):\n        mlvl_bboxes = []\n        mlvl_scores = []\n\n        for cls_score, bbox_pred in zip(cls_score_list, bbox_pred_list):\n            if self.use_sigmoid_cls:\n                scores = F.sigmoid(cls_score)\n            else:\n                scores = F.softmax(cls_score, axis=-1)\n\n            if scores.shape[0] > self.nms_pre:\n                # Get maximum scores for foreground classes.\n                if self.use_sigmoid_cls:\n                    max_scores = paddle.max(scores, axis=1)\n                else:\n                    max_scores = paddle.max(scores[:, :-1], axis=1)\n\n                topk_val, topk_inds = paddle.topk(max_scores, self.nms_pre)\n                bbox_pred = paddle.gather(bbox_pred, topk_inds)\n                scores = paddle.gather(scores, topk_inds)\n\n            mlvl_bboxes.append(bbox_pred)\n            mlvl_scores.append(scores)\n\n        mlvl_bboxes = paddle.concat(mlvl_bboxes)\n        mlvl_scores = paddle.concat(mlvl_scores)\n\n        mlvl_polys = self.rbox2poly(mlvl_bboxes).unsqueeze(0)\n        mlvl_scores = paddle.transpose(mlvl_scores, [1, 0]).unsqueeze(0)\n\n        bbox, bbox_num, _ = self.nms(mlvl_polys, mlvl_scores)\n        if bbox.shape[0] <= 0:\n            bbox = self.fake_bbox\n            bbox_num = self.fake_bbox_num\n\n        return bbox, bbox_num\n\n    def smooth_l1_loss(self, pred, label, delta=1.0 / 9.0):\n        \"\"\"\n        Args:\n            pred: pred score\n            label: label\n            delta: delta\n        Returns: loss\n        \"\"\"\n        assert pred.shape == label.shape and label.numel() > 0\n        assert delta > 0\n        diff = paddle.abs(pred - label)\n        loss = paddle.where(diff < delta, 0.5 * diff * diff / delta,\n                            diff - 0.5 * delta)\n        return loss\n\n    def get_fam_loss(self, fam_target, s2anet_head_out, reg_loss_type='l1'):\n        (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes,\n         pos_inds, neg_inds) = fam_target\n        fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out\n\n        fam_cls_losses = []\n        fam_bbox_losses = []\n        st_idx = 0\n        num_total_samples = len(pos_inds) + len(\n            neg_inds) if self.sampling else len(pos_inds)\n        num_total_samples = max(1, num_total_samples)\n\n        for idx, feat_anchor_num in enumerate(num_anchors_list):\n            # step1:  get data\n            feat_labels = labels[st_idx:st_idx + feat_anchor_num]\n            feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num]\n\n            feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :]\n            feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :]\n\n            # step2: calc cls loss\n            feat_labels = feat_labels.reshape(-1)\n            feat_label_weights = feat_label_weights.reshape(-1)\n\n            fam_cls_score = fam_cls_branch_list[idx]\n            fam_cls_score = paddle.squeeze(fam_cls_score, axis=0)\n            fam_cls_score1 = fam_cls_score\n\n            feat_labels = paddle.to_tensor(feat_labels)\n            feat_labels_one_hot = paddle.nn.functional.one_hot(\n                feat_labels, self.cls_out_channels + 1)\n            feat_labels_one_hot = feat_labels_one_hot[:, 1:]\n            feat_labels_one_hot.stop_gradient = True\n\n            num_total_samples = paddle.to_tensor(\n                num_total_samples, dtype='float32', stop_gradient=True)\n\n            fam_cls = F.sigmoid_focal_loss(\n                fam_cls_score1,\n                feat_labels_one_hot,\n                normalizer=num_total_samples,\n                reduction='none')\n\n            feat_label_weights = feat_label_weights.reshape(\n                feat_label_weights.shape[0], 1)\n            feat_label_weights = np.repeat(\n                feat_label_weights, self.cls_out_channels, axis=1)\n            feat_label_weights = paddle.to_tensor(\n                feat_label_weights, stop_gradient=True)\n\n            fam_cls = fam_cls * feat_label_weights\n            fam_cls_total = paddle.sum(fam_cls)\n            fam_cls_losses.append(fam_cls_total)\n\n            # step3: regression loss\n            feat_bbox_targets = paddle.to_tensor(\n                feat_bbox_targets, dtype='float32', stop_gradient=True)\n            feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5])\n\n            fam_bbox_pred = fam_reg_branch_list[idx]\n            fam_bbox_pred = paddle.squeeze(fam_bbox_pred, axis=0)\n            fam_bbox_pred = paddle.reshape(fam_bbox_pred, [-1, 5])\n            fam_bbox = self.smooth_l1_loss(fam_bbox_pred, feat_bbox_targets)\n            loss_weight = paddle.to_tensor(\n                self.reg_loss_weight, dtype='float32', stop_gradient=True)\n            fam_bbox = paddle.multiply(fam_bbox, loss_weight)\n            feat_bbox_weights = paddle.to_tensor(\n                feat_bbox_weights, stop_gradient=True)\n\n            fam_bbox = fam_bbox * feat_bbox_weights\n            fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples\n            fam_bbox_losses.append(fam_bbox_total)\n            st_idx += feat_anchor_num\n\n        fam_cls_loss = paddle.add_n(fam_cls_losses)\n        fam_cls_loss_weight = paddle.to_tensor(\n            self.cls_loss_weight[0], dtype='float32', stop_gradient=True)\n        fam_cls_loss = fam_cls_loss * fam_cls_loss_weight\n        fam_reg_loss = paddle.add_n(fam_bbox_losses)\n        return fam_cls_loss, fam_reg_loss\n\n    def get_odm_loss(self, odm_target, s2anet_head_out, reg_loss_type='l1'):\n        (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes,\n         pos_inds, neg_inds) = odm_target\n        fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out\n\n        odm_cls_losses = []\n        odm_bbox_losses = []\n        st_idx = 0\n        num_total_samples = len(pos_inds) + len(\n            neg_inds) if self.sampling else len(pos_inds)\n        num_total_samples = max(1, num_total_samples)\n\n        for idx, feat_anchor_num in enumerate(num_anchors_list):\n            # step1:  get data\n            feat_labels = labels[st_idx:st_idx + feat_anchor_num]\n            feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num]\n\n            feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :]\n            feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :]\n\n            # step2: calc cls loss\n            feat_labels = feat_labels.reshape(-1)\n            feat_label_weights = feat_label_weights.reshape(-1)\n\n            odm_cls_score = odm_cls_branch_list[idx]\n            odm_cls_score = paddle.squeeze(odm_cls_score, axis=0)\n            odm_cls_score1 = odm_cls_score\n\n            feat_labels = paddle.to_tensor(feat_labels)\n            feat_labels_one_hot = paddle.nn.functional.one_hot(\n                feat_labels, self.cls_out_channels + 1)\n            feat_labels_one_hot = feat_labels_one_hot[:, 1:]\n            feat_labels_one_hot.stop_gradient = True\n\n            num_total_samples = paddle.to_tensor(\n                num_total_samples, dtype='float32', stop_gradient=True)\n            odm_cls = F.sigmoid_focal_loss(\n                odm_cls_score1,\n                feat_labels_one_hot,\n                normalizer=num_total_samples,\n                reduction='none')\n\n            feat_label_weights = feat_label_weights.reshape(\n                feat_label_weights.shape[0], 1)\n            feat_label_weights = np.repeat(\n                feat_label_weights, self.cls_out_channels, axis=1)\n            feat_label_weights = paddle.to_tensor(feat_label_weights)\n            feat_label_weights.stop_gradient = True\n\n            odm_cls = odm_cls * feat_label_weights\n            odm_cls_total = paddle.sum(odm_cls)\n            odm_cls_losses.append(odm_cls_total)\n\n            # # step3: regression loss\n            feat_bbox_targets = paddle.to_tensor(\n                feat_bbox_targets, dtype='float32')\n            feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5])\n            feat_bbox_targets.stop_gradient = True\n\n            odm_bbox_pred = odm_reg_branch_list[idx]\n            odm_bbox_pred = paddle.squeeze(odm_bbox_pred, axis=0)\n            odm_bbox_pred = paddle.reshape(odm_bbox_pred, [-1, 5])\n            odm_bbox = self.smooth_l1_loss(odm_bbox_pred, feat_bbox_targets)\n\n            loss_weight = paddle.to_tensor(\n                self.reg_loss_weight, dtype='float32', stop_gradient=True)\n            odm_bbox = paddle.multiply(odm_bbox, loss_weight)\n            feat_bbox_weights = paddle.to_tensor(\n                feat_bbox_weights, stop_gradient=True)\n\n            odm_bbox = odm_bbox * feat_bbox_weights\n            odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples\n\n            odm_bbox_losses.append(odm_bbox_total)\n            st_idx += feat_anchor_num\n\n        odm_cls_loss = paddle.add_n(odm_cls_losses)\n        odm_cls_loss_weight = paddle.to_tensor(\n            self.cls_loss_weight[1], dtype='float32', stop_gradient=True)\n        odm_cls_loss = odm_cls_loss * odm_cls_loss_weight\n        odm_reg_loss = paddle.add_n(odm_bbox_losses)\n        return odm_cls_loss, odm_reg_loss\n\n    def get_loss(self, head_outs, inputs):\n        fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, \\\n            num_anchors_list, base_anchors_list, refine_anchors_list = head_outs\n\n        # compute loss\n        fam_cls_loss_lst = []\n        fam_reg_loss_lst = []\n        odm_cls_loss_lst = []\n        odm_reg_loss_lst = []\n\n        batch = len(inputs['gt_rbox'])\n        for i in range(batch):\n            # data_format: (xc, yc, w, h, theta)\n            gt_mask = inputs['pad_gt_mask'][i, :, 0]\n            gt_idx = paddle.nonzero(gt_mask).squeeze(-1)\n            gt_bboxes = paddle.gather(inputs['gt_rbox'][i], gt_idx).numpy()\n            gt_labels = paddle.gather(inputs['gt_class'][i], gt_idx).numpy()\n            is_crowd = paddle.gather(inputs['is_crowd'][i], gt_idx).numpy()\n            gt_labels = gt_labels + 1\n\n            anchors_per_image = np.concatenate(base_anchors_list)\n\n            fam_cls_per_image = [t[i] for t in fam_cls_list]\n            fam_reg_per_image = [t[i] for t in fam_reg_list]\n            odm_cls_per_image = [t[i] for t in odm_cls_list]\n            odm_reg_per_image = [t[i] for t in odm_reg_list]\n            im_s2anet_head_out = (fam_cls_per_image, fam_reg_per_image,\n                                  odm_cls_per_image, odm_reg_per_image,\n                                  num_anchors_list)\n            # FAM\n            im_fam_target = self.anchor_assign(anchors_per_image, gt_bboxes,\n                                               gt_labels, is_crowd)\n            if im_fam_target is not None:\n                im_fam_cls_loss, im_fam_reg_loss = self.get_fam_loss(\n                    im_fam_target, im_s2anet_head_out, self.reg_loss_type)\n                fam_cls_loss_lst.append(im_fam_cls_loss)\n                fam_reg_loss_lst.append(im_fam_reg_loss)\n\n            # ODM\n            refine_anchors_per_image = [t[i] for t in refine_anchors_list]\n            refine_anchors_per_image = paddle.concat(\n                refine_anchors_per_image).numpy()\n            im_odm_target = self.anchor_assign(refine_anchors_per_image,\n                                               gt_bboxes, gt_labels, is_crowd)\n\n            if im_odm_target is not None:\n                im_odm_cls_loss, im_odm_reg_loss = self.get_odm_loss(\n                    im_odm_target, im_s2anet_head_out, self.reg_loss_type)\n                odm_cls_loss_lst.append(im_odm_cls_loss)\n                odm_reg_loss_lst.append(im_odm_reg_loss)\n\n        fam_cls_loss = paddle.add_n(fam_cls_loss_lst) / batch\n        fam_reg_loss = paddle.add_n(fam_reg_loss_lst) / batch\n        odm_cls_loss = paddle.add_n(odm_cls_loss_lst) / batch\n        odm_reg_loss = paddle.add_n(odm_reg_loss_lst) / batch\n        loss = fam_cls_loss + fam_reg_loss + odm_cls_loss + odm_reg_loss\n\n        return {\n            'loss': loss,\n            'fam_cls_loss': fam_cls_loss,\n            'fam_reg_loss': fam_reg_loss,\n            'odm_cls_loss': odm_cls_loss,\n            'odm_reg_loss': odm_reg_loss\n        }\n\n    def bbox_decode(self, preds, anchors, wh_ratio_clip=1e-6):\n        \"\"\"decode bbox from deltas\n        Args:\n            preds: [B, L, 5]\n            anchors: [1, L, 5]\n        return:\n            bboxes: [B, L, 5]\n        \"\"\"\n        preds = paddle.add(paddle.multiply(preds, self.stds), self.means)\n\n        dx, dy, dw, dh, dangle = paddle.split(preds, 5, axis=-1)\n        max_ratio = np.abs(np.log(wh_ratio_clip))\n        dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)\n        dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)\n\n        rroi_x, rroi_y, rroi_w, rroi_h, rroi_angle = paddle.split(\n            anchors, 5, axis=-1)\n\n        gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin(\n            rroi_angle) + rroi_x\n        gy = dx * rroi_w * paddle.sin(rroi_angle) + dy * rroi_h * paddle.cos(\n            rroi_angle) + rroi_y\n        gw = rroi_w * dw.exp()\n        gh = rroi_h * dh.exp()\n        ga = np.pi * dangle + rroi_angle\n        ga = (ga + np.pi / 4) % np.pi - np.pi / 4\n        bboxes = paddle.concat([gx, gy, gw, gh, ga], axis=-1)\n        return bboxes\n\n    def rbox2poly(self, rboxes):\n        \"\"\"\n        rboxes: [x_ctr,y_ctr,w,h,angle]\n        to\n        polys: [x0,y0,x1,y1,x2,y2,x3,y3]\n        \"\"\"\n        N = rboxes.shape[0]\n\n        x_ctr = rboxes[:, 0]\n        y_ctr = rboxes[:, 1]\n        width = rboxes[:, 2]\n        height = rboxes[:, 3]\n        angle = rboxes[:, 4]\n\n        tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5\n\n        normal_rects = paddle.stack(\n            [tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0)\n        normal_rects = paddle.reshape(normal_rects, [2, 4, N])\n        normal_rects = paddle.transpose(normal_rects, [2, 0, 1])\n\n        sin, cos = paddle.sin(angle), paddle.cos(angle)\n        # M: [N,2,2]\n        M = paddle.stack([cos, -sin, sin, cos], axis=0)\n        M = paddle.reshape(M, [2, 2, N])\n        M = paddle.transpose(M, [2, 0, 1])\n\n        # polys: [N,8]\n        polys = paddle.matmul(M, normal_rects)\n        polys = paddle.transpose(polys, [2, 1, 0])\n        polys = paddle.reshape(polys, [-1, N])\n        polys = paddle.transpose(polys, [1, 0])\n\n        tmp = paddle.stack(\n            [x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1)\n        polys = polys + tmp\n        return polys\n"
  },
  {
    "path": "ppdet/modeling/heads/simota_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# The code is based on:\n# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/yolox_head.py\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nfrom functools import partial\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Normal, Constant\n\nfrom ppdet.core.workspace import register\n\nfrom ppdet.modeling.bbox_utils import distance2bbox, bbox2distance\nfrom ppdet.data.transform.atss_assigner import bbox_overlaps\n\nfrom .gfl_head import GFLHead\n\n\n@register\nclass OTAHead(GFLHead):\n    \"\"\"\n    OTAHead\n    Args:\n        conv_feat (object): Instance of 'FCOSFeat'\n        num_classes (int): Number of classes\n        fpn_stride (list): The stride of each FPN Layer\n        prior_prob (float): Used to set the bias init for the class prediction layer\n        loss_qfl (object): Instance of QualityFocalLoss.\n        loss_dfl (object): Instance of DistributionFocalLoss.\n        loss_bbox (object): Instance of bbox loss.\n        assigner (object): Instance of label assigner.\n        reg_max: Max value of integral set :math: `{0, ..., reg_max}`\n                n QFL setting. Default: 16.\n    \"\"\"\n    __inject__ = [\n        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',\n        'assigner', 'nms'\n    ]\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 conv_feat='FCOSFeat',\n                 dgqp_module=None,\n                 num_classes=80,\n                 fpn_stride=[8, 16, 32, 64, 128],\n                 prior_prob=0.01,\n                 loss_class='QualityFocalLoss',\n                 loss_dfl='DistributionFocalLoss',\n                 loss_bbox='GIoULoss',\n                 assigner='SimOTAAssigner',\n                 reg_max=16,\n                 feat_in_chan=256,\n                 nms=None,\n                 nms_pre=1000,\n                 cell_offset=0):\n        super(OTAHead, self).__init__(\n            conv_feat=conv_feat,\n            dgqp_module=dgqp_module,\n            num_classes=num_classes,\n            fpn_stride=fpn_stride,\n            prior_prob=prior_prob,\n            loss_class=loss_class,\n            loss_dfl=loss_dfl,\n            loss_bbox=loss_bbox,\n            reg_max=reg_max,\n            feat_in_chan=feat_in_chan,\n            nms=nms,\n            nms_pre=nms_pre,\n            cell_offset=cell_offset)\n        self.conv_feat = conv_feat\n        self.dgqp_module = dgqp_module\n        self.num_classes = num_classes\n        self.fpn_stride = fpn_stride\n        self.prior_prob = prior_prob\n        self.loss_qfl = loss_class\n        self.loss_dfl = loss_dfl\n        self.loss_bbox = loss_bbox\n        self.reg_max = reg_max\n        self.feat_in_chan = feat_in_chan\n        self.nms = nms\n        self.nms_pre = nms_pre\n        self.cell_offset = cell_offset\n        self.use_sigmoid = self.loss_qfl.use_sigmoid\n\n        self.assigner = assigner\n\n    def _get_target_single(self, flatten_cls_pred, flatten_center_and_stride,\n                           flatten_bbox, gt_bboxes, gt_labels):\n        \"\"\"Compute targets for priors in a single image.\n        \"\"\"\n        pos_num, label, label_weight, bbox_target = self.assigner(\n            F.sigmoid(flatten_cls_pred), flatten_center_and_stride,\n            flatten_bbox, gt_bboxes, gt_labels)\n\n        return (pos_num, label, label_weight, bbox_target)\n\n    def get_loss(self, head_outs, gt_meta):\n        cls_scores, bbox_preds = head_outs\n        num_level_anchors = [\n            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores\n        ]\n        num_imgs = gt_meta['im_id'].shape[0]\n        featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]]\n                         for featmap in cls_scores]\n\n        decode_bbox_preds = []\n        center_and_strides = []\n        for featmap_size, stride, bbox_pred in zip(featmap_sizes,\n                                                   self.fpn_stride, bbox_preds):\n\n            # center in origin image\n            yy, xx = self.get_single_level_center_point(featmap_size, stride,\n                                                        self.cell_offset)\n\n            center_and_stride = paddle.stack([xx, yy, stride, stride], -1).tile(\n                [num_imgs, 1, 1])\n            center_and_strides.append(center_and_stride)\n            center_in_feature = center_and_stride.reshape(\n                [-1, 4])[:, :-2] / stride\n            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(\n                [num_imgs, -1, 4 * (self.reg_max + 1)])\n            pred_distances = self.distribution_project(bbox_pred)\n            decode_bbox_pred_wo_stride = distance2bbox(\n                center_in_feature, pred_distances).reshape([num_imgs, -1, 4])\n            decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride)\n\n        flatten_cls_preds = [\n            cls_pred.transpose([0, 2, 3, 1]).reshape(\n                [num_imgs, -1, self.cls_out_channels])\n            for cls_pred in cls_scores\n        ]\n        flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1)\n        flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1)\n        flatten_center_and_strides = paddle.concat(center_and_strides, axis=1)\n\n        gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class']\n        pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], []\n        for flatten_cls_pred,flatten_center_and_stride,flatten_bbox,gt_box, gt_label \\\n            in zip(flatten_cls_preds.detach(),flatten_center_and_strides.detach(), \\\n                   flatten_bboxes.detach(),gt_boxes, gt_labels):\n            pos_num, label, label_weight, bbox_target = self._get_target_single(\n                flatten_cls_pred, flatten_center_and_stride, flatten_bbox,\n                gt_box, gt_label)\n            pos_num_l.append(pos_num)\n            label_l.append(label)\n            label_weight_l.append(label_weight)\n            bbox_target_l.append(bbox_target)\n\n        labels = paddle.to_tensor(np.stack(label_l, axis=0))\n        label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0))\n        bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0))\n\n        center_and_strides_list = self._images_to_levels(\n            flatten_center_and_strides, num_level_anchors)\n        labels_list = self._images_to_levels(labels, num_level_anchors)\n        label_weights_list = self._images_to_levels(label_weights,\n                                                    num_level_anchors)\n        bbox_targets_list = self._images_to_levels(bbox_targets,\n                                                   num_level_anchors)\n        num_total_pos = sum(pos_num_l)\n        try:\n            paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos))\n            num_total_pos = paddle.clip(\n                num_total_pos / paddle.distributed.get_world_size(), min=1.)\n        except:\n            num_total_pos = max(num_total_pos, 1)\n\n        loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], []\n        for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip(\n                cls_scores, bbox_preds, center_and_strides_list, labels_list,\n                label_weights_list, bbox_targets_list, self.fpn_stride):\n            center_and_strides = center_and_strides.reshape([-1, 4])\n            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(\n                [-1, self.cls_out_channels])\n            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(\n                [-1, 4 * (self.reg_max + 1)])\n            bbox_targets = bbox_targets.reshape([-1, 4])\n            labels = labels.reshape([-1])\n            label_weights = label_weights.reshape([-1])\n\n            bg_class_ind = self.num_classes\n            pos_inds = paddle.nonzero(\n                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),\n                as_tuple=False).squeeze(1)\n            score = np.zeros(labels.shape)\n\n            if len(pos_inds) > 0:\n                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)\n                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)\n                pos_centers = paddle.gather(\n                    center_and_strides[:, :-2], pos_inds, axis=0) / stride\n\n                weight_targets = F.sigmoid(cls_score.detach())\n                weight_targets = paddle.gather(\n                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)\n                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)\n                pos_decode_bbox_pred = distance2bbox(pos_centers,\n                                                     pos_bbox_pred_corners)\n                pos_decode_bbox_targets = pos_bbox_targets / stride\n                bbox_iou = bbox_overlaps(\n                    pos_decode_bbox_pred.detach().numpy(),\n                    pos_decode_bbox_targets.detach().numpy(),\n                    is_aligned=True)\n                score[pos_inds.numpy()] = bbox_iou\n\n                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])\n                target_corners = bbox2distance(pos_centers,\n                                               pos_decode_bbox_targets,\n                                               self.reg_max).reshape([-1])\n                # regression loss\n                loss_bbox = paddle.sum(\n                    self.loss_bbox(pos_decode_bbox_pred,\n                                   pos_decode_bbox_targets) * weight_targets)\n\n                # dfl loss\n                loss_dfl = self.loss_dfl(\n                    pred_corners,\n                    target_corners,\n                    weight=weight_targets.expand([-1, 4]).reshape([-1]),\n                    avg_factor=4.0)\n            else:\n                loss_bbox = bbox_pred.sum() * 0\n                loss_dfl = bbox_pred.sum() * 0\n                weight_targets = paddle.to_tensor([0], dtype='float32')\n\n            # qfl loss\n            score = paddle.to_tensor(score)\n            loss_qfl = self.loss_qfl(\n                cls_score, (labels, score),\n                weight=label_weights,\n                avg_factor=num_total_pos)\n            loss_bbox_list.append(loss_bbox)\n            loss_dfl_list.append(loss_dfl)\n            loss_qfl_list.append(loss_qfl)\n            avg_factor.append(weight_targets.sum())\n\n        avg_factor = sum(avg_factor)\n        try:\n            paddle.distributed.all_reduce(paddle.to_tensor(avg_factor))\n            avg_factor = paddle.clip(\n                avg_factor / paddle.distributed.get_world_size(), min=1)\n        except:\n            avg_factor = max(avg_factor.item(), 1)\n        if avg_factor <= 0:\n            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n            loss_bbox = paddle.to_tensor(\n                0, dtype='float32', stop_gradient=False)\n            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n        else:\n            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))\n            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))\n            loss_qfl = sum(loss_qfl_list)\n            loss_bbox = sum(losses_bbox)\n            loss_dfl = sum(losses_dfl)\n\n        loss_states = dict(\n            loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)\n\n        return loss_states\n\n\n@register\nclass OTAVFLHead(OTAHead):\n    __inject__ = [\n        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',\n        'assigner', 'nms'\n    ]\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 conv_feat='FCOSFeat',\n                 dgqp_module=None,\n                 num_classes=80,\n                 fpn_stride=[8, 16, 32, 64, 128],\n                 prior_prob=0.01,\n                 loss_class='VarifocalLoss',\n                 loss_dfl='DistributionFocalLoss',\n                 loss_bbox='GIoULoss',\n                 assigner='SimOTAAssigner',\n                 reg_max=16,\n                 feat_in_chan=256,\n                 nms=None,\n                 nms_pre=1000,\n                 cell_offset=0):\n        super(OTAVFLHead, self).__init__(\n            conv_feat=conv_feat,\n            dgqp_module=dgqp_module,\n            num_classes=num_classes,\n            fpn_stride=fpn_stride,\n            prior_prob=prior_prob,\n            loss_class=loss_class,\n            loss_dfl=loss_dfl,\n            loss_bbox=loss_bbox,\n            reg_max=reg_max,\n            feat_in_chan=feat_in_chan,\n            nms=nms,\n            nms_pre=nms_pre,\n            cell_offset=cell_offset)\n        self.conv_feat = conv_feat\n        self.dgqp_module = dgqp_module\n        self.num_classes = num_classes\n        self.fpn_stride = fpn_stride\n        self.prior_prob = prior_prob\n        self.loss_vfl = loss_class\n        self.loss_dfl = loss_dfl\n        self.loss_bbox = loss_bbox\n        self.reg_max = reg_max\n        self.feat_in_chan = feat_in_chan\n        self.nms = nms\n        self.nms_pre = nms_pre\n        self.cell_offset = cell_offset\n        self.use_sigmoid = self.loss_vfl.use_sigmoid\n\n        self.assigner = assigner\n\n    def get_loss(self, head_outs, gt_meta):\n        cls_scores, bbox_preds = head_outs\n        num_level_anchors = [\n            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores\n        ]\n        num_imgs = gt_meta['im_id'].shape[0]\n        featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]]\n                         for featmap in cls_scores]\n\n        decode_bbox_preds = []\n        center_and_strides = []\n        for featmap_size, stride, bbox_pred in zip(featmap_sizes,\n                                                   self.fpn_stride, bbox_preds):\n            # center in origin image\n            yy, xx = self.get_single_level_center_point(featmap_size, stride,\n                                                        self.cell_offset)\n            strides = paddle.full((len(xx), ), stride)\n            center_and_stride = paddle.stack([xx, yy, strides, strides],\n                                             -1).tile([num_imgs, 1, 1])\n            center_and_strides.append(center_and_stride)\n            center_in_feature = center_and_stride.reshape(\n                [-1, 4])[:, :-2] / stride\n            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(\n                [num_imgs, -1, 4 * (self.reg_max + 1)])\n            pred_distances = self.distribution_project(bbox_pred)\n            decode_bbox_pred_wo_stride = distance2bbox(\n                center_in_feature, pred_distances).reshape([num_imgs, -1, 4])\n            decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride)\n\n        flatten_cls_preds = [\n            cls_pred.transpose([0, 2, 3, 1]).reshape(\n                [num_imgs, -1, self.cls_out_channels])\n            for cls_pred in cls_scores\n        ]\n        flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1)\n        flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1)\n        flatten_center_and_strides = paddle.concat(center_and_strides, axis=1)\n\n        gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class']\n        pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], []\n        for flatten_cls_pred, flatten_center_and_stride, flatten_bbox,gt_box,gt_label \\\n                in zip(flatten_cls_preds.detach(), flatten_center_and_strides.detach(), \\\n                       flatten_bboxes.detach(),gt_boxes,gt_labels):\n            pos_num, label, label_weight, bbox_target = self._get_target_single(\n                flatten_cls_pred, flatten_center_and_stride, flatten_bbox,\n                gt_box, gt_label)\n            pos_num_l.append(pos_num)\n            label_l.append(label)\n            label_weight_l.append(label_weight)\n            bbox_target_l.append(bbox_target)\n\n        labels = paddle.to_tensor(np.stack(label_l, axis=0))\n        label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0))\n        bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0))\n\n        center_and_strides_list = self._images_to_levels(\n            flatten_center_and_strides, num_level_anchors)\n        labels_list = self._images_to_levels(labels, num_level_anchors)\n        label_weights_list = self._images_to_levels(label_weights,\n                                                    num_level_anchors)\n        bbox_targets_list = self._images_to_levels(bbox_targets,\n                                                   num_level_anchors)\n        num_total_pos = sum(pos_num_l)\n        try:\n            paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos))\n            num_total_pos = paddle.clip(\n                num_total_pos / paddle.distributed.get_world_size(), min=1.)\n        except:\n            num_total_pos = max(num_total_pos, 1)\n\n        loss_bbox_list, loss_dfl_list, loss_vfl_list, avg_factor = [], [], [], []\n        for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip(\n                cls_scores, bbox_preds, center_and_strides_list, labels_list,\n                label_weights_list, bbox_targets_list, self.fpn_stride):\n            center_and_strides = center_and_strides.reshape([-1, 4])\n            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(\n                [-1, self.cls_out_channels])\n            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(\n                [-1, 4 * (self.reg_max + 1)])\n            bbox_targets = bbox_targets.reshape([-1, 4])\n            labels = labels.reshape([-1])\n\n            bg_class_ind = self.num_classes\n            pos_inds = paddle.nonzero(\n                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),\n                as_tuple=False).squeeze(1)\n            # vfl\n            vfl_score = np.zeros(cls_score.shape)\n\n            if len(pos_inds) > 0:\n                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)\n                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)\n                pos_centers = paddle.gather(\n                    center_and_strides[:, :-2], pos_inds, axis=0) / stride\n\n                weight_targets = F.sigmoid(cls_score.detach())\n                weight_targets = paddle.gather(\n                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)\n                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)\n                pos_decode_bbox_pred = distance2bbox(pos_centers,\n                                                     pos_bbox_pred_corners)\n                pos_decode_bbox_targets = pos_bbox_targets / stride\n                bbox_iou = bbox_overlaps(\n                    pos_decode_bbox_pred.detach().numpy(),\n                    pos_decode_bbox_targets.detach().numpy(),\n                    is_aligned=True)\n\n                # vfl\n                pos_labels = paddle.gather(labels, pos_inds, axis=0)\n                vfl_score[pos_inds.numpy(), pos_labels] = bbox_iou\n\n                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])\n                target_corners = bbox2distance(pos_centers,\n                                               pos_decode_bbox_targets,\n                                               self.reg_max).reshape([-1])\n                # regression loss\n                loss_bbox = paddle.sum(\n                    self.loss_bbox(pos_decode_bbox_pred,\n                                   pos_decode_bbox_targets) * weight_targets)\n\n                # dfl loss\n                loss_dfl = self.loss_dfl(\n                    pred_corners,\n                    target_corners,\n                    weight=weight_targets.expand([-1, 4]).reshape([-1]),\n                    avg_factor=4.0)\n            else:\n                loss_bbox = bbox_pred.sum() * 0\n                loss_dfl = bbox_pred.sum() * 0\n                weight_targets = paddle.to_tensor([0], dtype='float32')\n\n            # vfl loss\n            num_pos_avg_per_gpu = num_total_pos\n            vfl_score = paddle.to_tensor(vfl_score)\n            loss_vfl = self.loss_vfl(\n                cls_score, vfl_score, avg_factor=num_pos_avg_per_gpu)\n\n            loss_bbox_list.append(loss_bbox)\n            loss_dfl_list.append(loss_dfl)\n            loss_vfl_list.append(loss_vfl)\n            avg_factor.append(weight_targets.sum())\n\n        avg_factor = sum(avg_factor)\n        try:\n            paddle.distributed.all_reduce(paddle.to_tensor(avg_factor))\n            avg_factor = paddle.clip(\n                avg_factor / paddle.distributed.get_world_size(), min=1)\n        except:\n            avg_factor = max(avg_factor.item(), 1)\n        if avg_factor <= 0:\n            loss_vfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n            loss_bbox = paddle.to_tensor(\n                0, dtype='float32', stop_gradient=False)\n            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)\n        else:\n            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))\n            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))\n            loss_vfl = sum(loss_vfl_list)\n            loss_bbox = sum(losses_bbox)\n            loss_dfl = sum(losses_dfl)\n\n        loss_states = dict(\n            loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)\n\n        return loss_states\n"
  },
  {
    "path": "ppdet/modeling/heads/solov2_head.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal, Constant\n\nfrom ppdet.modeling.layers import ConvNormLayer, MaskMatrixNMS, DropBlock\nfrom ppdet.core.workspace import register\n\nfrom six.moves import zip\nimport numpy as np\n\n__all__ = ['SOLOv2Head']\n\n\n@register\nclass SOLOv2MaskHead(nn.Layer):\n    \"\"\"\n    MaskHead of SOLOv2.\n    The code of this function is based on:\n        https://github.com/WXinlong/SOLO/blob/master/mmdet/models/mask_heads/mask_feat_head.py\n\n    Args:\n        in_channels (int): The channel number of input Tensor.\n        out_channels (int): The channel number of output Tensor.\n        start_level (int): The position where the input starts.\n        end_level (int): The position where the input ends.\n        use_dcn_in_tower (bool): Whether to use dcn in tower or not.\n    \"\"\"\n    __shared__ = ['norm_type']\n\n    def __init__(self,\n                 in_channels=256,\n                 mid_channels=128,\n                 out_channels=256,\n                 start_level=0,\n                 end_level=3,\n                 use_dcn_in_tower=False,\n                 norm_type='gn'):\n        super(SOLOv2MaskHead, self).__init__()\n        assert start_level >= 0 and end_level >= start_level\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.mid_channels = mid_channels\n        self.use_dcn_in_tower = use_dcn_in_tower\n        self.range_level = end_level - start_level + 1\n        self.use_dcn = True if self.use_dcn_in_tower else False\n        self.convs_all_levels = []\n        self.norm_type = norm_type\n        for i in range(start_level, end_level + 1):\n            conv_feat_name = 'mask_feat_head.convs_all_levels.{}'.format(i)\n            conv_pre_feat = nn.Sequential()\n            if i == start_level:\n                conv_pre_feat.add_sublayer(\n                    conv_feat_name + '.conv' + str(i),\n                    ConvNormLayer(\n                        ch_in=self.in_channels,\n                        ch_out=self.mid_channels,\n                        filter_size=3,\n                        stride=1,\n                        use_dcn=self.use_dcn,\n                        norm_type=self.norm_type))\n                self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat)\n                self.convs_all_levels.append(conv_pre_feat)\n            else:\n                for j in range(i):\n                    ch_in = 0\n                    if j == 0:\n                        ch_in = self.in_channels + 2 if i == end_level else self.in_channels\n                    else:\n                        ch_in = self.mid_channels\n                    conv_pre_feat.add_sublayer(\n                        conv_feat_name + '.conv' + str(j),\n                        ConvNormLayer(\n                            ch_in=ch_in,\n                            ch_out=self.mid_channels,\n                            filter_size=3,\n                            stride=1,\n                            use_dcn=self.use_dcn,\n                            norm_type=self.norm_type))\n                    conv_pre_feat.add_sublayer(\n                        conv_feat_name + '.conv' + str(j) + 'act', nn.ReLU())\n                    conv_pre_feat.add_sublayer(\n                        'upsample' + str(i) + str(j),\n                        nn.Upsample(\n                            scale_factor=2, mode='bilinear'))\n                self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat)\n                self.convs_all_levels.append(conv_pre_feat)\n\n        conv_pred_name = 'mask_feat_head.conv_pred.0'\n        self.conv_pred = self.add_sublayer(\n            conv_pred_name,\n            ConvNormLayer(\n                ch_in=self.mid_channels,\n                ch_out=self.out_channels,\n                filter_size=1,\n                stride=1,\n                use_dcn=self.use_dcn,\n                norm_type=self.norm_type))\n\n    def forward(self, inputs):\n        \"\"\"\n        Get SOLOv2MaskHead output.\n\n        Args:\n            inputs(list[Tensor]): feature map from each necks with shape of [N, C, H, W]\n        Returns:\n            ins_pred(Tensor): Output of SOLOv2MaskHead head\n        \"\"\"\n        feat_all_level = F.relu(self.convs_all_levels[0](inputs[0]))\n        for i in range(1, self.range_level):\n            input_p = inputs[i]\n            if i == (self.range_level - 1):\n                input_feat = input_p\n                x_range = paddle.linspace(\n                    -1, 1, input_feat.shape[-1], dtype='float32')\n                y_range = paddle.linspace(\n                    -1, 1, input_feat.shape[-2], dtype='float32')\n                y, x = paddle.meshgrid([y_range, x_range])\n                x = paddle.unsqueeze(x, [0, 1])\n                y = paddle.unsqueeze(y, [0, 1])\n                y = paddle.expand(\n                    y, shape=[input_feat.shape[0], 1, -1, -1])\n                x = paddle.expand(\n                    x, shape=[input_feat.shape[0], 1, -1, -1])\n                coord_feat = paddle.concat([x, y], axis=1)\n                input_p = paddle.concat([input_p, coord_feat], axis=1)\n            feat_all_level = paddle.add(feat_all_level,\n                                        self.convs_all_levels[i](input_p))\n        ins_pred = F.relu(self.conv_pred(feat_all_level))\n\n        return ins_pred\n\n\n@register\nclass SOLOv2Head(nn.Layer):\n    \"\"\"\n    Head block for SOLOv2 network\n\n    Args:\n        num_classes (int): Number of output classes.\n        in_channels (int): Number of input channels.\n        seg_feat_channels (int): Num_filters of kernel & categroy branch convolution operation.\n        stacked_convs (int): Times of convolution operation.\n        num_grids (list[int]): List of feature map grids size.\n        kernel_out_channels (int): Number of output channels in kernel branch.\n        dcn_v2_stages (list): Which stage use dcn v2 in tower. It is between [0, stacked_convs).\n        segm_strides (list[int]): List of segmentation area stride.\n        solov2_loss (object): SOLOv2Loss instance.\n        score_threshold (float): Threshold of categroy score.\n        mask_nms (object): MaskMatrixNMS instance.\n    \"\"\"\n    __inject__ = ['solov2_loss', 'mask_nms']\n    __shared__ = ['norm_type', 'num_classes']\n\n    def __init__(self,\n                 num_classes=80,\n                 in_channels=256,\n                 seg_feat_channels=256,\n                 stacked_convs=4,\n                 num_grids=[40, 36, 24, 16, 12],\n                 kernel_out_channels=256,\n                 dcn_v2_stages=[],\n                 segm_strides=[8, 8, 16, 32, 32],\n                 solov2_loss=None,\n                 score_threshold=0.1,\n                 mask_threshold=0.5,\n                 mask_nms=None,\n                 norm_type='gn',\n                 drop_block=False):\n        super(SOLOv2Head, self).__init__()\n        self.num_classes = num_classes\n        self.in_channels = in_channels\n        self.seg_num_grids = num_grids\n        self.cate_out_channels = self.num_classes\n        self.seg_feat_channels = seg_feat_channels\n        self.stacked_convs = stacked_convs\n        self.kernel_out_channels = kernel_out_channels\n        self.dcn_v2_stages = dcn_v2_stages\n        self.segm_strides = segm_strides\n        self.solov2_loss = solov2_loss\n        self.mask_nms = mask_nms\n        self.score_threshold = score_threshold\n        self.mask_threshold = mask_threshold\n        self.norm_type = norm_type\n        self.drop_block = drop_block\n\n        self.kernel_pred_convs = []\n        self.cate_pred_convs = []\n        for i in range(self.stacked_convs):\n            use_dcn = True if i in self.dcn_v2_stages else False\n            ch_in = self.in_channels + 2 if i == 0 else self.seg_feat_channels\n            kernel_conv = self.add_sublayer(\n                'bbox_head.kernel_convs.' + str(i),\n                ConvNormLayer(\n                    ch_in=ch_in,\n                    ch_out=self.seg_feat_channels,\n                    filter_size=3,\n                    stride=1,\n                    use_dcn=use_dcn,\n                    norm_type=self.norm_type))\n            self.kernel_pred_convs.append(kernel_conv)\n            ch_in = self.in_channels if i == 0 else self.seg_feat_channels\n            cate_conv = self.add_sublayer(\n                'bbox_head.cate_convs.' + str(i),\n                ConvNormLayer(\n                    ch_in=ch_in,\n                    ch_out=self.seg_feat_channels,\n                    filter_size=3,\n                    stride=1,\n                    use_dcn=use_dcn,\n                    norm_type=self.norm_type))\n            self.cate_pred_convs.append(cate_conv)\n\n        self.solo_kernel = self.add_sublayer(\n            'bbox_head.solo_kernel',\n            nn.Conv2D(\n                self.seg_feat_channels,\n                self.kernel_out_channels,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=True))\n        self.solo_cate = self.add_sublayer(\n            'bbox_head.solo_cate',\n            nn.Conv2D(\n                self.seg_feat_channels,\n                self.cate_out_channels,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(\n                    value=float(-np.log((1 - 0.01) / 0.01))))))\n\n        if self.drop_block and self.training:\n            self.drop_block_fun = DropBlock(\n                block_size=3, keep_prob=0.9, name='solo_cate.dropblock')\n\n    def _points_nms(self, heat, kernel_size=2):\n        hmax = F.max_pool2d(heat, kernel_size=kernel_size, stride=1, padding=1)\n        keep = paddle.cast((hmax[:, :, :-1, :-1] == heat), 'float32')\n        return heat * keep\n\n    def _split_feats(self, feats):\n        return (F.interpolate(\n            feats[0],\n            scale_factor=0.5,\n            align_corners=False,\n            align_mode=0,\n            mode='bilinear'), feats[1], feats[2], feats[3], F.interpolate(\n                feats[4],\n                size=feats[3].shape[-2:],\n                mode='bilinear',\n                align_corners=False,\n                align_mode=0))\n\n    def forward(self, input):\n        \"\"\"\n        Get SOLOv2 head output\n\n        Args:\n            input (list): List of Tensors, output of backbone or neck stages\n        Returns:\n            cate_pred_list (list): Tensors of each category branch layer\n            kernel_pred_list (list): Tensors of each kernel branch layer\n        \"\"\"\n        feats = self._split_feats(input)\n        cate_pred_list = []\n        kernel_pred_list = []\n        for idx in range(len(self.seg_num_grids)):\n            cate_pred, kernel_pred = self._get_output_single(feats[idx], idx)\n            cate_pred_list.append(cate_pred)\n            kernel_pred_list.append(kernel_pred)\n\n        return cate_pred_list, kernel_pred_list\n\n    def _get_output_single(self, input, idx):\n        ins_kernel_feat = input\n        # CoordConv\n        x_range = paddle.linspace(\n            -1, 1, ins_kernel_feat.shape[-1], dtype='float32')\n        y_range = paddle.linspace(\n            -1, 1, ins_kernel_feat.shape[-2], dtype='float32')\n        y, x = paddle.meshgrid([y_range, x_range])\n        x = paddle.unsqueeze(x, [0, 1])\n        y = paddle.unsqueeze(y, [0, 1])\n        y = paddle.expand(\n            y, shape=[ins_kernel_feat.shape[0], 1, -1, -1])\n        x = paddle.expand(\n            x, shape=[ins_kernel_feat.shape[0], 1, -1, -1])\n        coord_feat = paddle.concat([x, y], axis=1)\n        ins_kernel_feat = paddle.concat([ins_kernel_feat, coord_feat], axis=1)\n\n        # kernel branch\n        kernel_feat = ins_kernel_feat\n        seg_num_grid = self.seg_num_grids[idx]\n        kernel_feat = F.interpolate(\n            kernel_feat,\n            size=[seg_num_grid, seg_num_grid],\n            mode='bilinear',\n            align_corners=False,\n            align_mode=0)\n        cate_feat = kernel_feat[:, :-2, :, :]\n\n        for kernel_layer in self.kernel_pred_convs:\n            kernel_feat = F.relu(kernel_layer(kernel_feat))\n        if self.drop_block and self.training:\n            kernel_feat = self.drop_block_fun(kernel_feat)\n        kernel_pred = self.solo_kernel(kernel_feat)\n        # cate branch\n        for cate_layer in self.cate_pred_convs:\n            cate_feat = F.relu(cate_layer(cate_feat))\n        if self.drop_block and self.training:\n            cate_feat = self.drop_block_fun(cate_feat)\n        cate_pred = self.solo_cate(cate_feat)\n\n        if not self.training:\n            cate_pred = self._points_nms(F.sigmoid(cate_pred), kernel_size=2)\n            cate_pred = paddle.transpose(cate_pred, [0, 2, 3, 1])\n        return cate_pred, kernel_pred\n\n    def get_loss(self, cate_preds, kernel_preds, ins_pred, ins_labels,\n                 cate_labels, grid_order_list, fg_num):\n        \"\"\"\n        Get loss of network of SOLOv2.\n\n        Args:\n            cate_preds (list): Tensor list of categroy branch output.\n            kernel_preds (list): Tensor list of kernel branch output.\n            ins_pred (list): Tensor list of instance branch output.\n            ins_labels (list): List of instance labels pre batch.\n            cate_labels (list): List of categroy labels pre batch.\n            grid_order_list (list): List of index in pre grid.\n            fg_num (int): Number of positive samples in a mini-batch.\n        Returns:\n            loss_ins (Tensor): The instance loss Tensor of SOLOv2 network.\n            loss_cate (Tensor): The category loss Tensor of SOLOv2 network.\n        \"\"\"\n        batch_size = grid_order_list[0].shape[0]\n        ins_pred_list = []\n        for kernel_preds_level, grid_orders_level in zip(kernel_preds,\n                                                         grid_order_list):\n            if grid_orders_level.shape[1] == 0:\n                ins_pred_list.append(None)\n                continue\n            grid_orders_level = paddle.reshape(grid_orders_level, [-1])\n            reshape_pred = paddle.reshape(\n                kernel_preds_level,\n                shape=(kernel_preds_level.shape[0],\n                       kernel_preds_level.shape[1], -1))\n            reshape_pred = paddle.transpose(reshape_pred, [0, 2, 1])\n            reshape_pred = paddle.reshape(\n                reshape_pred, shape=(-1, reshape_pred.shape[2]))\n            gathered_pred = paddle.gather(reshape_pred, index=grid_orders_level)\n            gathered_pred = paddle.reshape(\n                gathered_pred,\n                shape=[batch_size, -1, gathered_pred.shape[1]])\n            cur_ins_pred = ins_pred\n            cur_ins_pred = paddle.reshape(\n                cur_ins_pred,\n                shape=(cur_ins_pred.shape[0],\n                       cur_ins_pred.shape[1], -1))\n            ins_pred_conv = paddle.matmul(gathered_pred, cur_ins_pred)\n            cur_ins_pred = paddle.reshape(\n                ins_pred_conv,\n                shape=(-1, ins_pred.shape[-2],\n                       ins_pred.shape[-1]))\n            ins_pred_list.append(cur_ins_pred)\n\n        num_ins = paddle.sum(fg_num)\n        cate_preds = [\n            paddle.reshape(\n                paddle.transpose(cate_pred, [0, 2, 3, 1]),\n                shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds\n        ]\n        flatten_cate_preds = paddle.concat(cate_preds)\n        new_cate_labels = []\n        for cate_label in cate_labels:\n            new_cate_labels.append(paddle.reshape(cate_label, shape=[-1]))\n        cate_labels = paddle.concat(new_cate_labels)\n\n        loss_ins, loss_cate = self.solov2_loss(\n            ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins)\n\n        return {'loss_ins': loss_ins, 'loss_cate': loss_cate}\n\n    def get_prediction(self, cate_preds, kernel_preds, seg_pred, im_shape,\n                       scale_factor):\n        \"\"\"\n        Get prediction result of SOLOv2 network\n\n        Args:\n            cate_preds (list): List of Variables, output of categroy branch.\n            kernel_preds (list): List of Variables, output of kernel branch.\n            seg_pred (list): List of Variables, output of mask head stages.\n            im_shape (Variables): [h, w] for input images.\n            scale_factor (Variables): [scale, scale] for input images.\n        Returns:\n            seg_masks (Tensor): The prediction segmentation.\n            cate_labels (Tensor): The prediction categroy label of each segmentation.\n            seg_masks (Tensor): The prediction score of each segmentation.\n        \"\"\"\n        num_levels = len(cate_preds)\n        featmap_size = seg_pred.shape[-2:]\n        seg_masks_list = []\n        cate_labels_list = []\n        cate_scores_list = []\n        cate_preds = [cate_pred * 1.0 for cate_pred in cate_preds]\n        kernel_preds = [kernel_pred * 1.0 for kernel_pred in kernel_preds]\n        # Currently only supports batch size == 1\n        for idx in range(1):\n            cate_pred_list = [\n                paddle.reshape(\n                    cate_preds[i][idx], shape=(-1, self.cate_out_channels))\n                for i in range(num_levels)\n            ]\n            seg_pred_list = seg_pred\n            kernel_pred_list = [\n                paddle.reshape(\n                    paddle.transpose(kernel_preds[i][idx], [1, 2, 0]),\n                    shape=(-1, self.kernel_out_channels))\n                for i in range(num_levels)\n            ]\n            cate_pred_list = paddle.concat(cate_pred_list, axis=0)\n            kernel_pred_list = paddle.concat(kernel_pred_list, axis=0)\n\n            seg_masks, cate_labels, cate_scores = self.get_seg_single(\n                cate_pred_list, seg_pred_list, kernel_pred_list, featmap_size,\n                im_shape[idx], scale_factor[idx][0])\n            bbox_num = cate_labels.shape[0:1]\n        return seg_masks, cate_labels, cate_scores, bbox_num\n\n    def get_seg_single(self, cate_preds, seg_preds, kernel_preds, featmap_size,\n                       im_shape, scale_factor):\n        \"\"\"\n        The code of this function is based on:\n            https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L385\n        \"\"\"\n        h = paddle.cast(im_shape[0], 'int32')\n        w = paddle.cast(im_shape[1], 'int32')\n        upsampled_size_out = [featmap_size[0] * 4, featmap_size[1] * 4]\n\n        y = paddle.zeros(shape=cate_preds.shape, dtype='float32')\n        inds = paddle.where(cate_preds > self.score_threshold, cate_preds, y)\n        inds = paddle.nonzero(inds)\n        cate_preds = paddle.reshape(cate_preds, shape=[-1])\n        # Prevent empty and increase fake data\n        ind_a = paddle.cast(paddle.shape(kernel_preds)[0:1], 'int64')\n        ind_b = paddle.zeros(shape=[1], dtype='int64')\n        inds_end = paddle.unsqueeze(paddle.concat([ind_a, ind_b]), 0)\n        inds = paddle.concat([inds, inds_end])\n        kernel_preds_end = paddle.ones(\n            shape=[1, self.kernel_out_channels], dtype='float32')\n        kernel_preds = paddle.concat([kernel_preds, kernel_preds_end])\n        cate_preds = paddle.concat(\n            [cate_preds, paddle.zeros(\n                shape=[1], dtype='float32')])\n\n        # cate_labels & kernel_preds\n        cate_labels = inds[:, 1]\n        kernel_preds = paddle.gather(kernel_preds, index=inds[:, 0])\n        cate_score_idx = paddle.add(inds[:, 0] * self.cate_out_channels,\n                                    cate_labels)\n        cate_scores = paddle.gather(cate_preds, index=cate_score_idx)\n\n        size_trans = np.power(self.seg_num_grids, 2)\n        strides = []\n        for _ind in range(len(self.segm_strides)):\n            strides.append(\n                paddle.full(\n                    shape=[int(size_trans[_ind])],\n                    fill_value=self.segm_strides[_ind],\n                    dtype=\"int32\"))\n        strides = paddle.concat(strides)\n        strides = paddle.concat(\n            [strides, paddle.zeros(\n                shape=[1], dtype='int32')])\n        strides = paddle.gather(strides, index=inds[:, 0])\n\n        # mask encoding.\n        kernel_preds = paddle.unsqueeze(kernel_preds, [2, 3])\n        seg_preds = F.conv2d(seg_preds, kernel_preds)\n        seg_preds = F.sigmoid(paddle.squeeze(seg_preds, [0]))\n        seg_masks = seg_preds > self.mask_threshold\n        seg_masks = paddle.cast(seg_masks, 'float32')\n        sum_masks = paddle.sum(seg_masks, axis=[1, 2])\n\n        y = paddle.zeros(shape=paddle.shape(sum_masks), dtype='float32')\n        keep = paddle.where(sum_masks > strides.cast(sum_masks.dtype), sum_masks, y)\n        keep = paddle.nonzero(keep)\n        keep = paddle.squeeze(keep, axis=[1])\n        # Prevent empty and increase fake data\n        keep_other = paddle.concat(\n            [keep, paddle.cast(paddle.shape(sum_masks)[0:1] - 1, 'int64')])\n        keep_scores = paddle.concat(\n            [keep, paddle.cast(paddle.shape(sum_masks)[0:1], 'int64')])\n        cate_scores_end = paddle.zeros(shape=[1], dtype='float32')\n        cate_scores = paddle.concat([cate_scores, cate_scores_end])\n\n        seg_masks = paddle.gather(seg_masks, index=keep_other)\n        seg_preds = paddle.gather(seg_preds, index=keep_other)\n        sum_masks = paddle.gather(sum_masks, index=keep_other)\n        cate_labels = paddle.gather(cate_labels, index=keep_other)\n        cate_scores = paddle.gather(cate_scores, index=keep_scores)\n\n        # mask scoring.\n        seg_mul = paddle.cast(seg_preds * seg_masks, 'float32')\n        seg_scores = paddle.sum(seg_mul, axis=[1, 2]) / sum_masks\n        cate_scores *= seg_scores\n        # Matrix NMS\n        seg_preds, cate_scores, cate_labels = self.mask_nms(\n            seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=sum_masks)\n        ori_shape = im_shape[:2] / scale_factor + 0.5\n        ori_shape = paddle.cast(ori_shape, 'int32')\n        seg_preds = F.interpolate(\n            paddle.unsqueeze(seg_preds, 0),\n            size=upsampled_size_out,\n            mode='bilinear',\n            align_corners=False,\n            align_mode=0)\n        seg_preds = paddle.slice(\n            seg_preds, axes=[2, 3], starts=[0, 0], ends=[h, w])\n        seg_masks = paddle.squeeze(\n            F.interpolate(\n                seg_preds,\n                size=ori_shape[:2],\n                mode='bilinear',\n                align_corners=False,\n                align_mode=0),\n            axis=[0])\n        seg_masks = paddle.cast(seg_masks > self.mask_threshold, 'uint8')\n        return seg_masks, cate_labels, cate_scores\n"
  },
  {
    "path": "ppdet/modeling/heads/sparse_roi_head.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# This code is referenced from: https://github.com/open-mmlab/mmdetection\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport copy\n\nimport paddle\nfrom paddle import nn\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling import initializer as init\nfrom .roi_extractor import RoIAlign\nfrom ..bbox_utils import delta2bbox_v2\nfrom ..cls_utils import _get_class_default_kwargs\nfrom ..layers import MultiHeadAttention\n\n__all__ = ['SparseRoIHead', 'DIIHead', 'DynamicMaskHead']\n\n\nclass DynamicConv(nn.Layer):\n    def __init__(self,\n                 in_channels=256,\n                 feature_channels=64,\n                 out_channels=None,\n                 roi_resolution=7,\n                 with_proj=True):\n        super(DynamicConv, self).__init__()\n\n        self.in_channels = in_channels\n        self.feature_channels = feature_channels\n        self.out_channels = out_channels if out_channels else in_channels\n\n        self.num_params_in = self.in_channels * self.feature_channels\n        self.num_params_out = self.out_channels * self.feature_channels\n        self.dynamic_layer = nn.Linear(self.in_channels,\n                                       self.num_params_in + self.num_params_out)\n\n        self.norm_in = nn.LayerNorm(self.feature_channels)\n        self.norm_out = nn.LayerNorm(self.out_channels)\n\n        self.activation = nn.ReLU()\n\n        self.with_proj = with_proj\n        if self.with_proj:\n            num_output = self.out_channels * roi_resolution**2\n            self.fc_layer = nn.Linear(num_output, self.out_channels)\n            self.fc_norm = nn.LayerNorm(self.out_channels)\n\n    def forward(self, param_feature, input_feature):\n        input_feature = input_feature.flatten(2).transpose([2, 0, 1])\n        input_feature = input_feature.transpose([1, 0, 2])\n\n        parameters = self.dynamic_layer(param_feature)\n\n        param_in = parameters[:, :self.num_params_in].reshape(\n            [-1, self.in_channels, self.feature_channels])\n        param_out = parameters[:, -self.num_params_out:].reshape(\n            [-1, self.feature_channels, self.out_channels])\n\n        features = paddle.bmm(input_feature, param_in)\n        features = self.norm_in(features)\n        features = self.activation(features)\n\n        features = paddle.bmm(features, param_out)\n        features = self.norm_out(features)\n        features = self.activation(features)\n\n        if self.with_proj:\n            features = features.flatten(1)\n            features = self.fc_layer(features)\n            features = self.fc_norm(features)\n            features = self.activation(features)\n\n        return features\n\n\nclass FFN(nn.Layer):\n    def __init__(self,\n                 embed_dims=256,\n                 feedforward_channels=2048,\n                 num_fcs=2,\n                 ffn_drop=0.0,\n                 add_identity=True):\n        super(FFN, self).__init__()\n\n        layers = []\n        in_channels = embed_dims\n        for _ in range(num_fcs - 1):\n            layers.append(\n                nn.Sequential(\n                    nn.Linear(in_channels, feedforward_channels),\n                    nn.ReLU(), nn.Dropout(ffn_drop)))\n            in_channels = feedforward_channels\n        layers.append(nn.Linear(feedforward_channels, embed_dims))\n        layers.append(nn.Dropout(ffn_drop))\n        self.layers = nn.Sequential(*layers)\n\n        self.add_identity = add_identity\n\n    def forward(self, x):\n        identity = x\n        out = self.layers(x)\n        if not self.add_identity:\n            return out\n        else:\n            return out + identity\n\n\n@register\nclass DynamicMaskHead(nn.Layer):\n    __shared__ = ['num_classes', 'proposal_embedding_dim', 'norm_type']\n\n    def __init__(self,\n                 num_classes=80,\n                 proposal_embedding_dim=256,\n                 dynamic_feature_channels=64,\n                 roi_resolution=14,\n                 num_convs=4,\n                 conv_kernel_size=3,\n                 conv_channels=256,\n                 upsample_method='deconv',\n                 upsample_scale_factor=2,\n                 norm_type='bn'):\n        super(DynamicMaskHead, self).__init__()\n\n        self.d_model = proposal_embedding_dim\n\n        self.instance_interactive_conv = DynamicConv(\n            self.d_model,\n            dynamic_feature_channels,\n            roi_resolution=roi_resolution,\n            with_proj=False)\n\n        self.convs = nn.LayerList()\n        for i in range(num_convs):\n            self.convs.append(\n                nn.Sequential(\n                    nn.Conv2D(\n                        self.d_model if i == 0 else conv_channels,\n                        conv_channels,\n                        conv_kernel_size,\n                        padding='same',\n                        bias_attr=False),\n                    nn.BatchNorm2D(conv_channels),\n                    nn.ReLU()))\n        if norm_type == 'sync_bn':\n            self.convs = nn.SyncBatchNorm.convert_sync_batchnorm(self.convs)\n\n        self.upsample_method = upsample_method\n        if upsample_method is None:\n            self.upsample = None\n        elif upsample_method == 'deconv':\n            self.upsample = nn.Conv2DTranspose(\n                conv_channels if num_convs > 0 else self.d_model,\n                conv_channels,\n                upsample_scale_factor,\n                stride=upsample_scale_factor)\n            self.relu = nn.ReLU()\n        else:\n            self.upsample = nn.Upsample(None, upsample_scale_factor)\n\n        cls_in_channels = conv_channels if num_convs > 0 else self.d_model\n        cls_in_channels = conv_channels if upsample_method == 'deconv' else cls_in_channels\n        self.conv_cls = nn.Conv2D(cls_in_channels, num_classes, 1)\n\n        self._init_weights()\n\n    def _init_weights(self):\n        for p in self.parameters():\n            if p.dim() > 1:\n                init.xavier_uniform_(p)\n\n        init.constant_(self.conv_cls.bias, 0.)\n\n    def forward(self, roi_features, attn_features):\n        attn_features = attn_features.reshape([-1, self.d_model])\n        attn_features_iic = self.instance_interactive_conv(attn_features,\n                                                           roi_features)\n\n        x = attn_features_iic.transpose([0, 2, 1]).reshape(roi_features.shape)\n\n        for conv in self.convs:\n            x = conv(x)\n        if self.upsample is not None:\n            x = self.upsample(x)\n            if self.upsample_method == 'deconv':\n                x = self.relu(x)\n        mask_pred = self.conv_cls(x)\n        return mask_pred\n\n\n@register\nclass DIIHead(nn.Layer):\n    __shared__ = ['num_classes', 'proposal_embedding_dim']\n\n    def __init__(self,\n                 num_classes=80,\n                 proposal_embedding_dim=256,\n                 feedforward_channels=2048,\n                 dynamic_feature_channels=64,\n                 roi_resolution=7,\n                 num_attn_heads=8,\n                 dropout=0.0,\n                 num_ffn_fcs=2,\n                 num_cls_fcs=1,\n                 num_reg_fcs=3):\n        super(DIIHead, self).__init__()\n\n        self.num_classes = num_classes\n        self.d_model = proposal_embedding_dim\n\n        self.attention = MultiHeadAttention(self.d_model, num_attn_heads,\n                                            dropout)\n        self.attention_norm = nn.LayerNorm(self.d_model)\n\n        self.instance_interactive_conv = DynamicConv(\n            self.d_model,\n            dynamic_feature_channels,\n            roi_resolution=roi_resolution,\n            with_proj=True)\n        self.instance_interactive_conv_dropout = nn.Dropout(dropout)\n        self.instance_interactive_conv_norm = nn.LayerNorm(self.d_model)\n\n        self.ffn = FFN(self.d_model, feedforward_channels, num_ffn_fcs, dropout)\n        self.ffn_norm = nn.LayerNorm(self.d_model)\n\n        self.cls_fcs = nn.LayerList()\n        for _ in range(num_cls_fcs):\n            self.cls_fcs.append(\n                nn.Linear(\n                    self.d_model, self.d_model, bias_attr=False))\n            self.cls_fcs.append(nn.LayerNorm(self.d_model))\n            self.cls_fcs.append(nn.ReLU())\n        self.fc_cls = nn.Linear(self.d_model, self.num_classes)\n\n        self.reg_fcs = nn.LayerList()\n        for _ in range(num_reg_fcs):\n            self.reg_fcs.append(\n                nn.Linear(\n                    self.d_model, self.d_model, bias_attr=False))\n            self.reg_fcs.append(nn.LayerNorm(self.d_model))\n            self.reg_fcs.append(nn.ReLU())\n        self.fc_reg = nn.Linear(self.d_model, 4)\n\n        self._init_weights()\n\n    def _init_weights(self):\n        for p in self.parameters():\n            if p.dim() > 1:\n                init.xavier_uniform_(p)\n\n        bias_init = init.bias_init_with_prob(0.01)\n        init.constant_(self.fc_cls.bias, bias_init)\n\n    def forward(self, roi_features, proposal_features):\n        N, num_proposals = proposal_features.shape[:2]\n\n        proposal_features = proposal_features + self.attention(\n            proposal_features)\n        attn_features = self.attention_norm(proposal_features)\n\n        proposal_features = attn_features.reshape([-1, self.d_model])\n        proposal_features_iic = self.instance_interactive_conv(\n            proposal_features, roi_features)\n        proposal_features = proposal_features + self.instance_interactive_conv_dropout(\n            proposal_features_iic)\n        obj_features = self.instance_interactive_conv_norm(proposal_features)\n\n        obj_features = self.ffn(obj_features)\n        obj_features = self.ffn_norm(obj_features)\n\n        cls_feature = obj_features.clone()\n        reg_feature = obj_features.clone()\n\n        for cls_layer in self.cls_fcs:\n            cls_feature = cls_layer(cls_feature)\n        class_logits = self.fc_cls(cls_feature)\n        for reg_layer in self.reg_fcs:\n            reg_feature = reg_layer(reg_feature)\n        bbox_deltas = self.fc_reg(reg_feature)\n\n        class_logits = class_logits.reshape(\n            [N, num_proposals, self.num_classes])\n        bbox_deltas = bbox_deltas.reshape([N, num_proposals, 4])\n        obj_features = obj_features.reshape([N, num_proposals, self.d_model])\n\n        return class_logits, bbox_deltas, obj_features, attn_features\n\n    @staticmethod\n    def refine_bboxes(proposal_bboxes, bbox_deltas):\n        pred_bboxes = delta2bbox_v2(\n            bbox_deltas.reshape([-1, 4]),\n            proposal_bboxes.reshape([-1, 4]),\n            delta_mean=[0.0, 0.0, 0.0, 0.0],\n            delta_std=[0.5, 0.5, 1.0, 1.0],\n            ctr_clip=None)\n        return pred_bboxes.reshape(proposal_bboxes.shape)\n\n\n@register\nclass SparseRoIHead(nn.Layer):\n    __inject__ = ['bbox_head', 'mask_head', 'loss_func']\n\n    def __init__(self,\n                 num_stages=6,\n                 bbox_roi_extractor=_get_class_default_kwargs(RoIAlign),\n                 mask_roi_extractor=_get_class_default_kwargs(RoIAlign),\n                 bbox_head='DIIHead',\n                 mask_head='DynamicMaskHead',\n                 loss_func='QueryInstLoss'):\n        super(SparseRoIHead, self).__init__()\n\n        self.num_stages = num_stages\n\n        self.bbox_roi_extractor = bbox_roi_extractor\n        self.mask_roi_extractor = mask_roi_extractor\n        if isinstance(bbox_roi_extractor, dict):\n            self.bbox_roi_extractor = RoIAlign(**bbox_roi_extractor)\n        if isinstance(mask_roi_extractor, dict):\n            self.mask_roi_extractor = RoIAlign(**mask_roi_extractor)\n\n        self.bbox_heads = nn.LayerList(\n            [copy.deepcopy(bbox_head) for _ in range(num_stages)])\n        self.mask_heads = nn.LayerList(\n            [copy.deepcopy(mask_head) for _ in range(num_stages)])\n\n        self.loss_helper = loss_func\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        bbox_roi_extractor = cfg['bbox_roi_extractor']\n        mask_roi_extractor = cfg['mask_roi_extractor']\n        assert isinstance(bbox_roi_extractor, dict)\n        assert isinstance(mask_roi_extractor, dict)\n\n        kwargs = RoIAlign.from_config(cfg, input_shape)\n        bbox_roi_extractor.update(kwargs)\n        mask_roi_extractor.update(kwargs)\n\n        return {\n            'bbox_roi_extractor': bbox_roi_extractor,\n            'mask_roi_extractor': mask_roi_extractor\n        }\n\n    @staticmethod\n    def get_roi_features(features, bboxes, roi_extractor):\n        rois_list = [\n            bboxes[i] for i in range(len(bboxes)) if len(bboxes[i]) > 0\n        ]\n        rois_num = paddle.to_tensor(\n            [len(bboxes[i]) for i in range(len(bboxes))], dtype='int32')\n\n        pos_ids = paddle.cast(rois_num, dtype='bool')\n        if pos_ids.sum() != len(rois_num):\n            rois_num = rois_num[pos_ids]\n            features = [features[i][pos_ids] for i in range(len(features))]\n\n        return roi_extractor(features, rois_list, rois_num)\n\n    def _forward_train(self, body_feats, pro_bboxes, pro_feats, targets):\n        all_stage_losses = {}\n        for stage in range(self.num_stages):\n            bbox_head = self.bbox_heads[stage]\n            mask_head = self.mask_heads[stage]\n\n            roi_feats = self.get_roi_features(body_feats, pro_bboxes,\n                                              self.bbox_roi_extractor)\n            class_logits, bbox_deltas, pro_feats, attn_feats = bbox_head(\n                roi_feats, pro_feats)\n            bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes,\n                                                             bbox_deltas)\n\n            indices = self.loss_helper.matcher({\n                'pred_logits': class_logits.detach(),\n                'pred_boxes': bbox_pred.detach()\n            }, targets)\n            avg_factor = paddle.to_tensor(\n                [sum(len(tgt['labels']) for tgt in targets)], dtype='float32')\n            if paddle.distributed.get_world_size() > 1:\n                paddle.distributed.all_reduce(avg_factor)\n                avg_factor /= paddle.distributed.get_world_size()\n            avg_factor = paddle.clip(avg_factor, min=1.)\n\n            loss_classes = self.loss_helper.loss_classes(class_logits, targets,\n                                                         indices, avg_factor)\n            if sum(len(v['labels']) for v in targets) == 0:\n                loss_bboxes = {\n                    'loss_bbox': paddle.to_tensor([0.]),\n                    'loss_giou': paddle.to_tensor([0.])\n                }\n                loss_masks = {'loss_mask': paddle.to_tensor([0.])}\n            else:\n                loss_bboxes = self.loss_helper.loss_bboxes(bbox_pred, targets,\n                                                           indices, avg_factor)\n\n                pos_attn_feats = paddle.concat([\n                    paddle.gather(\n                        src, src_idx, axis=0)\n                    for src, (src_idx, _) in zip(attn_feats, indices)\n                ])\n                pos_bbox_pred = [\n                    paddle.gather(\n                        src, src_idx, axis=0)\n                    for src, (src_idx, _) in zip(bbox_pred.detach(), indices)\n                ]\n                pos_roi_feats = self.get_roi_features(body_feats, pos_bbox_pred,\n                                                      self.mask_roi_extractor)\n                mask_logits = mask_head(pos_roi_feats, pos_attn_feats)\n                loss_masks = self.loss_helper.loss_masks(\n                    pos_bbox_pred, mask_logits, targets, indices, avg_factor)\n\n            for loss in [loss_classes, loss_bboxes, loss_masks]:\n                for key in loss.keys():\n                    all_stage_losses[f'stage{stage}_{key}'] = loss[key]\n\n            pro_bboxes = bbox_pred.detach()\n\n        return all_stage_losses\n\n    def _forward_test(self, body_feats, pro_bboxes, pro_feats):\n        for stage in range(self.num_stages):\n            roi_feats = self.get_roi_features(body_feats, pro_bboxes,\n                                              self.bbox_roi_extractor)\n            class_logits, bbox_deltas, pro_feats, attn_feats = self.bbox_heads[\n                stage](roi_feats, pro_feats)\n            bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes,\n                                                             bbox_deltas)\n\n            pro_bboxes = bbox_pred.detach()\n\n        roi_feats = self.get_roi_features(body_feats, bbox_pred,\n                                          self.mask_roi_extractor)\n        mask_logits = self.mask_heads[stage](roi_feats, attn_feats)\n\n        return {\n            'class_logits': class_logits,\n            'bbox_pred': bbox_pred,\n            'mask_logits': mask_logits\n        }\n\n    def forward(self,\n                body_features,\n                proposal_bboxes,\n                proposal_features,\n                targets=None):\n        if self.training:\n            return self._forward_train(body_features, proposal_bboxes,\n                                       proposal_features, targets)\n        else:\n            return self._forward_test(body_features, proposal_bboxes,\n                                      proposal_features)\n"
  },
  {
    "path": "ppdet/modeling/heads/sparsercnn_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py\nThs copyright of PeizeSun/SparseR-CNN is as follows:\nMIT License [see LICENSE for details]\n\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport copy\nimport paddle\nimport paddle.nn as nn\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.heads.roi_extractor import RoIAlign\nfrom ppdet.modeling.bbox_utils import delta2bbox\nfrom .. import initializer as init\n\n_DEFAULT_SCALE_CLAMP = math.log(100000. / 16)\n\n\nclass DynamicConv(nn.Layer):\n    def __init__(\n            self,\n            head_hidden_dim,\n            head_dim_dynamic,\n            head_num_dynamic, ):\n        super().__init__()\n\n        self.hidden_dim = head_hidden_dim\n        self.dim_dynamic = head_dim_dynamic\n        self.num_dynamic = head_num_dynamic\n        self.num_params = self.hidden_dim * self.dim_dynamic\n        self.dynamic_layer = nn.Linear(self.hidden_dim,\n                                       self.num_dynamic * self.num_params)\n\n        self.norm1 = nn.LayerNorm(self.dim_dynamic)\n        self.norm2 = nn.LayerNorm(self.hidden_dim)\n\n        self.activation = nn.ReLU()\n\n        pooler_resolution = 7\n        num_output = self.hidden_dim * pooler_resolution**2\n        self.out_layer = nn.Linear(num_output, self.hidden_dim)\n        self.norm3 = nn.LayerNorm(self.hidden_dim)\n\n    def forward(self, pro_features, roi_features):\n        '''\n        pro_features: (1,  N * nr_boxes, self.d_model)\n        roi_features: (49, N * nr_boxes, self.d_model)\n        '''\n        features = roi_features.transpose(perm=[1, 0, 2])\n        parameters = self.dynamic_layer(pro_features).transpose(perm=[1, 0, 2])\n\n        param1 = parameters[:, :, :self.num_params].reshape(\n            [-1, self.hidden_dim, self.dim_dynamic])\n        param2 = parameters[:, :, self.num_params:].reshape(\n            [-1, self.dim_dynamic, self.hidden_dim])\n\n        features = paddle.bmm(features, param1)\n        features = self.norm1(features)\n        features = self.activation(features)\n\n        features = paddle.bmm(features, param2)\n        features = self.norm2(features)\n        features = self.activation(features)\n\n        features = features.flatten(1)\n        features = self.out_layer(features)\n        features = self.norm3(features)\n        features = self.activation(features)\n\n        return features\n\n\nclass RCNNHead(nn.Layer):\n    def __init__(\n            self,\n            d_model,\n            num_classes,\n            dim_feedforward,\n            nhead,\n            dropout,\n            head_cls,\n            head_reg,\n            head_dim_dynamic,\n            head_num_dynamic,\n            scale_clamp: float=_DEFAULT_SCALE_CLAMP,\n            bbox_weights=(2.0, 2.0, 1.0, 1.0), ):\n        super().__init__()\n\n        self.d_model = d_model\n\n        # dynamic.\n        self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout)\n        self.inst_interact = DynamicConv(d_model, head_dim_dynamic,\n                                         head_num_dynamic)\n\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.norm3 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout)\n        self.dropout2 = nn.Dropout(dropout)\n        self.dropout3 = nn.Dropout(dropout)\n\n        self.activation = nn.ReLU()\n\n        # cls.\n        num_cls = head_cls\n        cls_module = list()\n        for _ in range(num_cls):\n            cls_module.append(nn.Linear(d_model, d_model, bias_attr=False))\n            cls_module.append(nn.LayerNorm(d_model))\n            cls_module.append(nn.ReLU())\n        self.cls_module = nn.LayerList(cls_module)\n\n        # reg.\n        num_reg = head_reg\n        reg_module = list()\n        for _ in range(num_reg):\n            reg_module.append(nn.Linear(d_model, d_model, bias_attr=False))\n            reg_module.append(nn.LayerNorm(d_model))\n            reg_module.append(nn.ReLU())\n        self.reg_module = nn.LayerList(reg_module)\n\n        # pred.\n        self.class_logits = nn.Linear(d_model, num_classes)\n        self.bboxes_delta = nn.Linear(d_model, 4)\n        self.scale_clamp = scale_clamp\n        self.bbox_weights = bbox_weights\n\n    def forward(self, features, bboxes, pro_features, pooler):\n        \"\"\"\n        :param bboxes: (N, nr_boxes, 4)\n        :param pro_features: (N, nr_boxes, d_model)\n        \"\"\"\n\n        N, nr_boxes = bboxes.shape[:2]\n\n        proposal_boxes = list()\n        for b in range(N):\n            proposal_boxes.append(bboxes[b])\n        roi_num = paddle.full([N], nr_boxes).astype(\"int32\")\n\n        roi_features = pooler(features, proposal_boxes, roi_num)\n        roi_features = roi_features.reshape(\n            [N * nr_boxes, self.d_model, -1]).transpose(perm=[2, 0, 1])\n\n        # self_att.\n        pro_features = pro_features.reshape([N, nr_boxes, self.d_model])\n        pro_features2 = self.self_attn(\n            pro_features, pro_features, value=pro_features)\n        pro_features = pro_features.transpose(perm=[1, 0, 2]) + self.dropout1(\n            pro_features2.transpose(perm=[1, 0, 2]))\n        pro_features = self.norm1(pro_features)\n\n        # inst_interact.\n        pro_features = pro_features.reshape(\n            [nr_boxes, N, self.d_model]).transpose(perm=[1, 0, 2]).reshape(\n                [1, N * nr_boxes, self.d_model])\n        pro_features2 = self.inst_interact(pro_features, roi_features)\n        pro_features = pro_features + self.dropout2(pro_features2)\n        obj_features = self.norm2(pro_features)\n\n        # obj_feature.\n        obj_features2 = self.linear2(\n            self.dropout(self.activation(self.linear1(obj_features))))\n        obj_features = obj_features + self.dropout3(obj_features2)\n        obj_features = self.norm3(obj_features)\n\n        fc_feature = obj_features.transpose(perm=[1, 0, 2]).reshape(\n            [N * nr_boxes, -1])\n        cls_feature = fc_feature.clone()\n        reg_feature = fc_feature.clone()\n        for cls_layer in self.cls_module:\n            cls_feature = cls_layer(cls_feature)\n        for reg_layer in self.reg_module:\n            reg_feature = reg_layer(reg_feature)\n        class_logits = self.class_logits(cls_feature)\n        bboxes_deltas = self.bboxes_delta(reg_feature)\n        pred_bboxes = delta2bbox(bboxes_deltas,\n                                 bboxes.reshape([-1, 4]), self.bbox_weights)\n\n        return class_logits.reshape([N, nr_boxes, -1]), pred_bboxes.reshape(\n            [N, nr_boxes, -1]), obj_features\n\n\n@register\nclass SparseRCNNHead(nn.Layer):\n    '''\n    SparsercnnHead\n    Args:\n        roi_input_shape (list[ShapeSpec]): The output shape of fpn\n        num_classes (int): Number of classes,\n        head_hidden_dim (int): The param of MultiHeadAttention,\n        head_dim_feedforward (int): The param of MultiHeadAttention,\n        nhead (int): The param of MultiHeadAttention,\n        head_dropout (float): The p of dropout,\n        head_cls (int): The number of class head,\n        head_reg (int): The number of regressionhead,\n        head_num_dynamic (int): The number of DynamicConv's param,\n        head_num_heads (int): The number of RCNNHead,\n        deep_supervision (int): wheather supervise the intermediate results,\n        num_proposals (int): the number of proposals boxes and features\n    '''\n    __inject__ = ['loss_func']\n    __shared__ = ['num_classes']\n\n    def __init__(\n            self,\n            head_hidden_dim,\n            head_dim_feedforward,\n            nhead,\n            head_dropout,\n            head_cls,\n            head_reg,\n            head_dim_dynamic,\n            head_num_dynamic,\n            head_num_heads,\n            deep_supervision,\n            num_proposals,\n            num_classes=80,\n            loss_func=\"SparseRCNNLoss\",\n            roi_input_shape=None, ):\n        super().__init__()\n        assert head_num_heads > 0, \\\n            f'At least one RoI Head is required, but {head_num_heads}.'\n\n        # Build RoI.\n        box_pooler = self._init_box_pooler(roi_input_shape)\n        self.box_pooler = box_pooler\n\n        # Build heads.\n        rcnn_head = RCNNHead(\n            head_hidden_dim,\n            num_classes,\n            head_dim_feedforward,\n            nhead,\n            head_dropout,\n            head_cls,\n            head_reg,\n            head_dim_dynamic,\n            head_num_dynamic, )\n        self.head_series = nn.LayerList(\n            [copy.deepcopy(rcnn_head) for i in range(head_num_heads)])\n        self.return_intermediate = deep_supervision\n\n        self.num_classes = num_classes\n\n        # build init proposal\n        self.init_proposal_features = nn.Embedding(num_proposals,\n                                                   head_hidden_dim)\n        self.init_proposal_boxes = nn.Embedding(num_proposals, 4)\n\n        self.lossfunc = loss_func\n\n        # Init parameters.\n        init.reset_initialized_parameter(self)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # init all parameters.\n        prior_prob = 0.01\n        bias_value = -math.log((1 - prior_prob) / prior_prob)\n\n        for m in self.sublayers():\n            if isinstance(m, nn.Linear):\n                init.xavier_normal_(m.weight, reverse=True)\n            elif not isinstance(m, nn.Embedding) and hasattr(\n                    m, \"weight\") and m.weight.dim() > 1:\n                init.xavier_normal_(m.weight, reverse=False)\n\n            if hasattr(m, \"bias\") and m.bias is not None and m.bias.shape[\n                    -1] == self.num_classes:\n                init.constant_(m.bias, bias_value)\n\n        init_bboxes = paddle.empty_like(self.init_proposal_boxes.weight)\n        init_bboxes[:, :2] = 0.5\n        init_bboxes[:, 2:] = 1.0\n        self.init_proposal_boxes.weight.set_value(init_bboxes)\n\n    @staticmethod\n    def _init_box_pooler(input_shape):\n\n        pooler_resolution = 7\n        sampling_ratio = 2\n\n        if input_shape is not None:\n            pooler_scales = tuple(1.0 / input_shape[k].stride\n                                  for k in range(len(input_shape)))\n            in_channels = [\n                input_shape[f].channels for f in range(len(input_shape))\n            ]\n            end_level = len(input_shape) - 1\n            # Check all channel counts are equal\n            assert len(set(in_channels)) == 1, in_channels\n        else:\n            pooler_scales = [1.0 / 4.0, 1.0 / 8.0, 1.0 / 16.0, 1.0 / 32.0]\n            end_level = 3\n\n        aligned = True\n        if paddle.device.is_compiled_with_custom_device('npu'):\n            aligned = False\n        box_pooler = RoIAlign(\n            resolution=pooler_resolution,\n            spatial_scale=pooler_scales,\n            sampling_ratio=sampling_ratio,\n            end_level=end_level,\n            aligned=aligned)\n        return box_pooler\n\n    def forward(self, features, input_whwh):\n\n        bs = len(features[0])\n        bboxes = box_cxcywh_to_xyxy(self.init_proposal_boxes.weight.clone(\n        )).unsqueeze(0)\n        bboxes = bboxes * input_whwh.unsqueeze(-2)\n\n        init_features = self.init_proposal_features.weight.unsqueeze(0).tile(\n            [1, bs, 1])\n        proposal_features = init_features.clone()\n\n        inter_class_logits = []\n        inter_pred_bboxes = []\n\n        for stage, rcnn_head in enumerate(self.head_series):\n            class_logits, pred_bboxes, proposal_features = rcnn_head(\n                features, bboxes, proposal_features, self.box_pooler)\n\n            if self.return_intermediate or stage == len(self.head_series) - 1:\n                inter_class_logits.append(class_logits)\n                inter_pred_bboxes.append(pred_bboxes)\n            bboxes = pred_bboxes.detach()\n\n        output = {\n            'pred_logits': inter_class_logits[-1],\n            'pred_boxes': inter_pred_bboxes[-1]\n        }\n        if self.return_intermediate:\n            output['aux_outputs'] = [{\n                'pred_logits': a,\n                'pred_boxes': b\n            } for a, b in zip(inter_class_logits[:-1], inter_pred_bboxes[:-1])]\n\n        return output\n\n    def get_loss(self, outputs, targets):\n        losses = self.lossfunc(outputs, targets)\n        weight_dict = self.lossfunc.weight_dict\n\n        for k in losses.keys():\n            if k in weight_dict:\n                losses[k] *= weight_dict[k]\n\n        return losses\n\n\ndef box_cxcywh_to_xyxy(x):\n    x_c, y_c, w, h = x.unbind(-1)\n    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]\n    return paddle.stack(b, axis=-1)\n"
  },
  {
    "path": "ppdet/modeling/heads/ssd_head.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom paddle.regularizer import L2Decay\nfrom paddle import ParamAttr\n\nfrom ..layers import AnchorGeneratorSSD\nfrom ..cls_utils import _get_class_default_kwargs\n\n\nclass SepConvLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=3,\n                 padding=1,\n                 conv_decay=0.):\n        super(SepConvLayer, self).__init__()\n        self.dw_conv = nn.Conv2D(\n            in_channels=in_channels,\n            out_channels=in_channels,\n            kernel_size=kernel_size,\n            stride=1,\n            padding=padding,\n            groups=in_channels,\n            weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)),\n            bias_attr=False)\n\n        self.bn = nn.BatchNorm2D(\n            in_channels,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.)))\n\n        self.pw_conv = nn.Conv2D(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)),\n            bias_attr=False)\n\n    def forward(self, x):\n        x = self.dw_conv(x)\n        x = F.relu6(self.bn(x))\n        x = self.pw_conv(x)\n        return x\n\n\nclass SSDExtraHead(nn.Layer):\n    def __init__(self,\n                 in_channels=256,\n                 out_channels=([256, 512], [256, 512], [128, 256], [128, 256],\n                               [128, 256]),\n                 strides=(2, 2, 2, 1, 1),\n                 paddings=(1, 1, 1, 0, 0)):\n        super(SSDExtraHead, self).__init__()\n        self.convs = nn.LayerList()\n        for out_channel, stride, padding in zip(out_channels, strides,\n                                                paddings):\n            self.convs.append(\n                self._make_layers(in_channels, out_channel[0], out_channel[1],\n                                  stride, padding))\n            in_channels = out_channel[-1]\n\n    def _make_layers(self, c_in, c_hidden, c_out, stride_3x3, padding_3x3):\n        return nn.Sequential(\n            nn.Conv2D(c_in, c_hidden, 1),\n            nn.ReLU(),\n            nn.Conv2D(c_hidden, c_out, 3, stride_3x3, padding_3x3), nn.ReLU())\n\n    def forward(self, x):\n        out = [x]\n        for conv_layer in self.convs:\n            out.append(conv_layer(out[-1]))\n        return out\n\n\n@register\nclass SSDHead(nn.Layer):\n    \"\"\"\n    SSDHead\n\n    Args:\n        num_classes (int): Number of classes\n        in_channels (list): Number of channels per input feature\n        anchor_generator (dict): Configuration of 'AnchorGeneratorSSD' instance\n        kernel_size (int): Conv kernel size\n        padding (int): Conv padding\n        use_sepconv (bool): Use SepConvLayer if true\n        conv_decay (float): Conv regularization coeff\n        loss (object): 'SSDLoss' instance\n        use_extra_head (bool): If use ResNet34 as baskbone, you should set `use_extra_head`=True\n    \"\"\"\n\n    __shared__ = ['num_classes']\n    __inject__ = ['anchor_generator', 'loss']\n\n    def __init__(self,\n                 num_classes=80,\n                 in_channels=(512, 1024, 512, 256, 256, 256),\n                 anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),\n                 kernel_size=3,\n                 padding=1,\n                 use_sepconv=False,\n                 conv_decay=0.,\n                 loss='SSDLoss',\n                 use_extra_head=False):\n        super(SSDHead, self).__init__()\n        # add background class\n        self.num_classes = num_classes + 1\n        self.in_channels = in_channels\n        self.anchor_generator = anchor_generator\n        self.loss = loss\n        self.use_extra_head = use_extra_head\n\n        if self.use_extra_head:\n            self.ssd_extra_head = SSDExtraHead()\n            self.in_channels = [256, 512, 512, 256, 256, 256]\n\n        if isinstance(anchor_generator, dict):\n            self.anchor_generator = AnchorGeneratorSSD(**anchor_generator)\n\n        self.num_priors = self.anchor_generator.num_priors\n        self.box_convs = []\n        self.score_convs = []\n        for i, num_prior in enumerate(self.num_priors):\n            box_conv_name = \"boxes{}\".format(i)\n            if not use_sepconv:\n                box_conv = self.add_sublayer(\n                    box_conv_name,\n                    nn.Conv2D(\n                        in_channels=self.in_channels[i],\n                        out_channels=num_prior * 4,\n                        kernel_size=kernel_size,\n                        padding=padding))\n            else:\n                box_conv = self.add_sublayer(\n                    box_conv_name,\n                    SepConvLayer(\n                        in_channels=self.in_channels[i],\n                        out_channels=num_prior * 4,\n                        kernel_size=kernel_size,\n                        padding=padding,\n                        conv_decay=conv_decay))\n            self.box_convs.append(box_conv)\n\n            score_conv_name = \"scores{}\".format(i)\n            if not use_sepconv:\n                score_conv = self.add_sublayer(\n                    score_conv_name,\n                    nn.Conv2D(\n                        in_channels=self.in_channels[i],\n                        out_channels=num_prior * self.num_classes,\n                        kernel_size=kernel_size,\n                        padding=padding))\n            else:\n                score_conv = self.add_sublayer(\n                    score_conv_name,\n                    SepConvLayer(\n                        in_channels=self.in_channels[i],\n                        out_channels=num_prior * self.num_classes,\n                        kernel_size=kernel_size,\n                        padding=padding,\n                        conv_decay=conv_decay))\n            self.score_convs.append(score_conv)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    def forward(self, feats, image, gt_bbox=None, gt_class=None):\n        if self.use_extra_head:\n            assert len(feats) == 1, \\\n                (\"If you set use_extra_head=True, backbone feature \"\n                 \"list length should be 1.\")\n            feats = self.ssd_extra_head(feats[0])\n        box_preds = []\n        cls_scores = []\n        for feat, box_conv, score_conv in zip(feats, self.box_convs,\n                                              self.score_convs):\n            box_pred = box_conv(feat)\n            box_pred = paddle.transpose(box_pred, [0, 2, 3, 1])\n            box_pred = paddle.reshape(box_pred, [0, -1, 4])\n            box_preds.append(box_pred)\n\n            cls_score = score_conv(feat)\n            cls_score = paddle.transpose(cls_score, [0, 2, 3, 1])\n            cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes])\n            cls_scores.append(cls_score)\n\n        prior_boxes = self.anchor_generator(feats, image)\n\n        if self.training:\n            return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class,\n                                 prior_boxes)\n        else:\n            return (box_preds, cls_scores), prior_boxes\n\n    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):\n        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)\n"
  },
  {
    "path": "ppdet/modeling/heads/tood_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Constant\n\nfrom ppdet.core.workspace import register\nfrom ..initializer import normal_, constant_, bias_init_with_prob\nfrom ppdet.modeling.bbox_utils import bbox_center, batch_distance2bbox\nfrom ..losses import GIoULoss\nfrom ppdet.modeling.layers import ConvNormLayer\nfrom ppdet.modeling.ops import get_static_shape\nfrom ppdet.modeling.assigners.utils import generate_anchors_for_grid_cell\n\n\nclass ScaleReg(nn.Layer):\n    \"\"\"\n    Parameter for scaling the regression outputs.\n    \"\"\"\n\n    def __init__(self, init_scale=1.):\n        super(ScaleReg, self).__init__()\n        self.scale_reg = self.create_parameter(\n            shape=[1],\n            attr=ParamAttr(initializer=Constant(value=init_scale)),\n            dtype=\"float32\")\n\n    def forward(self, inputs):\n        out = inputs * self.scale_reg\n        return out\n\n\nclass TaskDecomposition(nn.Layer):\n    \"\"\"This code is based on\n        https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py\n    \"\"\"\n\n    def __init__(\n            self,\n            feat_channels,\n            stacked_convs,\n            la_down_rate=8,\n            norm_type='gn',\n            norm_groups=32, ):\n        super(TaskDecomposition, self).__init__()\n        self.feat_channels = feat_channels\n        self.stacked_convs = stacked_convs\n        self.norm_type = norm_type\n        self.norm_groups = norm_groups\n        self.in_channels = self.feat_channels * self.stacked_convs\n        self.la_conv1 = nn.Conv2D(self.in_channels,\n                                  self.in_channels // la_down_rate, 1)\n        self.la_conv2 = nn.Conv2D(self.in_channels // la_down_rate,\n                                  self.stacked_convs, 1)\n\n        self.reduction_conv = ConvNormLayer(\n            self.in_channels,\n            self.feat_channels,\n            filter_size=1,\n            stride=1,\n            norm_type=self.norm_type,\n            norm_groups=self.norm_groups)\n\n        self._init_weights()\n\n    def _init_weights(self):\n        normal_(self.la_conv1.weight, std=0.001)\n        normal_(self.la_conv2.weight, std=0.001)\n\n    def forward(self, feat, avg_feat):\n        feat_shape = get_static_shape(feat)\n        b = feat_shape[0:1]\n        h = feat_shape[2:3]\n        w = feat_shape[3:4]\n        weight = F.relu(self.la_conv1(avg_feat))\n        weight = F.sigmoid(self.la_conv2(weight)).unsqueeze(-1)\n        feat = paddle.reshape(\n            feat, [b, self.stacked_convs, self.feat_channels, h, w]) * weight\n        feat = self.reduction_conv(feat.flatten(1, 2))\n        feat = F.relu(feat)\n        return feat\n\n\n@register\nclass TOODHead(nn.Layer):\n    \"\"\"This code is based on\n        https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py\n    \"\"\"\n    __inject__ = ['nms', 'static_assigner', 'assigner']\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 num_classes=80,\n                 feat_channels=256,\n                 stacked_convs=6,\n                 fpn_strides=(8, 16, 32, 64, 128),\n                 grid_cell_scale=8,\n                 grid_cell_offset=0.5,\n                 norm_type='gn',\n                 norm_groups=32,\n                 static_assigner_epoch=4,\n                 use_align_head=True,\n                 loss_weight={\n                     'class': 1.0,\n                     'bbox': 1.0,\n                     'iou': 2.0,\n                 },\n                 nms='MultiClassNMS',\n                 static_assigner='ATSSAssigner',\n                 assigner='TaskAlignedAssigner'):\n        super(TOODHead, self).__init__()\n        self.num_classes = num_classes\n        self.feat_channels = feat_channels\n        self.stacked_convs = stacked_convs\n        self.fpn_strides = fpn_strides\n        self.grid_cell_scale = grid_cell_scale\n        self.grid_cell_offset = grid_cell_offset\n        self.static_assigner_epoch = static_assigner_epoch\n        self.use_align_head = use_align_head\n        self.nms = nms\n        self.static_assigner = static_assigner\n        self.assigner = assigner\n        self.loss_weight = loss_weight\n        self.giou_loss = GIoULoss()\n\n        self.inter_convs = nn.LayerList()\n        for i in range(self.stacked_convs):\n            self.inter_convs.append(\n                ConvNormLayer(\n                    self.feat_channels,\n                    self.feat_channels,\n                    filter_size=3,\n                    stride=1,\n                    norm_type=norm_type,\n                    norm_groups=norm_groups))\n\n        self.cls_decomp = TaskDecomposition(\n            self.feat_channels,\n            self.stacked_convs,\n            self.stacked_convs * 8,\n            norm_type=norm_type,\n            norm_groups=norm_groups)\n        self.reg_decomp = TaskDecomposition(\n            self.feat_channels,\n            self.stacked_convs,\n            self.stacked_convs * 8,\n            norm_type=norm_type,\n            norm_groups=norm_groups)\n\n        self.tood_cls = nn.Conv2D(\n            self.feat_channels, self.num_classes, 3, padding=1)\n        self.tood_reg = nn.Conv2D(self.feat_channels, 4, 3, padding=1)\n\n        if self.use_align_head:\n            self.cls_prob_conv1 = nn.Conv2D(self.feat_channels *\n                                            self.stacked_convs,\n                                            self.feat_channels // 4, 1)\n            self.cls_prob_conv2 = nn.Conv2D(\n                self.feat_channels // 4, 1, 3, padding=1)\n            self.reg_offset_conv1 = nn.Conv2D(self.feat_channels *\n                                              self.stacked_convs,\n                                              self.feat_channels // 4, 1)\n            self.reg_offset_conv2 = nn.Conv2D(\n                self.feat_channels // 4, 4 * 2, 3, padding=1)\n\n        self.scales_regs = nn.LayerList([ScaleReg() for _ in self.fpn_strides])\n\n        self._init_weights()\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {\n            'feat_channels': input_shape[0].channels,\n            'fpn_strides': [i.stride for i in input_shape],\n        }\n\n    def _init_weights(self):\n        bias_cls = bias_init_with_prob(0.01)\n        normal_(self.tood_cls.weight, std=0.01)\n        constant_(self.tood_cls.bias, bias_cls)\n        normal_(self.tood_reg.weight, std=0.01)\n\n        if self.use_align_head:\n            normal_(self.cls_prob_conv1.weight, std=0.01)\n            normal_(self.cls_prob_conv2.weight, std=0.01)\n            constant_(self.cls_prob_conv2.bias, bias_cls)\n            normal_(self.reg_offset_conv1.weight, std=0.001)\n            constant_(self.reg_offset_conv2.weight)\n            constant_(self.reg_offset_conv2.bias)\n\n    def _reg_grid_sample(self, feat, offset, anchor_points):\n        feat_shape = get_static_shape(feat)\n        b = feat_shape[0:1]\n        h = feat_shape[2:3]\n        w = feat_shape[3:4]\n        feat = paddle.reshape(feat, [-1, 1, h, w])\n        offset = paddle.reshape(offset, [-1, 2, h, w]).transpose([0, 2, 3, 1])\n        grid_shape = paddle.concat([w, h]).astype('float32')\n        grid = (offset + anchor_points) / grid_shape\n        grid = 2 * grid.clip(0., 1.) - 1\n        feat = F.grid_sample(feat, grid)\n        feat = paddle.reshape(feat, [b, -1, h, w])\n        return feat\n\n    def forward(self, feats):\n        assert len(feats) == len(self.fpn_strides), \\\n            \"The size of feats is not equal to size of fpn_strides\"\n\n        anchors, anchor_points, num_anchors_list, stride_tensor =\\\n            generate_anchors_for_grid_cell(\n            feats, self.fpn_strides, self.grid_cell_scale,\n            self.grid_cell_offset)\n        anchor_centers_split = paddle.split(anchor_points / stride_tensor,\n                                            num_anchors_list)\n\n        cls_score_list, bbox_pred_list = [], []\n        for feat, scale_reg, anchor_centers, stride in zip(\n                feats, self.scales_regs, anchor_centers_split,\n                self.fpn_strides):\n            b, _, h, w = get_static_shape(feat)\n            inter_feats = []\n            for inter_conv in self.inter_convs:\n                feat = F.relu(inter_conv(feat))\n                inter_feats.append(feat)\n            feat = paddle.concat(inter_feats, axis=1)\n\n            # task decomposition\n            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))\n            cls_feat = self.cls_decomp(feat, avg_feat)\n            reg_feat = self.reg_decomp(feat, avg_feat)\n\n            # cls prediction and alignment\n            cls_logits = self.tood_cls(cls_feat)\n            if self.use_align_head:\n                cls_prob = F.relu(self.cls_prob_conv1(feat))\n                cls_prob = F.sigmoid(self.cls_prob_conv2(cls_prob))\n                cls_score = (F.sigmoid(cls_logits) * cls_prob).sqrt()\n            else:\n                cls_score = F.sigmoid(cls_logits)\n            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))\n\n            # reg prediction and alignment\n            reg_dist = scale_reg(self.tood_reg(reg_feat).exp())\n            reg_dist = reg_dist.flatten(2).transpose([0, 2, 1])\n            reg_bbox = batch_distance2bbox(\n                anchor_centers.unsqueeze(0), reg_dist)\n            if self.use_align_head:\n                reg_offset = F.relu(self.reg_offset_conv1(feat))\n                reg_offset = self.reg_offset_conv2(reg_offset)\n                reg_bbox = reg_bbox.transpose([0, 2, 1]).reshape([b, 4, h, w])\n                anchor_centers = anchor_centers.reshape([1, h, w, 2])\n                bbox_pred = self._reg_grid_sample(reg_bbox, reg_offset,\n                                                  anchor_centers)\n                bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])\n            else:\n                bbox_pred = reg_bbox\n\n            if not self.training:\n                bbox_pred *= stride\n            bbox_pred_list.append(bbox_pred)\n        cls_score_list = paddle.concat(cls_score_list, axis=1)\n        bbox_pred_list = paddle.concat(bbox_pred_list, axis=1)\n\n        return cls_score_list, bbox_pred_list, anchors, num_anchors_list, stride_tensor\n\n    @staticmethod\n    def _focal_loss(score, label, alpha=0.25, gamma=2.0):\n        weight = (score - label).pow(gamma)\n        if alpha > 0:\n            alpha_t = alpha * label + (1 - alpha) * (1 - label)\n            weight *= alpha_t\n        loss = F.binary_cross_entropy(\n            score, label, weight=weight, reduction='sum')\n        return loss\n\n    def get_loss(self, head_outs, gt_meta):\n        pred_scores, pred_bboxes, anchors, \\\n        num_anchors_list, stride_tensor = head_outs\n        gt_labels = gt_meta['gt_class']\n        gt_bboxes = gt_meta['gt_bbox']\n        pad_gt_mask = gt_meta['pad_gt_mask']\n        # label assignment\n        if gt_meta['epoch_id'] < self.static_assigner_epoch:\n            assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(\n                anchors,\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index=self.num_classes)\n            alpha_l = 0.25\n        else:\n            assigned_labels, assigned_bboxes, assigned_scores = self.assigner(\n                pred_scores.detach(),\n                pred_bboxes.detach() * stride_tensor,\n                bbox_center(anchors),\n                num_anchors_list,\n                gt_labels,\n                gt_bboxes,\n                pad_gt_mask,\n                bg_index=self.num_classes)\n            alpha_l = -1\n\n        # rescale bbox\n        assigned_bboxes /= stride_tensor\n        # classification loss\n        loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=alpha_l)\n        # select positive samples mask\n        mask_positive = (assigned_labels != self.num_classes)\n        num_pos = mask_positive.astype(paddle.float32).sum()\n        # bbox regression loss\n        if num_pos > 0:\n            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])\n            pred_bboxes_pos = paddle.masked_select(pred_bboxes,\n                                                   bbox_mask).reshape([-1, 4])\n            assigned_bboxes_pos = paddle.masked_select(\n                assigned_bboxes, bbox_mask).reshape([-1, 4])\n            bbox_weight = paddle.masked_select(\n                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)\n            # iou loss\n            loss_iou = self.giou_loss(pred_bboxes_pos,\n                                      assigned_bboxes_pos) * bbox_weight\n            loss_iou = loss_iou.sum() / bbox_weight.sum()\n            # l1 loss\n            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)\n        else:\n            loss_iou = paddle.zeros([])\n            loss_l1 = paddle.zeros([])\n\n        loss_cls /= assigned_scores.sum().clip(min=1)\n        loss = self.loss_weight['class'] * loss_cls + self.loss_weight[\n            'iou'] * loss_iou\n\n        return {\n            'loss': loss,\n            'loss_class': loss_cls,\n            'loss_iou': loss_iou,\n            'loss_l1': loss_l1\n        }\n\n    def post_process(self, head_outs, img_shape, scale_factor):\n        pred_scores, pred_bboxes, _, _, _ = head_outs\n        pred_scores = pred_scores.transpose([0, 2, 1])\n\n        for i in range(len(pred_bboxes)):\n            pred_bboxes[i, :, 0] = pred_bboxes[i, :, 0].clip(\n                min=0, max=img_shape[i, 1])\n            pred_bboxes[i, :, 1] = pred_bboxes[i, :, 1].clip(\n                min=0, max=img_shape[i, 0])\n            pred_bboxes[i, :, 2] = pred_bboxes[i, :, 2].clip(\n                min=0, max=img_shape[i, 1])\n            pred_bboxes[i, :, 3] = pred_bboxes[i, :, 3].clip(\n                min=0, max=img_shape[i, 0])\n        # scale bbox to origin\n        scale_factor = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)\n        pred_bboxes /= scale_factor\n        bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)\n        return bbox_pred, bbox_num\n"
  },
  {
    "path": "ppdet/modeling/heads/ttf_head.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Constant, Normal\nfrom paddle.regularizer import L2Decay\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.layers import DeformableConvV2, LiteConv\nimport numpy as np\n\n\n@register\nclass HMHead(nn.Layer):\n    \"\"\"\n    Args:\n        ch_in (int): The channel number of input Tensor.\n        ch_out (int): The channel number of output Tensor.\n        num_classes (int): Number of classes.\n        conv_num (int): The convolution number of hm_feat.\n        dcn_head(bool): whether use dcn in head. False by default. \n        lite_head(bool): whether use lite version. False by default.\n        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.\n            bn by default\n\n    Return:\n        Heatmap head output\n    \"\"\"\n    __shared__ = ['num_classes', 'norm_type']\n\n    def __init__(\n            self,\n            ch_in,\n            ch_out=128,\n            num_classes=80,\n            conv_num=2,\n            dcn_head=False,\n            lite_head=False,\n            norm_type='bn', ):\n        super(HMHead, self).__init__()\n        head_conv = nn.Sequential()\n        for i in range(conv_num):\n            name = 'conv.{}'.format(i)\n            if lite_head:\n                lite_name = 'hm.' + name\n                head_conv.add_sublayer(\n                    lite_name,\n                    LiteConv(\n                        in_channels=ch_in if i == 0 else ch_out,\n                        out_channels=ch_out,\n                        norm_type=norm_type))\n            else:\n                if dcn_head:\n                    head_conv.add_sublayer(\n                        name,\n                        DeformableConvV2(\n                            in_channels=ch_in if i == 0 else ch_out,\n                            out_channels=ch_out,\n                            kernel_size=3,\n                            weight_attr=ParamAttr(initializer=Normal(0, 0.01))))\n                else:\n                    head_conv.add_sublayer(\n                        name,\n                        nn.Conv2D(\n                            in_channels=ch_in if i == 0 else ch_out,\n                            out_channels=ch_out,\n                            kernel_size=3,\n                            padding=1,\n                            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),\n                            bias_attr=ParamAttr(\n                                learning_rate=2., regularizer=L2Decay(0.))))\n                head_conv.add_sublayer(name + '.act', nn.ReLU())\n        self.feat = head_conv\n        bias_init = float(-np.log((1 - 0.01) / 0.01))\n        weight_attr = None if lite_head else ParamAttr(initializer=Normal(0,\n                                                                          0.01))\n        self.head = nn.Conv2D(\n            in_channels=ch_out,\n            out_channels=num_classes,\n            kernel_size=1,\n            weight_attr=weight_attr,\n            bias_attr=ParamAttr(\n                learning_rate=2.,\n                regularizer=L2Decay(0.),\n                initializer=Constant(bias_init)))\n\n    def forward(self, feat):\n        out = self.feat(feat)\n        out = self.head(out)\n        return out\n\n\n@register\nclass WHHead(nn.Layer):\n    \"\"\"\n    Args:\n        ch_in (int): The channel number of input Tensor.\n        ch_out (int): The channel number of output Tensor.\n        conv_num (int): The convolution number of wh_feat.\n        dcn_head(bool): whether use dcn in head. False by default.\n        lite_head(bool): whether use lite version. False by default.\n        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.\n            bn by default\n    Return:\n        Width & Height head output\n    \"\"\"\n    __shared__ = ['norm_type']\n\n    def __init__(self,\n                 ch_in,\n                 ch_out=64,\n                 conv_num=2,\n                 dcn_head=False,\n                 lite_head=False,\n                 norm_type='bn'):\n        super(WHHead, self).__init__()\n        head_conv = nn.Sequential()\n        for i in range(conv_num):\n            name = 'conv.{}'.format(i)\n            if lite_head:\n                lite_name = 'wh.' + name\n                head_conv.add_sublayer(\n                    lite_name,\n                    LiteConv(\n                        in_channels=ch_in if i == 0 else ch_out,\n                        out_channels=ch_out,\n                        norm_type=norm_type))\n            else:\n                if dcn_head:\n                    head_conv.add_sublayer(\n                        name,\n                        DeformableConvV2(\n                            in_channels=ch_in if i == 0 else ch_out,\n                            out_channels=ch_out,\n                            kernel_size=3,\n                            weight_attr=ParamAttr(initializer=Normal(0, 0.01))))\n                else:\n                    head_conv.add_sublayer(\n                        name,\n                        nn.Conv2D(\n                            in_channels=ch_in if i == 0 else ch_out,\n                            out_channels=ch_out,\n                            kernel_size=3,\n                            padding=1,\n                            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),\n                            bias_attr=ParamAttr(\n                                learning_rate=2., regularizer=L2Decay(0.))))\n                head_conv.add_sublayer(name + '.act', nn.ReLU())\n\n        weight_attr = None if lite_head else ParamAttr(initializer=Normal(0,\n                                                                          0.01))\n        self.feat = head_conv\n        self.head = nn.Conv2D(\n            in_channels=ch_out,\n            out_channels=4,\n            kernel_size=1,\n            weight_attr=weight_attr,\n            bias_attr=ParamAttr(\n                learning_rate=2., regularizer=L2Decay(0.)))\n\n    def forward(self, feat):\n        out = self.feat(feat)\n        out = self.head(out)\n        out = F.relu(out)\n        return out\n\n\n@register\nclass TTFHead(nn.Layer):\n    \"\"\"\n    TTFHead\n    Args:\n        in_channels (int): the channel number of input to TTFHead.\n        num_classes (int): the number of classes, 80 by default.\n        hm_head_planes (int): the channel number in heatmap head,\n            128 by default.\n        wh_head_planes (int): the channel number in width & height head,\n            64 by default.\n        hm_head_conv_num (int): the number of convolution in heatmap head,\n            2 by default.\n        wh_head_conv_num (int): the number of convolution in width & height\n            head, 2 by default.\n        hm_loss (object): Instance of 'CTFocalLoss'.\n        wh_loss (object): Instance of 'GIoULoss'.\n        wh_offset_base (float): the base offset of width and height,\n            16.0 by default.\n        down_ratio (int): the actual down_ratio is calculated by base_down_ratio\n            (default 16) and the number of upsample layers.\n        lite_head(bool): whether use lite version. False by default.\n        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.\n            bn by default\n        ags_module(bool): whether use AGS module to reweight location feature.\n            false by default.\n\n    \"\"\"\n\n    __shared__ = ['num_classes', 'down_ratio', 'norm_type']\n    __inject__ = ['hm_loss', 'wh_loss']\n\n    def __init__(self,\n                 in_channels,\n                 num_classes=80,\n                 hm_head_planes=128,\n                 wh_head_planes=64,\n                 hm_head_conv_num=2,\n                 wh_head_conv_num=2,\n                 hm_loss='CTFocalLoss',\n                 wh_loss='GIoULoss',\n                 wh_offset_base=16.,\n                 down_ratio=4,\n                 dcn_head=False,\n                 lite_head=False,\n                 norm_type='bn',\n                 ags_module=False):\n        super(TTFHead, self).__init__()\n        self.in_channels = in_channels\n        self.hm_head = HMHead(in_channels, hm_head_planes, num_classes,\n                              hm_head_conv_num, dcn_head, lite_head, norm_type)\n        self.wh_head = WHHead(in_channels, wh_head_planes, wh_head_conv_num,\n                              dcn_head, lite_head, norm_type)\n        self.hm_loss = hm_loss\n        self.wh_loss = wh_loss\n\n        self.wh_offset_base = wh_offset_base\n        self.down_ratio = down_ratio\n        self.ags_module = ags_module\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        if isinstance(input_shape, (list, tuple)):\n            input_shape = input_shape[0]\n        return {'in_channels': input_shape.channels, }\n\n    def forward(self, feats):\n        hm = self.hm_head(feats)\n        wh = self.wh_head(feats) * self.wh_offset_base\n        return hm, wh\n\n    def filter_box_by_weight(self, pred, target, weight):\n        \"\"\"\n        Filter out boxes where ttf_reg_weight is 0, only keep positive samples.\n        \"\"\"\n        index = paddle.nonzero(weight > 0)\n        index.stop_gradient = True\n        weight = paddle.gather_nd(weight, index)\n        pred = paddle.gather_nd(pred, index)\n        target = paddle.gather_nd(target, index)\n        return pred, target, weight\n\n    def filter_loc_by_weight(self, score, weight):\n        index = paddle.nonzero(weight > 0)\n        index.stop_gradient = True\n        score = paddle.gather_nd(score, index)\n        return score\n\n    def get_loss(self, pred_hm, pred_wh, target_hm, box_target, target_weight):\n        pred_hm = paddle.clip(F.sigmoid(pred_hm), 1e-4, 1 - 1e-4)\n        hm_loss = self.hm_loss(pred_hm, target_hm)\n        H, W = target_hm.shape[2:]\n        mask = paddle.reshape(target_weight, [-1, H, W])\n        avg_factor = paddle.sum(mask) + 1e-4\n\n        base_step = self.down_ratio\n        shifts_x = paddle.arange(0, W * base_step, base_step, dtype='int32')\n        shifts_y = paddle.arange(0, H * base_step, base_step, dtype='int32')\n        shift_y, shift_x = paddle.tensor.meshgrid([shifts_y, shifts_x])\n        base_loc = paddle.stack([shift_x, shift_y], axis=0)\n        base_loc.stop_gradient = True\n\n        pred_boxes = paddle.concat(\n            [0 - pred_wh[:, 0:2, :, :] + base_loc.astype(pred_wh.dtype), pred_wh[:, 2:4] + base_loc.astype(pred_wh.dtype)],\n            axis=1)\n        pred_boxes = paddle.transpose(pred_boxes, [0, 2, 3, 1])\n        boxes = paddle.transpose(box_target, [0, 2, 3, 1])\n        boxes.stop_gradient = True\n\n        if self.ags_module:\n            pred_hm_max = paddle.max(pred_hm, axis=1, keepdim=True)\n            pred_hm_max_softmax = F.softmax(pred_hm_max, axis=1)\n            pred_hm_max_softmax = paddle.transpose(pred_hm_max_softmax,\n                                                   [0, 2, 3, 1])\n            pred_hm_max_softmax = self.filter_loc_by_weight(pred_hm_max_softmax,\n                                                            mask)\n        else:\n            pred_hm_max_softmax = None\n\n        pred_boxes, boxes, mask = self.filter_box_by_weight(pred_boxes, boxes,\n                                                            mask)\n        mask.stop_gradient = True\n        wh_loss = self.wh_loss(\n            pred_boxes,\n            boxes,\n            iou_weight=mask.unsqueeze(1),\n            loc_reweight=pred_hm_max_softmax)\n        wh_loss = wh_loss / avg_factor\n\n        ttf_loss = {'hm_loss': hm_loss, 'wh_loss': wh_loss}\n        return ttf_loss\n"
  },
  {
    "path": "ppdet/modeling/heads/vitpose_head.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.keypoint_utils import resize, flip_back\nfrom paddle.nn.initializer import TruncatedNormal, Constant, Normal\nfrom ppdet.modeling.layers import ConvTranspose2d, BatchNorm2d\n\ntrunc_normal_ = TruncatedNormal(std=.02)\nnormal_ = Normal(std=0.001)\nzeros_ = Constant(value=0.)\nones_ = Constant(value=1.)\n\n__all__ = ['TopdownHeatmapSimpleHead']\n\n\n@register\nclass TopdownHeatmapSimpleHead(nn.Layer):\n    def __init__(self,\n                 in_channels=768,\n                 out_channels=17,\n                 num_deconv_layers=3,\n                 num_deconv_filters=(256, 256, 256),\n                 num_deconv_kernels=(4, 4, 4),\n                 extra=None,\n                 in_index=0,\n                 input_transform=None,\n                 align_corners=False,\n                 upsample=0,\n                 flip_pairs=None,\n                 shift_heatmap=False,\n                 target_type='GaussianHeatmap'):\n        super(TopdownHeatmapSimpleHead, self).__init__()\n\n        self.in_channels = in_channels\n        self.upsample = upsample\n        self.flip_pairs = flip_pairs\n        self.shift_heatmap = shift_heatmap\n        self.target_type = target_type\n\n        self._init_inputs(in_channels, in_index, input_transform)\n        self.in_index = in_index\n        self.align_corners = align_corners\n\n        if extra is not None and not isinstance(extra, dict):\n            raise TypeError('extra should be dict or None.')\n\n        if num_deconv_layers > 0:\n            self.deconv_layers = self._make_deconv_layer(\n                num_deconv_layers,\n                num_deconv_filters,\n                num_deconv_kernels, )\n        elif num_deconv_layers == 0:\n            self.deconv_layers = nn.Identity()\n        else:\n            raise ValueError(\n                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')\n\n        identity_final_layer = False\n        if extra is not None and 'final_conv_kernel' in extra:\n            assert extra['final_conv_kernel'] in [0, 1, 3]\n            if extra['final_conv_kernel'] == 3:\n                padding = 1\n            elif extra['final_conv_kernel'] == 1:\n                padding = 0\n            else:\n                # 0 for Identity mapping.\n                identity_final_layer = True\n            kernel_size = extra['final_conv_kernel']\n        else:\n            kernel_size = 1\n            padding = 0\n\n        if identity_final_layer:\n            self.final_layer = nn.Identity()\n        else:\n            conv_channels = num_deconv_filters[\n                -1] if num_deconv_layers > 0 else self.in_channels\n\n            layers = []\n            if extra is not None:\n                num_conv_layers = extra.get('num_conv_layers', 0)\n                num_conv_kernels = extra.get('num_conv_kernels',\n                                             [1] * num_conv_layers)\n\n                for i in range(num_conv_layers):\n                    layers.append(\n                        nn.Conv2D(\n                            in_channels=conv_channels,\n                            out_channels=conv_channels,\n                            kernel_size=num_conv_kernels[i],\n                            stride=1,\n                            padding=(num_conv_kernels[i] - 1) // 2))\n                    layers.append(nn.BatchNorm2D(conv_channels))\n                    layers.append(nn.ReLU())\n\n            layers.append(\n                nn.Conv2D(\n                    in_channels=conv_channels,\n                    out_channels=out_channels,\n                    kernel_size=kernel_size,\n                    stride=1,\n                    padding=(padding, padding)))\n\n            if len(layers) > 1:\n                self.final_layer = nn.Sequential(*layers)\n            else:\n                self.final_layer = layers[0]\n\n        self.init_weights()\n\n    @staticmethod\n    def _get_deconv_cfg(deconv_kernel):\n        \"\"\"Get configurations for deconv layers.\"\"\"\n        if deconv_kernel == 4:\n            padding = 1\n            output_padding = 0\n        elif deconv_kernel == 3:\n            padding = 1\n            output_padding = 1\n        elif deconv_kernel == 2:\n            padding = 0\n            output_padding = 0\n        else:\n            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')\n\n        return deconv_kernel, padding, output_padding\n\n    def _init_inputs(self, in_channels, in_index, input_transform):\n        \"\"\"Check and initialize input transforms.\n        \"\"\"\n\n        if input_transform is not None:\n            assert input_transform in ['resize_concat', 'multiple_select']\n        self.input_transform = input_transform\n        self.in_index = in_index\n        if input_transform is not None:\n            assert isinstance(in_channels, (list, tuple))\n            assert isinstance(in_index, (list, tuple))\n            assert len(in_channels) == len(in_index)\n            if input_transform == 'resize_concat':\n                self.in_channels = sum(in_channels)\n            else:\n                self.in_channels = in_channels\n        else:\n            assert isinstance(in_channels, int)\n            assert isinstance(in_index, int)\n            self.in_channels = in_channels\n\n    def _transform_inputs(self, inputs):\n        \"\"\"Transform inputs for decoder.\n        \"\"\"\n        if not isinstance(inputs, list):\n            if not isinstance(inputs, list):\n\n                if self.upsample > 0:\n                    inputs = resize(\n                        input=F.relu(inputs),\n                        scale_factor=self.upsample,\n                        mode='bilinear',\n                        align_corners=self.align_corners)\n            return inputs\n\n        if self.input_transform == 'resize_concat':\n            inputs = [inputs[i] for i in self.in_index]\n            upsampled_inputs = [\n                resize(\n                    input=x,\n                    size=inputs[0].shape[2:],\n                    mode='bilinear',\n                    align_corners=self.align_corners) for x in inputs\n            ]\n            inputs = paddle.concat(upsampled_inputs, dim=1)\n        elif self.input_transform == 'multiple_select':\n            inputs = [inputs[i] for i in self.in_index]\n        else:\n            inputs = inputs[self.in_index]\n\n        return inputs\n\n    def forward(self, x):\n        \"\"\"Forward function.\"\"\"\n        x = self._transform_inputs(x)\n        x = self.deconv_layers(x)\n        x = self.final_layer(x)\n\n        return x\n\n    def inference_model(self, x, flip_pairs=None):\n        \"\"\"Inference function.\n\n        Returns:\n            output_heatmap (np.ndarray): Output heatmaps.\n\n        Args:\n            x (torch.Tensor[N,K,H,W]): Input features.\n            flip_pairs (None | list[tuple]):\n                Pairs of keypoints which are mirrored.\n        \"\"\"\n        output = self.forward(x)\n\n        if flip_pairs is not None:\n            output_heatmap = flip_back(\n                output, self.flip_pairs, target_type=self.target_type)\n            # feature is not aligned, shift flipped heatmap for higher accuracy\n            if self.shift_heatmap:\n                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]\n        else:\n            output_heatmap = output\n        return output_heatmap\n\n    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):\n        \"\"\"Make deconv layers.\"\"\"\n        if num_layers != len(num_filters):\n            error_msg = f'num_layers({num_layers}) ' \\\n                        f'!= length of num_filters({len(num_filters)})'\n            raise ValueError(error_msg)\n        if num_layers != len(num_kernels):\n            error_msg = f'num_layers({num_layers}) ' \\\n                        f'!= length of num_kernels({len(num_kernels)})'\n            raise ValueError(error_msg)\n\n        layers = []\n        for i in range(num_layers):\n            kernel, padding, output_padding = \\\n                self._get_deconv_cfg(num_kernels[i])\n\n            planes = num_filters[i]\n            layers.append(\n                ConvTranspose2d(\n                    in_channels=self.in_channels,\n                    out_channels=planes,\n                    kernel_size=kernel,\n                    stride=2,\n                    padding=padding,\n                    output_padding=output_padding,\n                    bias=False))\n            layers.append(nn.BatchNorm2D(planes))\n            layers.append(nn.ReLU())\n            self.in_channels = planes\n\n        return nn.Sequential(*layers)\n\n    def init_weights(self):\n        \"\"\"Initialize model weights.\"\"\"\n        if not isinstance(self.deconv_layers, nn.Identity):\n\n            for m in self.deconv_layers:\n                if isinstance(m, nn.BatchNorm2D):\n                    ones_(m.weight)\n                    ones_(m.bias)\n        if not isinstance(self.final_layer, nn.Conv2D):\n\n            for m in self.final_layer:\n                if isinstance(m, nn.Conv2D):\n                    normal_(m.weight)\n                    zeros_(m.bias)\n                elif isinstance(m, nn.BatchNorm2D):\n                    ones_(m.weight)\n                    ones_(m.bias)\n        else:\n            normal_(self.final_layer.weight)\n            zeros_(self.final_layer.bias)\n"
  },
  {
    "path": "ppdet/modeling/heads/yolo_head.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom ppdet.core.workspace import register\n\nimport math\nimport numpy as np\nfrom ..initializer import bias_init_with_prob, constant_\nfrom ..backbones.csp_darknet import BaseConv, DWConv\nfrom ..losses import IouLoss\nfrom ppdet.modeling.assigners.simota_assigner import SimOTAAssigner\nfrom ppdet.modeling.bbox_utils import bbox_overlaps\nfrom ppdet.modeling.layers import MultiClassNMS\n\n__all__ = ['YOLOv3Head', 'YOLOXHead']\n\n\ndef _de_sigmoid(x, eps=1e-7):\n    x = paddle.clip(x, eps, 1. / eps)\n    x = paddle.clip(1. / x - 1., eps, 1. / eps)\n    x = -paddle.log(x)\n    return x\n\n\n@register\nclass YOLOv3Head(nn.Layer):\n    __shared__ = ['num_classes', 'data_format']\n    __inject__ = ['loss']\n\n    def __init__(self,\n                 in_channels=[1024, 512, 256],\n                 anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],\n                          [59, 119], [116, 90], [156, 198], [373, 326]],\n                 anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],\n                 num_classes=80,\n                 loss='YOLOv3Loss',\n                 iou_aware=False,\n                 iou_aware_factor=0.4,\n                 data_format='NCHW'):\n        \"\"\"\n        Head for YOLOv3 network\n\n        Args:\n            num_classes (int): number of foreground classes\n            anchors (list): anchors\n            anchor_masks (list): anchor masks\n            loss (object): YOLOv3Loss instance\n            iou_aware (bool): whether to use iou_aware\n            iou_aware_factor (float): iou aware factor\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(YOLOv3Head, self).__init__()\n        assert len(in_channels) > 0, \"in_channels length should > 0\"\n        self.in_channels = in_channels\n        self.num_classes = num_classes\n        self.loss = loss\n\n        self.iou_aware = iou_aware\n        self.iou_aware_factor = iou_aware_factor\n\n        self.parse_anchor(anchors, anchor_masks)\n        self.num_outputs = len(self.anchors)\n        self.data_format = data_format\n\n        self.yolo_outputs = []\n        for i in range(len(self.anchors)):\n\n            if self.iou_aware:\n                num_filters = len(self.anchors[i]) * (self.num_classes + 6)\n            else:\n                num_filters = len(self.anchors[i]) * (self.num_classes + 5)\n            name = 'yolo_output.{}'.format(i)\n            conv = nn.Conv2D(\n                in_channels=self.in_channels[i],\n                out_channels=num_filters,\n                kernel_size=1,\n                stride=1,\n                padding=0,\n                data_format=data_format,\n                bias_attr=ParamAttr(regularizer=L2Decay(0.)))\n            conv.skip_quant = True\n            yolo_output = self.add_sublayer(name, conv)\n            self.yolo_outputs.append(yolo_output)\n\n    def parse_anchor(self, anchors, anchor_masks):\n        self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks]\n        self.mask_anchors = []\n        anchor_num = len(anchors)\n        for masks in anchor_masks:\n            self.mask_anchors.append([])\n            for mask in masks:\n                assert mask < anchor_num, \"anchor mask index overflow\"\n                self.mask_anchors[-1].extend(anchors[mask])\n\n    def forward(self, feats, targets=None):\n        assert len(feats) == len(self.anchors)\n        yolo_outputs = []\n        for i, feat in enumerate(feats):\n            yolo_output = self.yolo_outputs[i](feat)\n            if self.data_format == 'NHWC':\n                yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2])\n            yolo_outputs.append(yolo_output)\n\n        if self.training:\n            return self.loss(yolo_outputs, targets, self.anchors)\n        else:\n            if self.iou_aware:\n                y = []\n                for i, out in enumerate(yolo_outputs):\n                    na = len(self.anchors[i])\n                    ioup, x = out[:, 0:na, :, :], out[:, na:, :, :]\n                    b, c, h, w = x.shape\n                    no = c // na\n                    x = x.reshape((b, na, no, h * w))\n                    ioup = ioup.reshape((b, na, 1, h * w))\n                    obj = x[:, :, 4:5, :]\n                    ioup = F.sigmoid(ioup)\n                    obj = F.sigmoid(obj)\n                    obj_t = (obj**(1 - self.iou_aware_factor)) * (\n                        ioup**self.iou_aware_factor)\n                    obj_t = _de_sigmoid(obj_t)\n                    loc_t = x[:, :, :4, :]\n                    cls_t = x[:, :, 5:, :]\n                    y_t = paddle.concat([loc_t, obj_t, cls_t], axis=2)\n                    y_t = y_t.reshape((b, c, h, w))\n                    y.append(y_t)\n                return y\n            else:\n                return yolo_outputs\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n\n@register\nclass YOLOXHead(nn.Layer):\n    __shared__ = ['num_classes', 'width_mult', 'act', 'trt', 'exclude_nms']\n    __inject__ = ['assigner', 'nms']\n\n    def __init__(self,\n                 num_classes=80,\n                 width_mult=1.0,\n                 depthwise=False,\n                 in_channels=[256, 512, 1024],\n                 feat_channels=256,\n                 fpn_strides=(8, 16, 32),\n                 l1_epoch=285,\n                 act='silu',\n                 assigner=SimOTAAssigner(use_vfl=False),\n                 nms='MultiClassNMS',\n                 loss_weight={\n                     'cls': 1.0,\n                     'obj': 1.0,\n                     'iou': 5.0,\n                     'l1': 1.0,\n                 },\n                 trt=False,\n                 exclude_nms=False):\n        super(YOLOXHead, self).__init__()\n        self._dtype = paddle.framework.get_default_dtype()\n        self.num_classes = num_classes\n        assert len(in_channels) > 0, \"in_channels length should > 0\"\n        self.in_channels = in_channels\n        feat_channels = int(feat_channels * width_mult)\n        self.fpn_strides = fpn_strides\n        self.l1_epoch = l1_epoch\n        self.assigner = assigner\n        self.nms = nms\n        if isinstance(self.nms, MultiClassNMS) and trt:\n            self.nms.trt = trt\n        self.exclude_nms = exclude_nms\n        self.loss_weight = loss_weight\n        self.iou_loss = IouLoss(loss_weight=1.0)  # default loss_weight 2.5\n\n        ConvBlock = DWConv if depthwise else BaseConv\n\n        self.stem_conv = nn.LayerList()\n        self.conv_cls = nn.LayerList()\n        self.conv_reg = nn.LayerList()  # reg [x,y,w,h] + obj\n        for in_c in self.in_channels:\n            self.stem_conv.append(BaseConv(in_c, feat_channels, 1, 1, act=act))\n\n            self.conv_cls.append(\n                nn.Sequential(* [\n                    ConvBlock(\n                        feat_channels, feat_channels, 3, 1, act=act), ConvBlock(\n                            feat_channels, feat_channels, 3, 1, act=act),\n                    nn.Conv2D(\n                        feat_channels,\n                        self.num_classes,\n                        1,\n                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n                ]))\n\n            self.conv_reg.append(\n                nn.Sequential(* [\n                    ConvBlock(\n                        feat_channels, feat_channels, 3, 1, act=act),\n                    ConvBlock(\n                        feat_channels, feat_channels, 3, 1, act=act),\n                    nn.Conv2D(\n                        feat_channels,\n                        4 + 1,  # reg [x,y,w,h] + obj\n                        1,\n                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n                ]))\n\n        self._init_weights()\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    def _init_weights(self):\n        bias_cls = bias_init_with_prob(0.01)\n        bias_reg = paddle.full([5], math.log(5.), dtype=self._dtype)\n        bias_reg[:2] = 0.\n        bias_reg[-1] = bias_cls\n        for cls_, reg_ in zip(self.conv_cls, self.conv_reg):\n            constant_(cls_[-1].weight)\n            constant_(cls_[-1].bias, bias_cls)\n            constant_(reg_[-1].weight)\n            reg_[-1].bias.set_value(bias_reg)\n\n    def _generate_anchor_point(self, feat_sizes, strides, offset=0.):\n        anchor_points, stride_tensor = [], []\n        num_anchors_list = []\n        for feat_size, stride in zip(feat_sizes, strides):\n            h, w = feat_size\n            x = (paddle.arange(w) + offset) * stride\n            y = (paddle.arange(h) + offset) * stride\n            y, x = paddle.meshgrid(y, x)\n            anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2]))\n            stride_tensor.append(\n                paddle.full(\n                    [len(anchor_points[-1]), 1], stride, dtype=self._dtype))\n            num_anchors_list.append(len(anchor_points[-1]))\n        anchor_points = paddle.concat(anchor_points).astype(self._dtype)\n        anchor_points.stop_gradient = True\n        stride_tensor = paddle.concat(stride_tensor)\n        stride_tensor.stop_gradient = True\n        return anchor_points, stride_tensor, num_anchors_list\n\n    def forward(self, feats, targets=None):\n        assert len(feats) == len(self.fpn_strides), \\\n            \"The size of feats is not equal to size of fpn_strides\"\n\n        feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats]\n        cls_score_list, reg_pred_list = [], []\n        obj_score_list = []\n        for i, feat in enumerate(feats):\n            feat = self.stem_conv[i](feat)\n            cls_logit = self.conv_cls[i](feat)\n            reg_pred = self.conv_reg[i](feat)\n            # cls prediction\n            cls_score = F.sigmoid(cls_logit)\n            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))\n            # reg prediction\n            reg_xywh, obj_logit = paddle.split(reg_pred, [4, 1], axis=1)\n            reg_xywh = reg_xywh.flatten(2).transpose([0, 2, 1])\n            reg_pred_list.append(reg_xywh)\n            # obj prediction\n            obj_score = F.sigmoid(obj_logit)\n            obj_score_list.append(obj_score.flatten(2).transpose([0, 2, 1]))\n\n        cls_score_list = paddle.concat(cls_score_list, axis=1)\n        reg_pred_list = paddle.concat(reg_pred_list, axis=1)\n        obj_score_list = paddle.concat(obj_score_list, axis=1)\n\n        # bbox decode\n        anchor_points, stride_tensor, _ =\\\n            self._generate_anchor_point(feat_sizes, self.fpn_strides)\n        reg_xy, reg_wh = paddle.split(reg_pred_list, 2, axis=-1)\n        reg_xy += (anchor_points / stride_tensor)\n        reg_wh = paddle.exp(reg_wh) * 0.5\n        bbox_pred_list = paddle.concat(\n            [reg_xy - reg_wh, reg_xy + reg_wh], axis=-1)\n\n        if self.training:\n            anchor_points, stride_tensor, num_anchors_list =\\\n                self._generate_anchor_point(feat_sizes, self.fpn_strides, 0.5)\n            yolox_losses = self.get_loss([\n                cls_score_list, bbox_pred_list, obj_score_list, anchor_points,\n                stride_tensor, num_anchors_list\n            ], targets)\n            return yolox_losses\n        else:\n            pred_scores = (cls_score_list * obj_score_list).sqrt()\n            return pred_scores, bbox_pred_list, stride_tensor\n\n    def get_loss(self, head_outs, targets):\n        pred_cls, pred_bboxes, pred_obj,\\\n        anchor_points, stride_tensor, num_anchors_list = head_outs\n        gt_labels = targets['gt_class']\n        gt_bboxes = targets['gt_bbox']\n        pred_scores = (pred_cls * pred_obj).sqrt()\n        # label assignment\n        center_and_strides = paddle.concat(\n            [anchor_points, stride_tensor, stride_tensor], axis=-1)\n        pos_num_list, label_list, bbox_target_list = [], [], []\n        for pred_score, pred_bbox, gt_box, gt_label in zip(\n                pred_scores.detach(),\n                pred_bboxes.detach() * stride_tensor, gt_bboxes, gt_labels):\n            pos_num, label, _, bbox_target = self.assigner(\n                pred_score, center_and_strides, pred_bbox, gt_box, gt_label)\n            pos_num_list.append(pos_num)\n            label_list.append(label)\n            bbox_target_list.append(bbox_target)\n        labels = paddle.to_tensor(np.stack(label_list, axis=0))\n        bbox_targets = paddle.to_tensor(np.stack(bbox_target_list, axis=0))\n        bbox_targets /= stride_tensor  # rescale bbox\n\n        # 1. obj score loss\n        mask_positive = (labels != self.num_classes)\n        loss_obj = F.binary_cross_entropy(\n            pred_obj,\n            mask_positive.astype(pred_obj.dtype).unsqueeze(-1),\n            reduction='sum')\n\n        num_pos = sum(pos_num_list)\n\n        if num_pos > 0:\n            num_pos = paddle.to_tensor(num_pos, dtype=self._dtype).clip(min=1)\n            loss_obj /= num_pos\n\n            # 2. iou loss\n            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])\n            pred_bboxes_pos = paddle.masked_select(pred_bboxes,\n                                                   bbox_mask).reshape([-1, 4])\n            assigned_bboxes_pos = paddle.masked_select(\n                bbox_targets, bbox_mask).reshape([-1, 4])\n            bbox_iou = bbox_overlaps(pred_bboxes_pos, assigned_bboxes_pos)\n            bbox_iou = paddle.diag(bbox_iou)\n\n            loss_iou = self.iou_loss(\n                pred_bboxes_pos.split(\n                    4, axis=-1),\n                assigned_bboxes_pos.split(\n                    4, axis=-1))\n            loss_iou = loss_iou.sum() / num_pos\n\n            # 3. cls loss\n            cls_mask = mask_positive.unsqueeze(-1).tile(\n                [1, 1, self.num_classes])\n            pred_cls_pos = paddle.masked_select(\n                pred_cls, cls_mask).reshape([-1, self.num_classes])\n            assigned_cls_pos = paddle.masked_select(labels, mask_positive)\n            assigned_cls_pos = F.one_hot(assigned_cls_pos,\n                                         self.num_classes + 1)[..., :-1]\n            assigned_cls_pos *= bbox_iou.unsqueeze(-1)\n            loss_cls = F.binary_cross_entropy(\n                pred_cls_pos, assigned_cls_pos, reduction='sum')\n            loss_cls /= num_pos\n\n            # 4. l1 loss\n            if targets['epoch_id'] >= self.l1_epoch:\n                loss_l1 = F.l1_loss(\n                    pred_bboxes_pos, assigned_bboxes_pos, reduction='sum')\n                loss_l1 /= num_pos\n            else:\n                loss_l1 = paddle.zeros([])\n                loss_l1.stop_gradient = False\n        else:\n            loss_cls = paddle.zeros([])\n            loss_iou = paddle.zeros([])\n            loss_l1 = paddle.zeros([])\n            loss_cls.stop_gradient = False\n            loss_iou.stop_gradient = False\n            loss_l1.stop_gradient = False\n\n        loss = self.loss_weight['obj'] * loss_obj + \\\n               self.loss_weight['cls'] * loss_cls + \\\n               self.loss_weight['iou'] * loss_iou\n\n        if targets['epoch_id'] >= self.l1_epoch:\n            loss += (self.loss_weight['l1'] * loss_l1)\n\n        yolox_losses = {\n            'loss': loss,\n            'loss_cls': loss_cls,\n            'loss_obj': loss_obj,\n            'loss_iou': loss_iou,\n            'loss_l1': loss_l1,\n        }\n        return yolox_losses\n\n    def post_process(self, head_outs, img_shape, scale_factor):\n        pred_scores, pred_bboxes, stride_tensor = head_outs\n        pred_scores = pred_scores.transpose([0, 2, 1])\n        pred_bboxes *= stride_tensor\n        # scale bbox to origin image\n        scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)\n        pred_bboxes /= scale_factor\n        if self.exclude_nms:\n            # `exclude_nms=True` just use in benchmark\n            return pred_bboxes.sum(), pred_scores.sum()\n        else:\n            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)\n            return bbox_pred, bbox_num\n"
  },
  {
    "path": "ppdet/modeling/heads/yolof_head.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import Normal, Constant\n\nfrom ppdet.modeling.layers import MultiClassNMS\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.bbox_utils import delta2bbox_v2\n\n__all__ = ['YOLOFHead']\n\nINF = 1e8\n\n\ndef reduce_mean(tensor):\n    world_size = paddle.distributed.get_world_size()\n    if world_size == 1:\n        return tensor\n    paddle.distributed.all_reduce(tensor)\n    return tensor / world_size\n\n\ndef find_inside_anchor(feat_size, stride, num_anchors, im_shape):\n    feat_h, feat_w = feat_size[:2]\n    im_h, im_w = im_shape[:2]\n    inside_h = min(int(np.ceil(im_h / stride)), feat_h)\n    inside_w = min(int(np.ceil(im_w / stride)), feat_w)\n    inside_mask = paddle.zeros([feat_h, feat_w], dtype=paddle.bool)\n    inside_mask[:inside_h, :inside_w] = True\n    inside_mask = inside_mask.unsqueeze(-1).expand(\n        [feat_h, feat_w, num_anchors])\n    return inside_mask.reshape([-1])\n\n\n@register\nclass YOLOFFeat(nn.Layer):\n    def __init__(self,\n                 feat_in=256,\n                 feat_out=256,\n                 num_cls_convs=2,\n                 num_reg_convs=4,\n                 norm_type='bn'):\n        super(YOLOFFeat, self).__init__()\n        assert norm_type == 'bn', \"YOLOFFeat only support BN now.\"\n        self.feat_in = feat_in\n        self.feat_out = feat_out\n        self.num_cls_convs = num_cls_convs\n        self.num_reg_convs = num_reg_convs\n        self.norm_type = norm_type\n\n        cls_subnet, reg_subnet = [], []\n        for i in range(self.num_cls_convs):\n            feat_in = self.feat_in if i == 0 else self.feat_out\n            cls_subnet.append(\n                nn.Conv2D(\n                    feat_in,\n                    self.feat_out,\n                    3,\n                    stride=1,\n                    padding=1,\n                    weight_attr=ParamAttr(initializer=Normal(\n                        mean=0.0, std=0.01)),\n                    bias_attr=ParamAttr(initializer=Constant(value=0.0))))\n            cls_subnet.append(\n                nn.BatchNorm2D(\n                    self.feat_out,\n                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))\n            cls_subnet.append(nn.ReLU())\n\n        for i in range(self.num_reg_convs):\n            feat_in = self.feat_in if i == 0 else self.feat_out\n            reg_subnet.append(\n                nn.Conv2D(\n                    feat_in,\n                    self.feat_out,\n                    3,\n                    stride=1,\n                    padding=1,\n                    weight_attr=ParamAttr(initializer=Normal(\n                        mean=0.0, std=0.01)),\n                    bias_attr=ParamAttr(initializer=Constant(value=0.0))))\n            reg_subnet.append(\n                nn.BatchNorm2D(\n                    self.feat_out,\n                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))\n            reg_subnet.append(nn.ReLU())\n\n        self.cls_subnet = nn.Sequential(*cls_subnet)\n        self.reg_subnet = nn.Sequential(*reg_subnet)\n\n    def forward(self, fpn_feat):\n        cls_feat = self.cls_subnet(fpn_feat)\n        reg_feat = self.reg_subnet(fpn_feat)\n        return cls_feat, reg_feat\n\n\n@register\nclass YOLOFHead(nn.Layer):\n    __shared__ = ['num_classes', 'trt', 'exclude_nms']\n    __inject__ = [\n        'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',\n        'loss_bbox', 'nms'\n    ]\n\n    def __init__(self,\n                 num_classes=80,\n                 conv_feat='YOLOFFeat',\n                 anchor_generator='AnchorGenerator',\n                 bbox_assigner='UniformAssigner',\n                 loss_class='FocalLoss',\n                 loss_bbox='GIoULoss',\n                 ctr_clip=32.0,\n                 delta_mean=[0.0, 0.0, 0.0, 0.0],\n                 delta_std=[1.0, 1.0, 1.0, 1.0],\n                 nms='MultiClassNMS',\n                 prior_prob=0.01,\n                 nms_pre=1000,\n                 use_inside_anchor=False,\n                 trt=False,\n                 exclude_nms=False):\n        super(YOLOFHead, self).__init__()\n        self.num_classes = num_classes\n        self.conv_feat = conv_feat\n        self.anchor_generator = anchor_generator\n        self.na = self.anchor_generator.num_anchors\n        self.bbox_assigner = bbox_assigner\n        self.loss_class = loss_class\n        self.loss_bbox = loss_bbox\n        self.ctr_clip = ctr_clip\n        self.delta_mean = delta_mean\n        self.delta_std = delta_std\n        self.nms = nms\n        self.nms_pre = nms_pre\n        self.use_inside_anchor = use_inside_anchor\n        if isinstance(self.nms, MultiClassNMS) and trt:\n            self.nms.trt = trt\n        self.exclude_nms = exclude_nms\n\n        bias_init_value = -math.log((1 - prior_prob) / prior_prob)\n        self.cls_score = self.add_sublayer(\n            'cls_score',\n            nn.Conv2D(\n                in_channels=conv_feat.feat_out,\n                out_channels=self.num_classes * self.na,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0.0, std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(\n                    value=bias_init_value))))\n\n        self.bbox_pred = self.add_sublayer(\n            'bbox_pred',\n            nn.Conv2D(\n                in_channels=conv_feat.feat_out,\n                out_channels=4 * self.na,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0.0, std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n\n        self.object_pred = self.add_sublayer(\n            'object_pred',\n            nn.Conv2D(\n                in_channels=conv_feat.feat_out,\n                out_channels=self.na,\n                kernel_size=3,\n                stride=1,\n                padding=1,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0.0, std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(value=0))))\n\n    def forward(self, feats, targets=None):\n        assert len(feats) == 1, \"YOLOF only has one level feature.\"\n        conv_cls_feat, conv_reg_feat = self.conv_feat(feats[0])\n        cls_logits = self.cls_score(conv_cls_feat)\n        objectness = self.object_pred(conv_reg_feat)\n        bboxes_reg = self.bbox_pred(conv_reg_feat)\n\n        N, C, H, W = cls_logits.shape[:]\n        cls_logits = cls_logits.reshape((N, self.na, self.num_classes, H, W))\n        objectness = objectness.reshape((N, self.na, 1, H, W))\n        norm_cls_logits = cls_logits + objectness - paddle.log(\n            1.0 + paddle.clip(\n                cls_logits.exp(), max=INF) + paddle.clip(\n                    objectness.exp(), max=INF))\n        norm_cls_logits = norm_cls_logits.reshape((N, C, H, W))\n\n        anchors = self.anchor_generator([norm_cls_logits])\n\n        if self.training:\n            yolof_losses = self.get_loss(\n                [anchors[0], norm_cls_logits, bboxes_reg], targets)\n            return yolof_losses\n        else:\n            return anchors[0], norm_cls_logits, bboxes_reg\n\n    def get_loss(self, head_outs, targets):\n        anchors, cls_logits, bbox_preds = head_outs\n\n        feat_size = cls_logits.shape[-2:]\n        cls_logits = cls_logits.transpose([0, 2, 3, 1])\n        cls_logits = cls_logits.reshape([0, -1, self.num_classes])\n        bbox_preds = bbox_preds.transpose([0, 2, 3, 1])\n        bbox_preds = bbox_preds.reshape([0, -1, 4])\n\n        num_pos_list = []\n        cls_pred_list, cls_tar_list = [], []\n        reg_pred_list, reg_tar_list = [], []\n        # find and gather preds and targets in each image\n        for cls_logit, bbox_pred, gt_bbox, gt_class, im_shape in zip(\n                cls_logits, bbox_preds, targets['gt_bbox'], targets['gt_class'],\n                targets['im_shape']):\n            if self.use_inside_anchor:\n                inside_mask = find_inside_anchor(\n                    feat_size, self.anchor_generator.strides[0], self.na,\n                    im_shape.tolist())\n                cls_logit = cls_logit[inside_mask]\n                bbox_pred = bbox_pred[inside_mask]\n                anchors = anchors[inside_mask]\n\n            bbox_pred = delta2bbox_v2(\n                bbox_pred,\n                anchors,\n                self.delta_mean,\n                self.delta_std,\n                ctr_clip=self.ctr_clip)\n            bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]])\n\n            # -2:ignore, -1:neg, >=0:pos\n            match_labels, pos_bbox_pred, pos_bbox_tar = self.bbox_assigner(\n                bbox_pred, anchors, gt_bbox)\n            pos_mask = (match_labels >= 0)\n            neg_mask = (match_labels == -1)\n            chosen_mask = paddle.logical_or(pos_mask, neg_mask)\n\n            gt_class = gt_class.reshape([-1])\n            bg_class = paddle.to_tensor(\n                [self.num_classes], dtype=gt_class.dtype)\n            # a trick to assign num_classes to negative targets\n            gt_class = paddle.concat([gt_class, bg_class], axis=-1)\n            match_labels = paddle.where(\n                neg_mask,\n                paddle.full_like(match_labels, gt_class.size - 1), match_labels)\n            num_pos_list.append(max(1.0, pos_mask.sum().item()))\n\n            cls_pred_list.append(cls_logit[chosen_mask])\n            cls_tar_list.append(gt_class[match_labels[chosen_mask]])\n            reg_pred_list.append(pos_bbox_pred)\n            reg_tar_list.append(pos_bbox_tar)\n\n        num_tot_pos = paddle.to_tensor(sum(num_pos_list))\n        num_tot_pos = reduce_mean(num_tot_pos).item()\n        num_tot_pos = max(1.0, num_tot_pos)\n\n        cls_pred = paddle.concat(cls_pred_list)\n        cls_tar = paddle.concat(cls_tar_list)\n        cls_loss = self.loss_class(\n            cls_pred, cls_tar, reduction='sum') / num_tot_pos\n\n        reg_pred_list = [_ for _ in reg_pred_list if _ is not None]\n        reg_tar_list = [_ for _ in reg_tar_list if _ is not None]\n        if len(reg_pred_list) == 0:\n            reg_loss = bbox_preds.sum() * 0.0\n        else:\n            reg_pred = paddle.concat(reg_pred_list)\n            reg_tar = paddle.concat(reg_tar_list)\n            reg_loss = self.loss_bbox(reg_pred, reg_tar).sum() / num_tot_pos\n\n        yolof_losses = {\n            'loss': cls_loss + reg_loss,\n            'loss_cls': cls_loss,\n            'loss_reg': reg_loss,\n        }\n        return yolof_losses\n\n    def get_bboxes_single(self,\n                          anchors,\n                          cls_scores,\n                          bbox_preds,\n                          im_shape,\n                          scale_factor,\n                          rescale=True):\n        assert len(cls_scores) == len(bbox_preds)\n        mlvl_bboxes = []\n        mlvl_scores = []\n        for anchor, cls_score, bbox_pred in zip(anchors, cls_scores,\n                                                bbox_preds):\n            cls_score = cls_score.reshape([-1, self.num_classes])\n            bbox_pred = bbox_pred.reshape([-1, 4])\n            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:\n                max_score = cls_score.max(axis=1)\n                _, topk_inds = max_score.topk(self.nms_pre)\n                bbox_pred = bbox_pred.gather(topk_inds)\n                anchor = anchor.gather(topk_inds)\n                cls_score = cls_score.gather(topk_inds)\n\n            bbox_pred = delta2bbox_v2(\n                bbox_pred,\n                anchor,\n                self.delta_mean,\n                self.delta_std,\n                max_shape=im_shape,\n                ctr_clip=self.ctr_clip).squeeze()\n            mlvl_bboxes.append(bbox_pred)\n            mlvl_scores.append(F.sigmoid(cls_score))\n        mlvl_bboxes = paddle.concat(mlvl_bboxes)\n        mlvl_bboxes = paddle.squeeze(mlvl_bboxes)\n        if rescale:\n            mlvl_bboxes = mlvl_bboxes / paddle.concat(\n                [scale_factor[::-1], scale_factor[::-1]])\n        mlvl_scores = paddle.concat(mlvl_scores)\n        mlvl_scores = mlvl_scores.transpose([1, 0])\n        return mlvl_bboxes, mlvl_scores\n\n    def decode(self, anchors, cls_scores, bbox_preds, im_shape, scale_factor):\n        batch_bboxes = []\n        batch_scores = []\n        for img_id in range(cls_scores[0].shape[0]):\n            num_lvls = len(cls_scores)\n            cls_score_list = [cls_scores[i][img_id] for i in range(num_lvls)]\n            bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_lvls)]\n            bboxes, scores = self.get_bboxes_single(\n                anchors, cls_score_list, bbox_pred_list, im_shape[img_id],\n                scale_factor[img_id])\n            batch_bboxes.append(bboxes)\n            batch_scores.append(scores)\n        batch_bboxes = paddle.stack(batch_bboxes, 0)\n        batch_scores = paddle.stack(batch_scores, 0)\n        return batch_bboxes, batch_scores\n\n    def post_process(self, head_outs, im_shape, scale_factor):\n        anchors, cls_scores, bbox_preds = head_outs\n        cls_scores = cls_scores.transpose([0, 2, 3, 1])\n        bbox_preds = bbox_preds.transpose([0, 2, 3, 1])\n        pred_bboxes, pred_scores = self.decode(\n            [anchors], [cls_scores], [bbox_preds], im_shape, scale_factor)\n\n        if self.exclude_nms:\n            # `exclude_nms=True` just use in benchmark\n            return pred_bboxes.sum(), pred_scores.sum()\n        else:\n            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)\n            return bbox_pred, bbox_num\n"
  },
  {
    "path": "ppdet/modeling/initializer.py",
    "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py\nThs copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.\n\"\"\"\n\nimport math\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\n\n__all__ = [\n    'uniform_',\n    'normal_',\n    'constant_',\n    'ones_',\n    'zeros_',\n    'xavier_uniform_',\n    'xavier_normal_',\n    'kaiming_uniform_',\n    'kaiming_normal_',\n    'linear_init_',\n    'conv_init_',\n    'reset_initialized_parameter',\n]\n\n\ndef _no_grad_uniform_(tensor, a, b):\n    with paddle.no_grad():\n        tensor.set_value(\n            paddle.uniform(\n                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))\n    return tensor\n\n\ndef _no_grad_normal_(tensor, mean=0., std=1.):\n    with paddle.no_grad():\n        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))\n    return tensor\n\n\ndef _no_grad_fill_(tensor, value=0.):\n    with paddle.no_grad():\n        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))\n    return tensor\n\n\ndef uniform_(tensor, a, b):\n    \"\"\"\n    Modified tensor inspace using uniform_\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n        a (float|int): min value.\n        b (float|int): max value.\n    Return:\n        tensor\n    \"\"\"\n    return _no_grad_uniform_(tensor, a, b)\n\n\ndef normal_(tensor, mean=0., std=1.):\n    \"\"\"\n    Modified tensor inspace using normal_\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n        mean (float|int): mean value.\n        std (float|int): std value.\n    Return:\n        tensor\n    \"\"\"\n    return _no_grad_normal_(tensor, mean, std)\n\n\ndef constant_(tensor, value=0.):\n    \"\"\"\n    Modified tensor inspace using constant_\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n        value (float|int): value to fill tensor.\n    Return:\n        tensor\n    \"\"\"\n    return _no_grad_fill_(tensor, value)\n\n\ndef ones_(tensor):\n    \"\"\"\n    Modified tensor inspace using ones_\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n    Return:\n        tensor\n    \"\"\"\n    return _no_grad_fill_(tensor, 1)\n\n\ndef zeros_(tensor):\n    \"\"\"\n    Modified tensor inspace using zeros_\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n    Return:\n        tensor\n    \"\"\"\n    return _no_grad_fill_(tensor, 0)\n\n\ndef vector_(tensor, vector):\n    with paddle.no_grad():\n        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))\n    return tensor\n\n\ndef _calculate_fan_in_and_fan_out(tensor, reverse=False):\n    \"\"\"\n    Calculate (fan_in, _fan_out) for tensor\n\n    Args:\n        tensor (Tensor): paddle.Tensor\n        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True\n\n    Return:\n        Tuple[fan_in, fan_out]\n    \"\"\"\n    if tensor.ndim < 2:\n        raise ValueError(\n            \"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions\"\n        )\n\n    if reverse:\n        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]\n    else:\n        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]\n\n    receptive_field_size = 1\n    if tensor.ndim > 2:\n        receptive_field_size = np.prod(tensor.shape[2:])\n\n    fan_in = num_input_fmaps * receptive_field_size\n    fan_out = num_output_fmaps * receptive_field_size\n\n    return fan_in, fan_out\n\n\ndef xavier_uniform_(tensor, gain=1., reverse=False):\n    \"\"\"\n    Modified tensor inspace using xavier_uniform_\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n        gain (float): super parameter, 1. default.\n        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].\n    Return:\n        tensor\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    k = math.sqrt(3.0) * std\n    return _no_grad_uniform_(tensor, -k, k)\n\n\ndef xavier_normal_(tensor, gain=1., reverse=False):\n    \"\"\"\n    Modified tensor inspace using xavier_normal_\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n        gain (float): super parameter, 1. default.\n        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].\n    Return:\n        tensor\n    \"\"\"\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)\n    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))\n    return _no_grad_normal_(tensor, 0, std)\n\n\n# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html\ndef _calculate_correct_fan(tensor, mode, reverse=False):\n    mode = mode.lower()\n    valid_modes = ['fan_in', 'fan_out']\n    if mode not in valid_modes:\n        raise ValueError(\"Mode {} not supported, please use one of {}\".format(\n            mode, valid_modes))\n\n    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)\n\n    return fan_in if mode == 'fan_in' else fan_out\n\n\ndef _calculate_gain(nonlinearity, param=None):\n    linear_fns = [\n        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',\n        'conv_transpose2d', 'conv_transpose3d'\n    ]\n    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':\n        return 1\n    elif nonlinearity == 'tanh':\n        return 5.0 / 3\n    elif nonlinearity == 'relu':\n        return math.sqrt(2.0)\n    elif nonlinearity == 'leaky_relu':\n        if param is None:\n            negative_slope = 0.01\n        elif not isinstance(param, bool) and isinstance(\n                param, int) or isinstance(param, float):\n            # True/False are instances of int, hence check above\n            negative_slope = param\n        else:\n            raise ValueError(\"negative_slope {} not a valid number\".format(\n                param))\n        return math.sqrt(2.0 / (1 + negative_slope**2))\n    elif nonlinearity == 'selu':\n        return 3.0 / 4\n    else:\n        raise ValueError(\"Unsupported nonlinearity {}\".format(nonlinearity))\n\n\ndef kaiming_uniform_(tensor,\n                     a=0,\n                     mode='fan_in',\n                     nonlinearity='leaky_relu',\n                     reverse=False):\n    \"\"\"\n    Modified tensor inspace using kaiming_uniform method\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut\n        nonlinearity (str): nonlinearity method name\n        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].\n    Return:\n        tensor\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode, reverse)\n    gain = _calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    k = math.sqrt(3.0) * std\n    return _no_grad_uniform_(tensor, -k, k)\n\n\ndef kaiming_normal_(tensor,\n                    a=0,\n                    mode='fan_in',\n                    nonlinearity='leaky_relu',\n                    reverse=False):\n    \"\"\"\n    Modified tensor inspace using kaiming_normal_\n    Args:\n        tensor (paddle.Tensor): paddle Tensor\n        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut\n        nonlinearity (str): nonlinearity method name\n        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].\n    Return:\n        tensor\n    \"\"\"\n    fan = _calculate_correct_fan(tensor, mode, reverse)\n    gain = _calculate_gain(nonlinearity, a)\n    std = gain / math.sqrt(fan)\n    return _no_grad_normal_(tensor, 0, std)\n\n\ndef linear_init_(module):\n    bound = 1 / math.sqrt(module.weight.shape[0])\n    uniform_(module.weight, -bound, bound)\n    if hasattr(module, \"bias\") and module.bias is not None:\n        uniform_(module.bias, -bound, bound)\n\n\ndef conv_init_(module):\n    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))\n    uniform_(module.weight, -bound, bound)\n    if module.bias is not None:\n        uniform_(module.bias, -bound, bound)\n\n\ndef bias_init_with_prob(prior_prob=0.01):\n    \"\"\"initialize conv/fc bias value according to a given probability value.\"\"\"\n    bias_init = float(-np.log((1 - prior_prob) / prior_prob))\n    return bias_init\n\n\n@paddle.no_grad()\ndef reset_initialized_parameter(model, include_self=True):\n    \"\"\"\n    Reset initialized parameter using following method for [conv, linear, embedding, bn]\n\n    Args:\n        model (paddle.Layer): paddle Layer\n        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself\n    Return:\n        None\n    \"\"\"\n    for _, m in model.named_sublayers(include_self=include_self):\n        if isinstance(m, nn.Conv2D):\n            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *\n                                    m._kernel_size[1])\n            k = math.sqrt(k)\n            _no_grad_uniform_(m.weight, -k, k)\n            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:\n                _no_grad_uniform_(m.bias, -k, k)\n\n        elif isinstance(m, nn.Linear):\n            k = math.sqrt(1. / m.weight.shape[0])\n            _no_grad_uniform_(m.weight, -k, k)\n            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:\n                _no_grad_uniform_(m.bias, -k, k)\n\n        elif isinstance(m, nn.Embedding):\n            _no_grad_normal_(m.weight, mean=0., std=1.)\n\n        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):\n            _no_grad_fill_(m.weight, 1.)\n            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:\n                _no_grad_fill_(m.bias, 0)\n"
  },
  {
    "path": "ppdet/modeling/keypoint_utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nthis code is based on https://github.com/open-mmlab/mmpose\n\"\"\"\n\nimport cv2\nimport numpy as np\nimport paddle.nn.functional as F\n\n\ndef get_affine_mat_kernel(h, w, s, inv=False):\n    if w < h:\n        w_ = s\n        h_ = int(np.ceil((s / w * h) / 64.) * 64)\n        scale_w = w\n        scale_h = h_ / w_ * w\n\n    else:\n        h_ = s\n        w_ = int(np.ceil((s / h * w) / 64.) * 64)\n        scale_h = h\n        scale_w = w_ / h_ * h\n\n    center = np.array([np.round(w / 2.), np.round(h / 2.)])\n\n    size_resized = (w_, h_)\n    trans = get_affine_transform(\n        center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)\n\n    return trans, size_resized\n\n\ndef get_affine_transform(center,\n                         input_size,\n                         rot,\n                         output_size,\n                         shift=(0., 0.),\n                         inv=False):\n    \"\"\"Get the affine transform matrix, given the center/scale/rot/output_size.\n\n    Args:\n        center (np.ndarray[2, ]): Center of the bounding box (x, y).\n        input_size (np.ndarray[2, ]): Size of input feature (width, height).\n        rot (float): Rotation angle (degree).\n        output_size (np.ndarray[2, ]): Size of the destination heatmaps.\n        shift (0-100%): Shift translation ratio wrt the width/height.\n            Default (0., 0.).\n        inv (bool): Option to inverse the affine transform direction.\n            (inv=False: src->dst or inv=True: dst->src)\n\n    Returns:\n        np.ndarray: The transform matrix.\n    \"\"\"\n    assert len(center) == 2\n    assert len(output_size) == 2\n    assert len(shift) == 2\n\n    if not isinstance(input_size, (np.ndarray, list)):\n        input_size = np.array([input_size, input_size], dtype=np.float32)\n    scale_tmp = input_size\n\n    shift = np.array(shift)\n    src_w = scale_tmp[0]\n    dst_w = output_size[0]\n    dst_h = output_size[1]\n\n    rot_rad = np.pi * rot / 180\n    src_dir = rotate_point([0., src_w * -0.5], rot_rad)\n    dst_dir = np.array([0., dst_w * -0.5])\n\n    src = np.zeros((3, 2), dtype=np.float32)\n\n    src[0, :] = center + scale_tmp * shift\n    src[1, :] = center + src_dir + scale_tmp * shift\n    src[2, :] = _get_3rd_point(src[0, :], src[1, :])\n\n    dst = np.zeros((3, 2), dtype=np.float32)\n    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]\n    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir\n    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])\n\n    if inv:\n        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))\n    else:\n        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))\n\n    return trans\n\n\ndef get_warp_matrix(theta, size_input, size_dst, size_target):\n    \"\"\"This code is based on\n        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py\n\n        Calculate the transformation matrix under the constraint of unbiased.\n    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased\n    Data Processing for Human Pose Estimation (CVPR 2020).\n\n    Args:\n        theta (float): Rotation angle in degrees.\n        size_input (np.ndarray): Size of input image [w, h].\n        size_dst (np.ndarray): Size of output image [w, h].\n        size_target (np.ndarray): Size of ROI in input plane [w, h].\n\n    Returns:\n        matrix (np.ndarray): A matrix for transformation.\n    \"\"\"\n    theta = np.deg2rad(theta)\n    matrix = np.zeros((2, 3), dtype=np.float32)\n    scale_x = size_dst[0] / size_target[0]\n    scale_y = size_dst[1] / size_target[1]\n    matrix[0, 0] = np.cos(theta) * scale_x\n    matrix[0, 1] = -np.sin(theta) * scale_x\n    matrix[0, 2] = scale_x * (\n        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *\n        np.sin(theta) + 0.5 * size_target[0])\n    matrix[1, 0] = np.sin(theta) * scale_y\n    matrix[1, 1] = np.cos(theta) * scale_y\n    matrix[1, 2] = scale_y * (\n        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *\n        np.cos(theta) + 0.5 * size_target[1])\n    return matrix\n\n\ndef _get_3rd_point(a, b):\n    \"\"\"To calculate the affine matrix, three pairs of points are required. This\n    function is used to get the 3rd point, given 2D points a & b.\n\n    The 3rd point is defined by rotating vector `a - b` by 90 degrees\n    anticlockwise, using b as the rotation center.\n\n    Args:\n        a (np.ndarray): point(x,y)\n        b (np.ndarray): point(x,y)\n\n    Returns:\n        np.ndarray: The 3rd point.\n    \"\"\"\n    assert len(\n        a) == 2, 'input of _get_3rd_point should be point with length of 2'\n    assert len(\n        b) == 2, 'input of _get_3rd_point should be point with length of 2'\n    direction = a - b\n    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)\n\n    return third_pt\n\n\ndef rotate_point(pt, angle_rad):\n    \"\"\"Rotate a point by an angle.\n\n    Args:\n        pt (list[float]): 2 dimensional point to be rotated\n        angle_rad (float): rotation angle by radian\n\n    Returns:\n        list[float]: Rotated point.\n    \"\"\"\n    assert len(pt) == 2\n    sn, cs = np.sin(angle_rad), np.cos(angle_rad)\n    new_x = pt[0] * cs - pt[1] * sn\n    new_y = pt[0] * sn + pt[1] * cs\n    rotated_pt = [new_x, new_y]\n\n    return rotated_pt\n\n\ndef transpred(kpts, h, w, s):\n    trans, _ = get_affine_mat_kernel(h, w, s, inv=True)\n\n    return warp_affine_joints(kpts[..., :2].copy(), trans)\n\n\ndef warp_affine_joints(joints, mat):\n    \"\"\"Apply affine transformation defined by the transform matrix on the\n    joints.\n\n    Args:\n        joints (np.ndarray[..., 2]): Origin coordinate of joints.\n        mat (np.ndarray[3, 2]): The affine matrix.\n\n    Returns:\n        matrix (np.ndarray[..., 2]): Result coordinate of joints.\n    \"\"\"\n    joints = np.array(joints)\n    shape = joints.shape\n    joints = joints.reshape(-1, 2)\n    return np.dot(np.concatenate(\n        (joints, joints[:, 0:1] * 0 + 1), axis=1),\n                  mat.T).reshape(shape)\n\n\ndef affine_transform(pt, t):\n    new_pt = np.array([pt[0], pt[1], 1.]).T\n    new_pt = np.dot(t, new_pt)\n    return new_pt[:2]\n\n\ndef transform_preds(coords, center, scale, output_size):\n    target_coords = np.zeros(coords.shape)\n    trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)\n    for p in range(coords.shape[0]):\n        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)\n    return target_coords\n\n\ndef oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):\n    if not isinstance(sigmas, np.ndarray):\n        sigmas = np.array([\n            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,\n            .87, .87, .89, .89\n        ]) / 10.0\n    vars = (sigmas * 2)**2\n    xg = g[0::3]\n    yg = g[1::3]\n    vg = g[2::3]\n    ious = np.zeros((d.shape[0]))\n    for n_d in range(0, d.shape[0]):\n        xd = d[n_d, 0::3]\n        yd = d[n_d, 1::3]\n        vd = d[n_d, 2::3]\n        dx = xd - xg\n        dy = yd - yg\n        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2\n        if in_vis_thre is not None:\n            ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)\n            e = e[ind]\n        ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0\n    return ious\n\n\ndef oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):\n    \"\"\"greedily select boxes with high confidence and overlap with current maximum <= thresh\n    rule out overlap >= thresh\n\n    Args:\n        kpts_db (list): The predicted keypoints within the image\n        thresh (float): The threshold to select the boxes\n        sigmas (np.array): The variance to calculate the oks iou\n            Default: None\n        in_vis_thre (float): The threshold to select the high confidence boxes\n            Default: None\n\n    Return:\n        keep (list): indexes to keep\n    \"\"\"\n\n    if len(kpts_db) == 0:\n        return []\n\n    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])\n    kpts = np.array(\n        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])\n    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])\n\n    order = scores.argsort()[::-1]\n\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n\n        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],\n                          sigmas, in_vis_thre)\n\n        inds = np.where(oks_ovr <= thresh)[0]\n        order = order[inds + 1]\n\n    return keep\n\n\ndef rescore(overlap, scores, thresh, type='gaussian'):\n    assert overlap.shape[0] == scores.shape[0]\n    if type == 'linear':\n        inds = np.where(overlap >= thresh)[0]\n        scores[inds] = scores[inds] * (1 - overlap[inds])\n    else:\n        scores = scores * np.exp(-overlap**2 / thresh)\n\n    return scores\n\n\ndef soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):\n    \"\"\"greedily select boxes with high confidence and overlap with current maximum <= thresh\n    rule out overlap >= thresh\n\n    Args:\n        kpts_db (list): The predicted keypoints within the image\n        thresh (float): The threshold to select the boxes\n        sigmas (np.array): The variance to calculate the oks iou\n            Default: None\n        in_vis_thre (float): The threshold to select the high confidence boxes\n            Default: None\n\n    Return:\n        keep (list): indexes to keep\n    \"\"\"\n\n    if len(kpts_db) == 0:\n        return []\n\n    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])\n    kpts = np.array(\n        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])\n    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])\n\n    order = scores.argsort()[::-1]\n    scores = scores[order]\n\n    # max_dets = order.size\n    max_dets = 20\n    keep = np.zeros(max_dets, dtype=np.intp)\n    keep_cnt = 0\n    while order.size > 0 and keep_cnt < max_dets:\n        i = order[0]\n\n        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],\n                          sigmas, in_vis_thre)\n\n        order = order[1:]\n        scores = rescore(oks_ovr, scores[1:], thresh)\n\n        tmp = scores.argsort()[::-1]\n        order = order[tmp]\n        scores = scores[tmp]\n\n        keep[keep_cnt] = i\n        keep_cnt += 1\n\n    keep = keep[:keep_cnt]\n\n    return keep\n\n\ndef resize(input,\n           size=None,\n           scale_factor=None,\n           mode='nearest',\n           align_corners=None,\n           warning=True):\n    if warning:\n        if size is not None and align_corners:\n            input_h, input_w = tuple(int(x) for x in input.shape[2:])\n            output_h, output_w = tuple(int(x) for x in size)\n            if output_h > input_h or output_w > output_h:\n                if ((output_h > 1 and output_w > 1 and input_h > 1 and\n                     input_w > 1) and (output_h - 1) % (input_h - 1) and\n                    (output_w - 1) % (input_w - 1)):\n                    warnings.warn(\n                        f'When align_corners={align_corners}, '\n                        'the output would more aligned if '\n                        f'input size {(input_h, input_w)} is `x+1` and '\n                        f'out size {(output_h, output_w)} is `nx+1`')\n\n    return F.interpolate(input, size, scale_factor, mode, align_corners)\n\n\ndef flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):\n    \"\"\"Flip the flipped heatmaps back to the original form.\n    Note:\n        - batch_size: N\n        - num_keypoints: K\n        - heatmap height: H\n        - heatmap width: W\n    Args:\n        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained\n            from the flipped images.\n        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored\n            (for example, left ear -- right ear).\n        target_type (str): GaussianHeatmap or CombinedTarget\n    Returns:\n        np.ndarray: heatmaps that flipped back to the original image\n    \"\"\"\n    assert len(output_flipped.shape) == 4, \\\n        'output_flipped should be [batch_size, num_keypoints, height, width]'\n    shape_ori = output_flipped.shape\n    channels = 1\n    if target_type.lower() == 'CombinedTarget'.lower():\n        channels = 3\n        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]\n    output_flipped = output_flipped.reshape((shape_ori[0], -1, channels,\n                                             shape_ori[2], shape_ori[3]))\n    output_flipped_back = output_flipped.clone()\n\n    # Swap left-right parts\n    for left, right in flip_pairs:\n        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]\n        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]\n    output_flipped_back = output_flipped_back.reshape(shape_ori)\n    # Flip horizontally\n    output_flipped_back = output_flipped_back[..., ::-1]\n    return output_flipped_back\n\n\ndef _calc_distances(preds, targets, mask, normalize):\n    \"\"\"Calculate the normalized distances between preds and target.\n\n    Note:\n        batch_size: N\n        num_keypoints: K\n        dimension of keypoints: D (normally, D=2 or D=3)\n\n    Args:\n        preds (np.ndarray[N, K, D]): Predicted keypoint location.\n        targets (np.ndarray[N, K, D]): Groundtruth keypoint location.\n        mask (np.ndarray[N, K]): Visibility of the target. False for invisible\n            joints, and True for visible. Invisible joints will be ignored for\n            accuracy calculation.\n        normalize (np.ndarray[N, D]): Typical value is heatmap_size\n\n    Returns:\n        np.ndarray[K, N]: The normalized distances. \\\n            If target keypoints are missing, the distance is -1.\n    \"\"\"\n    N, K, _ = preds.shape\n    # set mask=0 when normalize==0\n    _mask = mask.copy()\n    _mask[np.where((normalize == 0).sum(1))[0], :] = False\n    distances = np.full((N, K), -1, dtype=np.float32)\n    # handle invalid values\n    normalize[np.where(normalize <= 0)] = 1e6\n    distances[_mask] = np.linalg.norm(\n        ((preds - targets) / normalize[:, None, :])[_mask], axis=-1)\n    return distances.T\n\n\ndef _distance_acc(distances, thr=0.5):\n    \"\"\"Return the percentage below the distance threshold, while ignoring\n    distances values with -1.\n\n    Note:\n        batch_size: N\n    Args:\n        distances (np.ndarray[N, ]): The normalized distances.\n        thr (float): Threshold of the distances.\n\n    Returns:\n        float: Percentage of distances below the threshold. \\\n            If all target keypoints are missing, return -1.\n    \"\"\"\n    distance_valid = distances != -1\n    num_distance_valid = distance_valid.sum()\n    if num_distance_valid > 0:\n        return (distances[distance_valid] < thr).sum() / num_distance_valid\n    return -1\n\n\ndef keypoint_pck_accuracy(pred, gt, mask, thr, normalize):\n    \"\"\"Calculate the pose accuracy of PCK for each individual keypoint and the\n    averaged accuracy across all keypoints for coordinates.\n\n    Note:\n        PCK metric measures accuracy of the localization of the body joints.\n        The distances between predicted positions and the ground-truth ones\n        are typically normalized by the bounding box size.\n        The threshold (thr) of the normalized distance is commonly set\n        as 0.05, 0.1 or 0.2 etc.\n\n        - batch_size: N\n        - num_keypoints: K\n\n    Args:\n        pred (np.ndarray[N, K, 2]): Predicted keypoint location.\n        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.\n        mask (np.ndarray[N, K]): Visibility of the target. False for invisible\n            joints, and True for visible. Invisible joints will be ignored for\n            accuracy calculation.\n        thr (float): Threshold of PCK calculation.\n        normalize (np.ndarray[N, 2]): Normalization factor for H&W.\n\n    Returns:\n        tuple: A tuple containing keypoint accuracy.\n\n        - acc (np.ndarray[K]): Accuracy of each keypoint.\n        - avg_acc (float): Averaged accuracy across all keypoints.\n        - cnt (int): Number of valid keypoints.\n    \"\"\"\n    distances = _calc_distances(pred, gt, mask, normalize)\n\n    acc = np.array([_distance_acc(d, thr) for d in distances])\n    valid_acc = acc[acc >= 0]\n    cnt = len(valid_acc)\n    avg_acc = valid_acc.mean() if cnt > 0 else 0\n    return acc, avg_acc, cnt\n\n\ndef keypoint_auc(pred, gt, mask, normalize, num_step=20):\n    \"\"\"Calculate the pose accuracy of PCK for each individual keypoint and the\n    averaged accuracy across all keypoints for coordinates.\n\n    Note:\n        - batch_size: N\n        - num_keypoints: K\n\n    Args:\n        pred (np.ndarray[N, K, 2]): Predicted keypoint location.\n        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.\n        mask (np.ndarray[N, K]): Visibility of the target. False for invisible\n            joints, and True for visible. Invisible joints will be ignored for\n            accuracy calculation.\n        normalize (float): Normalization factor.\n\n    Returns:\n        float: Area under curve.\n    \"\"\"\n    nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))\n    x = [1.0 * i / num_step for i in range(num_step)]\n    y = []\n    for thr in x:\n        _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)\n        y.append(avg_acc)\n\n    auc = 0\n    for i in range(num_step):\n        auc += 1.0 / num_step * y[i]\n    return auc\n\n\ndef keypoint_epe(pred, gt, mask):\n    \"\"\"Calculate the end-point error.\n\n    Note:\n        - batch_size: N\n        - num_keypoints: K\n\n    Args:\n        pred (np.ndarray[N, K, 2]): Predicted keypoint location.\n        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.\n        mask (np.ndarray[N, K]): Visibility of the target. False for invisible\n            joints, and True for visible. Invisible joints will be ignored for\n            accuracy calculation.\n\n    Returns:\n        float: Average end-point error.\n    \"\"\"\n\n    normalize = np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32)\n    distances = _calc_distances(pred, gt, mask, normalize)\n    distance_valid = distances[distances != -1]\n    return distance_valid.sum() / max(1, len(distance_valid))\n"
  },
  {
    "path": "ppdet/modeling/lane_utils.py",
    "content": "import os\nimport cv2\nimport numpy as np\nfrom scipy.interpolate import InterpolatedUnivariateSpline\n\n\nclass Lane:\n    def __init__(self, points=None, invalid_value=-2., metadata=None):\n        super(Lane, self).__init__()\n        self.curr_iter = 0\n        self.points = points\n        self.invalid_value = invalid_value\n        self.function = InterpolatedUnivariateSpline(\n            points[:, 1], points[:, 0], k=min(3, len(points) - 1))\n        self.min_y = points[:, 1].min() - 0.01\n        self.max_y = points[:, 1].max() + 0.01\n        self.metadata = metadata or {}\n\n    def __repr__(self):\n        return '[Lane]\\n' + str(self.points) + '\\n[/Lane]'\n\n    def __call__(self, lane_ys):\n        lane_xs = self.function(lane_ys)\n\n        lane_xs[(lane_ys < self.min_y) | (lane_ys > self.max_y\n                                          )] = self.invalid_value\n        return lane_xs\n\n    def to_array(self, sample_y_range, img_w, img_h):\n        self.sample_y = range(sample_y_range[0], sample_y_range[1],\n                              sample_y_range[2])\n        sample_y = self.sample_y\n        img_w, img_h = img_w, img_h\n        ys = np.array(sample_y) / float(img_h)\n        xs = self(ys)\n        valid_mask = (xs >= 0) & (xs < 1)\n        lane_xs = xs[valid_mask] * img_w\n        lane_ys = ys[valid_mask] * img_h\n        lane = np.concatenate(\n            (lane_xs.reshape(-1, 1), lane_ys.reshape(-1, 1)), axis=1)\n        return lane\n\n    def __iter__(self):\n        return self\n\n    def __next__(self):\n        if self.curr_iter < len(self.points):\n            self.curr_iter += 1\n            return self.points[self.curr_iter - 1]\n        self.curr_iter = 0\n        raise StopIteration\n\n\nCOLORS = [\n    (255, 0, 0),\n    (0, 255, 0),\n    (0, 0, 255),\n    (255, 255, 0),\n    (255, 0, 255),\n    (0, 255, 255),\n    (128, 255, 0),\n    (255, 128, 0),\n    (128, 0, 255),\n    (255, 0, 128),\n    (0, 128, 255),\n    (0, 255, 128),\n    (128, 255, 255),\n    (255, 128, 255),\n    (255, 255, 128),\n    (60, 180, 0),\n    (180, 60, 0),\n    (0, 60, 180),\n    (0, 180, 60),\n    (60, 0, 180),\n    (180, 0, 60),\n    (255, 0, 0),\n    (0, 255, 0),\n    (0, 0, 255),\n    (255, 255, 0),\n    (255, 0, 255),\n    (0, 255, 255),\n    (128, 255, 0),\n    (255, 128, 0),\n    (128, 0, 255),\n]\n\n\ndef imshow_lanes(img, lanes, show=False, out_file=None, width=4):\n    lanes_xys = []\n    for _, lane in enumerate(lanes):\n        xys = []\n        for x, y in lane:\n            if x <= 0 or y <= 0:\n                continue\n            x, y = int(x), int(y)\n            xys.append((x, y))\n        lanes_xys.append(xys)\n    lanes_xys.sort(key=lambda xys: xys[0][0] if len(xys) > 0 else 0)\n\n    for idx, xys in enumerate(lanes_xys):\n        for i in range(1, len(xys)):\n            cv2.line(img, xys[i - 1], xys[i], COLORS[idx], thickness=width)\n\n    if show:\n        cv2.imshow('view', img)\n        cv2.waitKey(0)\n\n    if out_file:\n        if not os.path.exists(os.path.dirname(out_file)):\n            os.makedirs(os.path.dirname(out_file))\n        cv2.imwrite(out_file, img)\n"
  },
  {
    "path": "ppdet/modeling/layers.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport six\nimport numpy as np\nfrom numbers import Integral\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom paddle import to_tensor\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal, Constant, XavierUniform\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.bbox_utils import delta2bbox\nfrom . import ops\nfrom .initializer import xavier_uniform_, constant_\n\nfrom paddle.vision.ops import DeformConv2D\n\n\ndef _to_list(l):\n    if isinstance(l, (list, tuple)):\n        return list(l)\n    return [l]\n\n\nclass AlignConv(nn.Layer):\n    def __init__(self, in_channels, out_channels, kernel_size=3, groups=1):\n        super(AlignConv, self).__init__()\n        self.kernel_size = kernel_size\n        self.align_conv = paddle.vision.ops.DeformConv2D(\n            in_channels,\n            out_channels,\n            kernel_size=self.kernel_size,\n            padding=(self.kernel_size - 1) // 2,\n            groups=groups,\n            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),\n            bias_attr=None)\n\n    @paddle.no_grad()\n    def get_offset(self, anchors, featmap_size, stride):\n        \"\"\"\n        Args:\n            anchors: [B, L, 5] xc,yc,w,h,angle\n            featmap_size: (feat_h, feat_w)\n            stride: 8\n        Returns:\n\n        \"\"\"\n        batch = anchors.shape[0]\n        dtype = anchors.dtype\n        feat_h, feat_w = featmap_size\n        pad = (self.kernel_size - 1) // 2\n        idx = paddle.arange(-pad, pad + 1, dtype=dtype)\n\n        yy, xx = paddle.meshgrid(idx, idx)\n        xx = paddle.reshape(xx, [-1])\n        yy = paddle.reshape(yy, [-1])\n\n        # get sampling locations of default conv\n        xc = paddle.arange(0, feat_w, dtype=dtype)\n        yc = paddle.arange(0, feat_h, dtype=dtype)\n        yc, xc = paddle.meshgrid(yc, xc)\n\n        xc = paddle.reshape(xc, [-1, 1])\n        yc = paddle.reshape(yc, [-1, 1])\n        x_conv = xc + xx\n        y_conv = yc + yy\n\n        # get sampling locations of anchors\n        x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1)\n        x_ctr = x_ctr / stride\n        y_ctr = y_ctr / stride\n        w_s = w / stride\n        h_s = h / stride\n        cos, sin = paddle.cos(a), paddle.sin(a)\n        dw, dh = w_s / self.kernel_size, h_s / self.kernel_size\n        x, y = dw * xx, dh * yy\n        xr = cos * x - sin * y\n        yr = sin * x + cos * y\n        x_anchor, y_anchor = xr + x_ctr, yr + y_ctr\n        # get offset filed\n        offset_x = x_anchor - x_conv\n        offset_y = y_anchor - y_conv\n        offset = paddle.stack([offset_y, offset_x], axis=-1)\n        offset = offset.reshape(\n            [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2])\n        offset = offset.transpose([0, 3, 1, 2])\n\n        return offset\n\n    def forward(self, x, refine_anchors, featmap_size, stride):\n        batch = x.shape[0].numpy()\n        offset = self.get_offset(refine_anchors, featmap_size, stride)\n        if self.training:\n            x = F.relu(self.align_conv(x, offset.detach()))\n        else:\n            x = F.relu(self.align_conv(x, offset))\n        return x\n\n\nclass DeformableConvV2(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 weight_attr=None,\n                 bias_attr=None,\n                 lr_scale=1,\n                 regularizer=None,\n                 skip_quant=False,\n                 dcn_bias_regularizer=L2Decay(0.),\n                 dcn_bias_lr_scale=2.):\n        super(DeformableConvV2, self).__init__()\n        self.offset_channel = 2 * kernel_size**2\n        self.mask_channel = kernel_size**2\n\n        if lr_scale == 1 and regularizer is None:\n            offset_bias_attr = ParamAttr(initializer=Constant(0.))\n        else:\n            offset_bias_attr = ParamAttr(\n                initializer=Constant(0.),\n                learning_rate=lr_scale,\n                regularizer=regularizer)\n        self.conv_offset = nn.Conv2D(\n            in_channels,\n            3 * kernel_size**2,\n            kernel_size,\n            stride=stride,\n            padding=(kernel_size - 1) // 2,\n            weight_attr=ParamAttr(initializer=Constant(0.0)),\n            bias_attr=offset_bias_attr)\n        if skip_quant:\n            self.conv_offset.skip_quant = True\n\n        if bias_attr:\n            # in FCOS-DCN head, specifically need learning_rate and regularizer\n            dcn_bias_attr = ParamAttr(\n                initializer=Constant(value=0),\n                regularizer=dcn_bias_regularizer,\n                learning_rate=dcn_bias_lr_scale)\n        else:\n            # in ResNet backbone, do not need bias\n            dcn_bias_attr = False\n        self.conv_dcn = DeformConv2D(\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride=stride,\n            padding=(kernel_size - 1) // 2 * dilation,\n            dilation=dilation,\n            groups=groups,\n            weight_attr=weight_attr,\n            bias_attr=dcn_bias_attr)\n\n    def forward(self, x):\n        offset_mask = self.conv_offset(x)\n        offset, mask = paddle.split(\n            offset_mask,\n            num_or_sections=[self.offset_channel, self.mask_channel],\n            axis=1)\n        mask = F.sigmoid(mask)\n        y = self.conv_dcn(x, offset, mask=mask)\n        return y\n\n\nclass ConvNormLayer(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size,\n                 stride,\n                 groups=1,\n                 norm_type='bn',\n                 norm_decay=0.,\n                 norm_groups=32,\n                 use_dcn=False,\n                 bias_on=False,\n                 lr_scale=1.,\n                 freeze_norm=False,\n                 initializer=Normal(\n                     mean=0., std=0.01),\n                 skip_quant=False,\n                 dcn_lr_scale=2.,\n                 dcn_regularizer=L2Decay(0.)):\n        super(ConvNormLayer, self).__init__()\n        assert norm_type in ['bn', 'sync_bn', 'gn', None]\n\n        if bias_on:\n            bias_attr = ParamAttr(\n                initializer=Constant(value=0.), learning_rate=lr_scale)\n        else:\n            bias_attr = False\n\n        if not use_dcn:\n            self.conv = nn.Conv2D(\n                in_channels=ch_in,\n                out_channels=ch_out,\n                kernel_size=filter_size,\n                stride=stride,\n                padding=(filter_size - 1) // 2,\n                groups=groups,\n                weight_attr=ParamAttr(\n                    initializer=initializer, learning_rate=1.),\n                bias_attr=bias_attr)\n            if skip_quant:\n                self.conv.skip_quant = True\n        else:\n            # in FCOS-DCN head, specifically need learning_rate and regularizer\n            self.conv = DeformableConvV2(\n                in_channels=ch_in,\n                out_channels=ch_out,\n                kernel_size=filter_size,\n                stride=stride,\n                padding=(filter_size - 1) // 2,\n                groups=groups,\n                weight_attr=ParamAttr(\n                    initializer=initializer, learning_rate=1.),\n                bias_attr=True,\n                lr_scale=dcn_lr_scale,\n                regularizer=dcn_regularizer,\n                dcn_bias_regularizer=dcn_regularizer,\n                dcn_bias_lr_scale=dcn_lr_scale,\n                skip_quant=skip_quant)\n\n        norm_lr = 0. if freeze_norm else 1.\n        param_attr = ParamAttr(\n            learning_rate=norm_lr,\n            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)\n        bias_attr = ParamAttr(\n            learning_rate=norm_lr,\n            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)\n        if norm_type in ['bn', 'sync_bn']:\n            self.norm = nn.BatchNorm2D(\n                ch_out, weight_attr=param_attr, bias_attr=bias_attr)\n        elif norm_type == 'gn':\n            self.norm = nn.GroupNorm(\n                num_groups=norm_groups,\n                num_channels=ch_out,\n                weight_attr=param_attr,\n                bias_attr=bias_attr)\n        else:\n            self.norm = None\n\n    def forward(self, inputs):\n        out = self.conv(inputs)\n        if self.norm is not None:\n            out = self.norm(out)\n        return out\n\n\nclass LiteConv(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 stride=1,\n                 with_act=True,\n                 norm_type='sync_bn',\n                 name=None):\n        super(LiteConv, self).__init__()\n        self.lite_conv = nn.Sequential()\n        conv1 = ConvNormLayer(\n            in_channels,\n            in_channels,\n            filter_size=5,\n            stride=stride,\n            groups=in_channels,\n            norm_type=norm_type,\n            initializer=XavierUniform())\n        conv2 = ConvNormLayer(\n            in_channels,\n            out_channels,\n            filter_size=1,\n            stride=stride,\n            norm_type=norm_type,\n            initializer=XavierUniform())\n        conv3 = ConvNormLayer(\n            out_channels,\n            out_channels,\n            filter_size=1,\n            stride=stride,\n            norm_type=norm_type,\n            initializer=XavierUniform())\n        conv4 = ConvNormLayer(\n            out_channels,\n            out_channels,\n            filter_size=5,\n            stride=stride,\n            groups=out_channels,\n            norm_type=norm_type,\n            initializer=XavierUniform())\n        conv_list = [conv1, conv2, conv3, conv4]\n        self.lite_conv.add_sublayer('conv1', conv1)\n        self.lite_conv.add_sublayer('relu6_1', nn.ReLU6())\n        self.lite_conv.add_sublayer('conv2', conv2)\n        if with_act:\n            self.lite_conv.add_sublayer('relu6_2', nn.ReLU6())\n        self.lite_conv.add_sublayer('conv3', conv3)\n        self.lite_conv.add_sublayer('relu6_3', nn.ReLU6())\n        self.lite_conv.add_sublayer('conv4', conv4)\n        if with_act:\n            self.lite_conv.add_sublayer('relu6_4', nn.ReLU6())\n\n    def forward(self, inputs):\n        out = self.lite_conv(inputs)\n        return out\n\n\nclass DropBlock(nn.Layer):\n    def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'):\n        \"\"\"\n        DropBlock layer, see https://arxiv.org/abs/1810.12890\n\n        Args:\n            block_size (int): block size\n            keep_prob (int): keep probability\n            name (str): layer name\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(DropBlock, self).__init__()\n        self.block_size = block_size\n        self.keep_prob = keep_prob\n        self.name = name\n        self.data_format = data_format\n\n    def forward(self, x):\n        if not self.training or self.keep_prob == 1:\n            return x\n        else:\n            gamma = (1. - self.keep_prob) / (self.block_size**2)\n            if self.data_format == 'NCHW':\n                shape = x.shape[2:]\n            else:\n                shape = x.shape[1:3]\n            for s in shape:\n                gamma *= s / (s - self.block_size + 1)\n\n            matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype)\n            mask_inv = F.max_pool2d(\n                matrix,\n                self.block_size,\n                stride=1,\n                padding=self.block_size // 2,\n                data_format=self.data_format)\n            mask = 1. - mask_inv\n            mask = mask.astype('float32')\n            x = x.astype('float32')\n            y = x * mask * (mask.numel() / mask.sum())\n            return y\n\n\n@register\n@serializable\nclass AnchorGeneratorSSD(object):\n    def __init__(self,\n                 steps=[8, 16, 32, 64, 100, 300],\n                 aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],\n                 min_ratio=15,\n                 max_ratio=90,\n                 base_size=300,\n                 min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0],\n                 max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0],\n                 offset=0.5,\n                 flip=True,\n                 clip=False,\n                 min_max_aspect_ratios_order=False):\n        self.steps = steps\n        self.aspect_ratios = aspect_ratios\n        self.min_ratio = min_ratio\n        self.max_ratio = max_ratio\n        self.base_size = base_size\n        self.min_sizes = min_sizes\n        self.max_sizes = max_sizes\n        self.offset = offset\n        self.flip = flip\n        self.clip = clip\n        self.min_max_aspect_ratios_order = min_max_aspect_ratios_order\n\n        if self.min_sizes == [] and self.max_sizes == []:\n            num_layer = len(aspect_ratios)\n            step = int(\n                math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2\n                                                                  )))\n            for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1,\n                                         step):\n                self.min_sizes.append(self.base_size * ratio / 100.)\n                self.max_sizes.append(self.base_size * (ratio + step) / 100.)\n            self.min_sizes = [self.base_size * .10] + self.min_sizes\n            self.max_sizes = [self.base_size * .20] + self.max_sizes\n\n        self.num_priors = []\n        for aspect_ratio, min_size, max_size in zip(\n                aspect_ratios, self.min_sizes, self.max_sizes):\n            if isinstance(min_size, (list, tuple)):\n                self.num_priors.append(\n                    len(_to_list(min_size)) + len(_to_list(max_size)))\n            else:\n                self.num_priors.append((len(aspect_ratio) * 2 + 1) * len(\n                    _to_list(min_size)) + len(_to_list(max_size)))\n\n    def __call__(self, inputs, image):\n        boxes = []\n        for input, min_size, max_size, aspect_ratio, step in zip(\n                inputs, self.min_sizes, self.max_sizes, self.aspect_ratios,\n                self.steps):\n            box, _ = ops.prior_box(\n                input=input,\n                image=image,\n                min_sizes=_to_list(min_size),\n                max_sizes=_to_list(max_size),\n                aspect_ratios=aspect_ratio,\n                flip=self.flip,\n                clip=self.clip,\n                steps=[step, step],\n                offset=self.offset,\n                min_max_aspect_ratios_order=self.min_max_aspect_ratios_order)\n            boxes.append(paddle.reshape(box, [-1, 4]))\n        return boxes\n\n\n@register\n@serializable\nclass RCNNBox(object):\n    __shared__ = ['num_classes', 'export_onnx']\n\n    def __init__(self,\n                 prior_box_var=[10., 10., 5., 5.],\n                 code_type=\"decode_center_size\",\n                 box_normalized=False,\n                 num_classes=80,\n                 export_onnx=False):\n        super(RCNNBox, self).__init__()\n        self.prior_box_var = prior_box_var\n        self.code_type = code_type\n        self.box_normalized = box_normalized\n        self.num_classes = num_classes\n        self.export_onnx = export_onnx\n\n    def __call__(self, bbox_head_out, rois, im_shape, scale_factor):\n        bbox_pred = bbox_head_out[0]\n        cls_prob = bbox_head_out[1]\n        roi = rois[0]\n        rois_num = rois[1]\n\n        if self.export_onnx:\n            onnx_rois_num_per_im = rois_num[0]\n            origin_shape = paddle.expand(im_shape[0, :],\n                                         [onnx_rois_num_per_im, 2])\n\n        else:\n            origin_shape_list = []\n            if isinstance(roi, list):\n                batch_size = len(roi)\n            else:\n                batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])\n\n            # bbox_pred.shape: [N, C*4]\n            for idx in range(batch_size):\n                rois_num_per_im = rois_num[idx]\n                expand_im_shape = paddle.expand(im_shape[idx, :],\n                                                [rois_num_per_im, 2])\n                origin_shape_list.append(expand_im_shape)\n\n            origin_shape = paddle.concat(origin_shape_list)\n\n        # bbox_pred.shape: [N, C*4]\n        # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head)\n        bbox = paddle.concat(roi)\n        bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)\n        scores = cls_prob[:, :-1]\n\n        # bbox.shape: [N, C, 4]\n        # bbox.shape[1] must be equal to scores.shape[1]\n        total_num = bbox.shape[0]\n        bbox_dim = bbox.shape[-1]\n        bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim])\n\n        origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)\n        origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)\n        zeros = paddle.zeros_like(origin_h)\n        x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros)\n        y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros)\n        x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros)\n        y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros)\n        bbox = paddle.stack([x1, y1, x2, y2], axis=-1)\n        bboxes = (bbox, rois_num)\n        return bboxes, scores\n\n\n@register\n@serializable\nclass MultiClassNMS(object):\n    def __init__(self,\n                 score_threshold=.05,\n                 nms_top_k=-1,\n                 keep_top_k=100,\n                 nms_threshold=.5,\n                 normalized=True,\n                 nms_eta=1.0,\n                 return_index=False,\n                 return_rois_num=True,\n                 trt=False,\n                 cpu=False):\n        super(MultiClassNMS, self).__init__()\n        self.score_threshold = score_threshold\n        self.nms_top_k = nms_top_k\n        self.keep_top_k = keep_top_k\n        self.nms_threshold = nms_threshold\n        self.normalized = normalized\n        self.nms_eta = nms_eta\n        self.return_index = return_index\n        self.return_rois_num = return_rois_num\n        self.trt = trt\n        self.cpu = cpu\n\n    def __call__(self, bboxes, score, background_label=-1):\n        \"\"\"\n        bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape \n                                         [N, M, 4], N is the batch size and M\n                                         is the number of bboxes\n                                      2. (List[Tensor]) bboxes and bbox_num,\n                                         bboxes have shape of [M, C, 4], C\n                                         is the class number and bbox_num means\n                                         the number of bboxes of each batch with\n                                         shape [N,] \n        score (Tensor): Predicted scores with shape [N, C, M] or [M, C]\n        background_label (int): Ignore the background label; For example, RCNN\n                                is num_classes and YOLO is -1. \n        \"\"\"\n        kwargs = self.__dict__.copy()\n        if isinstance(bboxes, tuple):\n            bboxes, bbox_num = bboxes\n            kwargs.update({'rois_num': bbox_num})\n        if background_label > -1:\n            kwargs.update({'background_label': background_label})\n        kwargs.pop('trt')\n        kwargs.pop('cpu')\n\n        # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt\n        if self.trt and (int(paddle.version.major) == 0 or\n                         (int(paddle.version.major) >= 2 and\n                          int(paddle.version.minor) >= 3)):\n            # TODO(wangxinxin08): tricky switch to run nms on tensorrt\n            kwargs.update({'nms_eta': 1.1})\n            bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs)\n            bbox = bbox.reshape([1, -1, 6])\n            idx = paddle.nonzero(bbox[..., 0] != -1)\n            bbox = paddle.gather_nd(bbox, idx)\n            return bbox, bbox_num, None\n        else:\n            if self.cpu:\n                device = paddle.device.get_device()\n                paddle.set_device('cpu')\n                outputs = ops.multiclass_nms(bboxes, score, **kwargs)\n                paddle.set_device(device)\n                return outputs\n            else:\n                return ops.multiclass_nms(bboxes, score, **kwargs)\n\n\n@register\n@serializable\nclass MatrixNMS(object):\n    __append_doc__ = True\n\n    def __init__(self,\n                 score_threshold=.05,\n                 post_threshold=.05,\n                 nms_top_k=-1,\n                 keep_top_k=100,\n                 use_gaussian=False,\n                 gaussian_sigma=2.,\n                 normalized=False,\n                 background_label=0):\n        super(MatrixNMS, self).__init__()\n        self.score_threshold = score_threshold\n        self.post_threshold = post_threshold\n        self.nms_top_k = nms_top_k\n        self.keep_top_k = keep_top_k\n        self.normalized = normalized\n        self.use_gaussian = use_gaussian\n        self.gaussian_sigma = gaussian_sigma\n        self.background_label = background_label\n\n    def __call__(self, bbox, score, *args):\n        return ops.matrix_nms(\n            bboxes=bbox,\n            scores=score,\n            score_threshold=self.score_threshold,\n            post_threshold=self.post_threshold,\n            nms_top_k=self.nms_top_k,\n            keep_top_k=self.keep_top_k,\n            use_gaussian=self.use_gaussian,\n            gaussian_sigma=self.gaussian_sigma,\n            background_label=self.background_label,\n            normalized=self.normalized)\n\n\n@register\n@serializable\nclass YOLOBox(object):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 num_classes=80,\n                 conf_thresh=0.005,\n                 downsample_ratio=32,\n                 clip_bbox=True,\n                 scale_x_y=1.):\n        self.num_classes = num_classes\n        self.conf_thresh = conf_thresh\n        self.downsample_ratio = downsample_ratio\n        self.clip_bbox = clip_bbox\n        self.scale_x_y = scale_x_y\n\n    def __call__(self,\n                 yolo_head_out,\n                 anchors,\n                 im_shape,\n                 scale_factor,\n                 var_weight=None):\n        boxes_list = []\n        scores_list = []\n        origin_shape = im_shape / scale_factor\n        origin_shape = paddle.cast(origin_shape, 'int32')\n        for i, head_out in enumerate(yolo_head_out):\n            boxes, scores = paddle.vision.ops.yolo_box(\n                head_out,\n                origin_shape,\n                anchors[i],\n                self.num_classes,\n                self.conf_thresh,\n                self.downsample_ratio // 2**i,\n                self.clip_bbox,\n                scale_x_y=self.scale_x_y)\n            boxes_list.append(boxes)\n            scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))\n        yolo_boxes = paddle.concat(boxes_list, axis=1)\n        yolo_scores = paddle.concat(scores_list, axis=2)\n        return yolo_boxes, yolo_scores\n\n\n@register\n@serializable\nclass SSDBox(object):\n    def __init__(self,\n                 is_normalized=True,\n                 prior_box_var=[0.1, 0.1, 0.2, 0.2],\n                 use_fuse_decode=False):\n        self.is_normalized = is_normalized\n        self.norm_delta = float(not self.is_normalized)\n        self.prior_box_var = prior_box_var\n        self.use_fuse_decode = use_fuse_decode\n\n    def __call__(self,\n                 preds,\n                 prior_boxes,\n                 im_shape,\n                 scale_factor,\n                 var_weight=None):\n        boxes, scores = preds\n        boxes = paddle.concat(boxes, axis=1)\n        prior_boxes = paddle.concat(prior_boxes)\n        if self.use_fuse_decode:\n            output_boxes = ops.box_coder(\n                prior_boxes,\n                self.prior_box_var,\n                boxes,\n                code_type=\"decode_center_size\",\n                box_normalized=self.is_normalized)\n        else:\n            pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta\n            pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta\n            pb_x = prior_boxes[:, 0] + pb_w * 0.5\n            pb_y = prior_boxes[:, 1] + pb_h * 0.5\n            out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0]\n            out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1]\n            out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w\n            out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h\n            output_boxes = paddle.stack(\n                [\n                    out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2.,\n                    out_y + out_h / 2.\n                ],\n                axis=-1)\n\n        if self.is_normalized:\n            h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1)\n            w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1)\n            im_shape = paddle.stack([w, h, w, h], axis=-1)\n            output_boxes *= im_shape\n        else:\n            output_boxes[..., -2:] -= 1.0\n        output_scores = F.softmax(paddle.concat(\n            scores, axis=1)).transpose([0, 2, 1])\n\n        return output_boxes, output_scores\n\n\n@register\nclass TTFBox(object):\n    __shared__ = ['down_ratio']\n\n    def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4):\n        super(TTFBox, self).__init__()\n        self.max_per_img = max_per_img\n        self.score_thresh = score_thresh\n        self.down_ratio = down_ratio\n\n    def _simple_nms(self, heat, kernel=3):\n        \"\"\"\n        Use maxpool to filter the max score, get local peaks.\n        \"\"\"\n        pad = (kernel - 1) // 2\n        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)\n        keep = paddle.cast(hmax == heat, 'float32')\n        return heat * keep\n\n    def _topk(self, scores):\n        \"\"\"\n        Select top k scores and decode to get xy coordinates.\n        \"\"\"\n        k = self.max_per_img\n        shape_fm = paddle.shape(scores)\n        shape_fm.stop_gradient = True\n        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]\n        # batch size is 1\n        scores_r = paddle.reshape(scores, [cat, -1])\n        topk_scores, topk_inds = paddle.topk(scores_r, k)\n        topk_ys = topk_inds // width\n        topk_xs = topk_inds % width\n\n        topk_score_r = paddle.reshape(topk_scores, [-1])\n        topk_score, topk_ind = paddle.topk(topk_score_r, k)\n        k_t = paddle.full(topk_ind.shape, k, dtype='int64')\n        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')\n\n        topk_inds = paddle.reshape(topk_inds, [-1])\n        topk_ys = paddle.reshape(topk_ys, [-1, 1])\n        topk_xs = paddle.reshape(topk_xs, [-1, 1])\n        topk_inds = paddle.gather(topk_inds, topk_ind)\n        topk_ys = paddle.gather(topk_ys, topk_ind)\n        topk_xs = paddle.gather(topk_xs, topk_ind)\n\n        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs\n\n    def _decode(self, hm, wh, im_shape, scale_factor):\n        heatmap = F.sigmoid(hm)\n        heat = self._simple_nms(heatmap)\n        scores, inds, clses, ys, xs = self._topk(heat)\n        ys = paddle.cast(ys, 'float32') * self.down_ratio\n        xs = paddle.cast(xs, 'float32') * self.down_ratio\n        scores = paddle.tensor.unsqueeze(scores, [1])\n        clses = paddle.tensor.unsqueeze(clses, [1])\n\n        wh_t = paddle.transpose(wh, [0, 2, 3, 1])\n        wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]])\n        wh = paddle.gather(wh, inds)\n\n        x1 = xs - wh[:, 0:1]\n        y1 = ys - wh[:, 1:2]\n        x2 = xs + wh[:, 2:3]\n        y2 = ys + wh[:, 3:4]\n\n        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)\n\n        scale_y = scale_factor[:, 0:1]\n        scale_x = scale_factor[:, 1:2]\n        scale_expand = paddle.concat(\n            [scale_x, scale_y, scale_x, scale_y], axis=1)\n        boxes_shape = paddle.shape(bboxes)\n        boxes_shape.stop_gradient = True\n        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)\n        bboxes = paddle.divide(bboxes, scale_expand)\n        results = paddle.concat([clses, scores, bboxes], axis=1)\n        # hack: append result with cls=-1 and score=1. to avoid all scores\n        # are less than score_thresh which may cause error in gather.\n        fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]]))\n        fill_r = paddle.cast(fill_r, results.dtype)\n        results = paddle.concat([results, fill_r])\n        scores = results[:, 1]\n        valid_ind = paddle.nonzero(scores > self.score_thresh)\n        results = paddle.gather(results, valid_ind)\n        return results, results.shape[0:1]\n\n    def __call__(self, hm, wh, im_shape, scale_factor):\n        results = []\n        results_num = []\n        for i in range(scale_factor.shape[0]):\n            result, num = self._decode(hm[i:i + 1, ], wh[i:i + 1, ],\n                                       im_shape[i:i + 1, ],\n                                       scale_factor[i:i + 1, ])\n            results.append(result)\n            results_num.append(num)\n        results = paddle.concat(results, axis=0)\n        results_num = paddle.concat(results_num, axis=0)\n        return results, results_num\n\n\n@register\n@serializable\nclass JDEBox(object):\n    __shared__ = ['num_classes']\n\n    def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32):\n        self.num_classes = num_classes\n        self.conf_thresh = conf_thresh\n        self.downsample_ratio = downsample_ratio\n\n    def generate_anchor(self, nGh, nGw, anchor_wh):\n        nA = len(anchor_wh)\n        yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)])\n        mesh = paddle.stack(\n            (xv, yv), axis=0).cast(dtype='float32')  # 2 x nGh x nGw\n        meshs = paddle.tile(mesh, [nA, 1, 1, 1])\n\n        anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat(\n            int(nGh), axis=-2).repeat(\n                int(nGw), axis=-1)\n        anchor_offset_mesh = paddle.to_tensor(\n            anchor_offset_mesh.astype(np.float32))\n        # nA x 2 x nGh x nGw\n\n        anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1)\n        anchor_mesh = paddle.transpose(anchor_mesh,\n                                       [0, 2, 3, 1])  # (nA x nGh x nGw) x 4\n        return anchor_mesh\n\n    def decode_delta(self, delta, fg_anchor_list):\n        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \\\n                        fg_anchor_list[:, 2], fg_anchor_list[:,3]\n        dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]\n        gx = pw * dx + px\n        gy = ph * dy + py\n        gw = pw * paddle.exp(dw)\n        gh = ph * paddle.exp(dh)\n        gx1 = gx - gw * 0.5\n        gy1 = gy - gh * 0.5\n        gx2 = gx + gw * 0.5\n        gy2 = gy + gh * 0.5\n        return paddle.stack([gx1, gy1, gx2, gy2], axis=1)\n\n    def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec):\n        anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec)\n        anchor_mesh = paddle.unsqueeze(anchor_mesh, 0)\n        pred_list = self.decode_delta(\n            paddle.reshape(\n                delta_map, shape=[-1, 4]),\n            paddle.reshape(\n                anchor_mesh, shape=[-1, 4]))\n        pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4])\n        return pred_map\n\n    def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec):\n        boxes_shape = head_out.shape  # [nB, nA*6, nGh, nGw]\n        nGh, nGw = boxes_shape[-2], boxes_shape[-1]\n        nB = 1  # TODO: only support bs=1 now\n        boxes_list, scores_list = [], []\n        for idx in range(nB):\n            p = paddle.reshape(\n                head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw])\n            p = paddle.transpose(p, perm=[0, 2, 3, 1])  # [nA, nGh, nGw, 6]\n            delta_map = p[:, :, :, :4]\n            boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec)\n            # [nA * nGh * nGw, 4]\n            boxes_list.append(boxes * stride)\n\n            p_conf = paddle.transpose(\n                p[:, :, :, 4:6], perm=[3, 0, 1, 2])  # [2, nA, nGh, nGw]\n            p_conf = F.softmax(\n                p_conf, axis=0)[1, :, :, :].unsqueeze(-1)  # [nA, nGh, nGw, 1]\n            scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1])\n            scores_list.append(scores)\n\n        boxes_results = paddle.stack(boxes_list)\n        scores_results = paddle.stack(scores_list)\n        return boxes_results, scores_results\n\n    def __call__(self, yolo_head_out, anchors):\n        bbox_pred_list = []\n        for i, head_out in enumerate(yolo_head_out):\n            stride = self.downsample_ratio // 2**i\n            anc_w, anc_h = anchors[i][0::2], anchors[i][1::2]\n            anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride\n            nA = len(anc_w)\n            boxes, scores = self._postprocessing_by_level(nA, stride, head_out,\n                                                          anchor_vec)\n            bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1))\n\n        yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1)\n        boxes_idx_over_conf_thr = paddle.nonzero(\n            yolo_boxes_scores[:, :, -1] > self.conf_thresh)\n        boxes_idx_over_conf_thr.stop_gradient = True\n\n        return boxes_idx_over_conf_thr, yolo_boxes_scores\n\n\n@register\n@serializable\nclass MaskMatrixNMS(object):\n    \"\"\"\n    Matrix NMS for multi-class masks.\n    Args:\n        update_threshold (float): Updated threshold of categroy score in second time.\n        pre_nms_top_n (int): Number of total instance to be kept per image before NMS\n        post_nms_top_n (int): Number of total instance to be kept per image after NMS.\n        kernel (str):  'linear' or 'gaussian'.\n        sigma (float): std in gaussian method.\n    Input:\n        seg_preds (Variable): shape (n, h, w), segmentation feature maps\n        seg_masks (Variable): shape (n, h, w), segmentation feature maps\n        cate_labels (Variable): shape (n), mask labels in descending order\n        cate_scores (Variable): shape (n), mask scores in descending order\n        sum_masks (Variable): a float tensor of the sum of seg_masks\n    Returns:\n        Variable: cate_scores, tensors of shape (n)\n    \"\"\"\n\n    def __init__(self,\n                 update_threshold=0.05,\n                 pre_nms_top_n=500,\n                 post_nms_top_n=100,\n                 kernel='gaussian',\n                 sigma=2.0):\n        super(MaskMatrixNMS, self).__init__()\n        self.update_threshold = update_threshold\n        self.pre_nms_top_n = pre_nms_top_n\n        self.post_nms_top_n = post_nms_top_n\n        self.kernel = kernel\n        self.sigma = sigma\n\n    def _sort_score(self, scores, top_num):\n        if scores.shape[0] > top_num:\n            return paddle.topk(scores, top_num)[1]\n        else:\n            return paddle.argsort(scores, descending=True)\n\n    def __call__(self,\n                 seg_preds,\n                 seg_masks,\n                 cate_labels,\n                 cate_scores,\n                 sum_masks=None):\n        # sort and keep top nms_pre\n        sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n)\n        seg_masks = paddle.gather(seg_masks, index=sort_inds)\n        seg_preds = paddle.gather(seg_preds, index=sort_inds)\n        sum_masks = paddle.gather(sum_masks, index=sort_inds)\n        cate_scores = paddle.gather(cate_scores, index=sort_inds)\n        cate_labels = paddle.gather(cate_labels, index=sort_inds)\n\n        seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1)\n        # inter.\n        inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0]))\n        n_samples = cate_labels.shape\n        n_samples = paddle.to_tensor(n_samples, dtype=\"int32\")\n        # union.\n        sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples])\n        # iou.\n        iou_matrix = (inter_matrix / (\n            sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix))\n        iou_matrix = paddle.triu(iou_matrix, diagonal=1)\n        # label_specific matrix.\n        cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples])\n        label_matrix = paddle.cast(\n            (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])),\n            'float32')\n        label_matrix = paddle.triu(label_matrix, diagonal=1)\n\n        # IoU compensation\n        compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0)\n        compensate_iou = paddle.expand(\n            compensate_iou, shape=[n_samples, n_samples])\n        compensate_iou = paddle.transpose(compensate_iou, [1, 0])\n\n        # IoU decay\n        decay_iou = iou_matrix * label_matrix\n\n        # matrix nms\n        if self.kernel == 'gaussian':\n            decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2))\n            compensate_matrix = paddle.exp(-1 * self.sigma *\n                                           (compensate_iou**2))\n            decay_coefficient = paddle.min(decay_matrix / compensate_matrix,\n                                           axis=0)\n        elif self.kernel == 'linear':\n            decay_matrix = (1 - decay_iou) / (1 - compensate_iou)\n            decay_coefficient = paddle.min(decay_matrix, axis=0)\n        else:\n            raise NotImplementedError\n\n        # update the score.\n        cate_scores = cate_scores * decay_coefficient\n        y = paddle.zeros(shape=cate_scores.shape, dtype='float32')\n        keep = paddle.where(cate_scores >= self.update_threshold, cate_scores,\n                            y)\n        keep = paddle.nonzero(keep)\n        keep = paddle.squeeze(keep, axis=[1])\n        # Prevent empty and increase fake data\n        keep = paddle.concat(\n            [keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, 'int64')])\n\n        seg_preds = paddle.gather(seg_preds, index=keep)\n        cate_scores = paddle.gather(cate_scores, index=keep)\n        cate_labels = paddle.gather(cate_labels, index=keep)\n\n        # sort and keep top_k\n        sort_inds = self._sort_score(cate_scores, self.post_nms_top_n)\n        seg_preds = paddle.gather(seg_preds, index=sort_inds)\n        cate_scores = paddle.gather(cate_scores, index=sort_inds)\n        cate_labels = paddle.gather(cate_labels, index=sort_inds)\n        return seg_preds, cate_scores, cate_labels\n\n\ndef Conv2d(in_channels,\n           out_channels,\n           kernel_size,\n           stride=1,\n           padding=0,\n           dilation=1,\n           groups=1,\n           bias=True,\n           weight_init=Normal(std=0.001),\n           bias_init=Constant(0.)):\n    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)\n    if bias:\n        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)\n    else:\n        bias_attr = False\n    conv = nn.Conv2D(\n        in_channels,\n        out_channels,\n        kernel_size,\n        stride,\n        padding,\n        dilation,\n        groups,\n        weight_attr=weight_attr,\n        bias_attr=bias_attr)\n    return conv\n\n\ndef ConvTranspose2d(in_channels,\n                    out_channels,\n                    kernel_size,\n                    stride=1,\n                    padding=0,\n                    output_padding=0,\n                    groups=1,\n                    bias=True,\n                    dilation=1,\n                    weight_init=Normal(std=0.001),\n                    bias_init=Constant(0.)):\n    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)\n    if bias:\n        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)\n    else:\n        bias_attr = False\n    conv = nn.Conv2DTranspose(\n        in_channels,\n        out_channels,\n        kernel_size,\n        stride,\n        padding,\n        output_padding,\n        dilation,\n        groups,\n        weight_attr=weight_attr,\n        bias_attr=bias_attr)\n    return conv\n\n\ndef BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True):\n    if not affine:\n        weight_attr = False\n        bias_attr = False\n    else:\n        weight_attr = None\n        bias_attr = None\n    batchnorm = nn.BatchNorm2D(\n        num_features,\n        momentum,\n        eps,\n        weight_attr=weight_attr,\n        bias_attr=bias_attr)\n    return batchnorm\n\n\ndef ReLU():\n    return nn.ReLU()\n\n\ndef Upsample(scale_factor=None, mode='nearest', align_corners=False):\n    return nn.Upsample(None, scale_factor, mode, align_corners)\n\n\ndef MaxPool(kernel_size, stride, padding, ceil_mode=False):\n    return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode)\n\n\nclass Concat(nn.Layer):\n    def __init__(self, dim=0):\n        super(Concat, self).__init__()\n        self.dim = dim\n\n    def forward(self, inputs):\n        return paddle.concat(inputs, axis=self.dim)\n\n    def extra_repr(self):\n        return 'dim={}'.format(self.dim)\n\n\ndef _convert_attention_mask(attn_mask, dtype):\n    \"\"\"\n    Convert the attention mask to the target dtype we expect.\n    Parameters:\n        attn_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False` \n                values and the others have `True` values. When the data type is \n                int, the unwanted positions have 0 values and the others have 1 \n                values. When the data type is float, the unwanted positions have \n                `-INF` values and the others have 0 values. It can be None when \n                nothing wanted or needed to be prevented attention to. Default None.\n        dtype (VarType): The target type of `attn_mask` we expect.\n    Returns:\n        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.\n    \"\"\"\n    return nn.layer.transformer._convert_attention_mask(attn_mask, dtype)\n\n\n@register\nclass MultiHeadAttention(nn.Layer):\n    \"\"\"\n    Attention mapps queries and a set of key-value pairs to outputs, and\n    Multi-Head Attention performs multiple parallel attention to jointly attending\n    to information from different representation subspaces.\n\n    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_\n    for more details.\n\n    Parameters:\n        embed_dim (int): The expected feature size in the input and output.\n        num_heads (int): The number of heads in multi-head attention.\n        dropout (float, optional): The dropout probability used on attention\n            weights to drop some attention targets. 0 for no dropout. Default 0\n        kdim (int, optional): The feature size in key. If None, assumed equal to\n            `embed_dim`. Default None.\n        vdim (int, optional): The feature size in value. If None, assumed equal to\n            `embed_dim`. Default None.\n        need_weights (bool, optional): Indicate whether to return the attention\n            weights. Default False.\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n\n            # encoder input: [batch_size, sequence_length, d_model]\n            query = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, num_heads, query_len, query_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)\n            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    def __init__(self,\n                 embed_dim,\n                 num_heads,\n                 dropout=0.,\n                 kdim=None,\n                 vdim=None,\n                 need_weights=False):\n        super(MultiHeadAttention, self).__init__()\n        self.embed_dim = embed_dim\n        self.kdim = kdim if kdim is not None else embed_dim\n        self.vdim = vdim if vdim is not None else embed_dim\n        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim\n\n        self.num_heads = num_heads\n        self.dropout = dropout\n        self.need_weights = need_weights\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        if self._qkv_same_embed_dim:\n            self.in_proj_weight = self.create_parameter(\n                shape=[embed_dim, 3 * embed_dim],\n                attr=None,\n                dtype=self._dtype,\n                is_bias=False)\n            self.in_proj_bias = self.create_parameter(\n                shape=[3 * embed_dim],\n                attr=None,\n                dtype=self._dtype,\n                is_bias=True)\n        else:\n            self.q_proj = nn.Linear(embed_dim, embed_dim)\n            self.k_proj = nn.Linear(self.kdim, embed_dim)\n            self.v_proj = nn.Linear(self.vdim, embed_dim)\n\n        self.out_proj = nn.Linear(embed_dim, embed_dim)\n        self._type_list = ('q_proj', 'k_proj', 'v_proj')\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        for p in self.parameters():\n            if p.dim() > 1:\n                xavier_uniform_(p)\n            else:\n                constant_(p)\n\n    def compute_qkv(self, tensor, index):\n        if self._qkv_same_embed_dim:\n            tensor = F.linear(\n                x=tensor,\n                weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)\n                                           * self.embed_dim],\n                bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *\n                                       self.embed_dim]\n                if self.in_proj_bias is not None else None)\n        else:\n            tensor = getattr(self, self._type_list[index])(tensor)\n        tensor = tensor.reshape(\n            [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])\n        return tensor\n\n    def forward(self, query, key=None, value=None, attn_mask=None):\n        r\"\"\"\n        Applies multi-head attention to map queries and a set of key-value pairs\n        to outputs.\n\n        Parameters:\n            query (Tensor): The queries for multi-head attention. It is a\n                tensor with shape `[batch_size, query_length, embed_dim]`. The\n                data type should be float32 or float64.\n            key (Tensor, optional): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If None, use `query` as\n                `key`. Default None.\n            value (Tensor, optional): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, use `query` as\n                `value`. Default None.\n            attn_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `query`, representing attention output. Or a tuple if \\\n                `need_weights` is True or `cache` is not None. If `need_weights` \\\n                is True, except for attention output, the tuple also includes \\\n                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \\\n                If `cache` is not None, the tuple then includes the new cache \\\n                having the same type as `cache`, and if it is `StaticCache`, it \\\n                is same as the input `cache`, if it is `Cache`, the new cache \\\n                reserves tensors concatanating raw tensors with intermediate \\\n                results of current query.\n        \"\"\"\n        key = query if key is None else key\n        value = query if value is None else value\n        # compute q ,k ,v\n        q, k, v = (self.compute_qkv(t, i)\n                   for i, t in enumerate([query, key, value]))\n\n        # scale dot product attention\n        product = paddle.matmul(x=q, y=k, transpose_y=True)\n        scaling = float(self.head_dim)**-0.5\n        product = product * scaling\n\n        if attn_mask is not None:\n            # Support bool or int mask\n            attn_mask = _convert_attention_mask(attn_mask, product.dtype)\n            product = product + attn_mask\n        weights = F.softmax(product)\n        if self.dropout:\n            weights = F.dropout(\n                weights,\n                self.dropout,\n                training=self.training,\n                mode=\"upscale_in_train\")\n        out = paddle.matmul(weights, v)\n\n        # combine heads\n        out = paddle.transpose(out, perm=[0, 2, 1, 3])\n        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])\n\n        # project to output\n        out = self.out_proj(out)\n\n        outs = [out]\n        if self.need_weights:\n            outs.append(weights)\n        return out if len(outs) == 1 else tuple(outs)\n\n\n@register\nclass ConvMixer(nn.Layer):\n    def __init__(\n            self,\n            dim,\n            depth,\n            kernel_size=3, ):\n        super().__init__()\n        self.dim = dim\n        self.depth = depth\n        self.kernel_size = kernel_size\n\n        self.mixer = self.conv_mixer(dim, depth, kernel_size)\n\n    def forward(self, x):\n        return self.mixer(x)\n\n    @staticmethod\n    def conv_mixer(\n            dim,\n            depth,\n            kernel_size, ):\n        Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim))\n        Residual = type('Residual', (Seq, ),\n                        {'forward': lambda self, x: self[0](x) + x})\n        return Seq(*[\n            Seq(Residual(\n                ActBn(\n                    nn.Conv2D(\n                        dim, dim, kernel_size, groups=dim, padding=\"same\"))),\n                ActBn(nn.Conv2D(dim, dim, 1))) for i in range(depth)\n        ])\n"
  },
  {
    "path": "ppdet/modeling/losses/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import yolo_loss\nfrom . import iou_aware_loss\nfrom . import iou_loss\nfrom . import ssd_loss\nfrom . import fcos_loss\nfrom . import solov2_loss\nfrom . import ctfocal_loss\nfrom . import keypoint_loss\nfrom . import jde_loss\nfrom . import fairmot_loss\nfrom . import gfocal_loss\nfrom . import detr_loss\nfrom . import sparsercnn_loss\nfrom . import focal_loss\nfrom . import smooth_l1_loss\nfrom . import probiou_loss\nfrom . import cot_loss\nfrom . import supcontrast\nfrom . import queryinst_loss\nfrom . import clrnet_loss\nfrom . import clrnet_line_iou_loss\n\nfrom .yolo_loss import *\nfrom .iou_aware_loss import *\nfrom .iou_loss import *\nfrom .ssd_loss import *\nfrom .fcos_loss import *\nfrom .solov2_loss import *\nfrom .ctfocal_loss import *\nfrom .keypoint_loss import *\nfrom .jde_loss import *\nfrom .fairmot_loss import *\nfrom .gfocal_loss import *\nfrom .detr_loss import *\nfrom .sparsercnn_loss import *\nfrom .focal_loss import *\nfrom .smooth_l1_loss import *\nfrom .pose3d_loss import *\nfrom .probiou_loss import *\nfrom .cot_loss import *\nfrom .supcontrast import *\nfrom .queryinst_loss import *\nfrom .clrnet_loss import *\nfrom .clrnet_line_iou_loss import *"
  },
  {
    "path": "ppdet/modeling/losses/clrnet_line_iou_loss.py",
    "content": "import paddle\n\n\ndef line_iou(pred, target, img_w, length=15, aligned=True):\n    '''\n    Calculate the line iou value between predictions and targets\n    Args:\n        pred: lane predictions, shape: (num_pred, 72)\n        target: ground truth, shape: (num_target, 72)\n        img_w: image width\n        length: extended radius\n        aligned: True for iou loss calculation, False for pair-wise ious in assign\n    '''\n    px1 = pred - length\n    px2 = pred + length\n    tx1 = target - length\n    tx2 = target + length\n\n    if aligned:\n        invalid_mask = target\n        ovr = paddle.minimum(px2, tx2) - paddle.maximum(px1, tx1)\n        union = paddle.maximum(px2, tx2) - paddle.minimum(px1, tx1)\n    else:\n        num_pred = pred.shape[0]\n        invalid_mask = target.tile([num_pred, 1, 1])\n\n        ovr = (paddle.minimum(px2[:, None, :], tx2[None, ...]) - paddle.maximum(\n            px1[:, None, :], tx1[None, ...]))\n        union = (paddle.maximum(px2[:, None, :], tx2[None, ...]) -\n                 paddle.minimum(px1[:, None, :], tx1[None, ...]))\n\n    invalid_masks = (invalid_mask < 0) | (invalid_mask >= img_w)\n\n    ovr[invalid_masks] = 0.\n    union[invalid_masks] = 0.\n    iou = ovr.sum(axis=-1) / (union.sum(axis=-1) + 1e-9)\n    return iou\n\n\ndef liou_loss(pred, target, img_w, length=15):\n    return (1 - line_iou(pred, target, img_w, length)).mean()\n"
  },
  {
    "path": "ppdet/modeling/losses/clrnet_loss.py",
    "content": "import paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.clrnet_utils import accuracy\nfrom ppdet.modeling.assigners.clrnet_assigner import assign\nfrom ppdet.modeling.losses.clrnet_line_iou_loss import liou_loss\n\n__all__ = ['CLRNetLoss']\n\n\nclass SoftmaxFocalLoss(nn.Layer):\n    def __init__(self, gamma, ignore_lb=255, *args, **kwargs):\n        super(SoftmaxFocalLoss, self).__init__()\n        self.gamma = gamma\n        self.nll = nn.NLLLoss(ignore_index=ignore_lb)\n\n    def forward(self, logits, labels):\n        scores = F.softmax(logits, dim=1)\n        factor = paddle.pow(1. - scores, self.gamma)\n        log_score = F.log_softmax(logits, dim=1)\n        log_score = factor * log_score\n        loss = self.nll(log_score, labels)\n        return loss\n\n\ndef focal_loss(input: paddle.Tensor,\n               target: paddle.Tensor,\n               alpha: float,\n               gamma: float=2.0,\n               reduction: str='none',\n               eps: float=1e-8) -> paddle.Tensor:\n    r\"\"\"Function that computes Focal loss.\n\n    See :class:`~kornia.losses.FocalLoss` for details.\n    \"\"\"\n    if not paddle.is_tensor(input):\n        raise TypeError(\"Input type is not a torch.Tensor. Got {}\".format(\n            type(input)))\n\n    if not len(input.shape) >= 2:\n        raise ValueError(\"Invalid input shape, we expect BxCx*. Got: {}\".format(\n            input.shape))\n\n    if input.shape[0] != target.shape[0]:\n        raise ValueError(\n            'Expected input batch_size ({}) to match target batch_size ({}).'.\n            format(input.shape[0], target.shape[0]))\n\n    n = input.shape[0]\n    out_size = (n, ) + tuple(input.shape[2:])\n    if target.shape[1:] != input.shape[2:]:\n        raise ValueError('Expected target size {}, got {}'.format(out_size,\n                                                                  target.shape))\n    if (isinstance(input.place, paddle.CUDAPlace) and\n            isinstance(target.place, paddle.CPUPlace)) | (isinstance(\n                input.place, paddle.CPUPlace) and isinstance(target.place,\n                                                             paddle.CUDAPlace)):\n        raise ValueError(\n            \"input and target must be in the same device. Got: {} and {}\".\n            format(input.place, target.place))\n\n    # compute softmax over the classes axis\n    input_soft: paddle.Tensor = F.softmax(input, axis=1) + eps\n\n    # create the labels one hot tensor\n    target_one_hot: paddle.Tensor = paddle.to_tensor(\n        F.one_hot(\n            target, num_classes=input.shape[1]).cast(input.dtype),\n        place=input.place)\n\n    # compute the actual focal loss\n    weight = paddle.pow(-input_soft + 1., gamma)\n\n    focal = -alpha * weight * paddle.log(input_soft)\n    loss_tmp = paddle.sum(target_one_hot * focal, axis=1)\n\n    if reduction == 'none':\n        loss = loss_tmp\n    elif reduction == 'mean':\n        loss = paddle.mean(loss_tmp)\n    elif reduction == 'sum':\n        loss = paddle.sum(loss_tmp)\n    else:\n        raise NotImplementedError(\"Invalid reduction mode: {}\".format(\n            reduction))\n    return loss\n\n\nclass FocalLoss(nn.Layer):\n    r\"\"\"Criterion that computes Focal loss.\n\n    According to [1], the Focal loss is computed as follows:\n\n    .. math::\n\n        \\text{FL}(p_t) = -\\alpha_t (1 - p_t)^{\\gamma} \\, \\text{log}(p_t)\n\n    where:\n       - :math:`p_t` is the model's estimated probability for each class.\n\n\n    Arguments:\n        alpha (float): Weighting factor :math:`\\alpha \\in [0, 1]`.\n        gamma (float): Focusing parameter :math:`\\gamma >= 0`.\n        reduction (str, optional): Specifies the reduction to apply to the\n         output: ‘none’ | ‘mean’ | ‘sum’. ‘none’: no reduction will be applied,\n         ‘mean’: the sum of the output will be divided by the number of elements\n         in the output, ‘sum’: the output will be summed. Default: ‘none’.\n\n    Shape:\n        - Input: :math:`(N, C, *)` where C = number of classes.\n        - Target: :math:`(N, *)` where each value is\n          :math:`0 ≤ targets[i] ≤ C−1`.\n\n    Examples:\n        >>> N = 5  # num_classes\n        >>> kwargs = {\"alpha\": 0.5, \"gamma\": 2.0, \"reduction\": 'mean'}\n        >>> loss = kornia.losses.FocalLoss(**kwargs)\n        >>> input = torch.randn(1, N, 3, 5, requires_grad=True)\n        >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N)\n        >>> output = loss(input, target)\n        >>> output.backward()\n\n    References:\n        [1] https://arxiv.org/abs/1708.02002\n    \"\"\"\n\n    def __init__(self, alpha: float, gamma: float=2.0,\n                 reduction: str='none') -> None:\n        super(FocalLoss, self).__init__()\n        self.alpha: float = alpha\n        self.gamma: float = gamma\n        self.reduction: str = reduction\n        self.eps: float = 1e-6\n\n    def forward(  # type: ignore\n            self, input: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:\n        return focal_loss(input, target, self.alpha, self.gamma, self.reduction,\n                          self.eps)\n\n\n@register\nclass CLRNetLoss(nn.Layer):\n    __shared__ = ['img_w', 'img_h', 'num_classes', 'num_points']\n\n    def __init__(self,\n                 cls_loss_weight=2.0,\n                 xyt_loss_weight=0.2,\n                 iou_loss_weight=2.0,\n                 seg_loss_weight=1.0,\n                 refine_layers=3,\n                 num_points=72,\n                 img_w=800,\n                 img_h=320,\n                 num_classes=5,\n                 ignore_label=255,\n                 bg_weight=0.4):\n        super(CLRNetLoss, self).__init__()\n        self.cls_loss_weight = cls_loss_weight\n        self.xyt_loss_weight = xyt_loss_weight\n        self.iou_loss_weight = iou_loss_weight\n        self.seg_loss_weight = seg_loss_weight\n        self.refine_layers = refine_layers\n        self.img_w = img_w\n        self.img_h = img_h\n        self.n_strips = num_points - 1\n        self.num_classes = num_classes\n        self.ignore_label = ignore_label\n        weights = paddle.ones(shape=[self.num_classes])\n        weights[0] = bg_weight\n        self.criterion = nn.NLLLoss(\n            ignore_index=self.ignore_label, weight=weights)\n\n    def forward(self, output, batch):\n        predictions_lists = output['predictions_lists']\n        targets = batch['lane_line'].clone()\n        cls_criterion = FocalLoss(alpha=0.25, gamma=2.0)\n        cls_loss = paddle.to_tensor(0.0)\n        reg_xytl_loss = paddle.to_tensor(0.0)\n        iou_loss = paddle.to_tensor(0.0)\n        cls_acc = []\n        cls_acc_stage = []\n        for stage in range(self.refine_layers):\n            predictions_list = predictions_lists[stage]\n            for predictions, target in zip(predictions_list, targets):\n                target = target[target[:, 1] == 1]\n\n                if len(target) == 0:\n                    # If there are no targets, all predictions have to be negatives (i.e., 0 confidence)\n                    cls_target = paddle.zeros(\n                        [predictions.shape[0]], dtype='int64')\n                    cls_pred = predictions[:, :2]\n                    cls_loss = cls_loss + cls_criterion(cls_pred,\n                                                        cls_target).sum()\n                    continue\n\n                with paddle.no_grad():\n                    matched_row_inds, matched_col_inds = assign(\n                        predictions, target, self.img_w, self.img_h)\n\n                # classification targets\n                cls_target = paddle.zeros([predictions.shape[0]], dtype='int64')\n                cls_target[matched_row_inds] = 1\n                cls_pred = predictions[:, :2]\n\n                # regression targets -> [start_y, start_x, theta] (all transformed to absolute values), only on matched pairs\n                reg_yxtl = predictions.index_select(matched_row_inds)[..., 2:6]\n\n                reg_yxtl[:, 0] *= self.n_strips\n                reg_yxtl[:, 1] *= (self.img_w - 1)\n                reg_yxtl[:, 2] *= 180\n                reg_yxtl[:, 3] *= self.n_strips\n\n                target_yxtl = target.index_select(matched_col_inds)[..., 2:\n                                                                    6].clone()\n\n                # regression targets -> S coordinates (all transformed to absolute values)\n                reg_pred = predictions.index_select(matched_row_inds)[..., 6:]\n                reg_pred *= (self.img_w - 1)\n                reg_targets = target.index_select(matched_col_inds)[...,\n                                                                    6:].clone()\n\n                with paddle.no_grad():\n                    predictions_starts = paddle.clip(\n                        (predictions.index_select(matched_row_inds)[..., 2] *\n                         self.n_strips).round().cast(\"int64\"),\n                        min=0,\n                        max=self.\n                        n_strips)  # ensure the predictions starts is valid\n\n                    target_starts = (\n                        target.index_select(matched_col_inds)[..., 2] *\n                        self.n_strips).round().cast(\"int64\")\n                    target_yxtl[:, -1] -= (\n                        predictions_starts - target_starts)  # reg length\n\n                # Loss calculation\n                cls_loss = cls_loss + cls_criterion(\n                    cls_pred, cls_target).sum() / target.shape[0]\n\n                target_yxtl[:, 0] *= self.n_strips\n                target_yxtl[:, 2] *= 180\n\n                reg_xytl_loss = reg_xytl_loss + F.smooth_l1_loss(\n                    input=reg_yxtl, label=target_yxtl, reduction='none').mean()\n\n                iou_loss = iou_loss + liou_loss(\n                    reg_pred, reg_targets, self.img_w, length=15)\n\n                cls_accuracy = accuracy(cls_pred, cls_target)\n                cls_acc_stage.append(cls_accuracy)\n\n            cls_acc.append(sum(cls_acc_stage) / (len(cls_acc_stage) + 1e-5))\n\n        # extra segmentation loss\n        seg_loss = self.criterion(\n            F.log_softmax(\n                output['seg'], axis=1), batch['seg'].cast('int64'))\n\n        cls_loss /= (len(targets) * self.refine_layers)\n        reg_xytl_loss /= (len(targets) * self.refine_layers)\n        iou_loss /= (len(targets) * self.refine_layers)\n\n        loss = cls_loss * self.cls_loss_weight \\\n            + reg_xytl_loss * self.xyt_loss_weight \\\n            + seg_loss * self.seg_loss_weight \\\n            + iou_loss * self.iou_loss_weight\n\n        return_value = {\n            'loss': loss,\n            'cls_loss': cls_loss * self.cls_loss_weight,\n            'reg_xytl_loss': reg_xytl_loss * self.xyt_loss_weight,\n            'seg_loss': seg_loss * self.seg_loss_weight,\n            'iou_loss': iou_loss * self.iou_loss_weight\n        }\n\n        for i in range(self.refine_layers):\n            if not isinstance(cls_acc[i], paddle.Tensor):\n                cls_acc[i] = paddle.to_tensor(cls_acc[i])\n            return_value['stage_{}_acc'.format(i)] = cls_acc[i]\n\n        return return_value\n"
  },
  {
    "path": "ppdet/modeling/losses/cot_loss.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport numpy as np\nfrom ppdet.core.workspace import register\n\n__all__ = ['COTLoss']\n\n@register\nclass COTLoss(nn.Layer):\n    __shared__ = ['num_classes']\n    def __init__(self,\n                 num_classes=80, \n                 cot_scale=1,\n                 cot_lambda=1):\n        super(COTLoss, self).__init__()\n        self.cot_scale = cot_scale\n        self.cot_lambda = cot_lambda    \n        self.num_classes = num_classes    \n        \n    def forward(self, scores, targets, cot_relation):    \n        cls_name = 'loss_bbox_cls_cot'\n        loss_bbox = {}\n\n        tgt_labels, tgt_bboxes, tgt_gt_inds = targets\n        tgt_labels = paddle.concat(tgt_labels) if len(\n            tgt_labels) > 1 else tgt_labels[0]\n        mask = (tgt_labels < self.num_classes)\n        valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()\n        if valid_inds.shape[0] == 0:\n            loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')\n        else:\n            tgt_labels = tgt_labels.cast('int64')\n            valid_cot_targets = []\n            for i in range(tgt_labels.shape[0]):\n                train_label = tgt_labels[i]\n                if train_label < self.num_classes:\n                    valid_cot_targets.append(cot_relation[train_label])\n            coco_targets = paddle.to_tensor(valid_cot_targets)\n            coco_targets.stop_gradient = True\n            coco_loss = - coco_targets * F.log_softmax(scores[mask][:, :-1] * self.cot_scale)\n            loss_bbox[cls_name] = self.cot_lambda * paddle.mean(paddle.sum(coco_loss, axis=-1))\n        return loss_bbox\n"
  },
  {
    "path": "ppdet/modeling/losses/ctfocal_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\n\nfrom ppdet.core.workspace import register, serializable\n\n__all__ = ['CTFocalLoss']\n\n\n@register\n@serializable\nclass CTFocalLoss(object):\n    \"\"\"\n    CTFocalLoss: CornerNet & CenterNet Focal Loss\n    Args:\n        loss_weight (float): loss weight\n        gamma (float): gamma parameter for Focal Loss\n    \"\"\"\n\n    def __init__(self, loss_weight=1., gamma=2.0):\n        self.loss_weight = loss_weight\n        self.gamma = gamma\n\n    def __call__(self, pred, target):\n        \"\"\"\n        Calculate the loss\n        Args:\n            pred (Tensor): heatmap prediction\n            target (Tensor): target for positive samples\n        Return:\n            ct_focal_loss (Tensor): Focal Loss used in CornerNet & CenterNet.\n                Note that the values in target are in [0, 1] since gaussian is\n                used to reduce the punishment and we treat [0, 1) as neg example.\n        \"\"\"\n        fg_map = paddle.cast(target == 1, 'float32')\n        fg_map.stop_gradient = True\n        bg_map = paddle.cast(target < 1, 'float32')\n        bg_map.stop_gradient = True\n\n        neg_weights = paddle.pow(1 - target, 4)\n        pos_loss = 0 - paddle.log(pred) * paddle.pow(1 - pred,\n                                                     self.gamma) * fg_map\n\n        neg_loss = 0 - paddle.log(1 - pred) * paddle.pow(\n            pred, self.gamma) * neg_weights * bg_map\n        pos_loss = paddle.sum(pos_loss)\n        neg_loss = paddle.sum(neg_loss)\n\n        fg_num = paddle.sum(fg_map)\n        ct_focal_loss = (pos_loss + neg_loss) / (\n            fg_num + paddle.cast(fg_num == 0, 'float32'))\n        return ct_focal_loss * self.loss_weight\n"
  },
  {
    "path": "ppdet/modeling/losses/detr_loss.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom .iou_loss import GIoULoss\nfrom ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits\nfrom ..bbox_utils import bbox_iou\n\n__all__ = ['DETRLoss', 'DINOLoss', 'DINOv3Loss']\n\n\n@register\nclass DETRLoss(nn.Layer):\n    __shared__ = ['num_classes', 'use_focal_loss']\n    __inject__ = ['matcher']\n\n    def __init__(self,\n                 num_classes=80,\n                 matcher='HungarianMatcher',\n                 loss_coeff={\n                     'class': 1,\n                     'bbox': 5,\n                     'giou': 2,\n                     'no_object': 0.1,\n                     'mask': 1,\n                     'dice': 1\n                 },\n                 aux_loss=True,\n                 use_focal_loss=False,\n                 use_vfl=False,\n                 vfl_iou_type='bbox',\n                 use_uni_match=False,\n                 uni_match_ind=0):\n        r\"\"\"\n        Args:\n            num_classes (int): The number of classes.\n            matcher (HungarianMatcher): It computes an assignment between the targets\n                and the predictions of the network.\n            loss_coeff (dict): The coefficient of loss.\n            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.\n            use_focal_loss (bool): Use focal loss or not.\n        \"\"\"\n        super(DETRLoss, self).__init__()\n\n        self.num_classes = num_classes\n        self.matcher = matcher\n        self.loss_coeff = loss_coeff\n        self.aux_loss = aux_loss\n        self.use_focal_loss = use_focal_loss\n        self.use_vfl = use_vfl\n        self.vfl_iou_type = vfl_iou_type\n        self.use_uni_match = use_uni_match\n        self.uni_match_ind = uni_match_ind\n\n        if not self.use_focal_loss:\n            self.loss_coeff['class'] = paddle.full([num_classes + 1],\n                                                   loss_coeff['class'])\n            self.loss_coeff['class'][-1] = loss_coeff['no_object']\n        self.giou_loss = GIoULoss()\n\n    def _get_loss_class(self,\n                        logits,\n                        gt_class,\n                        match_indices,\n                        bg_index,\n                        num_gts,\n                        postfix=\"\",\n                        iou_score=None,\n                        gt_score=None):\n        # logits: [b, query, num_classes], gt_class: list[[n, 1]]\n        name_class = \"loss_class\" + postfix\n\n        target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')\n        bs, num_query_objects = target_label.shape\n        num_gt = sum(len(a) for a in gt_class)\n        if num_gt > 0:\n            index, updates = self._get_index_updates(num_query_objects,\n                                                     gt_class, match_indices)\n            target_label = paddle.scatter(\n                target_label.reshape([-1, 1]), index, updates.astype('int64'))\n            target_label = target_label.reshape([bs, num_query_objects])\n        if self.use_focal_loss:\n            target_label = F.one_hot(target_label,\n                                     self.num_classes + 1)[..., :-1]\n            if iou_score is not None and self.use_vfl:\n                if gt_score is not None:\n                    target_score = paddle.zeros([bs, num_query_objects])\n                    target_score = paddle.scatter(\n                        target_score.reshape([-1, 1]), index, gt_score)\n                    target_score = target_score.reshape(\n                        [bs, num_query_objects, 1]) * target_label\n\n                    target_score_iou = paddle.zeros([bs, num_query_objects])\n                    target_score_iou = paddle.scatter(\n                        target_score_iou.reshape([-1, 1]), index, iou_score)\n                    target_score_iou = target_score_iou.reshape(\n                        [bs, num_query_objects, 1]) * target_label\n                    target_score = paddle.multiply(target_score,\n                                                   target_score_iou)\n                    loss_ = self.loss_coeff[\n                        'class'] * varifocal_loss_with_logits(\n                            logits, target_score, target_label,\n                            num_gts / num_query_objects)\n                else:\n                    target_score = paddle.zeros([bs, num_query_objects])\n                    if num_gt > 0:\n                        target_score = paddle.scatter(\n                            target_score.reshape([-1, 1]), index, iou_score)\n                    target_score = target_score.reshape(\n                        [bs, num_query_objects, 1]) * target_label\n                    loss_ = self.loss_coeff[\n                        'class'] * varifocal_loss_with_logits(\n                            logits, target_score, target_label,\n                            num_gts / num_query_objects)\n            else:\n                loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(\n                    logits, target_label, num_gts / num_query_objects)\n        else:\n            loss_ = F.cross_entropy(\n                logits, target_label, weight=self.loss_coeff['class'])\n        return {name_class: loss_}\n\n    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,\n                       postfix=\"\"):\n        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]\n        name_bbox = \"loss_bbox\" + postfix\n        name_giou = \"loss_giou\" + postfix\n\n        loss = dict()\n        if sum(len(a) for a in gt_bbox) == 0:\n            loss[name_bbox] = paddle.to_tensor([0.])\n            loss[name_giou] = paddle.to_tensor([0.])\n            return loss\n\n        src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,\n                                                            match_indices)\n        loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(\n            src_bbox, target_bbox, reduction='sum') / num_gts\n        loss[name_giou] = self.giou_loss(\n            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))\n        loss[name_giou] = loss[name_giou].sum() / num_gts\n        loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]\n        return loss\n\n    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,\n                       postfix=\"\"):\n        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]\n        name_mask = \"loss_mask\" + postfix\n        name_dice = \"loss_dice\" + postfix\n\n        loss = dict()\n        if sum(len(a) for a in gt_mask) == 0:\n            loss[name_mask] = paddle.to_tensor([0.])\n            loss[name_dice] = paddle.to_tensor([0.])\n            return loss\n\n        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,\n                                                              match_indices)\n        src_masks = F.interpolate(\n            src_masks.unsqueeze(0),\n            size=target_masks.shape[-2:],\n            mode=\"bilinear\")[0]\n        loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(\n            src_masks,\n            target_masks,\n            paddle.to_tensor(\n                [num_gts], dtype='float32'))\n        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(\n            src_masks, target_masks, num_gts)\n        return loss\n\n    def _dice_loss(self, inputs, targets, num_gts):\n        inputs = F.sigmoid(inputs)\n        inputs = inputs.flatten(1)\n        targets = targets.flatten(1)\n        numerator = 2 * (inputs * targets).sum(1)\n        denominator = inputs.sum(-1) + targets.sum(-1)\n        loss = 1 - (numerator + 1) / (denominator + 1)\n        return loss.sum() / num_gts\n\n    def _get_loss_aux(self,\n                      boxes,\n                      logits,\n                      gt_bbox,\n                      gt_class,\n                      bg_index,\n                      num_gts,\n                      dn_match_indices=None,\n                      postfix=\"\",\n                      masks=None,\n                      gt_mask=None,\n                      gt_score=None):\n        loss_class = []\n        loss_bbox, loss_giou = [], []\n        loss_mask, loss_dice = [], []\n        if dn_match_indices is not None:\n            match_indices = dn_match_indices\n        elif self.use_uni_match:\n            match_indices = self.matcher(\n                boxes[self.uni_match_ind],\n                logits[self.uni_match_ind],\n                gt_bbox,\n                gt_class,\n                masks=masks[self.uni_match_ind] if masks is not None else None,\n                gt_mask=gt_mask)\n        for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):\n            aux_masks = masks[i] if masks is not None else None\n            if not self.use_uni_match and dn_match_indices is None:\n                match_indices = self.matcher(\n                    aux_boxes,\n                    aux_logits,\n                    gt_bbox,\n                    gt_class,\n                    masks=aux_masks,\n                    gt_mask=gt_mask)\n            if self.use_vfl:\n                if sum(len(a) for a in gt_bbox) > 0:\n                    src_bbox, target_bbox = self._get_src_target_assign(\n                        aux_boxes.detach(), gt_bbox, match_indices)\n                    iou_score = bbox_iou(\n                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),\n                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))\n                else:\n                    iou_score = None\n                if gt_score is not None:\n                    _, target_score = self._get_src_target_assign(\n                        logits[-1].detach(), gt_score, match_indices)\n            else:\n                iou_score = None\n            loss_class.append(\n                self._get_loss_class(\n                    aux_logits,\n                    gt_class,\n                    match_indices,\n                    bg_index,\n                    num_gts,\n                    postfix,\n                    iou_score,\n                    gt_score=target_score\n                    if gt_score is not None else None)['loss_class' + postfix])\n            loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,\n                                        num_gts, postfix)\n            loss_bbox.append(loss_['loss_bbox' + postfix])\n            loss_giou.append(loss_['loss_giou' + postfix])\n            if masks is not None and gt_mask is not None:\n                loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,\n                                            num_gts, postfix)\n                loss_mask.append(loss_['loss_mask' + postfix])\n                loss_dice.append(loss_['loss_dice' + postfix])\n        loss = {\n            \"loss_class_aux\" + postfix: paddle.add_n(loss_class),\n            \"loss_bbox_aux\" + postfix: paddle.add_n(loss_bbox),\n            \"loss_giou_aux\" + postfix: paddle.add_n(loss_giou)\n        }\n        if masks is not None and gt_mask is not None:\n            loss[\"loss_mask_aux\" + postfix] = paddle.add_n(loss_mask)\n            loss[\"loss_dice_aux\" + postfix] = paddle.add_n(loss_dice)\n        return loss\n\n    def _get_index_updates(self, num_query_objects, target, match_indices):\n        batch_idx = paddle.concat([\n            paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)\n        ])\n        src_idx = paddle.concat([src for (src, _) in match_indices])\n        src_idx += (batch_idx * num_query_objects)\n        target_assign = paddle.concat([\n            paddle.gather(\n                t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)\n        ])\n        return src_idx, target_assign\n\n    def _get_src_target_assign(self, src, target, match_indices):\n        src_assign = paddle.concat([\n            paddle.gather(\n                t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])\n            for t, (I, _) in zip(src, match_indices)\n        ])\n        target_assign = paddle.concat([\n            paddle.gather(\n                t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])\n            for t, (_, J) in zip(target, match_indices)\n        ])\n        return src_assign, target_assign\n\n    def _get_num_gts(self, targets, dtype=\"float32\"):\n        num_gts = sum(len(a) for a in targets)\n        num_gts = paddle.to_tensor([num_gts], dtype=dtype)\n        if paddle.distributed.get_world_size() > 1:\n            paddle.distributed.all_reduce(num_gts)\n            num_gts /= paddle.distributed.get_world_size()\n        num_gts = paddle.clip(num_gts, min=1.)\n        return num_gts\n\n    def _get_prediction_loss(self,\n                             boxes,\n                             logits,\n                             gt_bbox,\n                             gt_class,\n                             masks=None,\n                             gt_mask=None,\n                             postfix=\"\",\n                             dn_match_indices=None,\n                             num_gts=1,\n                             gt_score=None):\n        if dn_match_indices is None:\n            match_indices = self.matcher(\n                boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)\n        else:\n            match_indices = dn_match_indices\n\n        if self.use_vfl:\n            if gt_score is not None:  #ssod\n                _, target_score = self._get_src_target_assign(\n                    logits[-1].detach(), gt_score, match_indices)\n            elif sum(len(a) for a in gt_bbox) > 0:\n                if self.vfl_iou_type == 'bbox':\n                    src_bbox, target_bbox = self._get_src_target_assign(\n                        boxes.detach(), gt_bbox, match_indices)\n                    iou_score = bbox_iou(\n                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),\n                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))\n                elif self.vfl_iou_type == 'mask':\n                    assert (masks is not None and gt_mask is not None,\n                            'Make sure the input has `mask` and `gt_mask`')\n                    assert sum(len(a) for a in gt_mask) > 0\n                    src_mask, target_mask = self._get_src_target_assign(\n                        masks.detach(), gt_mask, match_indices)\n                    src_mask = F.interpolate(\n                        src_mask.unsqueeze(0),\n                        scale_factor=2,\n                        mode='bilinear',\n                        align_corners=False).squeeze(0)\n                    target_mask = F.interpolate(\n                        target_mask.unsqueeze(0),\n                        size=src_mask.shape[-2:],\n                        mode='bilinear',\n                        align_corners=False).squeeze(0)\n                    src_mask = src_mask.flatten(1)\n                    src_mask = F.sigmoid(src_mask)\n                    src_mask = paddle.where(\n                        src_mask > 0.5, 1., 0.).astype(masks.dtype)\n                    target_mask = target_mask.flatten(1)\n                    target_mask = paddle.where(\n                        target_mask > 0.5, 1., 0.).astype(masks.dtype)\n                    inter = (src_mask * target_mask).sum(1)\n                    union = src_mask.sum(1) + target_mask.sum(1) - inter\n                    iou_score = (inter + 1e-2) / (union + 1e-2)\n                    iou_score = iou_score.unsqueeze(-1)\n                else:\n                    iou_score = None\n            else:\n                iou_score = None\n        else:\n            iou_score = None\n\n        loss = dict()\n        loss.update(\n            self._get_loss_class(\n                logits,\n                gt_class,\n                match_indices,\n                self.num_classes,\n                num_gts,\n                postfix,\n                iou_score,\n                gt_score=target_score if gt_score is not None else None))\n        loss.update(\n            self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,\n                                postfix))\n        if masks is not None and gt_mask is not None:\n            loss.update(\n                self._get_loss_mask(masks, gt_mask, match_indices, num_gts,\n                                    postfix))\n        return loss\n\n    def forward(self,\n                boxes,\n                logits,\n                gt_bbox,\n                gt_class,\n                masks=None,\n                gt_mask=None,\n                postfix=\"\",\n                gt_score=None,\n                o2m=1,\n                **kwargs):\n        r\"\"\"\n        Args:\n            boxes (Tensor): [l, b, query, 4]\n            logits (Tensor): [l, b, query, num_classes]\n            gt_bbox (List(Tensor)): list[[n, 4]]\n            gt_class (List(Tensor)): list[[n, 1]]\n            masks (Tensor, optional): [l, b, query, h, w]\n            gt_mask (List(Tensor), optional): list[[n, H, W]]\n            postfix (str): postfix of loss name\n        \"\"\"\n\n        dn_match_indices = kwargs.get(\"dn_match_indices\", None)\n        num_gts = kwargs.get(\"num_gts\", None)\n        if num_gts is None:\n            num_gts = self._get_num_gts(gt_class)\n\n        total_loss = self._get_prediction_loss(\n            boxes[-1],\n            logits[-1],\n            gt_bbox,\n            gt_class,\n            masks=masks[-1] if masks is not None else None,\n            gt_mask=gt_mask,\n            postfix=postfix,\n            dn_match_indices=dn_match_indices,\n            num_gts=num_gts,\n            gt_score=gt_score if gt_score is not None else None)\n\n        if self.aux_loss:\n            total_loss.update(\n                self._get_loss_aux(\n                    boxes[:-1],\n                    logits[:-1],\n                    gt_bbox,\n                    gt_class,\n                    self.num_classes,\n                    num_gts,\n                    dn_match_indices,\n                    postfix,\n                    masks=masks[:-1] if masks is not None else None,\n                    gt_mask=gt_mask,\n                    gt_score=gt_score if gt_score is not None else None))\n\n        return total_loss\n\n\n@register\nclass DINOLoss(DETRLoss):\n    def forward(self,\n                boxes,\n                logits,\n                gt_bbox,\n                gt_class,\n                masks=None,\n                gt_mask=None,\n                postfix=\"\",\n                dn_out_bboxes=None,\n                dn_out_logits=None,\n                dn_meta=None,\n                gt_score=None,\n                **kwargs):\n        num_gts = self._get_num_gts(gt_class)\n        total_loss = super(DINOLoss, self).forward(\n            boxes,\n            logits,\n            gt_bbox,\n            gt_class,\n            num_gts=num_gts,\n            gt_score=gt_score)\n\n        if dn_meta is not None:\n            dn_positive_idx, dn_num_group = \\\n                dn_meta[\"dn_positive_idx\"], dn_meta[\"dn_num_group\"]\n            assert len(gt_class) == len(dn_positive_idx)\n\n            # denoising match indices\n            dn_match_indices = self.get_dn_match_indices(\n                gt_class, dn_positive_idx, dn_num_group)\n\n            # compute denoising training loss\n            num_gts *= dn_num_group\n            dn_loss = super(DINOLoss, self).forward(\n                dn_out_bboxes,\n                dn_out_logits,\n                gt_bbox,\n                gt_class,\n                postfix=\"_dn\",\n                dn_match_indices=dn_match_indices,\n                num_gts=num_gts,\n                gt_score=gt_score)\n            total_loss.update(dn_loss)\n        else:\n            total_loss.update(\n                {k + '_dn': paddle.to_tensor([0.])\n                 for k in total_loss.keys()})\n\n        return total_loss\n\n    @staticmethod\n    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):\n        dn_match_indices = []\n        for i in range(len(labels)):\n            num_gt = len(labels[i])\n            if num_gt > 0:\n                gt_idx = paddle.arange(end=num_gt, dtype=\"int64\")\n                gt_idx = gt_idx.tile([dn_num_group])\n                assert len(dn_positive_idx[i]) == len(gt_idx)\n                dn_match_indices.append((dn_positive_idx[i], gt_idx))\n            else:\n                dn_match_indices.append((paddle.zeros(\n                    [0], dtype=\"int64\"), paddle.zeros(\n                        [0], dtype=\"int64\")))\n        return dn_match_indices\n\n@register\nclass DINOv3Loss(DETRLoss):\n    def forward(self,\n                boxes,\n                logits,\n                gt_bbox,\n                gt_class,\n                masks=None,\n                gt_mask=None,\n                postfix=\"\",\n                dn_out_bboxes=None,\n                dn_out_logits=None,\n                dn_meta=None,\n                gt_score=None,\n                o2m=1,\n                **kwargs):\n        \n        if o2m != 1:\n            gt_boxes_copy = [box.tile([o2m, 1]) for box in gt_bbox]\n            gt_class_copy = [label.tile([o2m, 1]) for label in gt_class]\n        else:\n            gt_boxes_copy = gt_bbox\n            gt_class_copy = gt_class\n        num_gts_copy = self._get_num_gts(gt_class_copy)\n        total_loss = self._get_prediction_loss(\n            boxes[-1],\n            logits[-1],\n            gt_boxes_copy,\n            gt_class_copy,\n            masks=masks[-1] if masks is not None else None,\n            gt_mask=gt_mask,\n            postfix=postfix,\n            dn_match_indices=None,\n            num_gts=num_gts_copy,\n            gt_score=gt_score if gt_score is not None else None)\n\n        if self.aux_loss:\n            total_loss.update(\n                self._get_loss_aux(\n                    boxes[:-1],\n                    logits[:-1],\n                    gt_boxes_copy,\n                    gt_class_copy,\n                    self.num_classes,\n                    num_gts_copy,\n                    dn_match_indices=None,\n                    postfix=postfix,\n                    masks=masks[:-1] if masks is not None else None,\n                    gt_mask=gt_mask,\n                    gt_score=gt_score if gt_score is not None else None))\n\n        if dn_meta is not None:\n            num_gts = self._get_num_gts(gt_class)\n            dn_positive_idx, dn_num_group = \\\n                dn_meta[\"dn_positive_idx\"], dn_meta[\"dn_num_group\"]\n            assert len(gt_class) == len(dn_positive_idx)\n\n            # denoising match indices\n            dn_match_indices = self.get_dn_match_indices(\n                gt_class, dn_positive_idx, dn_num_group)\n\n            # compute denoising training loss\n            num_gts *= dn_num_group\n            dn_loss = super(DINOv3Loss, self).forward(\n                dn_out_bboxes,\n                dn_out_logits,\n                gt_bbox,\n                gt_class,\n                postfix=\"_dn\",\n                dn_match_indices=dn_match_indices,\n                num_gts=num_gts,\n                gt_score=gt_score)\n            total_loss.update(dn_loss)\n        else:\n            total_loss.update(\n                {k + '_dn': paddle.to_tensor([0.])\n                 for k in total_loss.keys()})\n\n        return total_loss\n\n    @staticmethod\n    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):\n        dn_match_indices = []\n        for i in range(len(labels)):\n            num_gt = len(labels[i])\n            if num_gt > 0:\n                gt_idx = paddle.arange(end=num_gt, dtype=\"int64\")\n                gt_idx = gt_idx.tile([dn_num_group])\n                assert len(dn_positive_idx[i]) == len(gt_idx)\n                dn_match_indices.append((dn_positive_idx[i], gt_idx))\n            else:\n                dn_match_indices.append((paddle.zeros(\n                    [0], dtype=\"int64\"), paddle.zeros(\n                        [0], dtype=\"int64\")))\n        return dn_match_indices\n\n@register\nclass MaskDINOLoss(DETRLoss):\n    __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']\n    __inject__ = ['matcher']\n\n    def __init__(self,\n                 num_classes=80,\n                 matcher='HungarianMatcher',\n                 loss_coeff={\n                     'class': 4,\n                     'bbox': 5,\n                     'giou': 2,\n                     'mask': 5,\n                     'dice': 5\n                 },\n                 aux_loss=True,\n                 use_focal_loss=False,\n                 use_vfl=False,\n                 vfl_iou_type='bbox',\n                 num_sample_points=12544,\n                 oversample_ratio=3.0,\n                 important_sample_ratio=0.75):\n        super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,\n                                           aux_loss, use_focal_loss, use_vfl, vfl_iou_type)\n        assert oversample_ratio >= 1\n        assert important_sample_ratio <= 1 and important_sample_ratio >= 0\n\n        self.num_sample_points = num_sample_points\n        self.oversample_ratio = oversample_ratio\n        self.important_sample_ratio = important_sample_ratio\n        self.num_oversample_points = int(num_sample_points * oversample_ratio)\n        self.num_important_points = int(num_sample_points *\n                                        important_sample_ratio)\n        self.num_random_points = num_sample_points - self.num_important_points\n\n    def forward(self,\n                boxes,\n                logits,\n                gt_bbox,\n                gt_class,\n                masks=None,\n                gt_mask=None,\n                postfix=\"\",\n                dn_out_bboxes=None,\n                dn_out_logits=None,\n                dn_out_masks=None,\n                dn_meta=None,\n                **kwargs):\n        num_gts = self._get_num_gts(gt_class)\n        total_loss = super(MaskDINOLoss, self).forward(\n            boxes,\n            logits,\n            gt_bbox,\n            gt_class,\n            masks=masks,\n            gt_mask=gt_mask,\n            num_gts=num_gts)\n\n        if dn_meta is not None:\n            dn_positive_idx, dn_num_group = \\\n                dn_meta[\"dn_positive_idx\"], dn_meta[\"dn_num_group\"]\n            assert len(gt_class) == len(dn_positive_idx)\n\n            # denoising match indices\n            dn_match_indices = DINOLoss.get_dn_match_indices(\n                gt_class, dn_positive_idx, dn_num_group)\n\n            # compute denoising training loss\n            num_gts *= dn_num_group\n            dn_loss = super(MaskDINOLoss, self).forward(\n                dn_out_bboxes,\n                dn_out_logits,\n                gt_bbox,\n                gt_class,\n                masks=dn_out_masks,\n                gt_mask=gt_mask,\n                postfix=\"_dn\",\n                dn_match_indices=dn_match_indices,\n                num_gts=num_gts)\n            total_loss.update(dn_loss)\n        else:\n            total_loss.update(\n                {k + '_dn': paddle.to_tensor([0.])\n                 for k in total_loss.keys()})\n\n        return total_loss\n\n    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,\n                       postfix=\"\"):\n        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]\n        name_mask = \"loss_mask\" + postfix\n        name_dice = \"loss_dice\" + postfix\n\n        loss = dict()\n        if sum(len(a) for a in gt_mask) == 0:\n            loss[name_mask] = paddle.to_tensor([0.])\n            loss[name_dice] = paddle.to_tensor([0.])\n            return loss\n\n        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,\n                                                              match_indices)\n        # sample points\n        sample_points = self._get_point_coords_by_uncertainty(src_masks)\n        sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0\n\n        src_masks = F.grid_sample(\n            src_masks.unsqueeze(1), sample_points,\n            align_corners=False).squeeze([1, 2])\n\n        target_masks = F.grid_sample(\n            target_masks.unsqueeze(1), sample_points,\n            align_corners=False).squeeze([1, 2]).detach()\n\n        loss[name_mask] = self.loss_coeff[\n            'mask'] * F.binary_cross_entropy_with_logits(\n                src_masks, target_masks,\n                reduction='none').mean(1).sum() / num_gts\n        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(\n            src_masks, target_masks, num_gts)\n        return loss\n\n    def _get_point_coords_by_uncertainty(self, masks):\n        # Sample points based on their uncertainty.\n        masks = masks.detach()\n        num_masks = masks.shape[0]\n        sample_points = paddle.rand(\n            [num_masks, 1, self.num_oversample_points, 2])\n\n        out_mask = F.grid_sample(\n            masks.unsqueeze(1), 2.0 * sample_points - 1.0,\n            align_corners=False).squeeze([1, 2])\n        out_mask = -paddle.abs(out_mask)\n\n        _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)\n        batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)\n        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])\n        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)\n\n        sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)\n        if self.num_random_points > 0:\n            sample_points = paddle.concat(\n                [\n                    sample_points,\n                    paddle.rand([num_masks, self.num_random_points, 2])\n                ],\n                axis=1)\n        return sample_points"
  },
  {
    "path": "ppdet/modeling/losses/fairmot_loss.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn.initializer import Constant\nfrom ppdet.core.workspace import register\n\n__all__ = ['FairMOTLoss']\n\n\n@register\nclass FairMOTLoss(nn.Layer):\n    def __init__(self):\n        super(FairMOTLoss, self).__init__()\n        self.det_weight = self.create_parameter(\n            shape=[1], default_initializer=Constant(-1.85))\n        self.reid_weight = self.create_parameter(\n            shape=[1], default_initializer=Constant(-1.05))\n\n    def forward(self, det_loss, reid_loss):\n        loss = paddle.exp(-self.det_weight) * det_loss + paddle.exp(\n            -self.reid_weight) * reid_loss + (self.det_weight + self.reid_weight\n                                              )\n        loss *= 0.5\n        return {'loss': loss}\n"
  },
  {
    "path": "ppdet/modeling/losses/fcos_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling import ops\nfrom functools import partial\n\n__all__ = ['FCOSLoss', 'FCOSLossMILC', 'FCOSLossCR']\n\n\ndef flatten_tensor(inputs, channel_first=False):\n    \"\"\"\n    Flatten a Tensor\n    Args:\n        inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C]\n        channel_first (bool): If true the dimension order of Tensor is \n            [N, C, H, W], otherwise is [N, H, W, C]\n    Return:\n        output_channel_last (Tensor): The flattened Tensor in channel_last style\n    \"\"\"\n    if channel_first:\n        input_channel_last = paddle.transpose(inputs, perm=[0, 2, 3, 1])\n    else:\n        input_channel_last = inputs\n    output_channel_last = paddle.flatten(\n        input_channel_last, start_axis=0, stop_axis=2)\n    return output_channel_last\n\n\n@register\nclass FCOSLoss(nn.Layer):\n    \"\"\"\n    FCOSLoss\n    Args:\n        loss_alpha (float): alpha in focal loss\n        loss_gamma (float): gamma in focal loss\n        iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU\n        reg_weights (float): weight for location loss\n        quality (str): quality branch, centerness/iou\n    \"\"\"\n\n    def __init__(self,\n                 loss_alpha=0.25,\n                 loss_gamma=2.0,\n                 iou_loss_type=\"giou\",\n                 reg_weights=1.0,\n                 quality='centerness'):\n        super(FCOSLoss, self).__init__()\n        self.loss_alpha = loss_alpha\n        self.loss_gamma = loss_gamma\n        self.iou_loss_type = iou_loss_type\n        self.reg_weights = reg_weights\n        self.quality = quality\n\n    def _iou_loss(self,\n                  pred,\n                  targets,\n                  positive_mask,\n                  weights=None,\n                  return_iou=False):\n        \"\"\"\n        Calculate the loss for location prediction\n        Args:\n            pred (Tensor): bounding boxes prediction\n            targets (Tensor): targets for positive samples\n            positive_mask (Tensor): mask of positive samples\n            weights (Tensor): weights for each positive samples\n        Return:\n            loss (Tensor): location loss\n        \"\"\"\n        plw = pred[:, 0] * positive_mask\n        pth = pred[:, 1] * positive_mask\n        prw = pred[:, 2] * positive_mask\n        pbh = pred[:, 3] * positive_mask\n\n        tlw = targets[:, 0] * positive_mask\n        tth = targets[:, 1] * positive_mask\n        trw = targets[:, 2] * positive_mask\n        tbh = targets[:, 3] * positive_mask\n        tlw.stop_gradient = True\n        trw.stop_gradient = True\n        tth.stop_gradient = True\n        tbh.stop_gradient = True\n\n        ilw = paddle.minimum(plw, tlw)\n        irw = paddle.minimum(prw, trw)\n        ith = paddle.minimum(pth, tth)\n        ibh = paddle.minimum(pbh, tbh)\n\n        clw = paddle.maximum(plw, tlw)\n        crw = paddle.maximum(prw, trw)\n        cth = paddle.maximum(pth, tth)\n        cbh = paddle.maximum(pbh, tbh)\n\n        area_predict = (plw + prw) * (pth + pbh)\n        area_target = (tlw + trw) * (tth + tbh)\n        area_inter = (ilw + irw) * (ith + ibh)\n        ious = (area_inter + 1.0) / (\n            area_predict + area_target - area_inter + 1.0)\n        ious = ious * positive_mask\n\n        if return_iou:\n            return ious\n\n        if self.iou_loss_type.lower() == \"linear_iou\":\n            loss = 1.0 - ious\n        elif self.iou_loss_type.lower() == \"giou\":\n            area_uniou = area_predict + area_target - area_inter\n            area_circum = (clw + crw) * (cth + cbh) + 1e-7\n            giou = ious - (area_circum - area_uniou) / area_circum\n            loss = 1.0 - giou\n        elif self.iou_loss_type.lower() == \"iou\":\n            loss = 0.0 - paddle.log(ious)\n        else:\n            raise KeyError\n        if weights is not None:\n            loss = loss * weights\n        return loss\n\n    def forward(self, cls_logits, bboxes_reg, centerness, tag_labels,\n                tag_bboxes, tag_center):\n        \"\"\"\n        Calculate the loss for classification, location and centerness\n        Args:\n            cls_logits (list): list of Tensor, which is predicted\n                score for all anchor points with shape [N, M, C]\n            bboxes_reg (list): list of Tensor, which is predicted\n                offsets for all anchor points with shape [N, M, 4]\n            centerness (list): list of Tensor, which is predicted\n                centerness for all anchor points with shape [N, M, 1]\n            tag_labels (list): list of Tensor, which is category\n                targets for each anchor point\n            tag_bboxes (list): list of Tensor, which is bounding\n                boxes targets for positive samples\n            tag_center (list): list of Tensor, which is centerness\n                targets for positive samples\n        Return:\n            loss (dict): loss composed by classification loss, bounding box\n        \"\"\"\n        cls_logits_flatten_list = []\n        bboxes_reg_flatten_list = []\n        centerness_flatten_list = []\n        tag_labels_flatten_list = []\n        tag_bboxes_flatten_list = []\n        tag_center_flatten_list = []\n        num_lvl = len(cls_logits)\n        for lvl in range(num_lvl):\n            cls_logits_flatten_list.append(\n                flatten_tensor(cls_logits[lvl], True))\n            bboxes_reg_flatten_list.append(\n                flatten_tensor(bboxes_reg[lvl], True))\n            centerness_flatten_list.append(\n                flatten_tensor(centerness[lvl], True))\n\n            tag_labels_flatten_list.append(\n                flatten_tensor(tag_labels[lvl], False))\n            tag_bboxes_flatten_list.append(\n                flatten_tensor(tag_bboxes[lvl], False))\n            tag_center_flatten_list.append(\n                flatten_tensor(tag_center[lvl], False))\n\n        cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0)\n        bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0)\n        centerness_flatten = paddle.concat(centerness_flatten_list, axis=0)\n\n        tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0)\n        tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0)\n        tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0)\n        tag_labels_flatten.stop_gradient = True\n        tag_bboxes_flatten.stop_gradient = True\n        tag_center_flatten.stop_gradient = True\n\n        mask_positive_bool = tag_labels_flatten > 0\n        mask_positive_bool.stop_gradient = True\n        mask_positive_float = paddle.cast(mask_positive_bool, dtype=\"float32\")\n        mask_positive_float.stop_gradient = True\n\n        num_positive_fp32 = paddle.sum(mask_positive_float)\n        num_positive_fp32.stop_gradient = True\n        num_positive_int32 = paddle.cast(num_positive_fp32, dtype=\"int32\")\n        num_positive_int32 = num_positive_int32 * 0 + 1\n        num_positive_int32.stop_gradient = True\n\n        normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float)\n        normalize_sum.stop_gradient = True\n\n        # 1. cls_logits: sigmoid_focal_loss\n        # expand onehot labels\n        num_classes = cls_logits_flatten.shape[-1]\n        tag_labels_flatten = paddle.squeeze(tag_labels_flatten, axis=-1)\n        tag_labels_flatten_bin = F.one_hot(\n            tag_labels_flatten, num_classes=1 + num_classes)\n        tag_labels_flatten_bin = tag_labels_flatten_bin[:, 1:]\n        # sigmoid_focal_loss\n        cls_loss = F.sigmoid_focal_loss(\n            cls_logits_flatten, tag_labels_flatten_bin) / num_positive_fp32\n\n        if self.quality == 'centerness':\n            # 2. bboxes_reg: giou_loss\n            mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)\n            tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)\n            reg_loss = self._iou_loss(\n                bboxes_reg_flatten,\n                tag_bboxes_flatten,\n                mask_positive_float,\n                weights=tag_center_flatten)\n            reg_loss = reg_loss * mask_positive_float / normalize_sum\n\n            # 3. centerness: sigmoid_cross_entropy_with_logits_loss\n            centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1)\n            quality_loss = ops.sigmoid_cross_entropy_with_logits(\n                centerness_flatten, tag_center_flatten)\n            quality_loss = quality_loss * mask_positive_float / num_positive_fp32\n\n        elif self.quality == 'iou':\n            # 2. bboxes_reg: giou_loss\n            mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)\n            tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)\n            reg_loss = self._iou_loss(\n                bboxes_reg_flatten,\n                tag_bboxes_flatten,\n                mask_positive_float,\n                weights=None)\n            reg_loss = reg_loss * mask_positive_float / num_positive_fp32\n            # num_positive_fp32 is num_foreground\n\n            # 3. centerness: sigmoid_cross_entropy_with_logits_loss\n            centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1)\n            gt_ious = self._iou_loss(\n                bboxes_reg_flatten,\n                tag_bboxes_flatten,\n                mask_positive_float,\n                weights=None,\n                return_iou=True)\n            quality_loss = ops.sigmoid_cross_entropy_with_logits(\n                centerness_flatten, gt_ious)\n            quality_loss = quality_loss * mask_positive_float / num_positive_fp32\n        else:\n            raise Exception(f'Unknown quality type: {self.quality}')\n\n        loss_all = {\n            \"loss_cls\": paddle.sum(cls_loss),\n            \"loss_box\": paddle.sum(reg_loss),\n            \"loss_quality\": paddle.sum(quality_loss),\n        }\n        return loss_all\n\n\n@register\nclass FCOSLossMILC(FCOSLoss):\n    \"\"\"\n    FCOSLossMILC for ARSL in semi-det(ssod)\n    Args:\n        loss_alpha (float): alpha in focal loss\n        loss_gamma (float): gamma in focal loss\n        iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU\n        reg_weights (float): weight for location loss\n    \"\"\"\n\n    def __init__(self,\n                 loss_alpha=0.25,\n                 loss_gamma=2.0,\n                 iou_loss_type=\"giou\",\n                 reg_weights=1.0):\n        super(FCOSLossMILC, self).__init__()\n        self.loss_alpha = loss_alpha\n        self.loss_gamma = loss_gamma\n        self.iou_loss_type = iou_loss_type\n        self.reg_weights = reg_weights\n\n    def iou_loss(self, pred, targets, weights=None, avg_factor=None):\n        \"\"\"\n        Calculate the loss for location prediction\n        Args:\n            pred (Tensor): bounding boxes prediction\n            targets (Tensor): targets for positive samples\n            weights (Tensor): weights for each positive samples\n        Return:\n            loss (Tensor): location loss\n        \"\"\"\n        plw = pred[:, 0]\n        pth = pred[:, 1]\n        prw = pred[:, 2]\n        pbh = pred[:, 3]\n\n        tlw = targets[:, 0]\n        tth = targets[:, 1]\n        trw = targets[:, 2]\n        tbh = targets[:, 3]\n        tlw.stop_gradient = True\n        trw.stop_gradient = True\n        tth.stop_gradient = True\n        tbh.stop_gradient = True\n\n        ilw = paddle.minimum(plw, tlw)\n        irw = paddle.minimum(prw, trw)\n        ith = paddle.minimum(pth, tth)\n        ibh = paddle.minimum(pbh, tbh)\n\n        clw = paddle.maximum(plw, tlw)\n        crw = paddle.maximum(prw, trw)\n        cth = paddle.maximum(pth, tth)\n        cbh = paddle.maximum(pbh, tbh)\n\n        area_predict = (plw + prw) * (pth + pbh)\n        area_target = (tlw + trw) * (tth + tbh)\n        area_inter = (ilw + irw) * (ith + ibh)\n        ious = (area_inter + 1.0) / (\n            area_predict + area_target - area_inter + 1.0)\n        ious = ious\n\n        if self.iou_loss_type.lower() == \"linear_iou\":\n            loss = 1.0 - ious\n        elif self.iou_loss_type.lower() == \"giou\":\n            area_uniou = area_predict + area_target - area_inter\n            area_circum = (clw + crw) * (cth + cbh) + 1e-7\n            giou = ious - (area_circum - area_uniou) / area_circum\n            loss = 1.0 - giou\n        elif self.iou_loss_type.lower() == \"iou\":\n            loss = 0.0 - paddle.log(ious)\n        else:\n            raise KeyError\n        if weights is not None:\n            loss = loss * weights\n        loss = paddle.sum(loss)\n        if avg_factor is not None:\n            loss = loss / avg_factor\n        return loss\n\n    # temp function: calcualate iou between bbox and target\n    def _bbox_overlap_align(self, pred, targets):\n        assert pred.shape[0] == targets.shape[0], \\\n        'the pred should be aligned with target.'\n\n        plw = pred[:, 0]\n        pth = pred[:, 1]\n        prw = pred[:, 2]\n        pbh = pred[:, 3]\n\n        tlw = targets[:, 0]\n        tth = targets[:, 1]\n        trw = targets[:, 2]\n        tbh = targets[:, 3]\n\n        ilw = paddle.minimum(plw, tlw)\n        irw = paddle.minimum(prw, trw)\n        ith = paddle.minimum(pth, tth)\n        ibh = paddle.minimum(pbh, tbh)\n\n        area_predict = (plw + prw) * (pth + pbh)\n        area_target = (tlw + trw) * (tth + tbh)\n        area_inter = (ilw + irw) * (ith + ibh)\n        ious = (area_inter + 1.0) / (\n            area_predict + area_target - area_inter + 1.0)\n\n        return ious\n\n    def iou_based_soft_label_loss(self,\n                                  pred,\n                                  target,\n                                  alpha=0.75,\n                                  gamma=2.0,\n                                  iou_weighted=False,\n                                  implicit_iou=None,\n                                  avg_factor=None):\n        assert pred.shape == target.shape\n        pred = F.sigmoid(pred)\n        target = target.cast(pred.dtype)\n\n        if implicit_iou is not None:\n            pred = pred * implicit_iou\n\n        if iou_weighted:\n            focal_weight = (pred - target).abs().pow(gamma) * target * (target > 0.0).cast('float32') + \\\n                alpha * (pred - target).abs().pow(gamma) * \\\n                (target <= 0.0).cast('float32')\n        else:\n            focal_weight = (pred - target).abs().pow(gamma) * (target > 0.0).cast('float32') + \\\n                alpha * (pred - target).abs().pow(gamma) * \\\n                (target <= 0.0).cast('float32')\n\n        # focal loss\n        loss = F.binary_cross_entropy(\n            pred, target, reduction='none') * focal_weight\n        if avg_factor is not None:\n            loss = loss / avg_factor\n        return loss\n\n    def forward(self, cls_logits, bboxes_reg, centerness, tag_labels,\n                tag_bboxes, tag_center):\n        \"\"\"\n        Calculate the loss for classification, location and centerness\n        Args:\n            cls_logits (list): list of Tensor, which is predicted\n                score for all anchor points with shape [N, M, C]\n            bboxes_reg (list): list of Tensor, which is predicted\n                offsets for all anchor points with shape [N, M, 4]\n            centerness (list): list of Tensor, which is predicted\n                centerness for all anchor points with shape [N, M, 1]\n            tag_labels (list): list of Tensor, which is category\n                targets for each anchor point\n            tag_bboxes (list): list of Tensor, which is bounding\n                boxes targets for positive samples\n            tag_center (list): list of Tensor, which is centerness\n                targets for positive samples\n        Return:\n            loss (dict): loss composed by classification loss, bounding box\n        \"\"\"\n        cls_logits_flatten_list = []\n        bboxes_reg_flatten_list = []\n        centerness_flatten_list = []\n        tag_labels_flatten_list = []\n        tag_bboxes_flatten_list = []\n        tag_center_flatten_list = []\n        num_lvl = len(cls_logits)\n        for lvl in range(num_lvl):\n            cls_logits_flatten_list.append(\n                flatten_tensor(cls_logits[lvl], True))\n            bboxes_reg_flatten_list.append(\n                flatten_tensor(bboxes_reg[lvl], True))\n            centerness_flatten_list.append(\n                flatten_tensor(centerness[lvl], True))\n\n            tag_labels_flatten_list.append(\n                flatten_tensor(tag_labels[lvl], False))\n            tag_bboxes_flatten_list.append(\n                flatten_tensor(tag_bboxes[lvl], False))\n            tag_center_flatten_list.append(\n                flatten_tensor(tag_center[lvl], False))\n\n        cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0)\n        bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0)\n        centerness_flatten = paddle.concat(centerness_flatten_list, axis=0)\n\n        tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0)\n        tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0)\n        tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0)\n        tag_labels_flatten.stop_gradient = True\n        tag_bboxes_flatten.stop_gradient = True\n        tag_center_flatten.stop_gradient = True\n\n        # find positive index\n        mask_positive_bool = tag_labels_flatten > 0\n        mask_positive_bool.stop_gradient = True\n        mask_positive_float = paddle.cast(mask_positive_bool, dtype=\"float32\")\n        mask_positive_float.stop_gradient = True\n\n        num_positive_fp32 = paddle.sum(mask_positive_float)\n        num_positive_fp32.stop_gradient = True\n        num_positive_int32 = paddle.cast(num_positive_fp32, dtype=\"int32\")\n        num_positive_int32 = num_positive_int32 * 0 + 1\n        num_positive_int32.stop_gradient = True\n\n        # centerness target is used as reg weight\n        normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float)\n        normalize_sum.stop_gradient = True\n\n        # 1. IoU-Based soft label loss\n        # calculate iou\n        with paddle.no_grad():\n            pos_ind = paddle.nonzero(\n                tag_labels_flatten.reshape([-1]) > 0).reshape([-1])\n            pos_pred = bboxes_reg_flatten[pos_ind]\n            pos_target = tag_bboxes_flatten[pos_ind]\n            bbox_iou = self._bbox_overlap_align(pos_pred, pos_target)\n        # pos labels\n        pos_labels = tag_labels_flatten[pos_ind].squeeze(1)\n        cls_target = paddle.zeros(cls_logits_flatten.shape)\n        cls_target[pos_ind, pos_labels - 1] = bbox_iou\n        cls_loss = self.iou_based_soft_label_loss(\n            cls_logits_flatten,\n            cls_target,\n            implicit_iou=F.sigmoid(centerness_flatten),\n            avg_factor=num_positive_fp32)\n\n        # 2. bboxes_reg: giou_loss\n        mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)\n        tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)\n        reg_loss = self._iou_loss(\n            bboxes_reg_flatten,\n            tag_bboxes_flatten,\n            mask_positive_float,\n            weights=tag_center_flatten)\n        reg_loss = reg_loss * mask_positive_float / normalize_sum\n\n        # 3. iou loss\n        pos_iou_pred = paddle.squeeze(centerness_flatten, axis=-1)[pos_ind]\n        loss_iou = ops.sigmoid_cross_entropy_with_logits(pos_iou_pred, bbox_iou)\n        loss_iou = loss_iou / num_positive_fp32 * 0.5\n\n        loss_all = {\n            \"loss_cls\": paddle.sum(cls_loss),\n            \"loss_box\": paddle.sum(reg_loss),\n            'loss_iou': paddle.sum(loss_iou),\n        }\n\n        return loss_all\n\n\n# Concat multi-level feature maps by image\ndef levels_to_images(mlvl_tensor):\n    batch_size = mlvl_tensor[0].shape[0]\n    batch_list = [[] for _ in range(batch_size)]\n    channels = mlvl_tensor[0].shape[1]\n    for t in mlvl_tensor:\n        t = t.transpose([0, 2, 3, 1])\n        t = t.reshape([batch_size, -1, channels])\n        for img in range(batch_size):\n            batch_list[img].append(t[img])\n    return [paddle.concat(item, axis=0) for item in batch_list]\n\n\ndef multi_apply(func, *args, **kwargs):\n    \"\"\"Apply function to a list of arguments.\n\n    Note:\n        This function applies the ``func`` to multiple inputs and\n        map the multiple outputs of the ``func`` into different\n        list. Each list contains the same type of outputs corresponding\n        to different inputs.\n\n    Args:\n        func (Function): A function that will be applied to a list of\n            arguments\n\n    Returns:\n        tuple(list): A tuple containing multiple list, each list contains \\\n            a kind of returned results by the function\n    \"\"\"\n    pfunc = partial(func, **kwargs) if kwargs else func\n    map_results = map(pfunc, *args)\n    return tuple(map(list, zip(*map_results)))\n\n\n@register\nclass FCOSLossCR(FCOSLossMILC):\n    \"\"\"\n    FCOSLoss of Consistency Regularization\n    \"\"\"\n\n    def __init__(self,\n                 iou_loss_type=\"giou\",\n                 cls_weight=2.0,\n                 reg_weight=2.0,\n                 iou_weight=0.5,\n                 hard_neg_mining_flag=True):\n        super(FCOSLossCR, self).__init__()\n        self.iou_loss_type = iou_loss_type\n        self.cls_weight = cls_weight\n        self.reg_weight = reg_weight\n        self.iou_weight = iou_weight\n        self.hard_neg_mining_flag = hard_neg_mining_flag\n\n    def iou_loss(self, pred, targets, weights=None, avg_factor=None):\n        \"\"\"\n            Calculate the loss for location prediction\n            Args:\n                pred (Tensor): bounding boxes prediction\n                targets (Tensor): targets for positive samples\n                weights (Tensor): weights for each positive samples\n            Return:\n                loss (Tensor): location loss\n            \"\"\"\n        plw = pred[:, 0]\n        pth = pred[:, 1]\n        prw = pred[:, 2]\n        pbh = pred[:, 3]\n\n        tlw = targets[:, 0]\n        tth = targets[:, 1]\n        trw = targets[:, 2]\n        tbh = targets[:, 3]\n        tlw.stop_gradient = True\n        trw.stop_gradient = True\n        tth.stop_gradient = True\n        tbh.stop_gradient = True\n\n        ilw = paddle.minimum(plw, tlw)\n        irw = paddle.minimum(prw, trw)\n        ith = paddle.minimum(pth, tth)\n        ibh = paddle.minimum(pbh, tbh)\n\n        clw = paddle.maximum(plw, tlw)\n        crw = paddle.maximum(prw, trw)\n        cth = paddle.maximum(pth, tth)\n        cbh = paddle.maximum(pbh, tbh)\n\n        area_predict = (plw + prw) * (pth + pbh)\n        area_target = (tlw + trw) * (tth + tbh)\n        area_inter = (ilw + irw) * (ith + ibh)\n        ious = (area_inter + 1.0) / (\n            area_predict + area_target - area_inter + 1.0)\n        ious = ious\n\n        if self.iou_loss_type.lower() == \"linear_iou\":\n            loss = 1.0 - ious\n        elif self.iou_loss_type.lower() == \"giou\":\n            area_uniou = area_predict + area_target - area_inter\n            area_circum = (clw + crw) * (cth + cbh) + 1e-7\n            giou = ious - (area_circum - area_uniou) / area_circum\n            loss = 1.0 - giou\n        elif self.iou_loss_type.lower() == \"iou\":\n            loss = 0.0 - paddle.log(ious)\n        else:\n            raise KeyError\n        if weights is not None:\n            loss = loss * weights\n        loss = paddle.sum(loss)\n        if avg_factor is not None:\n            loss = loss / avg_factor\n        return loss\n\n    # calcualate iou between bbox and target\n    def bbox_overlap_align(self, pred, targets):\n        assert pred.shape[0] == targets.shape[0], \\\n        'the pred should be aligned with target.'\n\n        plw = pred[:, 0]\n        pth = pred[:, 1]\n        prw = pred[:, 2]\n        pbh = pred[:, 3]\n\n        tlw = targets[:, 0]\n        tth = targets[:, 1]\n        trw = targets[:, 2]\n        tbh = targets[:, 3]\n\n        ilw = paddle.minimum(plw, tlw)\n        irw = paddle.minimum(prw, trw)\n        ith = paddle.minimum(pth, tth)\n        ibh = paddle.minimum(pbh, tbh)\n\n        area_predict = (plw + prw) * (pth + pbh)\n        area_target = (tlw + trw) * (tth + tbh)\n        area_inter = (ilw + irw) * (ith + ibh)\n        ious = (area_inter + 1.0) / (\n            area_predict + area_target - area_inter + 1.0)\n        return ious\n\n    # cls loss: iou-based soft lable with joint iou\n    def quality_focal_loss(self,\n                           stu_cls,\n                           targets,\n                           quality=None,\n                           weights=None,\n                           alpha=0.75,\n                           gamma=2.0,\n                           avg_factor='sum'):\n        stu_cls = F.sigmoid(stu_cls)\n        if quality is not None:\n            stu_cls = stu_cls * F.sigmoid(quality)\n\n        focal_weight = (stu_cls - targets).abs().pow(gamma) * (targets > 0.0).cast('float32') + \\\n            alpha * (stu_cls - targets).abs().pow(gamma) * \\\n            (targets <= 0.0).cast('float32')\n\n        loss = F.binary_cross_entropy(\n            stu_cls, targets, reduction='none') * focal_weight\n\n        if weights is not None:\n            loss = loss * weights.reshape([-1, 1])\n        loss = paddle.sum(loss)\n        if avg_factor is not None:\n            loss = loss / avg_factor\n        return loss\n\n    # generate points according to feature maps\n    def compute_locations_by_level(self, fpn_stride, h, w):\n        \"\"\"\n        Compute locations of anchor points of each FPN layer\n        Return:\n            Anchor points locations of current FPN feature map\n        \"\"\"\n        shift_x = paddle.arange(0, w * fpn_stride, fpn_stride)\n        shift_y = paddle.arange(0, h * fpn_stride, fpn_stride)\n        shift_x = paddle.unsqueeze(shift_x, axis=0)\n        shift_y = paddle.unsqueeze(shift_y, axis=1)\n        shift_x = paddle.expand(shift_x, shape=[h, w])\n        shift_y = paddle.expand(shift_y, shape=[h, w])\n        shift_x = paddle.reshape(shift_x, shape=[-1])\n        shift_y = paddle.reshape(shift_y, shape=[-1])\n        location = paddle.stack(\n            [shift_x, shift_y], axis=-1) + float(fpn_stride) / 2\n        return location\n\n    # decode bbox from ltrb to x1y1x2y2\n    def decode_bbox(self, ltrb, points):\n        assert ltrb.shape[0] == points.shape[0], \\\n        \"When decoding bbox in one image, the num of loc should be same with points.\"\n        bbox_decoding = paddle.stack(\n            [\n                points[:, 0] - ltrb[:, 0], points[:, 1] - ltrb[:, 1],\n                points[:, 0] + ltrb[:, 2], points[:, 1] + ltrb[:, 3]\n            ],\n            axis=1)\n        return bbox_decoding\n\n    # encode bbox from x1y1x2y2 to ltrb\n    def encode_bbox(self, bbox, points):\n        assert bbox.shape[0] == points.shape[0], \\\n        \"When encoding bbox in one image, the num of bbox should be same with points.\"\n        bbox_encoding = paddle.stack(\n            [\n                points[:, 0] - bbox[:, 0], points[:, 1] - bbox[:, 1],\n                bbox[:, 2] - points[:, 0], bbox[:, 3] - points[:, 1]\n            ],\n            axis=1)\n        return bbox_encoding\n\n    def calcualate_iou(self, gt_bbox, predict_bbox):\n        # bbox area\n        gt_area = (gt_bbox[:, 2] - gt_bbox[:, 0]) * \\\n             (gt_bbox[:, 3] - gt_bbox[:, 1])\n        predict_area = (predict_bbox[:, 2] - predict_bbox[:, 0]) * \\\n             (predict_bbox[:, 3] - predict_bbox[:, 1])\n        # overlop area\n        lt = paddle.fmax(gt_bbox[:, None, :2], predict_bbox[None, :, :2])\n        rb = paddle.fmin(gt_bbox[:, None, 2:], predict_bbox[None, :, 2:])\n        wh = paddle.clip(rb - lt, min=0)\n        overlap = wh[..., 0] * wh[..., 1]\n        # iou\n        iou = overlap / (gt_area[:, None] + predict_area[None, :] - overlap)\n        return iou\n\n    # select potential positives from hard negatives \n    def hard_neg_mining(self,\n                        cls_score,\n                        loc_ltrb,\n                        quality,\n                        pos_ind,\n                        hard_neg_ind,\n                        loc_mask,\n                        loc_targets,\n                        iou_thresh=0.6):\n        # get points locations and strides\n        points_list = []\n        strides_list = []\n        scale_list = []\n        scale = [0, 1, 2, 3, 4]\n        for fpn_scale, fpn_stride, HW in zip(scale, self.fpn_stride,\n                                             self.lvl_hw):\n            h, w = HW\n            lvl_points = self.compute_locations_by_level(fpn_stride, h, w)\n            points_list.append(lvl_points)\n            lvl_strides = paddle.full([h * w, 1], fpn_stride)\n            strides_list.append(lvl_strides)\n            lvl_scales = paddle.full([h * w, 1], fpn_scale)\n            scale_list.append(lvl_scales)\n        points = paddle.concat(points_list, axis=0)\n        strides = paddle.concat(strides_list, axis=0)\n        scales = paddle.concat(scale_list, axis=0)\n\n        # cls scores\n        cls_vals = F.sigmoid(cls_score) * F.sigmoid(quality)\n        max_vals = paddle.max(cls_vals, axis=-1)\n        class_ind = paddle.argmax(cls_vals, axis=-1)\n\n        ### calculate iou between positive and hard negative\n        # decode pos bbox\n        pos_cls = max_vals[pos_ind]\n        pos_loc = loc_ltrb[pos_ind].reshape([-1, 4])\n        pos_strides = strides[pos_ind]\n        pos_points = points[pos_ind].reshape([-1, 2])\n        pos_loc = pos_loc * pos_strides\n        pos_bbox = self.decode_bbox(pos_loc, pos_points)\n        pos_scales = scales[pos_ind]\n        # decode hard negative bbox\n        hard_neg_loc = loc_ltrb[hard_neg_ind].reshape([-1, 4])\n        hard_neg_strides = strides[hard_neg_ind]\n        hard_neg_points = points[hard_neg_ind].reshape([-1, 2])\n        hard_neg_loc = hard_neg_loc * hard_neg_strides\n        hard_neg_bbox = self.decode_bbox(hard_neg_loc, hard_neg_points)\n        hard_neg_scales = scales[hard_neg_ind]\n        # iou between pos bbox and hard negative bbox\n        hard_neg_pos_iou = self.calcualate_iou(hard_neg_bbox, pos_bbox)\n\n        ### select potential positives from hard negatives\n        # scale flag\n        scale_temp = paddle.abs(\n            pos_scales.reshape([-1])[None, :] - hard_neg_scales.reshape([-1])\n            [:, None])\n        scale_flag = (scale_temp <= 1.)\n        # iou flag\n        iou_flag = (hard_neg_pos_iou >= iou_thresh)\n        # same class flag\n        pos_class = class_ind[pos_ind]\n        hard_neg_class = class_ind[hard_neg_ind]\n        class_flag = pos_class[None, :] - hard_neg_class[:, None]\n        class_flag = (class_flag == 0)\n        # hard negative point inside positive bbox flag\n        ltrb_temp = paddle.stack(\n            [\n                hard_neg_points[:, None, 0] - pos_bbox[None, :, 0],\n                hard_neg_points[:, None, 1] - pos_bbox[None, :, 1],\n                pos_bbox[None, :, 2] - hard_neg_points[:, None, 0],\n                pos_bbox[None, :, 3] - hard_neg_points[:, None, 1]\n            ],\n            axis=-1)\n        inside_flag = ltrb_temp.min(axis=-1) > 0\n        # reset iou\n        valid_flag = (iou_flag & class_flag & inside_flag & scale_flag)\n        invalid_iou = paddle.zeros_like(hard_neg_pos_iou)\n        hard_neg_pos_iou = paddle.where(valid_flag, hard_neg_pos_iou,\n                                        invalid_iou)\n        pos_hard_neg_max_iou = hard_neg_pos_iou.max(axis=-1)\n        # selece potential pos\n        potential_pos_ind = (pos_hard_neg_max_iou > 0.)\n        num_potential_pos = paddle.nonzero(potential_pos_ind).shape[0]\n        if num_potential_pos == 0:\n            return None\n\n        ### calculate loc target：aggregate all matching bboxes as the bbox targets of potential pos\n        # prepare data\n        potential_points = hard_neg_points[potential_pos_ind].reshape([-1, 2])\n        potential_strides = hard_neg_strides[potential_pos_ind]\n        potential_valid_flag = valid_flag[potential_pos_ind]\n        potential_pos_ind = hard_neg_ind[potential_pos_ind]\n\n        # get cls and box of matching positives\n        pos_cls = max_vals[pos_ind]\n        expand_pos_bbox = paddle.expand(\n            pos_bbox,\n            shape=[num_potential_pos, pos_bbox.shape[0], pos_bbox.shape[1]])\n        expand_pos_cls = paddle.expand(\n            pos_cls, shape=[num_potential_pos, pos_cls.shape[0]])\n        invalid_cls = paddle.zeros_like(expand_pos_cls)\n        expand_pos_cls = paddle.where(potential_valid_flag, expand_pos_cls,\n                                      invalid_cls)\n        expand_pos_cls = paddle.unsqueeze(expand_pos_cls, axis=-1)\n        # aggregate box based on cls_score\n        agg_bbox = (expand_pos_bbox * expand_pos_cls).sum(axis=1) \\\n            / expand_pos_cls.sum(axis=1)\n        agg_ltrb = self.encode_bbox(agg_bbox, potential_points)\n        agg_ltrb = agg_ltrb / potential_strides\n\n        # loc target for all pos\n        loc_targets[potential_pos_ind] = agg_ltrb\n        loc_mask[potential_pos_ind] = 1.\n\n        return loc_mask, loc_targets\n\n    # get training targets\n    def get_targets_per_img(self, tea_cls, tea_loc, tea_iou, stu_cls, stu_loc,\n                            stu_iou):\n\n        ### sample selection\n        # prepare datas\n        tea_cls_scores = F.sigmoid(tea_cls) * F.sigmoid(tea_iou)\n        class_ind = paddle.argmax(tea_cls_scores, axis=-1)\n        max_vals = paddle.max(tea_cls_scores, axis=-1)\n        cls_mask = paddle.zeros_like(\n            max_vals\n        )  # set cls valid mask: pos is 1, hard_negative and negative are 0.\n        num_pos, num_hard_neg = 0, 0\n\n        # mean-std selection\n        # using nonzero to turn index from bool to int, because the index will be used to compose two-dim index in following.\n        # using squeeze rather than reshape to avoid errors when no score is larger than thresh.\n        candidate_ind = paddle.nonzero(max_vals >= 0.1).squeeze(axis=-1)\n        num_candidate = candidate_ind.shape[0]\n        if num_candidate > 0:\n            # pos thresh = mean + std to select pos samples\n            candidate_score = max_vals[candidate_ind]\n            candidate_score_mean = candidate_score.mean()\n            candidate_score_std = candidate_score.std()\n            pos_thresh = (candidate_score_mean + candidate_score_std).clip(\n                max=0.4)\n            # select pos\n            pos_ind = paddle.nonzero(max_vals >= pos_thresh).squeeze(axis=-1)\n            num_pos = pos_ind.shape[0]\n            # select hard negatives as potential pos\n            hard_neg_ind = (max_vals >= 0.1) & (max_vals < pos_thresh)\n            hard_neg_ind = paddle.nonzero(hard_neg_ind).squeeze(axis=-1)\n            num_hard_neg = hard_neg_ind.shape[0]\n        # if not positive, directly select top-10 as pos.\n        if (num_pos == 0):\n            num_pos = 10\n            _, pos_ind = paddle.topk(max_vals, k=num_pos)\n        cls_mask[pos_ind] = 1.\n\n        ### Consistency Regularization Training targets\n        # cls targets\n        pos_class_ind = class_ind[pos_ind]\n        cls_targets = paddle.zeros_like(tea_cls)\n        cls_targets[pos_ind, pos_class_ind] = tea_cls_scores[pos_ind,\n                                                             pos_class_ind]\n        # hard negative cls target\n        if num_hard_neg != 0:\n            cls_targets[hard_neg_ind] = tea_cls_scores[hard_neg_ind]\n        # loc targets\n        loc_targets = paddle.zeros_like(tea_loc)\n        loc_targets[pos_ind] = tea_loc[pos_ind]\n        # iou targets\n        iou_targets = paddle.zeros(\n            shape=[tea_iou.shape[0]], dtype=tea_iou.dtype)\n        iou_targets[pos_ind] = F.sigmoid(\n            paddle.squeeze(\n                tea_iou, axis=-1)[pos_ind])\n\n        loc_mask = cls_mask.clone()\n        # select potential positive from hard negatives for loc_task training\n        if (num_hard_neg > 0) and self.hard_neg_mining_flag:\n            results = self.hard_neg_mining(tea_cls, tea_loc, tea_iou, pos_ind,\n                                           hard_neg_ind, loc_mask, loc_targets)\n            if results is not None:\n                loc_mask, loc_targets = results\n                loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1)\n                iou_targets[loc_pos_ind] = F.sigmoid(\n                    paddle.squeeze(\n                        tea_iou, axis=-1)[loc_pos_ind])\n\n        return cls_mask, loc_mask, \\\n               cls_targets, loc_targets, iou_targets\n\n    def forward(self, student_prediction, teacher_prediction):\n        stu_cls_lvl, stu_loc_lvl, stu_iou_lvl = student_prediction\n        tea_cls_lvl, tea_loc_lvl, tea_iou_lvl, self.fpn_stride = teacher_prediction\n\n        # H and W of level (used for aggregating targets)\n        self.lvl_hw = []\n        for t in tea_cls_lvl:\n            _, _, H, W = t.shape\n            self.lvl_hw.append([H, W])\n\n        # levels to images\n        stu_cls_img = levels_to_images(stu_cls_lvl)\n        stu_loc_img = levels_to_images(stu_loc_lvl)\n        stu_iou_img = levels_to_images(stu_iou_lvl)\n        tea_cls_img = levels_to_images(tea_cls_lvl)\n        tea_loc_img = levels_to_images(tea_loc_lvl)\n        tea_iou_img = levels_to_images(tea_iou_lvl)\n\n        with paddle.no_grad():\n            cls_mask, loc_mask, \\\n            cls_targets, loc_targets, iou_targets = multi_apply(\n                self.get_targets_per_img,\n                tea_cls_img,\n                tea_loc_img,\n                tea_iou_img,\n                stu_cls_img,\n                stu_loc_img,\n                stu_iou_img\n            )\n\n        # flatten preditction\n        stu_cls = paddle.concat(stu_cls_img, axis=0)\n        stu_loc = paddle.concat(stu_loc_img, axis=0)\n        stu_iou = paddle.concat(stu_iou_img, axis=0)\n        # flatten targets\n        cls_mask = paddle.concat(cls_mask, axis=0)\n        loc_mask = paddle.concat(loc_mask, axis=0)\n        cls_targets = paddle.concat(cls_targets, axis=0)\n        loc_targets = paddle.concat(loc_targets, axis=0)\n        iou_targets = paddle.concat(iou_targets, axis=0)\n\n        ### Training Weights and avg factor\n        # find positives\n        cls_pos_ind = paddle.nonzero(cls_mask > 0.).squeeze(axis=-1)\n        loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1)\n        # cls weight\n        cls_sample_weights = paddle.ones([cls_targets.shape[0]])\n        cls_avg_factor = paddle.max(cls_targets[cls_pos_ind],\n                                    axis=-1).sum().item()\n        # loc weight\n        loc_sample_weights = paddle.max(cls_targets[loc_pos_ind], axis=-1)\n        loc_avg_factor = loc_sample_weights.sum().item()\n        # iou weight\n        iou_sample_weights = paddle.ones([loc_pos_ind.shape[0]])\n        iou_avg_factor = loc_pos_ind.shape[0]\n\n        ### unsupervised loss\n        # cls loss\n        loss_cls = self.quality_focal_loss(\n            stu_cls,\n            cls_targets,\n            quality=stu_iou,\n            weights=cls_sample_weights,\n            avg_factor=cls_avg_factor) * self.cls_weight\n        # iou loss\n        pos_stu_iou = paddle.squeeze(stu_iou, axis=-1)[loc_pos_ind]\n        pos_iou_targets = iou_targets[loc_pos_ind]\n        loss_iou = F.binary_cross_entropy(\n            F.sigmoid(pos_stu_iou), pos_iou_targets,\n            reduction='none') * iou_sample_weights\n        loss_iou = loss_iou.sum() / iou_avg_factor * self.iou_weight\n        # box loss\n        pos_stu_loc = stu_loc[loc_pos_ind]\n        pos_loc_targets = loc_targets[loc_pos_ind]\n\n        loss_box = self.iou_loss(\n            pos_stu_loc,\n            pos_loc_targets,\n            weights=loc_sample_weights,\n            avg_factor=loc_avg_factor)\n        loss_box = loss_box * self.reg_weight\n\n        loss_all = {\n            \"loss_cls\": loss_cls,\n            \"loss_box\": loss_box,\n            \"loss_iou\": loss_iou,\n        }\n        return loss_all\n"
  },
  {
    "path": "ppdet/modeling/losses/focal_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nfrom ppdet.core.workspace import register\n\n__all__ = ['FocalLoss', 'Weighted_FocalLoss']\n\n@register\nclass FocalLoss(nn.Layer):\n    \"\"\"A wrapper around paddle.nn.functional.sigmoid_focal_loss.\n    Args:\n        use_sigmoid (bool): currently only support use_sigmoid=True\n        alpha (float): parameter alpha in Focal Loss\n        gamma (float): parameter gamma in Focal Loss\n        loss_weight (float): final loss will be multiplied by this\n    \"\"\"\n    def __init__(self,\n                 use_sigmoid=True,\n                 alpha=0.25,\n                 gamma=2.0,\n                 loss_weight=1.0):\n        super(FocalLoss, self).__init__()\n        assert use_sigmoid == True, \\\n            'Focal Loss only supports sigmoid at the moment'\n        self.use_sigmoid = use_sigmoid\n        self.alpha = alpha\n        self.gamma = gamma\n        self.loss_weight = loss_weight\n\n    def forward(self, pred, target, reduction='none'):\n        \"\"\"forward function.\n        Args:\n            pred (Tensor): logits of class prediction, of shape (N, num_classes)\n            target (Tensor): target class label, of shape (N, )\n            reduction (str): the way to reduce loss, one of (none, sum, mean)\n        \"\"\"\n        num_classes = pred.shape[1]\n        target = F.one_hot(target, num_classes+1).cast(pred.dtype)\n        target = target[:, :-1].detach()\n        loss = F.sigmoid_focal_loss(\n            pred, target, alpha=self.alpha, gamma=self.gamma,\n            reduction=reduction)\n        return loss * self.loss_weight\n\n\n@register\nclass Weighted_FocalLoss(FocalLoss):\n    \"\"\"A wrapper around paddle.nn.functional.sigmoid_focal_loss.\n    Args:\n        use_sigmoid (bool): currently only support use_sigmoid=True\n        alpha (float): parameter alpha in Focal Loss\n        gamma (float): parameter gamma in Focal Loss\n        loss_weight (float): final loss will be multiplied by this\n    \"\"\"\n    def __init__(self,\n                 use_sigmoid=True,\n                 alpha=0.25,\n                 gamma=2.0,\n                 loss_weight=1.0,\n                 reduction=\"mean\"):\n        super(FocalLoss, self).__init__()\n        assert use_sigmoid == True, \\\n            'Focal Loss only supports sigmoid at the moment'\n        self.use_sigmoid = use_sigmoid\n        self.alpha = alpha\n        self.gamma = gamma\n        self.loss_weight = loss_weight\n        self.reduction = reduction\n\n    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):\n        \"\"\"forward function.\n        Args:\n            pred (Tensor): logits of class prediction, of shape (N, num_classes)\n            target (Tensor): target class label, of shape (N, )\n            reduction (str): the way to reduce loss, one of (none, sum, mean)\n        \"\"\"\n        assert reduction_override in (None, 'none', 'mean', 'sum')\n        reduction = (\n            reduction_override if reduction_override else self.reduction)\n        num_classes = pred.shape[1]\n        target = F.one_hot(target, num_classes + 1).astype(pred.dtype)\n        target = target[:, :-1].detach()\n        loss = F.sigmoid_focal_loss(\n            pred, target, alpha=self.alpha, gamma=self.gamma,\n            reduction='none')\n\n        if weight is not None:\n            if weight.shape != loss.shape:\n                if weight.shape[0] == loss.shape[0]:\n                    # For most cases, weight is of shape (num_priors, ),\n                    #  which means it does not have the second axis num_class\n                    weight = weight.reshape((-1, 1))\n                else:\n                    # Sometimes, weight per anchor per class is also needed. e.g.\n                    #  in FSAF. But it may be flattened of shape\n                    #  (num_priors x num_class, ), while loss is still of shape\n                    #  (num_priors, num_class).\n                    assert weight.numel() == loss.numel()\n                    weight = weight.reshape((loss.shape[0], -1))\n            assert weight.ndim == loss.ndim\n            loss = loss * weight\n\n        # if avg_factor is not specified, just reduce the loss\n        if avg_factor is None:\n            if reduction == 'mean':\n                loss = loss.mean()\n            elif reduction == 'sum':\n                loss = loss.sum()\n        else:\n            # if reduction is mean, then average the loss by avg_factor\n            if reduction == 'mean':\n                # Avoid causing ZeroDivisionError when avg_factor is 0.0,\n                # i.e., all labels of an image belong to ignore index.\n                eps = 1e-10\n                loss = loss.sum() / (avg_factor + eps)\n            # if reduction is 'none', then do nothing, otherwise raise an error\n            elif reduction != 'none':\n                raise ValueError('avg_factor can not be used with reduction=\"sum\"')\n\n        return loss * self.loss_weight\n"
  },
  {
    "path": "ppdet/modeling/losses/gfocal_loss.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# The code is based on:\n# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling import ops\n\n__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']\n\n\ndef quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):\n    \"\"\"\n    Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning\n    Qualified and Distributed Bounding Boxes for Dense Object Detection\n    <https://arxiv.org/abs/2006.04388>`_.\n    Args:\n        pred (Tensor): Predicted joint representation of classification\n            and quality (IoU) estimation with shape (N, C), C is the number of\n            classes.\n        target (tuple([Tensor])): Target category label with shape (N,)\n            and target quality label with shape (N,).\n        beta (float): The beta parameter for calculating the modulating factor.\n            Defaults to 2.0.\n    Returns:\n        Tensor: Loss tensor with shape (N,).\n    \"\"\"\n    assert len(target) == 2, \"\"\"target for QFL must be a tuple of two elements,\n        including category label and quality label, respectively\"\"\"\n    # label denotes the category id, score denotes the quality score\n    label, score = target\n    if use_sigmoid:\n        func = F.binary_cross_entropy_with_logits\n    else:\n        func = F.binary_cross_entropy\n\n    # negatives are supervised by 0 quality score\n    pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred\n    scale_factor = pred_sigmoid\n    zerolabel = paddle.zeros(pred.shape, dtype='float32')\n    loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)\n\n    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes\n    bg_class_ind = pred.shape[1]\n    pos = paddle.logical_and((label >= 0),\n                             (label < bg_class_ind)).nonzero().squeeze(1)\n    if pos.shape[0] == 0:\n        return loss.sum(axis=1)\n    pos_label = paddle.gather(label, pos, axis=0)\n    pos_mask = np.zeros(pred.shape, dtype=np.int32)\n    pos_mask[pos.numpy(), pos_label.numpy()] = 1\n    pos_mask = paddle.to_tensor(pos_mask, dtype='bool')\n    score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32')\n    # positives are supervised by bbox quality (IoU) score\n    scale_factor_new = score - pred_sigmoid\n\n    loss_pos = func(\n        pred, score, reduction='none') * scale_factor_new.abs().pow(beta)\n    loss = loss * paddle.logical_not(pos_mask).astype(loss.dtype) + loss_pos * pos_mask.astype(loss.dtype)\n    loss = loss.sum(axis=1)\n    return loss\n\n\ndef distribution_focal_loss(pred, label):\n    \"\"\"Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning\n    Qualified and Distributed Bounding Boxes for Dense Object Detection\n    <https://arxiv.org/abs/2006.04388>`_.\n    Args:\n        pred (Tensor): Predicted general distribution of bounding boxes\n            (before softmax) with shape (N, n+1), n is the max value of the\n            integral set `{0, ..., n}` in paper.\n        label (Tensor): Target distance label for bounding boxes with\n            shape (N,).\n    Returns:\n        Tensor: Loss tensor with shape (N,).\n    \"\"\"\n    dis_left = label.cast('int64')\n    dis_right = dis_left + 1\n    weight_left = dis_right.cast('float32') - label\n    weight_right = label - dis_left.cast('float32')\n    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \\\n        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right\n    return loss\n\n\n@register\n@serializable\nclass QualityFocalLoss(nn.Layer):\n    r\"\"\"Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:\n    Learning Qualified and Distributed Bounding Boxes for Dense Object\n    Detection <https://arxiv.org/abs/2006.04388>`_.\n    Args:\n        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.\n            Defaults to True.\n        beta (float): The beta parameter for calculating the modulating factor.\n            Defaults to 2.0.\n        reduction (str): Options are \"none\", \"mean\" and \"sum\".\n        loss_weight (float): Loss weight of current loss.\n    \"\"\"\n\n    def __init__(self,\n                 use_sigmoid=True,\n                 beta=2.0,\n                 reduction='mean',\n                 loss_weight=1.0):\n        super(QualityFocalLoss, self).__init__()\n        self.use_sigmoid = use_sigmoid\n        self.beta = beta\n        assert reduction in ('none', 'mean', 'sum')\n        self.reduction = reduction\n        self.loss_weight = loss_weight\n\n    def forward(self, pred, target, weight=None, avg_factor=None):\n        \"\"\"Forward function.\n        Args:\n            pred (Tensor): Predicted joint representation of\n                classification and quality (IoU) estimation with shape (N, C),\n                C is the number of classes.\n            target (tuple([Tensor])): Target category label with shape\n                (N,) and target quality label with shape (N,).\n            weight (Tensor, optional): The weight of loss for each\n                prediction. Defaults to None.\n            avg_factor (int, optional): Average factor that is used to average\n                the loss. Defaults to None.\n        \"\"\"\n\n        loss = self.loss_weight * quality_focal_loss(\n            pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid)\n\n        if weight is not None:\n            loss = loss * weight\n        if avg_factor is None:\n            if self.reduction == 'none':\n                return loss\n            elif self.reduction == 'mean':\n                return loss.mean()\n            elif self.reduction == 'sum':\n                return loss.sum()\n        else:\n            # if reduction is mean, then average the loss by avg_factor\n            if self.reduction == 'mean':\n                loss = loss.sum() / avg_factor\n            # if reduction is 'none', then do nothing, otherwise raise an error\n            elif self.reduction != 'none':\n                raise ValueError(\n                    'avg_factor can not be used with reduction=\"sum\"')\n        return loss\n\n\n@register\n@serializable\nclass DistributionFocalLoss(nn.Layer):\n    \"\"\"Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:\n    Learning Qualified and Distributed Bounding Boxes for Dense Object\n    Detection <https://arxiv.org/abs/2006.04388>`_.\n    Args:\n        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.\n        loss_weight (float): Loss weight of current loss.\n    \"\"\"\n\n    def __init__(self, reduction='mean', loss_weight=1.0):\n        super(DistributionFocalLoss, self).__init__()\n        assert reduction in ('none', 'mean', 'sum')\n        self.reduction = reduction\n        self.loss_weight = loss_weight\n\n    def forward(self, pred, target, weight=None, avg_factor=None):\n        \"\"\"Forward function.\n        Args:\n            pred (Tensor): Predicted general distribution of bounding\n                boxes (before softmax) with shape (N, n+1), n is the max value\n                of the integral set `{0, ..., n}` in paper.\n            target (Tensor): Target distance label for bounding boxes\n                with shape (N,).\n            weight (Tensor, optional): The weight of loss for each\n                prediction. Defaults to None.\n            avg_factor (int, optional): Average factor that is used to average\n                the loss. Defaults to None.\n        \"\"\"\n        loss = self.loss_weight * distribution_focal_loss(pred, target)\n        if weight is not None:\n            loss = loss * weight\n        if avg_factor is None:\n            if self.reduction == 'none':\n                return loss\n            elif self.reduction == 'mean':\n                return loss.mean()\n            elif self.reduction == 'sum':\n                return loss.sum()\n        else:\n            # if reduction is mean, then average the loss by avg_factor\n            if self.reduction == 'mean':\n                loss = loss.sum() / avg_factor\n            # if reduction is 'none', then do nothing, otherwise raise an error\n            elif self.reduction != 'none':\n                raise ValueError(\n                    'avg_factor can not be used with reduction=\"sum\"')\n        return loss\n"
  },
  {
    "path": "ppdet/modeling/losses/iou_aware_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom .iou_loss import IouLoss\nfrom ..bbox_utils import bbox_iou\n\n\n@register\n@serializable\nclass IouAwareLoss(IouLoss):\n    \"\"\"\n    iou aware loss, see https://arxiv.org/abs/1912.05992\n    Args:\n        loss_weight (float): iou aware loss weight, default is 1.0\n        max_height (int): max height of input to support random shape input\n        max_width (int): max width of input to support random shape input\n    \"\"\"\n\n    def __init__(self, loss_weight=1.0, giou=False, diou=False, ciou=False):\n        super(IouAwareLoss, self).__init__(\n            loss_weight=loss_weight, giou=giou, diou=diou, ciou=ciou)\n\n    def __call__(self, ioup, pbox, gbox):\n        iou = bbox_iou(\n            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)\n        iou.stop_gradient = True\n        loss_iou_aware = F.binary_cross_entropy_with_logits(\n            ioup, iou, reduction='none')\n        loss_iou_aware = loss_iou_aware * self.loss_weight\n        return loss_iou_aware\n"
  },
  {
    "path": "ppdet/modeling/losses/iou_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport math\nimport paddle\n\nfrom ppdet.core.workspace import register, serializable\nfrom ..bbox_utils import bbox_iou\n\n__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss']\n\n\n@register\n@serializable\nclass IouLoss(object):\n    \"\"\"\n    iou loss, see https://arxiv.org/abs/1908.03851\n    loss = 1.0 - iou * iou\n    Args:\n        loss_weight (float): iou loss weight, default is 2.5\n        max_height (int): max height of input to support random shape input\n        max_width (int): max width of input to support random shape input\n        ciou_term (bool): whether to add ciou_term\n        loss_square (bool): whether to square the iou term\n    \"\"\"\n\n    def __init__(self,\n                 loss_weight=2.5,\n                 giou=False,\n                 diou=False,\n                 ciou=False,\n                 loss_square=True):\n        self.loss_weight = loss_weight\n        self.giou = giou\n        self.diou = diou\n        self.ciou = ciou\n        self.loss_square = loss_square\n\n    def __call__(self, pbox, gbox):\n        iou = bbox_iou(\n            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)\n        if self.loss_square:\n            loss_iou = 1 - iou * iou\n        else:\n            loss_iou = 1 - iou\n\n        loss_iou = loss_iou * self.loss_weight\n        return loss_iou\n\n\n@register\n@serializable\nclass GIoULoss(object):\n    \"\"\"\n    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630\n    Args:\n        loss_weight (float): giou loss weight, default as 1\n        eps (float): epsilon to avoid divide by zero, default as 1e-10\n        reduction (string): Options are \"none\", \"mean\" and \"sum\". default as none\n    \"\"\"\n\n    def __init__(self, loss_weight=1., eps=1e-10, reduction='none'):\n        self.loss_weight = loss_weight\n        self.eps = eps\n        assert reduction in ('none', 'mean', 'sum')\n        self.reduction = reduction\n\n    def bbox_overlap(self, box1, box2, eps=1e-10):\n        \"\"\"calculate the iou of box1 and box2\n        Args:\n            box1 (Tensor): box1 with the shape (..., 4)\n            box2 (Tensor): box1 with the shape (..., 4)\n            eps (float): epsilon to avoid divide by zero\n        Return:\n            iou (Tensor): iou of box1 and box2\n            overlap (Tensor): overlap of box1 and box2\n            union (Tensor): union of box1 and box2\n        \"\"\"\n        x1, y1, x2, y2 = box1\n        x1g, y1g, x2g, y2g = box2\n\n        xkis1 = paddle.maximum(x1, x1g)\n        ykis1 = paddle.maximum(y1, y1g)\n        xkis2 = paddle.minimum(x2, x2g)\n        ykis2 = paddle.minimum(y2, y2g)\n        w_inter = (xkis2 - xkis1).clip(0)\n        h_inter = (ykis2 - ykis1).clip(0)\n        overlap = w_inter * h_inter\n\n        area1 = (x2 - x1) * (y2 - y1)\n        area2 = (x2g - x1g) * (y2g - y1g)\n        union = area1 + area2 - overlap + eps\n        iou = overlap / union\n\n        return iou, overlap, union\n\n    def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None):\n        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)\n        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)\n        box1 = [x1, y1, x2, y2]\n        box2 = [x1g, y1g, x2g, y2g]\n        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)\n        xc1 = paddle.minimum(x1, x1g)\n        yc1 = paddle.minimum(y1, y1g)\n        xc2 = paddle.maximum(x2, x2g)\n        yc2 = paddle.maximum(y2, y2g)\n\n        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps\n        miou = iou - ((area_c - union) / area_c)\n        if loc_reweight is not None:\n            loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))\n            loc_thresh = 0.9\n            giou = 1 - (1 - loc_thresh\n                        ) * miou - loc_thresh * miou * loc_reweight\n        else:\n            giou = 1 - miou\n        if self.reduction == 'none':\n            loss = giou\n        elif self.reduction == 'sum':\n            loss = paddle.sum(giou * iou_weight)\n        else:\n            loss = paddle.mean(giou * iou_weight)\n        return loss * self.loss_weight\n\n\n@register\n@serializable\nclass DIouLoss(GIoULoss):\n    \"\"\"\n    Distance-IoU Loss, see https://arxiv.org/abs/1911.08287\n    Args:\n        loss_weight (float): giou loss weight, default as 1\n        eps (float): epsilon to avoid divide by zero, default as 1e-10\n        use_complete_iou_loss (bool): whether to use complete iou loss\n    \"\"\"\n\n    def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True):\n        super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps)\n        self.use_complete_iou_loss = use_complete_iou_loss\n\n    def __call__(self, pbox, gbox, iou_weight=1.):\n        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)\n        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)\n        cx = (x1 + x2) / 2\n        cy = (y1 + y2) / 2\n        w = x2 - x1\n        h = y2 - y1\n\n        cxg = (x1g + x2g) / 2\n        cyg = (y1g + y2g) / 2\n        wg = x2g - x1g\n        hg = y2g - y1g\n\n        x2 = paddle.maximum(x1, x2)\n        y2 = paddle.maximum(y1, y2)\n\n        # A and B\n        xkis1 = paddle.maximum(x1, x1g)\n        ykis1 = paddle.maximum(y1, y1g)\n        xkis2 = paddle.minimum(x2, x2g)\n        ykis2 = paddle.minimum(y2, y2g)\n\n        # A or B\n        xc1 = paddle.minimum(x1, x1g)\n        yc1 = paddle.minimum(y1, y1g)\n        xc2 = paddle.maximum(x2, x2g)\n        yc2 = paddle.maximum(y2, y2g)\n\n        intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)\n        intsctk = intsctk * paddle.greater_than(\n            xkis2, xkis1).astype(intsctk.dtype) * paddle.greater_than(ykis2, ykis1).astype(intsctk.dtype)\n        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g\n                                                        ) - intsctk + self.eps\n        iouk = intsctk / unionk\n\n        # DIOU term\n        dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)\n        dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)\n        diou_term = (dist_intersection + self.eps) / (dist_union + self.eps)\n\n        # CIOU term\n        ciou_term = 0\n        if self.use_complete_iou_loss:\n            ar_gt = wg / hg\n            ar_pred = w / h\n            arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred)\n            ar_loss = 4. / np.pi / np.pi * arctan * arctan\n            alpha = ar_loss / (1 - iouk + ar_loss + self.eps)\n            alpha.stop_gradient = True\n            ciou_term = alpha * ar_loss\n\n        diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight)\n\n        return diou * self.loss_weight\n\n\n@register\n@serializable\nclass SIoULoss(GIoULoss):\n    \"\"\"\n    see https://arxiv.org/pdf/2205.12740.pdf \n    Args:\n        loss_weight (float): siou loss weight, default as 1\n        eps (float): epsilon to avoid divide by zero, default as 1e-10\n        theta (float): default as 4\n        reduction (str): Options are \"none\", \"mean\" and \"sum\". default as none\n    \"\"\"\n\n    def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'):\n        super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps)\n        self.loss_weight = loss_weight\n        self.eps = eps\n        self.theta = theta\n        self.reduction = reduction\n\n    def __call__(self, pbox, gbox):\n        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)\n        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)\n\n        box1 = [x1, y1, x2, y2]\n        box2 = [x1g, y1g, x2g, y2g]\n        iou = bbox_iou(box1, box2)\n\n        cx = (x1 + x2) / 2\n        cy = (y1 + y2) / 2\n        w = x2 - x1 + self.eps\n        h = y2 - y1 + self.eps\n\n        cxg = (x1g + x2g) / 2\n        cyg = (y1g + y2g) / 2\n        wg = x2g - x1g + self.eps\n        hg = y2g - y1g + self.eps\n\n        x2 = paddle.maximum(x1, x2)\n        y2 = paddle.maximum(y1, y2)\n\n        # A or B\n        xc1 = paddle.minimum(x1, x1g)\n        yc1 = paddle.minimum(y1, y1g)\n        xc2 = paddle.maximum(x2, x2g)\n        yc2 = paddle.maximum(y2, y2g)\n\n        cw_out = xc2 - xc1\n        ch_out = yc2 - yc1\n\n        ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg)\n        cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg)\n\n        # angle cost\n        dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2)\n        sin_angle_alpha = ch / dist_intersection\n        sin_angle_beta = cw / dist_intersection\n        thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2\n        thred.stop_gradient = True\n        sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta,\n                                 sin_angle_alpha)\n        angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2)\n\n        # distance cost\n        gamma = 2 - angle_cost\n        # gamma.stop_gradient = True\n        beta_x = ((cxg - cx) / cw_out)**2\n        beta_y = ((cyg - cy) / ch_out)**2\n        dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma *\n                                                                     beta_y)\n\n        # shape cost\n        omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg)\n        omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg)\n        omega = (1 - paddle.exp(-omega_w))**self.theta + (\n            1 - paddle.exp(-omega_h))**self.theta\n        siou_loss = 1 - iou + (omega + dist_cost) / 2\n\n        if self.reduction == 'mean':\n            siou_loss = paddle.mean(siou_loss)\n        elif self.reduction == 'sum':\n            siou_loss = paddle.sum(siou_loss)\n\n        return siou_loss * self.loss_weight\n"
  },
  {
    "path": "ppdet/modeling/losses/jde_loss.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\n\n__all__ = ['JDEDetectionLoss', 'JDEEmbeddingLoss', 'JDELoss']\n\n\n@register\nclass JDEDetectionLoss(nn.Layer):\n    __shared__ = ['num_classes']\n\n    def __init__(self, num_classes=1, for_mot=True):\n        super(JDEDetectionLoss, self).__init__()\n        self.num_classes = num_classes\n        self.for_mot = for_mot\n\n    def det_loss(self, p_det, anchor, t_conf, t_box):\n        pshape = paddle.shape(p_det)\n        pshape.stop_gradient = True\n        nB, nGh, nGw = pshape[0], pshape[-2], pshape[-1]\n        nA = len(anchor)\n        p_det = paddle.reshape(\n            p_det, [nB, nA, self.num_classes + 5, nGh, nGw]).transpose(\n                (0, 1, 3, 4, 2))\n\n        # 1. loss_conf: cross_entropy\n        p_conf = p_det[:, :, :, :, 4:6]\n        p_conf_flatten = paddle.reshape(p_conf, [-1, 2])\n        t_conf_flatten = t_conf.flatten()\n        t_conf_flatten = paddle.cast(t_conf_flatten, dtype=\"int64\")\n        t_conf_flatten.stop_gradient = True\n        loss_conf = F.cross_entropy(\n            p_conf_flatten, t_conf_flatten, ignore_index=-1, reduction='mean')\n        loss_conf.stop_gradient = False\n\n        # 2. loss_box: smooth_l1_loss\n        p_box = p_det[:, :, :, :, :4]\n        p_box_flatten = paddle.reshape(p_box, [-1, 4])\n        t_box_flatten = paddle.reshape(t_box, [-1, 4])\n        fg_inds = paddle.nonzero(t_conf_flatten > 0).flatten()\n        if fg_inds.numel() > 0:\n            reg_delta = paddle.gather(p_box_flatten, fg_inds)\n            reg_target = paddle.gather(t_box_flatten, fg_inds)\n        else:\n            reg_delta = paddle.to_tensor([0, 0, 0, 0], dtype='float32')\n            reg_delta.stop_gradient = False\n            reg_target = paddle.to_tensor([0, 0, 0, 0], dtype='float32')\n        reg_target.stop_gradient = True\n        loss_box = F.smooth_l1_loss(\n            reg_delta, reg_target, reduction='mean', delta=1.0)\n        loss_box.stop_gradient = False\n\n        return loss_conf, loss_box\n\n    def forward(self, det_outs, targets, anchors):\n        \"\"\"\n        Args:\n            det_outs (list[Tensor]): output from detection head, each one\n                is a 4-D Tensor with shape [N, C, H, W].\n            targets (dict): contains 'im_id', 'gt_bbox', 'gt_ide', 'image',\n                'im_shape', 'scale_factor' and 'tbox', 'tconf', 'tide' of\n                each FPN level.\n            anchors (list[list]): anchor setting of JDE model, N row M col, N is\n                the anchor levels(FPN levels), M is the anchor scales each\n                level.\n        \"\"\"\n        assert len(det_outs) == len(anchors)\n        loss_confs = []\n        loss_boxes = []\n        for i, (p_det, anchor) in enumerate(zip(det_outs, anchors)):\n            t_conf = targets['tconf{}'.format(i)]\n            t_box = targets['tbox{}'.format(i)]\n\n            loss_conf, loss_box = self.det_loss(p_det, anchor, t_conf, t_box)\n            loss_confs.append(loss_conf)\n            loss_boxes.append(loss_box)\n        if self.for_mot:\n            return {'loss_confs': loss_confs, 'loss_boxes': loss_boxes}\n        else:\n            jde_conf_losses = sum(loss_confs)\n            jde_box_losses = sum(loss_boxes)\n            jde_det_losses = {\n                \"loss_conf\": jde_conf_losses,\n                \"loss_box\": jde_box_losses,\n                \"loss\": jde_conf_losses + jde_box_losses,\n            }\n            return jde_det_losses\n\n\n@register\nclass JDEEmbeddingLoss(nn.Layer):\n    def __init__(self, ):\n        super(JDEEmbeddingLoss, self).__init__()\n        self.phony = self.create_parameter(shape=[1], dtype=\"float32\")\n\n    def emb_loss(self, p_ide, t_conf, t_ide, emb_scale, classifier):\n        emb_dim = p_ide.shape[1]\n        p_ide = p_ide.transpose((0, 2, 3, 1))\n        p_ide_flatten = paddle.reshape(p_ide, [-1, emb_dim])\n        mask = t_conf > 0\n        mask = paddle.cast(mask, dtype=\"int64\")\n        mask.stop_gradient = True\n        emb_mask = mask.max(1).flatten()\n        emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()\n        emb_mask_inds.stop_gradient = True\n        # use max(1) to decide the id, TODO: more reseanable strategy\n        t_ide_flatten = t_ide.max(1).flatten()\n        t_ide_flatten = paddle.cast(t_ide_flatten, dtype=\"int64\")\n        valid_inds = paddle.nonzero(t_ide_flatten != -1).flatten()\n\n        if emb_mask_inds.numel() == 0 or valid_inds.numel() == 0:\n            # loss_ide = paddle.to_tensor([0]) # will be error in gradient backward\n            loss_ide = self.phony * 0  # todo\n        else:\n            embedding = paddle.gather(p_ide_flatten, emb_mask_inds)\n            embedding = emb_scale * F.normalize(embedding)\n            logits = classifier(embedding)\n\n            ide_target = paddle.gather(t_ide_flatten, emb_mask_inds)\n\n            loss_ide = F.cross_entropy(\n                logits, ide_target, ignore_index=-1, reduction='mean')\n        loss_ide.stop_gradient = False\n\n        return loss_ide\n\n    def forward(self, ide_outs, targets, emb_scale, classifier):\n        loss_ides = []\n        for i, p_ide in enumerate(ide_outs):\n            t_conf = targets['tconf{}'.format(i)]\n            t_ide = targets['tide{}'.format(i)]\n\n            loss_ide = self.emb_loss(p_ide, t_conf, t_ide, emb_scale,\n                                     classifier)\n            loss_ides.append(loss_ide)\n        return loss_ides\n\n\n@register\nclass JDELoss(nn.Layer):\n    def __init__(self):\n        super(JDELoss, self).__init__()\n\n    def forward(self, loss_confs, loss_boxes, loss_ides, loss_params_cls,\n                loss_params_reg, loss_params_ide, targets):\n        assert len(loss_confs) == len(loss_boxes) == len(loss_ides)\n        assert len(loss_params_cls) == len(loss_params_reg) == len(\n            loss_params_ide)\n        assert len(loss_confs) == len(loss_params_cls)\n\n        batchsize = targets['gt_bbox'].shape[0]\n        nTargets = paddle.nonzero(paddle.sum(targets['gt_bbox'], axis=2)).shape[\n            0] / batchsize\n        nTargets = paddle.to_tensor(nTargets, dtype='float32')\n        nTargets.stop_gradient = True\n\n        jde_losses = []\n        for i, (loss_conf, loss_box, loss_ide, l_conf_p, l_box_p,\n                l_ide_p) in enumerate(\n                    zip(loss_confs, loss_boxes, loss_ides, loss_params_cls,\n                        loss_params_reg, loss_params_ide)):\n\n            jde_loss = l_conf_p(loss_conf) + l_box_p(loss_box) + l_ide_p(\n                loss_ide)\n            jde_losses.append(jde_loss)\n\n        loss_all = {\n            \"loss_conf\": sum(loss_confs),\n            \"loss_box\": sum(loss_boxes),\n            \"loss_ide\": sum(loss_ides),\n            \"loss\": sum(jde_losses),\n            \"nTargets\": nTargets,\n        }\n        return loss_all\n"
  },
  {
    "path": "ppdet/modeling/losses/keypoint_loss.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom itertools import cycle, islice\nfrom collections import abc\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\n\nfrom ppdet.core.workspace import register, serializable\n\n__all__ = ['HrHRNetLoss', 'KeyPointMSELoss', 'OKSLoss', 'CenterFocalLoss', 'L1Loss']\n\n\n@register\n@serializable\nclass KeyPointMSELoss(nn.Layer):\n    def __init__(self, use_target_weight=True, loss_scale=0.5):\n        \"\"\"\n        KeyPointMSELoss layer\n\n        Args:\n            use_target_weight (bool): whether to use target weight\n        \"\"\"\n        super(KeyPointMSELoss, self).__init__()\n        self.criterion = nn.MSELoss(reduction='mean')\n        self.use_target_weight = use_target_weight\n        self.loss_scale = loss_scale\n\n    def forward(self, output, records):\n        target = records['target']\n        target_weight = records['target_weight']\n        batch_size = output.shape[0]\n        num_joints = output.shape[1]\n        heatmaps_pred = output.reshape(\n            (batch_size, num_joints, -1)).split(num_joints, 1)\n        heatmaps_gt = target.reshape(\n            (batch_size, num_joints, -1)).split(num_joints, 1)\n        loss = 0\n        for idx in range(num_joints):\n            heatmap_pred = heatmaps_pred[idx].squeeze()\n            heatmap_gt = heatmaps_gt[idx].squeeze()\n            if self.use_target_weight:\n                loss += self.loss_scale * self.criterion(\n                    heatmap_pred.multiply(target_weight[:, idx]),\n                    heatmap_gt.multiply(target_weight[:, idx]))\n            else:\n                loss += self.loss_scale * self.criterion(heatmap_pred,\n                                                         heatmap_gt)\n        keypoint_losses = dict()\n        keypoint_losses['loss'] = loss / num_joints\n        return keypoint_losses\n\n\n@register\n@serializable\nclass HrHRNetLoss(nn.Layer):\n    def __init__(self, num_joints, swahr):\n        \"\"\"\n        HrHRNetLoss layer\n\n        Args:\n            num_joints (int): number of keypoints\n        \"\"\"\n        super(HrHRNetLoss, self).__init__()\n        if swahr:\n            self.heatmaploss = HeatMapSWAHRLoss(num_joints)\n        else:\n            self.heatmaploss = HeatMapLoss()\n        self.aeloss = AELoss()\n        self.ziploss = ZipLoss(\n            [self.heatmaploss, self.heatmaploss, self.aeloss])\n\n    def forward(self, inputs, records):\n        targets = []\n        targets.append([records['heatmap_gt1x'], records['mask_1x']])\n        targets.append([records['heatmap_gt2x'], records['mask_2x']])\n        targets.append(records['tagmap'])\n        keypoint_losses = dict()\n        loss = self.ziploss(inputs, targets)\n        keypoint_losses['heatmap_loss'] = loss[0] + loss[1]\n        keypoint_losses['pull_loss'] = loss[2][0]\n        keypoint_losses['push_loss'] = loss[2][1]\n        keypoint_losses['loss'] = recursive_sum(loss)\n        return keypoint_losses\n\n\nclass HeatMapLoss(object):\n    def __init__(self, loss_factor=1.0):\n        super(HeatMapLoss, self).__init__()\n        self.loss_factor = loss_factor\n\n    def __call__(self, preds, targets):\n        heatmap, mask = targets\n        loss = ((preds - heatmap)**2 * mask.cast('float').unsqueeze(1))\n        loss = paddle.clip(loss, min=0, max=2).mean()\n        loss *= self.loss_factor\n        return loss\n\n\nclass HeatMapSWAHRLoss(object):\n    def __init__(self, num_joints, loss_factor=1.0):\n        super(HeatMapSWAHRLoss, self).__init__()\n        self.loss_factor = loss_factor\n        self.num_joints = num_joints\n\n    def __call__(self, preds, targets):\n        heatmaps_gt, mask = targets\n        heatmaps_pred = preds[0]\n        scalemaps_pred = preds[1]\n\n        heatmaps_scaled_gt = paddle.where(heatmaps_gt > 0, 0.5 * heatmaps_gt * (\n            1 + (1 +\n                 (scalemaps_pred - 1.) * paddle.log(heatmaps_gt + 1e-10))**2),\n                                          heatmaps_gt)\n\n        regularizer_loss = paddle.mean(\n            paddle.pow((scalemaps_pred - 1.) * (heatmaps_gt > 0).astype(float),\n                       2))\n        omiga = 0.01\n        # thres = 2**(-1/omiga), threshold for positive weight\n        hm_weight = heatmaps_scaled_gt**(\n            omiga\n        ) * paddle.abs(1 - heatmaps_pred) + paddle.abs(heatmaps_pred) * (\n            1 - heatmaps_scaled_gt**(omiga))\n\n        loss = (((heatmaps_pred - heatmaps_scaled_gt)**2) *\n                mask.cast('float').unsqueeze(1)) * hm_weight\n        loss = loss.mean()\n        loss = self.loss_factor * (loss + 1.0 * regularizer_loss)\n        return loss\n\n\nclass AELoss(object):\n    def __init__(self, pull_factor=0.001, push_factor=0.001):\n        super(AELoss, self).__init__()\n        self.pull_factor = pull_factor\n        self.push_factor = push_factor\n\n    def apply_single(self, pred, tagmap):\n        if tagmap.numpy()[:, :, 3].sum() == 0:\n            return (paddle.zeros([1]), paddle.zeros([1]))\n        nonzero = paddle.nonzero(tagmap[:, :, 3] > 0)\n        if nonzero.shape[0] == 0:\n            return (paddle.zeros([1]), paddle.zeros([1]))\n        p_inds = paddle.unique(nonzero[:, 0])\n        num_person = p_inds.shape[0]\n        if num_person == 0:\n            return (paddle.zeros([1]), paddle.zeros([1]))\n\n        pull = 0\n        tagpull_num = 0\n        embs_all = []\n        person_unvalid = 0\n        for person_idx in p_inds.numpy():\n            valid_single = tagmap[person_idx.item()]\n            validkpts = paddle.nonzero(valid_single[:, 3] > 0)\n            valid_single = paddle.index_select(valid_single, validkpts)\n            emb = paddle.gather_nd(pred, valid_single[:, :3])\n            if emb.shape[0] == 1:\n                person_unvalid += 1\n            mean = paddle.mean(emb, axis=0)\n            embs_all.append(mean)\n            pull += paddle.mean(paddle.pow(emb - mean, 2), axis=0)\n            tagpull_num += emb.shape[0]\n        pull /= max(num_person - person_unvalid, 1)\n        if num_person < 2:\n            return pull, paddle.zeros([1])\n\n        embs_all = paddle.stack(embs_all)\n        A = embs_all.expand([num_person, num_person])\n        B = A.transpose([1, 0])\n        diff = A - B\n\n        diff = paddle.pow(diff, 2)\n        push = paddle.exp(-diff)\n        push = paddle.sum(push) - num_person\n\n        push /= 2 * num_person * (num_person - 1)\n        return pull, push\n\n    def __call__(self, preds, tagmaps):\n        bs = preds.shape[0]\n        losses = [\n            self.apply_single(preds[i:i + 1].squeeze(),\n                              tagmaps[i:i + 1].squeeze()) for i in range(bs)\n        ]\n        pull = self.pull_factor * sum(loss[0] for loss in losses) / len(losses)\n        push = self.push_factor * sum(loss[1] for loss in losses) / len(losses)\n        return pull, push\n\n\nclass ZipLoss(object):\n    def __init__(self, loss_funcs):\n        super(ZipLoss, self).__init__()\n        self.loss_funcs = loss_funcs\n\n    def __call__(self, inputs, targets):\n        assert len(self.loss_funcs) == len(targets) >= len(inputs)\n\n        def zip_repeat(*args):\n            longest = max(map(len, args))\n            filled = [islice(cycle(x), longest) for x in args]\n            return zip(*filled)\n\n        return tuple(\n            fn(x, y)\n            for x, y, fn in zip_repeat(inputs, targets, self.loss_funcs))\n\n\ndef recursive_sum(inputs):\n    if isinstance(inputs, abc.Sequence):\n        return sum([recursive_sum(x) for x in inputs])\n    return inputs\n\n\ndef oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):\n    if not kpt_gts.astype('bool').any():\n        return kpt_preds.sum()*0\n    \n    sigmas = paddle.to_tensor(sigmas, dtype=kpt_preds.dtype)\n    variances = (sigmas * 2)**2\n\n    assert kpt_preds.shape[0] == kpt_gts.shape[0]\n    kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1] // 2, 2))\n    kpt_gts = kpt_gts.reshape((-1, kpt_gts.shape[-1] // 2, 2))\n\n    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \\\n        (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2\n    assert (kpt_valids.sum(-1) > 0).all()\n    squared_distance0 = squared_distance / (\n        kpt_areas[:, None] * variances[None, :] * 2)\n    squared_distance1 = paddle.exp(-squared_distance0)\n    squared_distance1 = squared_distance1 * kpt_valids\n    oks = squared_distance1.sum(axis=1) / kpt_valids.sum(axis=1)\n\n    return oks\n\n\ndef oks_loss(pred,\n             target,\n             weight,\n             valid=None,\n             area=None,\n             linear=False,\n             sigmas=None,\n             eps=1e-6,\n             avg_factor=None, \n             reduction=None):\n    \"\"\"Oks loss.\n\n    Computing the oks loss between a set of predicted poses and target poses.\n    The loss is calculated as negative log of oks.\n\n    Args:\n        pred (Tensor): Predicted poses of format (x1, y1, x2, y2, ...),\n            shape (n, K*2).\n        target (Tensor): Corresponding gt poses, shape (n, K*2).\n        linear (bool, optional): If True, use linear scale of loss instead of\n            log scale. Default: False.\n        eps (float): Eps to avoid log(0).\n\n    Returns:\n        Tensor: Loss tensor.\n    \"\"\"\n    oks = oks_overlaps(pred, target, valid, area, sigmas).clip(min=eps)\n    if linear:\n        loss = 1 - oks\n    else:\n        loss = -oks.log()\n\n    if weight is not None:\n        if weight.shape != loss.shape:\n            if weight.shape[0] == loss.shape[0]:\n                # For most cases, weight is of shape (num_priors, ),\n                #  which means it does not have the second axis num_class\n                weight = weight.reshape((-1, 1))\n            else:\n                # Sometimes, weight per anchor per class is also needed. e.g.\n                #  in FSAF. But it may be flattened of shape\n                #  (num_priors x num_class, ), while loss is still of shape\n                #  (num_priors, num_class).\n                assert weight.numel() == loss.numel()\n                weight = weight.reshape((loss.shape[0], -1))\n        assert weight.ndim == loss.ndim\n        loss = loss * weight\n\n    # if avg_factor is not specified, just reduce the loss\n    if avg_factor is None:\n        if reduction == 'mean':\n            loss = loss.mean()\n        elif reduction == 'sum':\n            loss = loss.sum()\n    else:\n        # if reduction is mean, then average the loss by avg_factor\n        if reduction == 'mean':\n            # Avoid causing ZeroDivisionError when avg_factor is 0.0,\n            # i.e., all labels of an image belong to ignore index.\n            eps = 1e-10\n            loss = loss.sum() / (avg_factor + eps)\n        # if reduction is 'none', then do nothing, otherwise raise an error\n        elif reduction != 'none':\n            raise ValueError('avg_factor can not be used with reduction=\"sum\"')\n\n\n    return loss\n\n@register\n@serializable\nclass OKSLoss(nn.Layer):\n    \"\"\"OKSLoss.\n\n    Computing the oks loss between a set of predicted poses and target poses.\n\n    Args:\n        linear (bool): If True, use linear scale of loss instead of log scale.\n            Default: False.\n        eps (float): Eps to avoid log(0).\n        reduction (str): Options are \"none\", \"mean\" and \"sum\".\n        loss_weight (float): Weight of loss.\n    \"\"\"\n\n    def __init__(self,\n                 linear=False,\n                 num_keypoints=17,\n                 eps=1e-6,\n                 reduction='mean',\n                 loss_weight=1.0):\n        super(OKSLoss, self).__init__()\n        self.linear = linear\n        self.eps = eps\n        self.reduction = reduction\n        self.loss_weight = loss_weight\n        if num_keypoints == 17:\n            self.sigmas = np.array([\n                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,\n                1.07, .87, .87, .89, .89\n            ], dtype=np.float32) / 10.0\n        elif num_keypoints == 14:\n            self.sigmas = np.array([\n                .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,\n                .79, .79\n            ]) / 10.0\n        else:\n            raise ValueError(f'Unsupported keypoints number {num_keypoints}')\n\n    def forward(self,\n                pred,\n                target,\n                valid,\n                area,\n                weight=None,\n                avg_factor=None,\n                reduction_override=None,\n                **kwargs):\n        \"\"\"Forward function.\n\n        Args:\n            pred (Tensor): The prediction.\n            target (Tensor): The learning target of the prediction.\n            valid (Tensor): The visible flag of the target pose.\n            area (Tensor): The area of the target pose.\n            weight (Tensor, optional): The weight of loss for each\n                prediction. Defaults to None.\n            avg_factor (int, optional): Average factor that is used to average\n                the loss. Defaults to None.\n            reduction_override (str, optional): The reduction method used to\n                override the original reduction method of the loss.\n                Defaults to None. Options are \"none\", \"mean\" and \"sum\".\n        \"\"\"\n        assert reduction_override in (None, 'none', 'mean', 'sum')\n        reduction = (\n            reduction_override if reduction_override else self.reduction)\n        if (weight is not None) and (not paddle.any(weight > 0)) and (\n                reduction != 'none'):\n            if pred.dim() == weight.dim() + 1:\n                weight = weight.unsqueeze(1)\n            return (pred * weight).sum()  # 0\n        if weight is not None and weight.dim() > 1:\n            # TODO: remove this in the future\n            # reduce the weight of shape (n, 4) to (n,) to match the\n            # iou_loss of shape (n,)\n            assert weight.shape == pred.shape\n            weight = weight.mean(-1)\n        loss = self.loss_weight * oks_loss(\n            pred,\n            target,\n            weight,\n            valid=valid,\n            area=area,\n            linear=self.linear,\n            sigmas=self.sigmas,\n            eps=self.eps,\n            reduction=reduction,\n            avg_factor=avg_factor,\n            **kwargs)\n        return loss\n\n\ndef center_focal_loss(pred, gt, weight=None, mask=None, avg_factor=None, reduction=None):\n    \"\"\"Modified focal loss. Exactly the same as CornerNet.\n    Runs faster and costs a little bit more memory.\n\n    Args:\n        pred (Tensor): The prediction with shape [bs, c, h, w].\n        gt (Tensor): The learning target of the prediction in gaussian\n            distribution, with shape [bs, c, h, w].\n        mask (Tensor): The valid mask. Defaults to None.\n    \"\"\"\n    if not gt.astype('bool').any():\n        return pred.sum()*0\n    pos_inds = gt.equal(1).astype('float32')\n    if mask is None:\n        neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32')\n    else:\n        neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32') * mask.equal(0).astype('float32')\n\n    neg_weights = paddle.pow(1 - gt, 4)\n\n    loss = 0\n\n    pos_loss = paddle.log(pred) * paddle.pow(1 - pred, 2) * pos_inds\n    neg_loss = paddle.log(1 - pred) * paddle.pow(pred, 2) * neg_weights * \\\n        neg_inds\n\n    num_pos = pos_inds.astype('float32').sum()\n    pos_loss = pos_loss.sum()\n    neg_loss = neg_loss.sum()\n\n    if num_pos == 0:\n        loss = loss - neg_loss\n    else:\n        loss = loss - (pos_loss + neg_loss) / num_pos\n\n    if weight is not None:\n        if weight.shape != loss.shape:\n            if weight.shape[0] == loss.shape[0]:\n                # For most cases, weight is of shape (num_priors, ),\n                #  which means it does not have the second axis num_class\n                weight = weight.reshape((-1, 1))\n            else:\n                # Sometimes, weight per anchor per class is also needed. e.g.\n                #  in FSAF. But it may be flattened of shape\n                #  (num_priors x num_class, ), while loss is still of shape\n                #  (num_priors, num_class).\n                assert weight.numel() == loss.numel()\n                weight = weight.reshape((loss.shape[0], -1))\n        assert weight.ndim == loss.ndim\n        loss = loss * weight\n\n    # if avg_factor is not specified, just reduce the loss\n    if avg_factor is None:\n        if reduction == 'mean':\n            loss = loss.mean()\n        elif reduction == 'sum':\n            loss = loss.sum()\n    else:\n        # if reduction is mean, then average the loss by avg_factor\n        if reduction == 'mean':\n            # Avoid causing ZeroDivisionError when avg_factor is 0.0,\n            # i.e., all labels of an image belong to ignore index.\n            eps = 1e-10\n            loss = loss.sum() / (avg_factor + eps)\n        # if reduction is 'none', then do nothing, otherwise raise an error\n        elif reduction != 'none':\n            raise ValueError('avg_factor can not be used with reduction=\"sum\"')\n\n    return loss\n\n@register\n@serializable\nclass CenterFocalLoss(nn.Layer):\n    \"\"\"CenterFocalLoss is a variant of focal loss.\n\n    More details can be found in the `paper\n    <https://arxiv.org/abs/1808.01244>`_\n\n    Args:\n        reduction (str): Options are \"none\", \"mean\" and \"sum\".\n        loss_weight (float): Loss weight of current loss.\n    \"\"\"\n\n    def __init__(self,\n                 reduction='none',\n                 loss_weight=1.0):\n        super(CenterFocalLoss, self).__init__()\n        self.reduction = reduction\n        self.loss_weight = loss_weight\n\n    def forward(self,\n                pred,\n                target,\n                weight=None,\n                mask=None,\n                avg_factor=None,\n                reduction_override=None):\n        \"\"\"Forward function.\n\n        Args:\n            pred (Tensor): The prediction.\n            target (Tensor): The learning target of the prediction in gaussian\n                distribution.\n            weight (Tensor, optional): The weight of loss for each\n                prediction. Defaults to None.\n            mask (Tensor): The valid mask. Defaults to None.\n            avg_factor (int, optional): Average factor that is used to average\n                the loss. Defaults to None.\n            reduction_override (str, optional): The reduction method used to\n                override the original reduction method of the loss.\n                Defaults to None.\n        \"\"\"\n        assert reduction_override in (None, 'none', 'mean', 'sum')\n        reduction = (\n            reduction_override if reduction_override else self.reduction)\n        loss_reg = self.loss_weight * center_focal_loss(\n            pred,\n            target,\n            weight,\n            mask=mask,\n            reduction=reduction,\n            avg_factor=avg_factor)\n        return loss_reg\n\ndef l1_loss(pred, target, weight=None, reduction='mean', avg_factor=None):\n    \"\"\"L1 loss.\n\n    Args:\n        pred (Tensor): The prediction.\n        target (Tensor): The learning target of the prediction.\n\n    Returns:\n        Tensor: Calculated loss\n    \"\"\"\n    if not target.astype('bool').any():\n        return pred.sum() * 0\n\n    assert pred.shape == target.shape\n    loss = paddle.abs(pred - target)\n\n    if weight is not None:\n        if weight.shape != loss.shape:\n            if weight.shape[0] == loss.shape[0]:\n                # For most cases, weight is of shape (num_priors, ),\n                #  which means it does not have the second axis num_class\n                weight = weight.reshape((-1, 1))\n            else:\n                # Sometimes, weight per anchor per class is also needed. e.g.\n                #  in FSAF. But it may be flattened of shape\n                #  (num_priors x num_class, ), while loss is still of shape\n                #  (num_priors, num_class).\n                assert weight.numel() == loss.numel()\n                weight = weight.reshape((loss.shape[0], -1))\n        assert weight.ndim == loss.ndim\n        loss = loss * weight\n\n    # if avg_factor is not specified, just reduce the loss\n    if avg_factor is None:\n        if reduction == 'mean':\n            loss = loss.mean()\n        elif reduction == 'sum':\n            loss = loss.sum()\n    else:\n        # if reduction is mean, then average the loss by avg_factor\n        if reduction == 'mean':\n            # Avoid causing ZeroDivisionError when avg_factor is 0.0,\n            # i.e., all labels of an image belong to ignore index.\n            eps = 1e-10\n            loss = loss.sum() / (avg_factor + eps)\n        # if reduction is 'none', then do nothing, otherwise raise an error\n        elif reduction != 'none':\n            raise ValueError('avg_factor can not be used with reduction=\"sum\"')\n\n\n    return loss\n\n@register\n@serializable\nclass L1Loss(nn.Layer):\n    \"\"\"L1 loss.\n\n    Args:\n        reduction (str, optional): The method to reduce the loss.\n            Options are \"none\", \"mean\" and \"sum\".\n        loss_weight (float, optional): The weight of loss.\n    \"\"\"\n\n    def __init__(self, reduction='mean', loss_weight=1.0):\n        super(L1Loss, self).__init__()\n        self.reduction = reduction\n        self.loss_weight = loss_weight\n\n    def forward(self,\n                pred,\n                target,\n                weight=None,\n                avg_factor=None,\n                reduction_override=None):\n        \"\"\"Forward function.\n\n        Args:\n            pred (Tensor): The prediction.\n            target (Tensor): The learning target of the prediction.\n            weight (Tensor, optional): The weight of loss for each\n                prediction. Defaults to None.\n            avg_factor (int, optional): Average factor that is used to average\n                the loss. Defaults to None.\n            reduction_override (str, optional): The reduction method used to\n                override the original reduction method of the loss.\n                Defaults to None.\n        \"\"\"\n        assert reduction_override in (None, 'none', 'mean', 'sum')\n        reduction = (\n            reduction_override if reduction_override else self.reduction)\n        loss_bbox = self.loss_weight * l1_loss(\n            pred, target, weight, reduction=reduction, avg_factor=avg_factor)\n        return loss_bbox\n\n"
  },
  {
    "path": "ppdet/modeling/losses/pose3d_loss.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom itertools import cycle, islice\nfrom collections import abc\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('ppdet.engine')\n\n__all__ = ['Pose3DLoss']\n\n\n@register\n@serializable\nclass Pose3DLoss(nn.Layer):\n    def __init__(self, weight_3d=1.0, weight_2d=0.0, reduction='none'):\n        \"\"\"\n        KeyPointMSELoss layer\n\n        Args:\n            weight_3d (float): weight of 3d loss\n            weight_2d (float): weight of 2d loss\n            reduction (bool): whether use reduction to loss\n        \"\"\"\n        super(Pose3DLoss, self).__init__()\n        self.weight_3d = weight_3d\n        self.weight_2d = weight_2d\n        self.criterion_2dpose = nn.MSELoss(reduction=reduction)\n        self.criterion_3dpose = nn.L1Loss(reduction=reduction)\n        self.criterion_smoothl1 = nn.SmoothL1Loss(\n            reduction=reduction, delta=1.0)\n        self.criterion_vertices = nn.L1Loss()\n\n    def forward(self, pred3d, pred2d, inputs):\n        \"\"\"\n        mpjpe: mpjpe loss between 3d joints\n        keypoint_2d_loss: 2d joints loss compute by criterion_2dpose\n        \"\"\"\n        gt_3d_joints = inputs['joints_3d']\n        gt_2d_joints = inputs['joints_2d']\n        has_3d_joints = inputs['has_3d_joints']\n        has_2d_joints = inputs['has_2d_joints']\n\n        loss_3d = mpjpe_focal(pred3d, gt_3d_joints, has_3d_joints)\n        loss = self.weight_3d * loss_3d\n        epoch = inputs['epoch_id']\n        if self.weight_2d > 0:\n            weight = self.weight_2d * pow(0.1, (epoch // 8))\n            if epoch > 8:\n                weight = 0\n            loss_2d = keypoint_2d_loss(self.criterion_2dpose, pred2d,\n                                       gt_2d_joints, has_2d_joints)\n            loss += weight * loss_2d\n        return loss\n\n\ndef filter_3d_joints(pred, gt, has_3d_joints):\n    \"\"\" \n    filter 3d joints\n    \"\"\"\n    gt = gt[has_3d_joints == 1]\n    gt = gt[:, :, :3]\n    pred = pred[has_3d_joints == 1]\n\n    gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2\n    gt = gt - gt_pelvis[:, None, :]\n    pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2\n    pred = pred - pred_pelvis[:, None, :]\n    return pred, gt\n\n\ndef mpjpe(pred, gt, has_3d_joints):\n    \"\"\" \n    mPJPE loss\n    \"\"\"\n    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)\n    error = paddle.sqrt((paddle.minimum((pred - gt), paddle.to_tensor(1.2))**2\n                         ).sum(axis=-1)).mean()\n    return error\n\n\ndef mpjpe_focal(pred, gt, has_3d_joints):\n    \"\"\" \n    mPJPE loss\n    \"\"\"\n    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)\n    mse_error = ((pred - gt)**2).sum(axis=-1)\n    mpjpe_error = paddle.sqrt(mse_error)\n    mean = mpjpe_error.mean()\n    std = mpjpe_error.std()\n    atte = 2 * F.sigmoid(6 * (mpjpe_error - mean) / std)\n    mse_error *= atte\n    return mse_error.mean()\n\n\ndef mpjpe_mse(pred, gt, has_3d_joints, weight=1.):\n    \"\"\" \n    mPJPE loss\n    \"\"\"\n    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)\n    error = (((pred - gt)**2).sum(axis=-1)).mean()\n    return error\n\n\ndef mpjpe_criterion(pred, gt, has_3d_joints, criterion_pose3d):\n    \"\"\" \n    mPJPE loss of self define criterion\n    \"\"\"\n    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)\n    error = paddle.sqrt(criterion_pose3d(pred, gt)).mean()\n    return error\n\n\n@register\n@serializable\ndef weighted_mpjpe(pred, gt, has_3d_joints):\n    \"\"\" \n    Weighted_mPJPE\n    \"\"\"\n    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)\n    weight = paddle.linalg.norm(pred, p=2, axis=-1)\n    weight = paddle.to_tensor(\n        [1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1., 1.])\n    error = (weight * paddle.linalg.norm(pred - gt, p=2, axis=-1)).mean()\n    return error\n\n\n@register\n@serializable\ndef normed_mpjpe(pred, gt, has_3d_joints):\n    \"\"\"\n    Normalized MPJPE (scale only), adapted from:\n    https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py\n    \"\"\"\n    assert pred.shape == gt.shape\n    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)\n\n    norm_predicted = paddle.mean(\n        paddle.sum(pred**2, axis=3, keepdim=True), axis=2, keepdim=True)\n    norm_target = paddle.mean(\n        paddle.sum(gt * pred, axis=3, keepdim=True), axis=2, keepdim=True)\n    scale = norm_target / norm_predicted\n    return mpjpe(scale * pred, gt)\n\n\n@register\n@serializable\ndef mpjpe_np(pred, gt, has_3d_joints):\n    \"\"\" \n    mPJPE_NP\n    \"\"\"\n    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)\n    error = np.sqrt(((pred - gt)**2).sum(axis=-1)).mean()\n    return error\n\n\n@register\n@serializable\ndef mean_per_vertex_error(pred, gt, has_smpl):\n    \"\"\"\n    Compute mPVE\n    \"\"\"\n    pred = pred[has_smpl == 1]\n    gt = gt[has_smpl == 1]\n    with paddle.no_grad():\n        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean()\n        return error\n\n\n@register\n@serializable\ndef keypoint_2d_loss(criterion_keypoints, pred_keypoints_2d, gt_keypoints_2d,\n                     has_pose_2d):\n    \"\"\"\n    Compute 2D reprojection loss if 2D keypoint annotations are available.\n    The confidence (conf) is binary and indicates whether the keypoints exist or not.\n    \"\"\"\n    conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone()\n    loss = (conf * criterion_keypoints(\n        pred_keypoints_2d, gt_keypoints_2d[:, :, :-1] * 0.001)).mean()\n    return loss\n\n\n@register\n@serializable\ndef keypoint_3d_loss(criterion_keypoints, pred_keypoints_3d, gt_keypoints_3d,\n                     has_pose_3d):\n    \"\"\"\n    Compute 3D keypoint loss if 3D keypoint annotations are available.\n    \"\"\"\n    conf = gt_keypoints_3d[:, :, -1].unsqueeze(-1).clone()\n    gt_keypoints_3d = gt_keypoints_3d[:, :, :-1].clone()\n    gt_keypoints_3d = gt_keypoints_3d[has_pose_3d == 1]\n    conf = conf[has_pose_3d == 1]\n    pred_keypoints_3d = pred_keypoints_3d[has_pose_3d == 1]\n    if len(gt_keypoints_3d) > 0:\n        gt_pelvis = (gt_keypoints_3d[:, 2, :] + gt_keypoints_3d[:, 3, :]) / 2\n        gt_keypoints_3d = gt_keypoints_3d - gt_pelvis[:, None, :]\n        pred_pelvis = (\n            pred_keypoints_3d[:, 2, :] + pred_keypoints_3d[:, 3, :]) / 2\n        pred_keypoints_3d = pred_keypoints_3d - pred_pelvis[:, None, :]\n        return (conf * criterion_keypoints(pred_keypoints_3d,\n                                           gt_keypoints_3d)).mean()\n    else:\n        return paddle.to_tensor([1.]).fill_(0.)\n\n\n@register\n@serializable\ndef vertices_loss(criterion_vertices, pred_vertices, gt_vertices, has_smpl):\n    \"\"\"\n    Compute per-vertex loss if vertex annotations are available.\n    \"\"\"\n    pred_vertices_with_shape = pred_vertices[has_smpl == 1]\n    gt_vertices_with_shape = gt_vertices[has_smpl == 1]\n    if len(gt_vertices_with_shape) > 0:\n        return criterion_vertices(pred_vertices_with_shape,\n                                  gt_vertices_with_shape)\n    else:\n        return paddle.to_tensor([1.]).fill_(0.)\n\n\n@register\n@serializable\ndef rectify_pose(pose):\n    pose = pose.copy()\n    R_mod = cv2.Rodrigues(np.array([np.pi, 0, 0]))[0]\n    R_root = cv2.Rodrigues(pose[:3])[0]\n    new_root = R_root.dot(R_mod)\n    pose[:3] = cv2.Rodrigues(new_root)[0].reshape(3)\n    return pose\n"
  },
  {
    "path": "ppdet/modeling/losses/probiou_loss.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\n\nimport paddle\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register, serializable\n\n__all__ = ['ProbIoULoss']\n\n\ndef gbb_form(boxes):\n    xy, wh, angle = paddle.split(boxes, [2, 2, 1], axis=-1)\n    return paddle.concat([xy, wh.pow(2) / 12., angle], axis=-1)\n\n\ndef rotated_form(a_, b_, angles):\n    cos_a = paddle.cos(angles)\n    sin_a = paddle.sin(angles)\n    a = a_ * paddle.pow(cos_a, 2) + b_ * paddle.pow(sin_a, 2)\n    b = a_ * paddle.pow(sin_a, 2) + b_ * paddle.pow(cos_a, 2)\n    c = (a_ - b_) * cos_a * sin_a\n    return a, b, c\n\n\ndef probiou_loss(pred, target, eps=1e-3, mode='l1'):\n    \"\"\"\n        pred    -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours predicted box ;in case of HBB angle == 0\n        target  -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours target    box ;in case of HBB angle == 0\n        eps     -> threshold to avoid infinite values\n        mode    -> ('l1' in [0,1] or 'l2' in [0,inf]) metrics according our paper\n\n    \"\"\"\n\n    gbboxes1 = gbb_form(pred)\n    gbboxes2 = gbb_form(target)\n\n    x1, y1, a1_, b1_, c1_ = gbboxes1[:,\n                                     0], gbboxes1[:,\n                                                  1], gbboxes1[:,\n                                                               2], gbboxes1[:,\n                                                                            3], gbboxes1[:,\n                                                                                         4]\n    x2, y2, a2_, b2_, c2_ = gbboxes2[:,\n                                     0], gbboxes2[:,\n                                                  1], gbboxes2[:,\n                                                               2], gbboxes2[:,\n                                                                            3], gbboxes2[:,\n                                                                                         4]\n\n    a1, b1, c1 = rotated_form(a1_, b1_, c1_)\n    a2, b2, c2 = rotated_form(a2_, b2_, c2_)\n\n    t1 = 0.25 * ((a1 + a2) * (paddle.pow(y1 - y2, 2)) + (b1 + b2) * (paddle.pow(x1 - x2, 2))) + \\\n         0.5 * ((c1+c2)*(x2-x1)*(y1-y2))\n    t2 = (a1 + a2) * (b1 + b2) - paddle.pow(c1 + c2, 2)\n    t3_ = (a1 * b1 - c1 * c1) * (a2 * b2 - c2 * c2)\n    t3 = 0.5 * paddle.log(t2 / (4 * paddle.sqrt(F.relu(t3_)) + eps))\n\n    B_d = (t1 / t2) + t3\n    # B_d = t1 + t2 + t3\n\n    B_d = paddle.clip(B_d, min=eps, max=100.0)\n    l1 = paddle.sqrt(1.0 - paddle.exp(-B_d) + eps)\n    l_i = paddle.pow(l1, 2.0)\n    l2 = -paddle.log(1.0 - l_i + eps)\n\n    if mode == 'l1':\n        probiou = l1\n    if mode == 'l2':\n        probiou = l2\n\n    return probiou\n\n\n@serializable\n@register\nclass ProbIoULoss(object):\n    \"\"\" ProbIoU Loss, refer to https://arxiv.org/abs/2106.06072 for details \"\"\"\n\n    def __init__(self, mode='l1', eps=1e-3):\n        super(ProbIoULoss, self).__init__()\n        self.mode = mode\n        self.eps = eps\n\n    def __call__(self, pred_rboxes, assigned_rboxes):\n        return probiou_loss(pred_rboxes, assigned_rboxes, self.eps, self.mode)\n"
  },
  {
    "path": "ppdet/modeling/losses/queryinst_loss.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.losses.iou_loss import GIoULoss\nfrom .sparsercnn_loss import HungarianMatcher\n\n__all__ = ['QueryInstLoss']\n\n\n@register\nclass QueryInstLoss(object):\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 num_classes=80,\n                 focal_loss_alpha=0.25,\n                 focal_loss_gamma=2.0,\n                 class_weight=2.0,\n                 l1_weight=5.0,\n                 giou_weight=2.0,\n                 mask_weight=8.0):\n        super(QueryInstLoss, self).__init__()\n\n        self.num_classes = num_classes\n        self.focal_loss_alpha = focal_loss_alpha\n        self.focal_loss_gamma = focal_loss_gamma\n        self.loss_weights = {\n            \"loss_cls\": class_weight,\n            \"loss_bbox\": l1_weight,\n            \"loss_giou\": giou_weight,\n            \"loss_mask\": mask_weight\n        }\n        self.giou_loss = GIoULoss(eps=1e-6, reduction='sum')\n\n        self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma,\n                                        class_weight, l1_weight, giou_weight)\n\n    def loss_classes(self, class_logits, targets, indices, avg_factor):\n        tgt_labels = paddle.full(\n            class_logits.shape[:2], self.num_classes, dtype='int32')\n\n        if sum(len(v['labels']) for v in targets) > 0:\n            tgt_classes = paddle.concat([\n                paddle.gather(\n                    tgt['labels'], tgt_idx, axis=0)\n                for tgt, (_, tgt_idx) in zip(targets, indices)\n            ])\n            batch_idx, src_idx = self._get_src_permutation_idx(indices)\n            for i, (batch_i, src_i) in enumerate(zip(batch_idx, src_idx)):\n                tgt_labels[int(batch_i), int(src_i)] = tgt_classes[i]\n\n        tgt_labels = tgt_labels.flatten(0, 1).unsqueeze(-1)\n\n        tgt_labels_onehot = paddle.cast(\n            tgt_labels == paddle.arange(0, self.num_classes), dtype='float32')\n        tgt_labels_onehot.stop_gradient = True\n\n        src_logits = class_logits.flatten(0, 1)\n\n        loss_cls = F.sigmoid_focal_loss(\n            src_logits,\n            tgt_labels_onehot,\n            alpha=self.focal_loss_alpha,\n            gamma=self.focal_loss_gamma,\n            reduction='sum') / avg_factor\n        losses = {'loss_cls': loss_cls * self.loss_weights['loss_cls']}\n        return losses\n\n    def loss_bboxes(self, bbox_pred, targets, indices, avg_factor):\n        bboxes = paddle.concat([\n            paddle.gather(\n                src, src_idx, axis=0)\n            for src, (src_idx, _) in zip(bbox_pred, indices)\n        ])\n\n        tgt_bboxes = paddle.concat([\n            paddle.gather(\n                tgt['boxes'], tgt_idx, axis=0)\n            for tgt, (_, tgt_idx) in zip(targets, indices)\n        ])\n        tgt_bboxes.stop_gradient = True\n\n        im_shapes = paddle.concat([tgt['img_whwh_tgt'] for tgt in targets])\n        bboxes_norm = bboxes / im_shapes\n        tgt_bboxes_norm = tgt_bboxes / im_shapes\n\n        loss_giou = self.giou_loss(bboxes, tgt_bboxes) / avg_factor\n        loss_bbox = F.l1_loss(\n            bboxes_norm, tgt_bboxes_norm, reduction='sum') / avg_factor\n        losses = {\n            'loss_bbox': loss_bbox * self.loss_weights['loss_bbox'],\n            'loss_giou': loss_giou * self.loss_weights['loss_giou']\n        }\n        return losses\n\n    def loss_masks(self, pos_bbox_pred, mask_logits, targets, indices,\n                   avg_factor):\n        tgt_segm = [\n            paddle.gather(\n                tgt['gt_segm'], tgt_idx, axis=0)\n            for tgt, (_, tgt_idx) in zip(targets, indices)\n        ]\n\n        tgt_masks = []\n        for i in range(len(indices)):\n            gt_segm = tgt_segm[i].unsqueeze(1)\n            if len(gt_segm) == 0:\n                continue\n            boxes = pos_bbox_pred[i]\n            boxes[:, 0::2] = paddle.clip(\n                boxes[:, 0::2], min=0, max=gt_segm.shape[3])\n            boxes[:, 1::2] = paddle.clip(\n                boxes[:, 1::2], min=0, max=gt_segm.shape[2])\n            boxes_num = paddle.to_tensor([1] * len(boxes), dtype='int32')\n            gt_mask = paddle.vision.ops.roi_align(\n                gt_segm,\n                boxes,\n                boxes_num,\n                output_size=mask_logits.shape[-2:],\n                aligned=True)\n            tgt_masks.append(gt_mask)\n        tgt_masks = paddle.concat(tgt_masks).squeeze(1)\n        tgt_masks = paddle.cast(tgt_masks >= 0.5, dtype='float32')\n        tgt_masks.stop_gradient = True\n\n        tgt_labels = paddle.concat([\n            paddle.gather(\n                tgt['labels'], tgt_idx, axis=0)\n            for tgt, (_, tgt_idx) in zip(targets, indices)\n        ])\n\n        mask_label = F.one_hot(tgt_labels, self.num_classes).unsqueeze([2, 3])\n        mask_label = paddle.expand_as(mask_label, mask_logits)\n        mask_label.stop_gradient = True\n\n        src_masks = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))\n        shape = mask_logits.shape\n        src_masks = paddle.reshape(src_masks, [shape[0], shape[2], shape[3]])\n        src_masks = F.sigmoid(src_masks)\n\n        X = src_masks.flatten(1)\n        Y = tgt_masks.flatten(1)\n        inter = paddle.sum(X * Y, 1)\n        union = paddle.sum(X * X, 1) + paddle.sum(Y * Y, 1)\n        dice = (2 * inter) / (union + 2e-5)\n\n        loss_mask = (1 - dice).sum() / avg_factor\n        losses = {'loss_mask': loss_mask * self.loss_weights['loss_mask']}\n        return losses\n\n    @staticmethod\n    def _get_src_permutation_idx(indices):\n        batch_idx = paddle.concat(\n            [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])\n        src_idx = paddle.concat([src for (src, _) in indices])\n        return batch_idx, src_idx\n"
  },
  {
    "path": "ppdet/modeling/losses/smooth_l1_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\n\n__all__ = ['SmoothL1Loss']\n\n@register\nclass SmoothL1Loss(nn.Layer):\n    \"\"\"Smooth L1 Loss.\n    Args:\n        beta (float): controls smooth region, it becomes L1 Loss when beta=0.0\n        loss_weight (float): the final loss will be multiplied by this \n    \"\"\"\n    def __init__(self,\n                 beta=1.0,\n                 loss_weight=1.0):\n        super(SmoothL1Loss, self).__init__()\n        assert beta >= 0\n        self.beta = beta\n        self.loss_weight = loss_weight\n\n    def forward(self, pred, target, reduction='none'):\n        \"\"\"forward function, based on fvcore.\n        Args:\n            pred (Tensor): prediction tensor\n            target (Tensor): target tensor, pred.shape must be the same as target.shape\n            reduction (str): the way to reduce loss, one of (none, sum, mean)\n        \"\"\"\n        assert reduction in ('none', 'sum', 'mean')\n        target = target.detach()\n        if self.beta < 1e-5:\n            loss = paddle.abs(pred - target)\n        else:\n            n = paddle.abs(pred - target)\n            cond = n < self.beta\n            loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta)\n        if reduction == 'mean':\n            loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum()\n        elif reduction == 'sum':\n            loss = loss.sum()\n        return loss * self.loss_weight\n"
  },
  {
    "path": "ppdet/modeling/losses/solov2_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\n\n__all__ = ['SOLOv2Loss']\n\n\n@register\n@serializable\nclass SOLOv2Loss(object):\n    \"\"\"\n    SOLOv2Loss\n    Args:\n        ins_loss_weight (float): Weight of instance loss.\n        focal_loss_gamma (float): Gamma parameter for focal loss.\n        focal_loss_alpha (float): Alpha parameter for focal loss.\n    \"\"\"\n\n    def __init__(self,\n                 ins_loss_weight=3.0,\n                 focal_loss_gamma=2.0,\n                 focal_loss_alpha=0.25):\n        self.ins_loss_weight = ins_loss_weight\n        self.focal_loss_gamma = focal_loss_gamma\n        self.focal_loss_alpha = focal_loss_alpha\n\n    def _dice_loss(self, input, target):\n        input = paddle.reshape(input, shape=(input.shape[0], -1))\n        target = paddle.reshape(target, shape=(target.shape[0], -1))\n        a = paddle.sum(input * target, axis=1)\n        b = paddle.sum(input * input, axis=1) + 0.001\n        c = paddle.sum(target * target, axis=1) + 0.001\n        d = (2 * a) / (b + c)\n        return 1 - d\n\n    def __call__(self, ins_pred_list, ins_label_list, cate_preds, cate_labels,\n                 num_ins):\n        \"\"\"\n        Get loss of network of SOLOv2.\n        Args:\n            ins_pred_list (list): Variable list of instance branch output.\n            ins_label_list (list): List of instance labels pre batch.\n            cate_preds (list): Concat Variable list of categroy branch output.\n            cate_labels (list): Concat list of categroy labels pre batch.\n            num_ins (int): Number of positive samples in a mini-batch.\n        Returns:\n            loss_ins (Variable): The instance loss Variable of SOLOv2 network.\n            loss_cate (Variable): The category loss Variable of SOLOv2 network.\n        \"\"\"\n\n        #1. Ues dice_loss to calculate instance loss\n        loss_ins = []\n        total_weights = paddle.zeros(shape=[1], dtype='float32')\n        for input, target in zip(ins_pred_list, ins_label_list):\n            if input is None:\n                continue\n            target = paddle.cast(target, 'float32')\n            target = paddle.reshape(\n                target,\n                shape=[-1, input.shape[-2], input.shape[-1]])\n            weights = paddle.cast(\n                paddle.sum(target, axis=[1, 2]) > 0, 'float32')\n            input = F.sigmoid(input)\n            dice_out = paddle.multiply(self._dice_loss(input, target), weights)\n            total_weights += paddle.sum(weights)\n            loss_ins.append(dice_out)\n        loss_ins = paddle.sum(paddle.concat(loss_ins)) / total_weights\n        loss_ins = loss_ins * self.ins_loss_weight\n\n        #2. Ues sigmoid_focal_loss to calculate category loss\n        # expand onehot labels\n        num_classes = cate_preds.shape[-1]\n        cate_labels_bin = F.one_hot(cate_labels, num_classes=num_classes + 1)\n        cate_labels_bin = cate_labels_bin[:, 1:]\n\n        loss_cate = F.sigmoid_focal_loss(\n            cate_preds,\n            label=cate_labels_bin,\n            normalizer=num_ins + 1.,\n            gamma=self.focal_loss_gamma,\n            alpha=self.focal_loss_alpha)\n\n        return loss_ins, loss_cate\n"
  },
  {
    "path": "ppdet/modeling/losses/sparsercnn_loss.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/loss.py\nThs copyright of PeizeSun/SparseR-CNN is as follows:\nMIT License [see LICENSE for details]\n\"\"\"\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom scipy.optimize import linear_sum_assignment\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.metric import accuracy\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.losses.iou_loss import GIoULoss\n\n__all__ = [\"SparseRCNNLoss\"]\n\n\n@register\nclass SparseRCNNLoss(nn.Layer):\n    \"\"\" This class computes the loss for SparseRCNN.\n    The process happens in two steps:\n        1) we compute hungarian assignment between ground truth boxes and the outputs of the model\n        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)\n    \"\"\"\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 losses,\n                 focal_loss_alpha,\n                 focal_loss_gamma,\n                 num_classes=80,\n                 class_weight=2.,\n                 l1_weight=5.,\n                 giou_weight=2.):\n        \"\"\" Create the criterion.\n        Parameters:\n            num_classes: number of object categories, omitting the special no-object category\n            weight_dict: dict containing as key the names of the losses and as values their relative weight.\n            losses: list of all the losses to be applied. See get_loss for list of available losses.\n            matcher: module able to compute a matching between targets and proposals\n        \"\"\"\n        super().__init__()\n        self.num_classes = num_classes\n        weight_dict = {\n            \"loss_ce\": class_weight,\n            \"loss_bbox\": l1_weight,\n            \"loss_giou\": giou_weight\n        }\n        self.weight_dict = weight_dict\n        self.losses = losses\n        self.giou_loss = GIoULoss(reduction=\"sum\")\n\n        self.focal_loss_alpha = focal_loss_alpha\n        self.focal_loss_gamma = focal_loss_gamma\n\n        self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma,\n                                        class_weight, l1_weight, giou_weight)\n\n    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):\n        \"\"\"Classification loss (NLL)\n        targets dicts must contain the key \"labels\" containing a tensor of dim [nb_target_boxes]\n        \"\"\"\n        assert 'pred_logits' in outputs\n        src_logits = outputs['pred_logits']\n\n        idx = self._get_src_permutation_idx(indices)\n        target_classes_o = paddle.concat([\n            paddle.gather(\n                t[\"labels\"], J, axis=0) for t, (_, J) in zip(targets, indices)\n        ])\n        target_classes = paddle.full(\n            src_logits.shape[:2], self.num_classes, dtype=\"int32\")\n        for i, ind in enumerate(zip(idx[0], idx[1])):\n            target_classes[int(ind[0]), int(ind[1])] = target_classes_o[i]\n        target_classes.stop_gradient = True\n\n        src_logits = src_logits.flatten(start_axis=0, stop_axis=1)\n\n        # prepare one_hot target.\n        target_classes = target_classes.flatten(start_axis=0, stop_axis=1)\n        class_ids = paddle.arange(0, self.num_classes)\n        labels = (target_classes.unsqueeze(-1) == class_ids.astype(target_classes.dtype)).astype(\"float32\")\n        labels.stop_gradient = True\n\n        # comp focal loss.\n        class_loss = sigmoid_focal_loss(\n            src_logits,\n            labels,\n            alpha=self.focal_loss_alpha,\n            gamma=self.focal_loss_gamma,\n            reduction=\"sum\", ) / num_boxes\n        losses = {'loss_ce': class_loss}\n\n        if log:\n            label_acc = target_classes_o.unsqueeze(-1)\n            src_idx = [src for (src, _) in indices]\n\n            pred_list = []\n            for i in range(outputs[\"pred_logits\"].shape[0]):\n                pred_list.append(\n                    paddle.gather(\n                        outputs[\"pred_logits\"][i], src_idx[i], axis=0))\n\n            pred = F.sigmoid(paddle.concat(pred_list, axis=0))\n            acc = accuracy(pred, label_acc.astype(\"int64\"))\n            losses[\"acc\"] = acc\n\n        return losses\n\n    def loss_boxes(self, outputs, targets, indices, num_boxes):\n        \"\"\"Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss\n           targets dicts must contain the key \"boxes\" containing a tensor of dim [nb_target_boxes, 4]\n           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.\n        \"\"\"\n        assert 'pred_boxes' in outputs  # [batch_size, num_proposals, 4]\n        src_idx = [src for (src, _) in indices]\n        src_boxes_list = []\n\n        for i in range(outputs[\"pred_boxes\"].shape[0]):\n            src_boxes_list.append(\n                paddle.gather(\n                    outputs[\"pred_boxes\"][i], src_idx[i], axis=0))\n\n        src_boxes = paddle.concat(src_boxes_list, axis=0)\n\n        target_boxes = paddle.concat(\n            [\n                paddle.gather(\n                    t['boxes'], I, axis=0)\n                for t, (_, I) in zip(targets, indices)\n            ],\n            axis=0)\n        target_boxes.stop_gradient = True\n        losses = {}\n\n        losses['loss_giou'] = self.giou_loss(src_boxes,\n                                             target_boxes) / num_boxes\n\n        image_size = paddle.concat([v[\"img_whwh_tgt\"] for v in targets])\n        src_boxes_ = src_boxes / image_size\n        target_boxes_ = target_boxes / image_size\n\n        loss_bbox = F.l1_loss(src_boxes_, target_boxes_, reduction='sum')\n        losses['loss_bbox'] = loss_bbox / num_boxes\n\n        return losses\n\n    def _get_src_permutation_idx(self, indices):\n        # permute predictions following indices\n        batch_idx = paddle.concat(\n            [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])\n        src_idx = paddle.concat([src for (src, _) in indices])\n        return batch_idx, src_idx\n\n    def _get_tgt_permutation_idx(self, indices):\n        # permute targets following indices\n        batch_idx = paddle.concat(\n            [paddle.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])\n        tgt_idx = paddle.concat([tgt for (_, tgt) in indices])\n        return batch_idx, tgt_idx\n\n    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):\n        loss_map = {\n            'labels': self.loss_labels,\n            'boxes': self.loss_boxes,\n        }\n        assert loss in loss_map, f'do you really want to compute {loss} loss?'\n        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)\n\n    def forward(self, outputs, targets):\n        \"\"\" This performs the loss computation.\n        Parameters:\n             outputs: dict of tensors, see the output specification of the model for the format\n             targets: list of dicts, such that len(targets) == batch_size.\n                      The expected keys in each dict depends on the losses applied, see each loss' doc\n        \"\"\"\n        outputs_without_aux = {\n            k: v\n            for k, v in outputs.items() if k != 'aux_outputs'\n        }\n\n        # Retrieve the matching between the outputs of the last layer and the targets\n        indices = self.matcher(outputs_without_aux, targets)\n\n        # Compute the average number of target boxes across all nodes, for normalization purposes\n        num_boxes = sum(len(t[\"labels\"]) for t in targets)\n        num_boxes = paddle.to_tensor(\n            [num_boxes],\n            dtype=\"float32\",\n            place=next(iter(outputs.values())).place)\n\n        # Compute all the requested losses\n        losses = {}\n        for loss in self.losses:\n            losses.update(\n                self.get_loss(loss, outputs, targets, indices, num_boxes))\n\n        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.\n        if 'aux_outputs' in outputs:\n            for i, aux_outputs in enumerate(outputs['aux_outputs']):\n                indices = self.matcher(aux_outputs, targets)\n                for loss in self.losses:\n                    kwargs = {}\n                    if loss == 'labels':\n                        # Logging is enabled only for the last layer\n                        kwargs = {'log': False}\n                    l_dict = self.get_loss(loss, aux_outputs, targets, indices,\n                                           num_boxes, **kwargs)\n\n                    w_dict = {}\n                    for k in l_dict.keys():\n                        if k in self.weight_dict:\n                            w_dict[k + f'_{i}'] = l_dict[k] * self.weight_dict[\n                                k]\n                        else:\n                            w_dict[k + f'_{i}'] = l_dict[k]\n                    losses.update(w_dict)\n\n        return losses\n\n\nclass HungarianMatcher(nn.Layer):\n    \"\"\"This class computes an assignment between the targets and the predictions of the network\n    For efficiency reasons, the targets don't include the no_object. Because of this, in general,\n    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,\n    while the others are un-matched (and thus treated as non-objects).\n    \"\"\"\n\n    def __init__(self,\n                 focal_loss_alpha,\n                 focal_loss_gamma,\n                 cost_class: float=1,\n                 cost_bbox: float=1,\n                 cost_giou: float=1):\n        \"\"\"Creates the matcher\n        Params:\n            cost_class: This is the relative weight of the classification error in the matching cost\n            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost\n            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost\n        \"\"\"\n        super().__init__()\n        self.cost_class = cost_class\n        self.cost_bbox = cost_bbox\n        self.cost_giou = cost_giou\n        self.focal_loss_alpha = focal_loss_alpha\n        self.focal_loss_gamma = focal_loss_gamma\n        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, \"all costs cant be 0\"\n\n    @paddle.no_grad()\n    def forward(self, outputs, targets):\n        \"\"\" Performs the matching\n        Args:\n            outputs: This is a dict that contains at least these entries:\n                 \"pred_logits\": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits\n                 \"pred_boxes\": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates\n                 eg. outputs = {\"pred_logits\": pred_logits, \"pred_boxes\": pred_boxes}\n            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:\n                 \"labels\": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth\n                           objects in the target) containing the class labels\n                 \"boxes\": Tensor of dim [num_target_boxes, 4] containing the target box coordinates\n                 eg. targets = [{\"labels\":labels, \"boxes\": boxes}, ...,{\"labels\":labels, \"boxes\": boxes}]\n        Returns:\n            A list of size batch_size, containing tuples of (index_i, index_j) where:\n                - index_i is the indices of the selected predictions (in order)\n                - index_j is the indices of the corresponding selected targets (in order)\n            For each batch element, it holds:\n                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)\n        \"\"\"\n        bs, num_queries = outputs[\"pred_logits\"].shape[:2]\n\n        if sum(len(v[\"labels\"]) for v in targets) == 0:\n            return [(paddle.to_tensor(\n                [], dtype=paddle.int64), paddle.to_tensor(\n                    [], dtype=paddle.int64)) for _ in range(bs)]\n\n        # We flatten to compute the cost matrices in a batch\n        out_prob = F.sigmoid(outputs[\"pred_logits\"].flatten(\n            start_axis=0, stop_axis=1))\n        out_bbox = outputs[\"pred_boxes\"].flatten(start_axis=0, stop_axis=1)\n\n        # Also concat the target labels and boxes\n        tgt_ids = paddle.concat([v[\"labels\"] for v in targets])\n        assert (tgt_ids > -1).all()\n        tgt_bbox = paddle.concat([v[\"boxes\"] for v in targets])\n\n        # Compute the classification cost. Contrary to the loss, we don't use the NLL,\n        # but approximate it in 1 - proba[target class].\n        # The 1 is a constant that doesn't change the matching, it can be ommitted.\n\n        # Compute the classification cost.\n        alpha = self.focal_loss_alpha\n        gamma = self.focal_loss_gamma\n\n        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(\n            1 - out_prob + 1e-8).log())\n        pos_cost_class = alpha * ((1 - out_prob)\n                                  **gamma) * (-(out_prob + 1e-8).log())\n\n        cost_class = paddle.gather(\n            pos_cost_class, tgt_ids, axis=1) - paddle.gather(\n                neg_cost_class, tgt_ids, axis=1)\n\n        # Compute the L1 cost between boxes\n        image_size_out = paddle.concat(\n            [v[\"img_whwh\"].unsqueeze(0) for v in targets])\n        image_size_out = image_size_out.unsqueeze(1).tile(\n            [1, num_queries, 1]).flatten(\n                start_axis=0, stop_axis=1)\n        image_size_tgt = paddle.concat([v[\"img_whwh_tgt\"] for v in targets])\n\n        out_bbox_ = out_bbox / image_size_out\n        tgt_bbox_ = tgt_bbox / image_size_tgt\n        cost_bbox = F.l1_loss(\n            out_bbox_.unsqueeze(-2), tgt_bbox_,\n            reduction='none').sum(-1)  # [batch_size * num_queries, num_tgts]\n\n        # Compute the giou cost betwen boxes\n        cost_giou = -get_bboxes_giou(out_bbox, tgt_bbox)\n\n        # Final cost matrix\n        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou\n        C = C.reshape([bs, num_queries, -1])\n\n        sizes = [len(v[\"boxes\"]) for v in targets]\n\n        indices = [\n            linear_sum_assignment(c[i].numpy())\n            for i, c in enumerate(C.split(sizes, -1))\n        ]\n        return [(paddle.to_tensor(\n            i, dtype=\"int32\"), paddle.to_tensor(\n                j, dtype=\"int32\")) for i, j in indices]\n\n\ndef box_area(boxes):\n    assert (boxes[:, 2:] >= boxes[:, :2]).all()\n    wh = boxes[:, 2:] - boxes[:, :2]\n    return wh[:, 0] * wh[:, 1]\n\n\ndef boxes_iou(boxes1, boxes2):\n    '''\n    Compute iou\n\n    Args:\n        boxes1 (paddle.tensor) shape (N, 4)\n        boxes2 (paddle.tensor) shape (M, 4)\n\n    Return:\n        (paddle.tensor) shape (N, M)\n    '''\n    area1 = box_area(boxes1)\n    area2 = box_area(boxes2)\n\n    lt = paddle.maximum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2])\n    rb = paddle.minimum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:])\n\n    wh = (rb - lt).astype(\"float32\").clip(min=1e-9)\n    inter = wh[:, :, 0] * wh[:, :, 1]\n\n    union = area1.unsqueeze(-1) + area2 - inter + 1e-9\n\n    iou = inter / union\n    return iou, union\n\n\ndef get_bboxes_giou(boxes1, boxes2, eps=1e-9):\n    \"\"\"calculate the ious of boxes1 and boxes2\n\n    Args:\n        boxes1 (Tensor): shape [N, 4]\n        boxes2 (Tensor): shape [M, 4]\n        eps (float): epsilon to avoid divide by zero\n\n    Return:\n        ious (Tensor): ious of boxes1 and boxes2, with the shape [N, M]\n    \"\"\"\n    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()\n    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()\n\n    iou, union = boxes_iou(boxes1, boxes2)\n\n    lt = paddle.minimum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2])\n    rb = paddle.maximum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:])\n\n    wh = (rb - lt).astype(\"float32\").clip(min=eps)\n    enclose_area = wh[:, :, 0] * wh[:, :, 1]\n\n    giou = iou - (enclose_area - union) / enclose_area\n\n    return giou\n\n\ndef sigmoid_focal_loss(inputs, targets, alpha, gamma, reduction=\"sum\"):\n\n    assert reduction in [\"sum\", \"mean\"\n                         ], f'do not support this {reduction} reduction?'\n\n    p = F.sigmoid(inputs)\n    ce_loss = F.binary_cross_entropy_with_logits(\n        inputs, targets, reduction=\"none\")\n    p_t = p * targets + (1 - p) * (1 - targets)\n    loss = ce_loss * ((1 - p_t)**gamma)\n\n    if alpha >= 0:\n        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)\n        loss = alpha_t * loss\n\n    if reduction == \"mean\":\n        loss = loss.mean()\n    elif reduction == \"sum\":\n        loss = loss.sum()\n\n    return loss\n"
  },
  {
    "path": "ppdet/modeling/losses/ssd_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom ..bbox_utils import iou_similarity, bbox2delta\n\n__all__ = ['SSDLoss']\n\n\n@register\nclass SSDLoss(nn.Layer):\n    \"\"\"\n    SSDLoss\n\n    Args:\n        overlap_threshold (float32, optional): IoU threshold for negative bboxes\n            and positive bboxes, 0.5 by default.\n        neg_pos_ratio (float): The ratio of negative samples / positive samples.\n        loc_loss_weight (float): The weight of loc_loss.\n        conf_loss_weight (float): The weight of conf_loss.\n        prior_box_var (list): Variances corresponding to prior box coord, [0.1,\n            0.1, 0.2, 0.2] by default.\n    \"\"\"\n\n    def __init__(self,\n                 overlap_threshold=0.5,\n                 neg_pos_ratio=3.0,\n                 loc_loss_weight=1.0,\n                 conf_loss_weight=1.0,\n                 prior_box_var=[0.1, 0.1, 0.2, 0.2]):\n        super(SSDLoss, self).__init__()\n        self.overlap_threshold = overlap_threshold\n        self.neg_pos_ratio = neg_pos_ratio\n        self.loc_loss_weight = loc_loss_weight\n        self.conf_loss_weight = conf_loss_weight\n        self.prior_box_var = [1. / a for a in prior_box_var]\n\n    def _bipartite_match_for_batch(self, gt_bbox, gt_label, prior_boxes,\n                                   bg_index):\n        \"\"\"\n        Args:\n            gt_bbox (Tensor): [B, N, 4]\n            gt_label (Tensor): [B, N, 1]\n            prior_boxes (Tensor): [A, 4]\n            bg_index (int): Background class index\n        \"\"\"\n        batch_size, num_priors = gt_bbox.shape[0], prior_boxes.shape[0]\n        ious = iou_similarity(gt_bbox.reshape((-1, 4)), prior_boxes).reshape(\n            (batch_size, -1, num_priors))\n\n        # For each prior box, get the max IoU of all GTs.\n        prior_max_iou, prior_argmax_iou = ious.max(axis=1), ious.argmax(axis=1)\n        # For each GT, get the max IoU of all prior boxes.\n        gt_max_iou, gt_argmax_iou = ious.max(axis=2), ious.argmax(axis=2)\n\n        # Gather target bbox and label according to 'prior_argmax_iou' index.\n        batch_ind = paddle.arange(end=batch_size, dtype='int64').unsqueeze(-1)\n        prior_argmax_iou = paddle.stack(\n            [batch_ind.tile([1, num_priors]), prior_argmax_iou], axis=-1)\n        targets_bbox = paddle.gather_nd(gt_bbox, prior_argmax_iou)\n        targets_label = paddle.gather_nd(gt_label, prior_argmax_iou)\n        # Assign negative\n        bg_index_tensor = paddle.full([batch_size, num_priors, 1], bg_index,\n                                      'int64')\n        targets_label = paddle.where(\n            prior_max_iou.unsqueeze(-1) < self.overlap_threshold,\n            bg_index_tensor, targets_label)\n\n        # Ensure each GT can match the max IoU prior box.\n        batch_ind = (batch_ind * num_priors + gt_argmax_iou).flatten()\n        targets_bbox = paddle.scatter(\n            targets_bbox.reshape([-1, 4]), batch_ind,\n            gt_bbox.reshape([-1, 4])).reshape([batch_size, -1, 4])\n        targets_label = paddle.scatter(\n            targets_label.reshape([-1, 1]), batch_ind,\n            gt_label.reshape([-1, 1])).reshape([batch_size, -1, 1])\n        targets_label[:, :1] = bg_index\n\n        # Encode box\n        prior_boxes = prior_boxes.unsqueeze(0).tile([batch_size, 1, 1])\n        targets_bbox = bbox2delta(\n            prior_boxes.reshape([-1, 4]),\n            targets_bbox.reshape([-1, 4]), self.prior_box_var)\n        targets_bbox = targets_bbox.reshape([batch_size, -1, 4])\n\n        return targets_bbox, targets_label\n\n    def _mine_hard_example(self,\n                           conf_loss,\n                           targets_label,\n                           bg_index,\n                           mine_neg_ratio=0.01):\n        pos = (targets_label != bg_index).astype(conf_loss.dtype)\n        num_pos = pos.sum(axis=1, keepdim=True)\n        neg = (targets_label == bg_index).astype(conf_loss.dtype)\n\n        conf_loss = conf_loss.detach() * neg\n        loss_idx = conf_loss.argsort(axis=1, descending=True)\n        idx_rank = loss_idx.argsort(axis=1)\n        num_negs = []\n        for i in range(conf_loss.shape[0]):\n            cur_num_pos = num_pos[i]\n            num_neg = paddle.clip(\n                cur_num_pos * self.neg_pos_ratio, max=pos.shape[1])\n            num_neg = num_neg if num_neg > 0 else paddle.to_tensor(\n                [pos.shape[1] * mine_neg_ratio])\n            num_negs.append(num_neg)\n        num_negs = paddle.stack(num_negs).expand_as(idx_rank)\n        neg_mask = (idx_rank.astype(num_negs.dtype) < num_negs).astype(conf_loss.dtype)\n\n        return (neg_mask + pos).astype('bool')\n\n    def forward(self, boxes, scores, gt_bbox, gt_label, prior_boxes):\n        boxes = paddle.concat(boxes, axis=1)\n        scores = paddle.concat(scores, axis=1)\n        gt_label = gt_label.unsqueeze(-1).astype('int64')\n        prior_boxes = paddle.concat(prior_boxes, axis=0)\n        bg_index = scores.shape[-1] - 1\n\n        # Match bbox and get targets.\n        targets_bbox, targets_label = \\\n            self._bipartite_match_for_batch(gt_bbox, gt_label, prior_boxes, bg_index)\n        targets_bbox.stop_gradient = True\n        targets_label.stop_gradient = True\n\n        # Compute regression loss.\n        # Select positive samples.\n        bbox_mask = paddle.tile(targets_label != bg_index, [1, 1, 4])\n        if bbox_mask.astype(boxes.dtype).sum() > 0:\n            location = paddle.masked_select(boxes, bbox_mask)\n            targets_bbox_tmp = paddle.masked_select(targets_bbox, bbox_mask)\n            loc_loss = F.smooth_l1_loss(location, targets_bbox_tmp, reduction='sum')\n            loc_loss = loc_loss * self.loc_loss_weight\n        else:\n            loc_loss = paddle.zeros([])\n\n        # Compute confidence loss.\n        conf_loss = F.cross_entropy(scores, targets_label, reduction=\"none\")\n        # Mining hard examples.\n        label_mask = self._mine_hard_example(\n            conf_loss.squeeze(-1), targets_label.squeeze(-1), bg_index)\n        conf_loss = paddle.masked_select(conf_loss, label_mask.unsqueeze(-1))\n        conf_loss = conf_loss.sum() * self.conf_loss_weight\n\n        # Compute overall weighted loss.\n        normalizer = (targets_label != bg_index).astype('float32').sum().clip(\n            min=1)\n        loss = (conf_loss + loc_loss) / normalizer\n\n        return loss\n"
  },
  {
    "path": "ppdet/modeling/losses/supcontrast.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nimport random\nfrom ppdet.core.workspace import register\n\n\n__all__ = ['SupContrast']\n\n\n@register\nclass SupContrast(nn.Layer):\n    __shared__ = [\n        'num_classes'\n    ]\n    def __init__(self, num_classes=80, temperature=2.5, sample_num=4096, thresh=0.75):\n        super(SupContrast, self).__init__()\n        self.num_classes = num_classes\n        self.temperature = temperature\n        self.sample_num = sample_num\n        self.thresh = thresh\n    def forward(self, features, labels, scores):\n        \n        assert features.shape[0] == labels.shape[0] == scores.shape[0]\n        positive_mask = (labels < self.num_classes)\n        positive_features, positive_labels, positive_scores = features[positive_mask], labels[positive_mask], \\\n                                                              scores[positive_mask]\n        \n        negative_mask = (labels == self.num_classes)\n        negative_features, negative_labels, negative_scores = features[negative_mask], labels[negative_mask], \\\n                                                              scores[negative_mask]\n        \n        N = negative_features.shape[0]\n        S = self.sample_num - positive_mask.sum()   \n        index = paddle.to_tensor(random.sample(range(N), int(S)), dtype='int32')\n\n        negative_features = paddle.index_select(x=negative_features, index=index, axis=0)\n        negative_labels = paddle.index_select(x=negative_labels, index=index, axis=0)\n        negative_scores = paddle.index_select(x=negative_scores, index=index, axis=0)\n        \n        features = paddle.concat([positive_features, negative_features], 0)\n        labels = paddle.concat([positive_labels, negative_labels], 0)\n        scores = paddle.concat([positive_scores, negative_scores], 0)\n\n        if len(labels.shape) == 1:\n            labels = labels.reshape([-1, 1])\n        label_mask = paddle.equal(labels, labels.T).detach()\n        similarity = (paddle.matmul(features, features.T) / self.temperature)\n\n        sim_row_max = paddle.max(similarity, axis=1, keepdim=True)\n        similarity = similarity - sim_row_max\n\n        logits_mask = paddle.ones_like(similarity).detach()\n        logits_mask.fill_diagonal_(0)\n\n        exp_sim = paddle.exp(similarity) * logits_mask\n        log_prob = similarity - paddle.log(exp_sim.sum(axis=1, keepdim=True))\n\n        per_label_log_prob = (log_prob * logits_mask * label_mask).sum(1) / label_mask.sum(1)\n        keep = scores > self.thresh\n        per_label_log_prob = per_label_log_prob[keep]\n        loss = -per_label_log_prob\n\n        return loss.mean()"
  },
  {
    "path": "ppdet/modeling/losses/varifocal_loss.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\n# The code is based on:\n# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling import ops\n# from paddle.base.framework import in_dygraph_mode\n__all__ = ['VarifocalLoss']\n\n\ndef varifocal_loss(pred,\n                   target,\n                   alpha=0.75,\n                   gamma=2.0,\n                   iou_weighted=True,\n                   use_sigmoid=True):\n    \"\"\"`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_\n\n    Args:\n        pred (Tensor): The prediction with shape (N, C), C is the\n            number of classes\n        target (Tensor): The learning target of the iou-aware\n            classification score with shape (N, C), C is the number of classes.\n        alpha (float, optional): A balance factor for the negative part of\n            Varifocal Loss, which is different from the alpha of Focal Loss.\n            Defaults to 0.75.\n        gamma (float, optional): The gamma for calculating the modulating\n            factor. Defaults to 2.0.\n        iou_weighted (bool, optional): Whether to weight the loss of the\n            positive example with the iou target. Defaults to True.\n    \"\"\"\n    # pred and target should be of the same size\n    assert len(pred.shape) == len(target.shape) # rank\n    # if in_dygraph_mode():\n    #     assert pred.shape == target.shape\n    if use_sigmoid:\n        pred_new = F.sigmoid(pred)\n    else:\n        pred_new = pred\n    target = target.cast(pred.dtype)\n    if iou_weighted:\n        focal_weight = target * (target > 0.0).cast('float32') + \\\n            alpha * (pred_new - target).abs().pow(gamma) * \\\n            (target <= 0.0).cast('float32')\n    else:\n        focal_weight = (target > 0.0).cast('float32') + \\\n            alpha * (pred_new - target).abs().pow(gamma) * \\\n            (target <= 0.0).cast('float32')\n\n    if use_sigmoid:\n        loss = F.binary_cross_entropy_with_logits(\n            pred, target, reduction='none') * focal_weight\n    else:\n        loss = F.binary_cross_entropy(\n            pred, target, reduction='none') * focal_weight\n        loss = loss.sum(axis=1)\n    return loss\n\n\n@register\n@serializable\nclass VarifocalLoss(nn.Layer):\n    def __init__(self,\n                 use_sigmoid=True,\n                 alpha=0.75,\n                 gamma=2.0,\n                 iou_weighted=True,\n                 reduction='mean',\n                 loss_weight=1.0):\n        \"\"\"`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_\n\n        Args:\n            use_sigmoid (bool, optional): Whether the prediction is\n                used for sigmoid or softmax. Defaults to True.\n            alpha (float, optional): A balance factor for the negative part of\n                Varifocal Loss, which is different from the alpha of Focal\n                Loss. Defaults to 0.75.\n            gamma (float, optional): The gamma for calculating the modulating\n                factor. Defaults to 2.0.\n            iou_weighted (bool, optional): Whether to weight the loss of the\n                positive examples with the iou target. Defaults to True.\n            reduction (str, optional): The method used to reduce the loss into\n                a scalar. Defaults to 'mean'. Options are \"none\", \"mean\" and\n                \"sum\".\n            loss_weight (float, optional): Weight of loss. Defaults to 1.0.\n        \"\"\"\n        super(VarifocalLoss, self).__init__()\n        assert alpha >= 0.0\n        self.use_sigmoid = use_sigmoid\n        self.alpha = alpha\n        self.gamma = gamma\n        self.iou_weighted = iou_weighted\n        self.reduction = reduction\n        self.loss_weight = loss_weight\n\n    def forward(self, pred, target, weight=None, avg_factor=None):\n        \"\"\"Forward function.\n\n        Args:\n            pred (Tensor): The prediction.\n            target (Tensor): The learning target of the prediction.\n            weight (Tensor, optional): The weight of loss for each\n                prediction. Defaults to None.\n            avg_factor (int, optional): Average factor that is used to average\n                the loss. Defaults to None.\n        Returns:\n            Tensor: The calculated loss\n        \"\"\"\n        loss = self.loss_weight * varifocal_loss(\n            pred,\n            target,\n            alpha=self.alpha,\n            gamma=self.gamma,\n            iou_weighted=self.iou_weighted,\n            use_sigmoid=self.use_sigmoid)\n\n        if weight is not None:\n            loss = loss * weight\n        if avg_factor is None:\n            if self.reduction == 'none':\n                return loss\n            elif self.reduction == 'mean':\n                return loss.mean()\n            elif self.reduction == 'sum':\n                return loss.sum()\n        else:\n            # if reduction is mean, then average the loss by avg_factor\n            if self.reduction == 'mean':\n                loss = loss.sum() / avg_factor\n            # if reduction is 'none', then do nothing, otherwise raise an error\n            elif self.reduction != 'none':\n                raise ValueError(\n                    'avg_factor can not be used with reduction=\"sum\"')\n        return loss\n"
  },
  {
    "path": "ppdet/modeling/losses/yolo_loss.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\n\nfrom ..bbox_utils import decode_yolo, xywh2xyxy, batch_iou_similarity\n\n__all__ = ['YOLOv3Loss']\n\n\ndef bbox_transform(pbox, anchor, downsample):\n    pbox = decode_yolo(pbox, anchor, downsample)\n    pbox = xywh2xyxy(pbox)\n    return pbox\n\n\n@register\nclass YOLOv3Loss(nn.Layer):\n\n    __inject__ = ['iou_loss', 'iou_aware_loss']\n    __shared__ = ['num_classes']\n\n    def __init__(self,\n                 num_classes=80,\n                 ignore_thresh=0.7,\n                 label_smooth=False,\n                 downsample=[32, 16, 8],\n                 scale_x_y=1.,\n                 iou_loss=None,\n                 iou_aware_loss=None):\n        \"\"\"\n        YOLOv3Loss layer\n\n        Args:\n            num_calsses (int): number of foreground classes\n            ignore_thresh (float): threshold to ignore confidence loss\n            label_smooth (bool): whether to use label smoothing\n            downsample (list): downsample ratio for each detection block\n            scale_x_y (float): scale_x_y factor\n            iou_loss (object): IoULoss instance\n            iou_aware_loss (object): IouAwareLoss instance  \n        \"\"\"\n        super(YOLOv3Loss, self).__init__()\n        self.num_classes = num_classes\n        self.ignore_thresh = ignore_thresh\n        self.label_smooth = label_smooth\n        self.downsample = downsample\n        self.scale_x_y = scale_x_y\n        self.iou_loss = iou_loss\n        self.iou_aware_loss = iou_aware_loss\n        self.distill_pairs = []\n\n    def obj_loss(self, pbox, gbox, pobj, tobj, anchor, downsample):\n        # pbox\n        pbox = decode_yolo(pbox, anchor, downsample)\n        pbox = xywh2xyxy(pbox)\n        pbox = paddle.concat(pbox, axis=-1)\n        b = pbox.shape[0]\n        pbox = pbox.reshape((b, -1, 4))\n        # gbox\n        gxy = gbox[:, :, 0:2] - gbox[:, :, 2:4] * 0.5\n        gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5\n        gbox = paddle.concat([gxy, gwh], axis=-1)\n\n        iou = batch_iou_similarity(pbox, gbox)\n        iou.stop_gradient = True\n        iou_max = iou.max(2)  # [N, M1]\n        iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype)\n        iou_mask.stop_gradient = True\n\n        pobj = pobj.reshape((b, -1))\n        tobj = tobj.reshape((b, -1))\n        obj_mask = paddle.cast(tobj > 0, dtype=pbox.dtype)\n        obj_mask.stop_gradient = True\n\n        loss_obj = F.binary_cross_entropy_with_logits(\n            pobj, obj_mask, reduction='none')\n        loss_obj_pos = (loss_obj * tobj)\n        loss_obj_neg = (loss_obj * (1 - obj_mask) * iou_mask)\n        return loss_obj_pos + loss_obj_neg\n\n    def cls_loss(self, pcls, tcls):\n        if self.label_smooth:\n            delta = min(1. / self.num_classes, 1. / 40)\n            pos, neg = 1 - delta, delta\n            # 1 for positive, 0 for negative\n            tcls = pos * paddle.cast(\n                tcls > 0., dtype=tcls.dtype) + neg * paddle.cast(\n                    tcls <= 0., dtype=tcls.dtype)\n\n        loss_cls = F.binary_cross_entropy_with_logits(\n            pcls, tcls, reduction='none')\n        return loss_cls\n\n    def yolov3_loss(self, p, t, gt_box, anchor, downsample, scale=1.,\n                    eps=1e-10):\n        na = len(anchor)\n        b, c, h, w = p.shape\n        if self.iou_aware_loss:\n            ioup, p = p[:, 0:na, :, :], p[:, na:, :, :]\n            ioup = ioup.unsqueeze(-1)\n        p = p.reshape((b, na, -1, h, w)).transpose((0, 1, 3, 4, 2))\n        x, y = p[:, :, :, :, 0:1], p[:, :, :, :, 1:2]\n        w, h = p[:, :, :, :, 2:3], p[:, :, :, :, 3:4]\n        obj, pcls = p[:, :, :, :, 4:5], p[:, :, :, :, 5:]\n        self.distill_pairs.append([x, y, w, h, obj, pcls])\n\n        t = t.transpose((0, 1, 3, 4, 2))\n        tx, ty = t[:, :, :, :, 0:1], t[:, :, :, :, 1:2]\n        tw, th = t[:, :, :, :, 2:3], t[:, :, :, :, 3:4]\n        tscale = t[:, :, :, :, 4:5]\n        tobj, tcls = t[:, :, :, :, 5:6], t[:, :, :, :, 6:]\n\n        tscale_obj = tscale * tobj\n        loss = dict()\n\n        x = scale * F.sigmoid(x) - 0.5 * (scale - 1.)\n        y = scale * F.sigmoid(y) - 0.5 * (scale - 1.)\n\n        if abs(scale - 1.) < eps:\n            loss_x = F.binary_cross_entropy(x, tx, reduction='none')\n            loss_y = F.binary_cross_entropy(y, ty, reduction='none')\n            loss_xy = tscale_obj * (loss_x + loss_y)\n        else:\n            loss_x = paddle.abs(x - tx)\n            loss_y = paddle.abs(y - ty)\n            loss_xy = tscale_obj * (loss_x + loss_y)\n\n        loss_xy = loss_xy.sum([1, 2, 3, 4]).mean()\n\n        loss_w = paddle.abs(w - tw)\n        loss_h = paddle.abs(h - th)\n        loss_wh = tscale_obj * (loss_w + loss_h)\n        loss_wh = loss_wh.sum([1, 2, 3, 4]).mean()\n\n        loss['loss_xy'] = loss_xy\n        loss['loss_wh'] = loss_wh\n\n        if self.iou_loss is not None:\n            # warn: do not modify x, y, w, h in place\n            box, tbox = [x, y, w, h], [tx, ty, tw, th]\n            pbox = bbox_transform(box, anchor, downsample)\n            gbox = bbox_transform(tbox, anchor, downsample)\n            loss_iou = self.iou_loss(pbox, gbox)\n            loss_iou = loss_iou * tscale_obj\n            loss_iou = loss_iou.sum([1, 2, 3, 4]).mean()\n            loss['loss_iou'] = loss_iou\n\n        if self.iou_aware_loss is not None:\n            box, tbox = [x, y, w, h], [tx, ty, tw, th]\n            pbox = bbox_transform(box, anchor, downsample)\n            gbox = bbox_transform(tbox, anchor, downsample)\n            loss_iou_aware = self.iou_aware_loss(ioup, pbox, gbox)\n            loss_iou_aware = loss_iou_aware * tobj\n            loss_iou_aware = loss_iou_aware.sum([1, 2, 3, 4]).mean()\n            loss['loss_iou_aware'] = loss_iou_aware\n\n        box = [x, y, w, h]\n        loss_obj = self.obj_loss(box, gt_box, obj, tobj, anchor, downsample)\n        loss_obj = loss_obj.sum(-1).mean()\n        loss['loss_obj'] = loss_obj\n        loss_cls = self.cls_loss(pcls, tcls) * tobj\n        loss_cls = loss_cls.sum([1, 2, 3, 4]).mean()\n        loss['loss_cls'] = loss_cls\n        return loss\n\n    def forward(self, inputs, targets, anchors):\n        np = len(inputs)\n        gt_targets = [targets['target{}'.format(i)] for i in range(np)]\n        gt_box = targets['gt_bbox']\n        yolo_losses = dict()\n        self.distill_pairs.clear()\n        for x, t, anchor, downsample in zip(inputs, gt_targets, anchors,\n                                            self.downsample):\n            yolo_loss = self.yolov3_loss(\n                x.astype('float32'), t, gt_box, anchor, downsample,\n                self.scale_x_y)\n            for k, v in yolo_loss.items():\n                if k in yolo_losses:\n                    yolo_losses[k] += v\n                else:\n                    yolo_losses[k] = v\n\n        loss = 0\n        for k, v in yolo_losses.items():\n            loss += v\n\n        yolo_losses['loss'] = loss\n        return yolo_losses\n"
  },
  {
    "path": "ppdet/modeling/mot/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import matching\nfrom . import tracker\nfrom . import motion\nfrom . import visualization\nfrom . import utils\n\nfrom .matching import *\nfrom .tracker import *\nfrom .motion import *\nfrom .visualization import *\nfrom .utils import *\n"
  },
  {
    "path": "ppdet/modeling/mot/matching/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import jde_matching\nfrom . import deepsort_matching\nfrom . import ocsort_matching\n\nfrom .jde_matching import *\nfrom .deepsort_matching import *\nfrom .ocsort_matching import *\n"
  },
  {
    "path": "ppdet/modeling/mot/matching/deepsort_matching.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/nwojke/deep_sort/tree/master/deep_sort\n\"\"\"\n\nimport numpy as np\nfrom scipy.optimize import linear_sum_assignment\nfrom ..motion import kalman_filter\n\nINFTY_COST = 1e+5\n\n__all__ = [\n    'iou_1toN',\n    'iou_cost',\n    '_nn_euclidean_distance',\n    '_nn_cosine_distance',\n    'NearestNeighborDistanceMetric',\n    'min_cost_matching',\n    'matching_cascade',\n    'gate_cost_matrix',\n]\n\n\ndef iou_1toN(bbox, candidates):\n    \"\"\"\n    Computer intersection over union (IoU) by one box to N candidates.\n\n    Args:\n        bbox (ndarray): A bounding box in format `(top left x, top left y, width, height)`.\n            candidates (ndarray): A matrix of candidate bounding boxes (one per row) in the\n            same format as `bbox`.\n\n    Returns:\n        ious (ndarray): The intersection over union in [0, 1] between the `bbox`\n            and each candidate. A higher score means a larger fraction of the\n            `bbox` is occluded by the candidate.\n    \"\"\"\n    bbox_tl = bbox[:2]\n    bbox_br = bbox[:2] + bbox[2:]\n    candidates_tl = candidates[:, :2]\n    candidates_br = candidates[:, :2] + candidates[:, 2:]\n\n    tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis],\n               np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]]\n    br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis],\n               np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]]\n    wh = np.maximum(0., br - tl)\n\n    area_intersection = wh.prod(axis=1)\n    area_bbox = bbox[2:].prod()\n    area_candidates = candidates[:, 2:].prod(axis=1)\n    ious = area_intersection / (area_bbox + area_candidates - area_intersection)\n    return ious\n\n\ndef iou_cost(tracks, detections, track_indices=None, detection_indices=None):\n    \"\"\"\n    IoU distance metric.\n\n    Args:\n        tracks (list[Track]): A list of tracks.\n        detections (list[Detection]): A list of detections.\n        track_indices (Optional[list[int]]): A list of indices to tracks that\n            should be matched. Defaults to all `tracks`.\n        detection_indices (Optional[list[int]]): A list of indices to detections\n            that should be matched. Defaults to all `detections`.\n\n    Returns:\n        cost_matrix (ndarray): A cost matrix of shape len(track_indices), \n            len(detection_indices) where entry (i, j) is \n            `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`.\n    \"\"\"\n    if track_indices is None:\n        track_indices = np.arange(len(tracks))\n    if detection_indices is None:\n        detection_indices = np.arange(len(detections))\n\n    cost_matrix = np.zeros((len(track_indices), len(detection_indices)))\n    for row, track_idx in enumerate(track_indices):\n        if tracks[track_idx].time_since_update > 1:\n            cost_matrix[row, :] = 1e+5\n            continue\n\n        bbox = tracks[track_idx].to_tlwh()\n        candidates = np.asarray([detections[i].tlwh for i in detection_indices])\n        cost_matrix[row, :] = 1. - iou_1toN(bbox, candidates)\n    return cost_matrix\n\n\ndef _nn_euclidean_distance(s, q):\n    \"\"\"\n    Compute pair-wise squared (Euclidean) distance between points in `s` and `q`.\n\n    Args:\n        s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M.\n        q (ndarray): Query points: an LxM matrix of L samples of dimensionality M.\n\n    Returns:\n        distances (ndarray): A vector of length M that contains for each entry in `q` the\n            smallest Euclidean distance to a sample in `s`.\n    \"\"\"\n    s, q = np.asarray(s), np.asarray(q)\n    if len(s) == 0 or len(q) == 0:\n        return np.zeros((len(s), len(q)))\n    s2, q2 = np.square(s).sum(axis=1), np.square(q).sum(axis=1)\n    distances = -2. * np.dot(s, q.T) + s2[:, None] + q2[None, :]\n    distances = np.clip(distances, 0., float(np.inf))\n\n    return np.maximum(0.0, distances.min(axis=0))\n\n\ndef _nn_cosine_distance(s, q):\n    \"\"\"\n    Compute pair-wise cosine distance between points in `s` and `q`.\n\n    Args:\n        s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M.\n        q (ndarray): Query points: an LxM matrix of L samples of dimensionality M.\n\n    Returns:\n        distances (ndarray): A vector of length M that contains for each entry in `q` the\n            smallest Euclidean distance to a sample in `s`.\n    \"\"\"\n    s = np.asarray(s) / np.linalg.norm(s, axis=1, keepdims=True)\n    q = np.asarray(q) / np.linalg.norm(q, axis=1, keepdims=True)\n    distances = 1. - np.dot(s, q.T)\n\n    return distances.min(axis=0)\n\n\nclass NearestNeighborDistanceMetric(object):\n    \"\"\"\n    A nearest neighbor distance metric that, for each target, returns\n    the closest distance to any sample that has been observed so far.\n\n    Args:\n        metric (str): Either \"euclidean\" or \"cosine\".\n        matching_threshold (float): The matching threshold. Samples with larger\n            distance are considered an invalid match.\n        budget (Optional[int]): If not None, fix samples per class to at most\n            this number. Removes the oldest samples when the budget is reached.\n\n    Attributes: \n        samples (Dict[int -> List[ndarray]]): A dictionary that maps from target\n            identities to the list of samples that have been observed so far.\n    \"\"\"\n\n    def __init__(self, metric, matching_threshold, budget=None):\n        if metric == \"euclidean\":\n            self._metric = _nn_euclidean_distance\n        elif metric == \"cosine\":\n            self._metric = _nn_cosine_distance\n        else:\n            raise ValueError(\n                \"Invalid metric; must be either 'euclidean' or 'cosine'\")\n        self.matching_threshold = matching_threshold\n        self.budget = budget\n        self.samples = {}\n\n    def partial_fit(self, features, targets, active_targets):\n        \"\"\"\n        Update the distance metric with new data.\n\n        Args:\n            features (ndarray): An NxM matrix of N features of dimensionality M.\n            targets (ndarray): An integer array of associated target identities.\n            active_targets (List[int]): A list of targets that are currently\n                present in the scene.\n        \"\"\"\n        for feature, target in zip(features, targets):\n            self.samples.setdefault(target, []).append(feature)\n            if self.budget is not None:\n                self.samples[target] = self.samples[target][-self.budget:]\n        self.samples = {k: self.samples[k] for k in active_targets}\n\n    def distance(self, features, targets):\n        \"\"\"\n        Compute distance between features and targets.\n\n        Args:\n            features (ndarray): An NxM matrix of N features of dimensionality M.\n            targets (list[int]): A list of targets to match the given `features` against.\n\n        Returns:\n            cost_matrix (ndarray): a cost matrix of shape len(targets), len(features),\n                where element (i, j) contains the closest squared distance between\n                `targets[i]` and `features[j]`.\n        \"\"\"\n        cost_matrix = np.zeros((len(targets), len(features)))\n        for i, target in enumerate(targets):\n            cost_matrix[i, :] = self._metric(self.samples[target], features)\n        return cost_matrix\n\n\ndef min_cost_matching(distance_metric,\n                      max_distance,\n                      tracks,\n                      detections,\n                      track_indices=None,\n                      detection_indices=None):\n    \"\"\"\n    Solve linear assignment problem.\n\n    Args:\n        distance_metric :\n            Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray\n            The distance metric is given a list of tracks and detections as \n            well as a list of N track indices and M detection indices. The \n            metric should return the NxM dimensional cost matrix, where element\n            (i, j) is the association cost between the i-th track in the given\n            track indices and the j-th detection in the given detection_indices.\n        max_distance (float): Gating threshold. Associations with cost larger\n            than this value are disregarded.\n        tracks (list[Track]): A list of predicted tracks at the current time\n            step.\n        detections (list[Detection]): A list of detections at the current time\n            step.\n        track_indices (list[int]): List of track indices that maps rows in\n            `cost_matrix` to tracks in `tracks`.\n        detection_indices (List[int]): List of detection indices that maps\n            columns in `cost_matrix` to detections in `detections`.\n\n    Returns:\n        A tuple (List[(int, int)], List[int], List[int]) with the following\n        three entries:\n            * A list of matched track and detection indices.\n            * A list of unmatched track indices.\n            * A list of unmatched detection indices.\n    \"\"\"\n    if track_indices is None:\n        track_indices = np.arange(len(tracks))\n    if detection_indices is None:\n        detection_indices = np.arange(len(detections))\n\n    if len(detection_indices) == 0 or len(track_indices) == 0:\n        return [], track_indices, detection_indices  # Nothing to match.\n\n    cost_matrix = distance_metric(tracks, detections, track_indices,\n                                  detection_indices)\n\n    cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5\n    indices = linear_sum_assignment(cost_matrix)\n\n    matches, unmatched_tracks, unmatched_detections = [], [], []\n    for col, detection_idx in enumerate(detection_indices):\n        if col not in indices[1]:\n            unmatched_detections.append(detection_idx)\n    for row, track_idx in enumerate(track_indices):\n        if row not in indices[0]:\n            unmatched_tracks.append(track_idx)\n    for row, col in zip(indices[0], indices[1]):\n        track_idx = track_indices[row]\n        detection_idx = detection_indices[col]\n        if cost_matrix[row, col] > max_distance:\n            unmatched_tracks.append(track_idx)\n            unmatched_detections.append(detection_idx)\n        else:\n            matches.append((track_idx, detection_idx))\n    return matches, unmatched_tracks, unmatched_detections\n\n\ndef matching_cascade(distance_metric,\n                     max_distance,\n                     cascade_depth,\n                     tracks,\n                     detections,\n                     track_indices=None,\n                     detection_indices=None):\n    \"\"\"\n    Run matching cascade.\n\n    Args:\n        distance_metric :\n            Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray\n            The distance metric is given a list of tracks and detections as \n            well as a list of N track indices and M detection indices. The \n            metric should return the NxM dimensional cost matrix, where element\n            (i, j) is the association cost between the i-th track in the given\n            track indices and the j-th detection in the given detection_indices.\n        max_distance (float): Gating threshold. Associations with cost larger\n            than this value are disregarded.\n        cascade_depth (int): The cascade depth, should be se to the maximum\n            track age.\n        tracks (list[Track]): A list of predicted tracks at the current time\n            step.\n        detections (list[Detection]): A list of detections at the current time\n            step.\n        track_indices (list[int]): List of track indices that maps rows in\n            `cost_matrix` to tracks in `tracks`.\n        detection_indices (List[int]): List of detection indices that maps\n            columns in `cost_matrix` to detections in `detections`.\n\n    Returns:\n        A tuple (List[(int, int)], List[int], List[int]) with the following\n        three entries:\n            * A list of matched track and detection indices.\n            * A list of unmatched track indices.\n            * A list of unmatched detection indices.\n    \"\"\"\n    if track_indices is None:\n        track_indices = list(range(len(tracks)))\n    if detection_indices is None:\n        detection_indices = list(range(len(detections)))\n\n    unmatched_detections = detection_indices\n    matches = []\n    for level in range(cascade_depth):\n        if len(unmatched_detections) == 0:  # No detections left\n            break\n\n        track_indices_l = [\n            k for k in track_indices if tracks[k].time_since_update == 1 + level\n        ]\n        if len(track_indices_l) == 0:  # Nothing to match at this level\n            continue\n\n        matches_l, _, unmatched_detections = \\\n            min_cost_matching(\n                distance_metric, max_distance, tracks, detections,\n                track_indices_l, unmatched_detections)\n        matches += matches_l\n    unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches))\n    return matches, unmatched_tracks, unmatched_detections\n\n\ndef gate_cost_matrix(kf,\n                     cost_matrix,\n                     tracks,\n                     detections,\n                     track_indices,\n                     detection_indices,\n                     gated_cost=INFTY_COST,\n                     only_position=False):\n    \"\"\"\n    Invalidate infeasible entries in cost matrix based on the state\n    distributions obtained by Kalman filtering.\n\n    Args:\n        kf (object): The Kalman filter.\n        cost_matrix (ndarray): The NxM dimensional cost matrix, where N is the\n            number of track indices and M is the number of detection indices,\n            such that entry (i, j) is the association cost between\n            `tracks[track_indices[i]]` and `detections[detection_indices[j]]`.\n        tracks (list[Track]): A list of predicted tracks at the current time\n            step.\n        detections (list[Detection]): A list of detections at the current time\n            step.\n        track_indices (List[int]): List of track indices that maps rows in\n            `cost_matrix` to tracks in `tracks`.\n        detection_indices (List[int]): List of detection indices that maps\n            columns in `cost_matrix` to detections in `detections`.\n        gated_cost (Optional[float]): Entries in the cost matrix corresponding\n            to infeasible associations are set this value. Defaults to a very\n            large value.\n        only_position (Optional[bool]): If True, only the x, y position of the\n            state distribution is considered during gating. Default False.\n    \"\"\"\n    gating_dim = 2 if only_position else 4\n    gating_threshold = kalman_filter.chi2inv95[gating_dim]\n    measurements = np.asarray(\n        [detections[i].to_xyah() for i in detection_indices])\n    for row, track_idx in enumerate(track_indices):\n        track = tracks[track_idx]\n        gating_distance = kf.gating_distance(track.mean, track.covariance,\n                                             measurements, only_position)\n        cost_matrix[row, gating_distance > gating_threshold] = gated_cost\n    return cost_matrix\n"
  },
  {
    "path": "ppdet/modeling/mot/matching/jde_matching.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\r\n#\r\n# Licensed under the Apache License, Version 2.0 (the \"License\");\r\n# you may not use this file except in compliance with the License.\r\n# You may obtain a copy of the License at\r\n#\r\n#     http://www.apache.org/licenses/LICENSE-2.0\r\n#\r\n# Unless required by applicable law or agreed to in writing, software\r\n# distributed under the License is distributed on an \"AS IS\" BASIS,\r\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n# See the License for the specific language governing permissions and\r\n# limitations under the License.\r\n\"\"\"\r\nThis code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/matching.py\r\n\"\"\"\r\n\r\ntry:\r\n    import lap\r\nexcept:\r\n    print(\r\n        'Warning: Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap'\r\n    )\r\n    pass\r\n\r\nimport scipy\r\nimport numpy as np\r\nfrom scipy.spatial.distance import cdist\r\nfrom ..motion import kalman_filter\r\nimport warnings\r\nwarnings.filterwarnings(\"ignore\")\r\n\r\n__all__ = [\r\n    'merge_matches',\r\n    'linear_assignment',\r\n    'bbox_ious',\r\n    'iou_distance',\r\n    'embedding_distance',\r\n    'fuse_motion',\r\n]\r\n\r\n\r\ndef merge_matches(m1, m2, shape):\r\n    O, P, Q = shape\r\n    m1 = np.asarray(m1)\r\n    m2 = np.asarray(m2)\r\n\r\n    M1 = scipy.sparse.coo_matrix(\r\n        (np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))\r\n    M2 = scipy.sparse.coo_matrix(\r\n        (np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))\r\n\r\n    mask = M1 * M2\r\n    match = mask.nonzero()\r\n    match = list(zip(match[0], match[1]))\r\n    unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))\r\n    unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))\r\n\r\n    return match, unmatched_O, unmatched_Q\r\n\r\n\r\ndef linear_assignment(cost_matrix, thresh):\r\n    try:\r\n        import lap\r\n    except Exception as e:\r\n        raise RuntimeError(\r\n            'Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap'\r\n        )\r\n    if cost_matrix.size == 0:\r\n        return np.empty(\r\n            (0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(\r\n                range(cost_matrix.shape[1]))\r\n    matches, unmatched_a, unmatched_b = [], [], []\r\n    cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)\r\n    for ix, mx in enumerate(x):\r\n        if mx >= 0:\r\n            matches.append([ix, mx])\r\n    unmatched_a = np.where(x < 0)[0]\r\n    unmatched_b = np.where(y < 0)[0]\r\n    matches = np.asarray(matches)\r\n    return matches, unmatched_a, unmatched_b\r\n\r\n\r\ndef bbox_ious(atlbrs, btlbrs):\r\n    boxes = np.ascontiguousarray(atlbrs, dtype=np.float32)\r\n    query_boxes = np.ascontiguousarray(btlbrs, dtype=np.float32)\r\n    N = boxes.shape[0]\r\n    K = query_boxes.shape[0]\r\n    ious = np.zeros((N, K), dtype=boxes.dtype)\r\n    if N * K == 0:\r\n        return ious\r\n\r\n    for k in range(K):\r\n        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + 1) *\r\n                    (query_boxes[k, 3] - query_boxes[k, 1] + 1))\r\n        for n in range(N):\r\n            iw = (min(boxes[n, 2], query_boxes[k, 2]) - max(\r\n                boxes[n, 0], query_boxes[k, 0]) + 1)\r\n            if iw > 0:\r\n                ih = (min(boxes[n, 3], query_boxes[k, 3]) - max(\r\n                    boxes[n, 1], query_boxes[k, 1]) + 1)\r\n                if ih > 0:\r\n                    ua = float((boxes[n, 2] - boxes[n, 0] + 1) * (boxes[\r\n                        n, 3] - boxes[n, 1] + 1) + box_area - iw * ih)\r\n                    ious[n, k] = iw * ih / ua\r\n    return ious\r\n\r\n\r\ndef iou_distance(atracks, btracks):\r\n    \"\"\"\r\n    Compute cost based on IoU between two list[STrack].\r\n    \"\"\"\r\n    if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) or (\r\n            len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):\r\n        atlbrs = atracks\r\n        btlbrs = btracks\r\n    else:\r\n        atlbrs = [track.tlbr for track in atracks]\r\n        btlbrs = [track.tlbr for track in btracks]\r\n    _ious = bbox_ious(atlbrs, btlbrs)\r\n    cost_matrix = 1 - _ious\r\n\r\n    return cost_matrix\r\n\r\n\r\ndef embedding_distance(tracks, detections, metric='euclidean'):\r\n    \"\"\"\r\n    Compute cost based on features between two list[STrack].\r\n    \"\"\"\r\n    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float32)\r\n    if cost_matrix.size == 0:\r\n        return cost_matrix\r\n    det_features = np.asarray(\r\n        [track.curr_feat for track in detections], dtype=np.float32)\r\n    track_features = np.asarray(\r\n        [track.smooth_feat for track in tracks], dtype=np.float32)\r\n    cost_matrix = np.maximum(0.0, cdist(track_features, det_features,\r\n                                        metric))  # Nomalized features\r\n    return cost_matrix\r\n\r\n\r\ndef fuse_motion(kf,\r\n                cost_matrix,\r\n                tracks,\r\n                detections,\r\n                only_position=False,\r\n                lambda_=0.98):\r\n    if cost_matrix.size == 0:\r\n        return cost_matrix\r\n    gating_dim = 2 if only_position else 4\r\n    gating_threshold = kalman_filter.chi2inv95[gating_dim]\r\n    measurements = np.asarray([det.to_xyah() for det in detections])\r\n    for row, track in enumerate(tracks):\r\n        gating_distance = kf.gating_distance(\r\n            track.mean,\r\n            track.covariance,\r\n            measurements,\r\n            only_position,\r\n            metric='maha')\r\n        cost_matrix[row, gating_distance > gating_threshold] = np.inf\r\n        cost_matrix[row] = lambda_ * cost_matrix[row] + (1 - lambda_\r\n                                                         ) * gating_distance\r\n    return cost_matrix\r\n"
  },
  {
    "path": "ppdet/modeling/mot/matching/ocsort_matching.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/association.py\n\"\"\"\n\nimport os\nimport numpy as np\n\n\ndef iou_batch(bboxes1, bboxes2):\n    bboxes2 = np.expand_dims(bboxes2, 0)\n    bboxes1 = np.expand_dims(bboxes1, 1)\n\n    xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0])\n    yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1])\n    xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2])\n    yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3])\n    w = np.maximum(0., xx2 - xx1)\n    h = np.maximum(0., yy2 - yy1)\n    area = w * h\n    iou_matrix = area / ((bboxes1[..., 2] - bboxes1[..., 0]) *\n                         (bboxes1[..., 3] - bboxes1[..., 1]) +\n                         (bboxes2[..., 2] - bboxes2[..., 0]) *\n                         (bboxes2[..., 3] - bboxes2[..., 1]) - area)\n    return iou_matrix\n\n\ndef speed_direction_batch(dets, tracks):\n    tracks = tracks[..., np.newaxis]\n    CX1, CY1 = (dets[:, 0] + dets[:, 2]) / 2.0, (dets[:, 1] + dets[:, 3]) / 2.0\n    CX2, CY2 = (tracks[:, 0] + tracks[:, 2]) / 2.0, (\n        tracks[:, 1] + tracks[:, 3]) / 2.0\n    dx = CX1 - CX2\n    dy = CY1 - CY2\n    norm = np.sqrt(dx**2 + dy**2) + 1e-6\n    dx = dx / norm\n    dy = dy / norm\n    return dy, dx\n\n\ndef linear_assignment(cost_matrix):\n    try:\n        import lap\n        _, x, y = lap.lapjv(cost_matrix, extend_cost=True)\n        return np.array([[y[i], i] for i in x if i >= 0])\n    except ImportError:\n        from scipy.optimize import linear_sum_assignment\n        x, y = linear_sum_assignment(cost_matrix)\n        return np.array(list(zip(x, y)))\n\n\ndef associate(detections, trackers, iou_threshold, velocities, previous_obs,\n              vdc_weight):\n    if (len(trackers) == 0):\n        return np.empty(\n            (0, 2), dtype=int), np.arange(len(detections)), np.empty(\n                (0, 5), dtype=int)\n\n    Y, X = speed_direction_batch(detections, previous_obs)\n    inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1]\n    inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1)\n    inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1)\n    diff_angle_cos = inertia_X * X + inertia_Y * Y\n    diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1)\n    diff_angle = np.arccos(diff_angle_cos)\n    diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi\n\n    valid_mask = np.ones(previous_obs.shape[0])\n    valid_mask[np.where(previous_obs[:, 4] < 0)] = 0\n\n    iou_matrix = iou_batch(detections, trackers)\n    scores = np.repeat(\n        detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1)\n    # iou_matrix = iou_matrix * scores # a trick sometiems works, we don't encourage this\n    valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1)\n\n    angle_diff_cost = (valid_mask * diff_angle) * vdc_weight\n    angle_diff_cost = angle_diff_cost.T\n    angle_diff_cost = angle_diff_cost * scores\n\n    if min(iou_matrix.shape) > 0:\n        a = (iou_matrix > iou_threshold).astype(np.int32)\n        if a.sum(1).max() == 1 and a.sum(0).max() == 1:\n            matched_indices = np.stack(np.where(a), axis=1)\n        else:\n            matched_indices = linear_assignment(-(iou_matrix + angle_diff_cost))\n    else:\n        matched_indices = np.empty(shape=(0, 2))\n\n    unmatched_detections = []\n    for d, det in enumerate(detections):\n        if (d not in matched_indices[:, 0]):\n            unmatched_detections.append(d)\n    unmatched_trackers = []\n    for t, trk in enumerate(trackers):\n        if (t not in matched_indices[:, 1]):\n            unmatched_trackers.append(t)\n\n    # filter out matched with low IOU\n    matches = []\n    for m in matched_indices:\n        if (iou_matrix[m[0], m[1]] < iou_threshold):\n            unmatched_detections.append(m[0])\n            unmatched_trackers.append(m[1])\n        else:\n            matches.append(m.reshape(1, 2))\n    if (len(matches) == 0):\n        matches = np.empty((0, 2), dtype=int)\n    else:\n        matches = np.concatenate(matches, axis=0)\n\n    return matches, np.array(unmatched_detections), np.array(unmatched_trackers)\n\n\ndef associate_only_iou(detections, trackers, iou_threshold):\n    if (len(trackers) == 0):\n        return np.empty(\n            (0, 2), dtype=int), np.arange(len(detections)), np.empty(\n                (0, 5), dtype=int)\n\n    iou_matrix = iou_batch(detections, trackers)\n\n    if min(iou_matrix.shape) > 0:\n        a = (iou_matrix > iou_threshold).astype(np.int32)\n        if a.sum(1).max() == 1 and a.sum(0).max() == 1:\n            matched_indices = np.stack(np.where(a), axis=1)\n        else:\n            matched_indices = linear_assignment(-iou_matrix)\n    else:\n        matched_indices = np.empty(shape=(0, 2))\n\n    unmatched_detections = []\n    for d, det in enumerate(detections):\n        if (d not in matched_indices[:, 0]):\n            unmatched_detections.append(d)\n    unmatched_trackers = []\n    for t, trk in enumerate(trackers):\n        if (t not in matched_indices[:, 1]):\n            unmatched_trackers.append(t)\n\n    # filter out matched with low IOU\n    matches = []\n    for m in matched_indices:\n        if (iou_matrix[m[0], m[1]] < iou_threshold):\n            unmatched_detections.append(m[0])\n            unmatched_trackers.append(m[1])\n        else:\n            matches.append(m.reshape(1, 2))\n    if (len(matches) == 0):\n        matches = np.empty((0, 2), dtype=int)\n    else:\n        matches = np.concatenate(matches, axis=0)\n    return matches, np.array(unmatched_detections), np.array(unmatched_trackers)\n"
  },
  {
    "path": "ppdet/modeling/mot/motion/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import kalman_filter\n\nfrom .kalman_filter import *\nfrom .gmc import *"
  },
  {
    "path": "ppdet/modeling/mot/motion/gmc.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/gmc.py\n\"\"\"\n\nimport cv2\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport copy\nimport time\nfrom ppdet.core.workspace import register, serializable\n\n\n@register\n@serializable\nclass GMC:\n    def __init__(self, method='sparseOptFlow', downscale=2, verbose=None):\n        super(GMC, self).__init__()\n\n        self.method = method\n        self.downscale = max(1, int(downscale))\n\n        if self.method == 'orb':\n            self.detector = cv2.FastFeatureDetector_create(20)\n            self.extractor = cv2.ORB_create()\n            self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING)\n\n        elif self.method == 'sift':\n            self.detector = cv2.SIFT_create(\n                nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)\n            self.extractor = cv2.SIFT_create(\n                nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)\n            self.matcher = cv2.BFMatcher(cv2.NORM_L2)\n\n        elif self.method == 'ecc':\n            number_of_iterations = 5000\n            termination_eps = 1e-6\n            self.warp_mode = cv2.MOTION_EUCLIDEAN\n            self.criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT,\n                             number_of_iterations, termination_eps)\n\n        elif self.method == 'sparseOptFlow':\n            self.feature_params = dict(\n                maxCorners=1000,\n                qualityLevel=0.01,\n                minDistance=1,\n                blockSize=3,\n                useHarrisDetector=False,\n                k=0.04)\n            # self.gmc_file = open('GMC_results.txt', 'w')\n\n        elif self.method == 'file' or self.method == 'files':\n            seqName = verbose[0]\n            ablation = verbose[1]\n            if ablation:\n                filePath = r'tracker/GMC_files/MOT17_ablation'\n            else:\n                filePath = r'tracker/GMC_files/MOTChallenge'\n\n            if '-FRCNN' in seqName:\n                seqName = seqName[:-6]\n            elif '-DPM' in seqName:\n                seqName = seqName[:-4]\n            elif '-SDP' in seqName:\n                seqName = seqName[:-4]\n\n            self.gmcFile = open(filePath + \"/GMC-\" + seqName + \".txt\", 'r')\n\n            if self.gmcFile is None:\n                raise ValueError(\"Error: Unable to open GMC file in directory:\"\n                                 + filePath)\n        elif self.method == 'none' or self.method == 'None':\n            self.method = 'none'\n        else:\n            raise ValueError(\"Error: Unknown CMC method:\" + method)\n\n        self.prevFrame = None\n        self.prevKeyPoints = None\n        self.prevDescriptors = None\n\n        self.initializedFirstFrame = False\n\n    def apply(self, raw_frame, detections=None):\n        if self.method == 'orb' or self.method == 'sift':\n            return self.applyFeaures(raw_frame, detections)\n        elif self.method == 'ecc':\n            return self.applyEcc(raw_frame, detections)\n        elif self.method == 'sparseOptFlow':\n            return self.applySparseOptFlow(raw_frame, detections)\n        elif self.method == 'file':\n            return self.applyFile(raw_frame, detections)\n        elif self.method == 'none':\n            return np.eye(2, 3)\n        else:\n            return np.eye(2, 3)\n\n    def applyEcc(self, raw_frame, detections=None):\n\n        # Initialize\n        height, width, _ = raw_frame.shape\n        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)\n        H = np.eye(2, 3, dtype=np.float32)\n\n        # Downscale image (TODO: consider using pyramids)\n        if self.downscale > 1.0:\n            frame = cv2.GaussianBlur(frame, (3, 3), 1.5)\n            frame = cv2.resize(frame, (width // self.downscale,\n                                       height // self.downscale))\n            width = width // self.downscale\n            height = height // self.downscale\n\n        # Handle first frame\n        if not self.initializedFirstFrame:\n            # Initialize data\n            self.prevFrame = frame.copy()\n\n            # Initialization done\n            self.initializedFirstFrame = True\n\n            return H\n\n        # Run the ECC algorithm. The results are stored in warp_matrix.\n        # (cc, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria)\n        try:\n            (cc,\n             H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode,\n                                       self.criteria, None, 1)\n        except:\n            print('Warning: find transform failed. Set warp as identity')\n\n        return H\n\n    def applyFeaures(self, raw_frame, detections=None):\n\n        # Initialize\n        height, width, _ = raw_frame.shape\n        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)\n        H = np.eye(2, 3)\n\n        # Downscale image (TODO: consider using pyramids)\n        if self.downscale > 1.0:\n            # frame = cv2.GaussianBlur(frame, (3, 3), 1.5)\n            frame = cv2.resize(frame, (width // self.downscale,\n                                       height // self.downscale))\n            width = width // self.downscale\n            height = height // self.downscale\n\n        # find the keypoints\n        mask = np.zeros_like(frame)\n        # mask[int(0.05 * height): int(0.95 * height), int(0.05 * width): int(0.95 * width)] = 255\n        mask[int(0.02 * height):int(0.98 * height), int(0.02 * width):int(\n            0.98 * width)] = 255\n        if detections is not None:\n            for det in detections:\n                tlbr = (det[:4] / self.downscale).astype(np.int_)\n                mask[tlbr[1]:tlbr[3], tlbr[0]:tlbr[2]] = 0\n\n        keypoints = self.detector.detect(frame, mask)\n\n        # compute the descriptors\n        keypoints, descriptors = self.extractor.compute(frame, keypoints)\n\n        # Handle first frame\n        if not self.initializedFirstFrame:\n            # Initialize data\n            self.prevFrame = frame.copy()\n            self.prevKeyPoints = copy.copy(keypoints)\n            self.prevDescriptors = copy.copy(descriptors)\n\n            # Initialization done\n            self.initializedFirstFrame = True\n\n            return H\n\n        # Match descriptors.\n        knnMatches = self.matcher.knnMatch(self.prevDescriptors, descriptors, 2)\n\n        # Filtered matches based on smallest spatial distance\n        matches = []\n        spatialDistances = []\n\n        maxSpatialDistance = 0.25 * np.array([width, height])\n\n        # Handle empty matches case\n        if len(knnMatches) == 0:\n            # Store to next iteration\n            self.prevFrame = frame.copy()\n            self.prevKeyPoints = copy.copy(keypoints)\n            self.prevDescriptors = copy.copy(descriptors)\n\n            return H\n\n        for m, n in knnMatches:\n            if m.distance < 0.9 * n.distance:\n                prevKeyPointLocation = self.prevKeyPoints[m.queryIdx].pt\n                currKeyPointLocation = keypoints[m.trainIdx].pt\n\n                spatialDistance = (\n                    prevKeyPointLocation[0] - currKeyPointLocation[0],\n                    prevKeyPointLocation[1] - currKeyPointLocation[1])\n\n                if (np.abs(spatialDistance[0]) < maxSpatialDistance[0]) and \\\n                        (np.abs(spatialDistance[1]) < maxSpatialDistance[1]):\n                    spatialDistances.append(spatialDistance)\n                    matches.append(m)\n\n        meanSpatialDistances = np.mean(spatialDistances, 0)\n        stdSpatialDistances = np.std(spatialDistances, 0)\n\n        inliesrs = (spatialDistances - meanSpatialDistances\n                    ) < 2.5 * stdSpatialDistances\n\n        goodMatches = []\n        prevPoints = []\n        currPoints = []\n        for i in range(len(matches)):\n            if inliesrs[i, 0] and inliesrs[i, 1]:\n                goodMatches.append(matches[i])\n                prevPoints.append(self.prevKeyPoints[matches[i].queryIdx].pt)\n                currPoints.append(keypoints[matches[i].trainIdx].pt)\n\n        prevPoints = np.array(prevPoints)\n        currPoints = np.array(currPoints)\n\n        # Draw the keypoint matches on the output image\n        if 0:\n            matches_img = np.hstack((self.prevFrame, frame))\n            matches_img = cv2.cvtColor(matches_img, cv2.COLOR_GRAY2BGR)\n            W = np.size(self.prevFrame, 1)\n            for m in goodMatches:\n                prev_pt = np.array(\n                    self.prevKeyPoints[m.queryIdx].pt, dtype=np.int_)\n                curr_pt = np.array(keypoints[m.trainIdx].pt, dtype=np.int_)\n                curr_pt[0] += W\n                color = np.random.randint(0, 255, (3, ))\n                color = (int(color[0]), int(color[1]), int(color[2]))\n\n                matches_img = cv2.line(matches_img, prev_pt, curr_pt,\n                                       tuple(color), 1, cv2.LINE_AA)\n                matches_img = cv2.circle(matches_img, prev_pt, 2,\n                                         tuple(color), -1)\n                matches_img = cv2.circle(matches_img, curr_pt, 2,\n                                         tuple(color), -1)\n\n            plt.figure()\n            plt.imshow(matches_img)\n            plt.show()\n\n        # Find rigid matrix\n        if (np.size(prevPoints, 0) > 4) and (\n                np.size(prevPoints, 0) == np.size(prevPoints, 0)):\n            H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints,\n                                                      cv2.RANSAC)\n\n            # Handle downscale\n            if self.downscale > 1.0:\n                H[0, 2] *= self.downscale\n                H[1, 2] *= self.downscale\n        else:\n            print('Warning: not enough matching points')\n\n        # Store to next iteration\n        self.prevFrame = frame.copy()\n        self.prevKeyPoints = copy.copy(keypoints)\n        self.prevDescriptors = copy.copy(descriptors)\n\n        return H\n\n    def applySparseOptFlow(self, raw_frame, detections=None):\n\n        t0 = time.time()\n\n        # Initialize\n        height, width, _ = raw_frame.shape\n        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)\n        H = np.eye(2, 3)\n\n        # Downscale image\n        if self.downscale > 1.0:\n            # frame = cv2.GaussianBlur(frame, (3, 3), 1.5)\n            frame = cv2.resize(frame, (width // self.downscale,\n                                       height // self.downscale))\n\n        # find the keypoints\n        keypoints = cv2.goodFeaturesToTrack(\n            frame, mask=None, **self.feature_params)\n\n        # Handle first frame\n        if not self.initializedFirstFrame:\n            # Initialize data\n            self.prevFrame = frame.copy()\n            self.prevKeyPoints = copy.copy(keypoints)\n\n            # Initialization done\n            self.initializedFirstFrame = True\n\n            return H\n\n        if self.prevFrame.shape != frame.shape:\n            self.prevFrame = frame.copy()\n            self.prevKeyPoints = copy.copy(keypoints)\n            return H\n\n        # find correspondences\n        matchedKeypoints, status, err = cv2.calcOpticalFlowPyrLK(\n            self.prevFrame, frame, self.prevKeyPoints, None)\n\n        # leave good correspondences only\n        prevPoints = []\n        currPoints = []\n\n        for i in range(len(status)):\n            if status[i]:\n                prevPoints.append(self.prevKeyPoints[i])\n                currPoints.append(matchedKeypoints[i])\n\n        prevPoints = np.array(prevPoints)\n        currPoints = np.array(currPoints)\n\n        # Find rigid matrix\n        if (np.size(prevPoints, 0) > 4) and (\n                np.size(prevPoints, 0) == np.size(prevPoints, 0)):\n            H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints,\n                                                      cv2.RANSAC)\n\n            # Handle downscale\n            if self.downscale > 1.0:\n                H[0, 2] *= self.downscale\n                H[1, 2] *= self.downscale\n        else:\n            print('Warning: not enough matching points')\n\n        # Store to next iteration\n        self.prevFrame = frame.copy()\n        self.prevKeyPoints = copy.copy(keypoints)\n\n        t1 = time.time()\n\n        # gmc_line = str(1000 * (t1 - t0)) + \"\\t\" + str(H[0, 0]) + \"\\t\" + str(H[0, 1]) + \"\\t\" + str(\n        #     H[0, 2]) + \"\\t\" + str(H[1, 0]) + \"\\t\" + str(H[1, 1]) + \"\\t\" + str(H[1, 2]) + \"\\n\"\n        # self.gmc_file.write(gmc_line)\n\n        return H\n\n    def applyFile(self, raw_frame, detections=None):\n        line = self.gmcFile.readline()\n        tokens = line.split(\"\\t\")\n        H = np.eye(2, 3, dtype=np.float_)\n        H[0, 0] = float(tokens[1])\n        H[0, 1] = float(tokens[2])\n        H[0, 2] = float(tokens[3])\n        H[1, 0] = float(tokens[4])\n        H[1, 1] = float(tokens[5])\n        H[1, 2] = float(tokens[6])\n\n        return H\n"
  },
  {
    "path": "ppdet/modeling/mot/motion/kalman_filter.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\r\n#\r\n# Licensed under the Apache License, Version 2.0 (the \"License\");\r\n# you may not use this file except in compliance with the License.\r\n# You may obtain a copy of the License at\r\n#\r\n#     http://www.apache.org/licenses/LICENSE-2.0\r\n#\r\n# Unless required by applicable law or agreed to in writing, software\r\n# distributed under the License is distributed on an \"AS IS\" BASIS,\r\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n# See the License for the specific language governing permissions and\r\n# limitations under the License.\r\n\"\"\"\r\nThis code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/kalman_filter.py\r\n\"\"\"\r\n\r\nimport numpy as np\r\nimport scipy.linalg\r\n\r\nuse_numba = True\r\ntry:\r\n    import numba as nb\r\n\r\n    @nb.njit(fastmath=True, cache=True)\r\n    def nb_project(mean, covariance, std, _update_mat):\r\n        innovation_cov = np.diag(np.square(std))\r\n        mean = np.dot(_update_mat, mean)\r\n        covariance = np.dot(np.dot(_update_mat, covariance), _update_mat.T)\r\n        return mean, covariance + innovation_cov\r\n\r\n    @nb.njit(fastmath=True, cache=True)\r\n    def nb_multi_predict(mean, covariance, motion_cov, motion_mat):\r\n        mean = np.dot(mean, motion_mat.T)\r\n        left = np.dot(motion_mat, covariance)\r\n        covariance = np.dot(left, motion_mat.T) + motion_cov\r\n        return mean, covariance\r\n\r\n    @nb.njit(fastmath=True, cache=True)\r\n    def nb_update(mean, covariance, proj_mean, proj_cov, measurement, meas_mat):\r\n        kalman_gain = np.linalg.solve(proj_cov, (covariance @meas_mat.T).T).T\r\n        innovation = measurement - proj_mean\r\n        mean = mean + innovation @kalman_gain.T\r\n        covariance = covariance - kalman_gain @proj_cov @kalman_gain.T\r\n        return mean, covariance\r\n\r\nexcept:\r\n    use_numba = False\r\n    print(\r\n        'Warning: Unable to use numba in PP-Tracking, please install numba, for example(python3.7): `pip install numba==0.56.4`'\r\n    )\r\n    pass\r\n\r\n__all__ = ['KalmanFilter']\r\n\"\"\"\r\nTable for the 0.95 quantile of the chi-square distribution with N degrees of\r\nfreedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv\r\nfunction and used as Mahalanobis gating threshold.\r\n\"\"\"\r\n\r\nchi2inv95 = {\r\n    1: 3.8415,\r\n    2: 5.9915,\r\n    3: 7.8147,\r\n    4: 9.4877,\r\n    5: 11.070,\r\n    6: 12.592,\r\n    7: 14.067,\r\n    8: 15.507,\r\n    9: 16.919\r\n}\r\n\r\n\r\nclass KalmanFilter(object):\r\n    \"\"\"\r\n    A simple Kalman filter for tracking bounding boxes in image space.\r\n\r\n    The 8-dimensional state space\r\n\r\n        x, y, a, h, vx, vy, va, vh\r\n\r\n    contains the bounding box center position (x, y), aspect ratio a, height h,\r\n    and their respective velocities.\r\n\r\n    Object motion follows a constant velocity model. The bounding box location\r\n    (x, y, a, h) is taken as direct observation of the state space (linear\r\n    observation model).\r\n\r\n    \"\"\"\r\n\r\n    def __init__(self):\r\n        ndim, dt = 4, 1.\r\n\r\n        # Create Kalman filter model matrices.\r\n        self._motion_mat = np.eye(2 * ndim, 2 * ndim, dtype=np.float32)\r\n        for i in range(ndim):\r\n            self._motion_mat[i, ndim + i] = dt\r\n        self._update_mat = np.eye(ndim, 2 * ndim, dtype=np.float32)\r\n\r\n        # Motion and observation uncertainty are chosen relative to the current\r\n        # state estimate. These weights control the amount of uncertainty in\r\n        # the model. This is a bit hacky.\r\n        self._std_weight_position = 1. / 20\r\n        self._std_weight_velocity = 1. / 160\r\n\r\n    def initiate(self, measurement):\r\n        \"\"\"\r\n        Create track from unassociated measurement.\r\n\r\n        Args:\r\n            measurement (ndarray): Bounding box coordinates (x, y, a, h) with\r\n                center position (x, y), aspect ratio a, and height h.\r\n\r\n        Returns:\r\n            The mean vector (8 dimensional) and covariance matrix (8x8\r\n            dimensional) of the new track. Unobserved velocities are \r\n            initialized to 0 mean.\r\n        \"\"\"\r\n        mean_pos = measurement\r\n        mean_vel = np.zeros_like(mean_pos)\r\n        mean = np.r_[mean_pos, mean_vel]\r\n\r\n        std = [\r\n            2 * self._std_weight_position * measurement[3],\r\n            2 * self._std_weight_position * measurement[3], 1e-2,\r\n            2 * self._std_weight_position * measurement[3],\r\n            10 * self._std_weight_velocity * measurement[3],\r\n            10 * self._std_weight_velocity * measurement[3], 1e-5,\r\n            10 * self._std_weight_velocity * measurement[3]\r\n        ]\r\n        covariance = np.diag(np.square(std))\r\n        return mean, np.float32(covariance)\r\n\r\n    def predict(self, mean, covariance):\r\n        \"\"\"\r\n        Run Kalman filter prediction step.\r\n\r\n        Args:\r\n            mean (ndarray): The 8 dimensional mean vector of the object state\r\n                at the previous time step.\r\n            covariance (ndarray): The 8x8 dimensional covariance matrix of the\r\n                object state at the previous time step.\r\n\r\n        Returns:\r\n            The mean vector and covariance matrix of the predicted state. \r\n            Unobserved velocities are initialized to 0 mean.\r\n        \"\"\"\r\n        std_pos = [\r\n            self._std_weight_position * mean[3], self._std_weight_position *\r\n            mean[3], 1e-2, self._std_weight_position * mean[3]\r\n        ]\r\n        std_vel = [\r\n            self._std_weight_velocity * mean[3], self._std_weight_velocity *\r\n            mean[3], 1e-5, self._std_weight_velocity * mean[3]\r\n        ]\r\n        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))\r\n\r\n        #mean = np.dot(self._motion_mat, mean)\r\n        mean = np.dot(mean, self._motion_mat.T)\r\n        covariance = np.linalg.multi_dot(\r\n            (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov\r\n\r\n        return mean, covariance\r\n\r\n    def project(self, mean, covariance):\r\n        \"\"\"\r\n        Project state distribution to measurement space.\r\n\r\n        Args\r\n            mean (ndarray): The state's mean vector (8 dimensional array).\r\n            covariance (ndarray): The state's covariance matrix (8x8 dimensional).\r\n\r\n        Returns:\r\n            The projected mean and covariance matrix of the given state estimate.\r\n        \"\"\"\r\n        std = np.array(\r\n            [\r\n                self._std_weight_position * mean[3], self._std_weight_position *\r\n                mean[3], 1e-1, self._std_weight_position * mean[3]\r\n            ],\r\n            dtype=np.float32)\r\n\r\n        if use_numba:\r\n            return nb_project(mean, covariance, std, self._update_mat)\r\n\r\n        innovation_cov = np.diag(np.square(std))\r\n\r\n        mean = np.dot(self._update_mat, mean)\r\n        covariance = np.linalg.multi_dot((self._update_mat, covariance,\r\n                                          self._update_mat.T))\r\n        return mean, covariance + innovation_cov\r\n\r\n    def multi_predict(self, mean, covariance):\r\n        \"\"\"\r\n        Run Kalman filter prediction step (Vectorized version).\r\n        \r\n        Args:\r\n            mean (ndarray): The Nx8 dimensional mean matrix of the object states\r\n                at the previous time step.\r\n            covariance (ndarray): The Nx8x8 dimensional covariance matrics of the\r\n                object states at the previous time step.\r\n\r\n        Returns:\r\n            The mean vector and covariance matrix of the predicted state.\r\n            Unobserved velocities are initialized to 0 mean.\r\n        \"\"\"\r\n        std_pos = np.array([\r\n            self._std_weight_position * mean[:, 3], self._std_weight_position *\r\n            mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]),\r\n            self._std_weight_position * mean[:, 3]\r\n        ])\r\n        std_vel = np.array([\r\n            self._std_weight_velocity * mean[:, 3], self._std_weight_velocity *\r\n            mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]),\r\n            self._std_weight_velocity * mean[:, 3]\r\n        ])\r\n        sqr = np.square(np.r_[std_pos, std_vel]).T\r\n\r\n        if use_numba:\r\n\r\n            means = []\r\n            covariances = []\r\n            for i in range(len(mean)):\r\n                a, b = nb_multi_predict(mean[i], covariance[i],\r\n                                        np.diag(sqr[i]), self._motion_mat)\r\n                means.append(a)\r\n                covariances.append(b)\r\n            return np.asarray(means), np.asarray(covariances)\r\n\r\n        motion_cov = []\r\n        for i in range(len(mean)):\r\n            motion_cov.append(np.diag(sqr[i]))\r\n        motion_cov = np.asarray(motion_cov)\r\n\r\n        mean = np.dot(mean, self._motion_mat.T)\r\n        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))\r\n        covariance = np.dot(left, self._motion_mat.T) + motion_cov\r\n\r\n        return mean, covariance\r\n\r\n    def update(self, mean, covariance, measurement):\r\n        \"\"\"\r\n        Run Kalman filter correction step.\r\n\r\n        Args:\r\n            mean (ndarray): The predicted state's mean vector (8 dimensional).\r\n            covariance (ndarray): The state's covariance matrix (8x8 dimensional).\r\n            measurement (ndarray): The 4 dimensional measurement vector\r\n                (x, y, a, h), where (x, y) is the center position, a the aspect\r\n                ratio, and h the height of the bounding box.\r\n\r\n        Returns:\r\n            The measurement-corrected state distribution.\r\n        \"\"\"\r\n        projected_mean, projected_cov = self.project(mean, covariance)\r\n\r\n        if use_numba:\r\n\r\n            return nb_update(mean, covariance, projected_mean, projected_cov,\r\n                             measurement, self._update_mat)\r\n\r\n        kalman_gain = np.linalg.solve(projected_cov,\r\n                                      (covariance @self._update_mat.T).T).T\r\n        innovation = measurement - projected_mean\r\n        mean = mean + innovation @kalman_gain.T\r\n        covariance = covariance - kalman_gain @projected_cov @kalman_gain.T\r\n        return mean, covariance\r\n\r\n    def gating_distance(self,\r\n                        mean,\r\n                        covariance,\r\n                        measurements,\r\n                        only_position=False,\r\n                        metric='maha'):\r\n        \"\"\"\r\n        Compute gating distance between state distribution and measurements.\r\n        A suitable distance threshold can be obtained from `chi2inv95`. If\r\n        `only_position` is False, the chi-square distribution has 4 degrees of\r\n        freedom, otherwise 2.\r\n        \r\n        Args:\r\n            mean (ndarray): Mean vector over the state distribution (8\r\n                dimensional).\r\n            covariance (ndarray): Covariance of the state distribution (8x8\r\n                dimensional).\r\n            measurements (ndarray): An Nx4 dimensional matrix of N measurements,\r\n                each in format (x, y, a, h) where (x, y) is the bounding box center\r\n                position, a the aspect ratio, and h the height.\r\n            only_position (Optional[bool]): If True, distance computation is \r\n                done with respect to the bounding box center position only.\r\n            metric (str): Metric type, 'gaussian' or 'maha'.\r\n\r\n        Returns\r\n            An array of length N, where the i-th element contains the squared\r\n            Mahalanobis distance between (mean, covariance) and `measurements[i]`.\r\n        \"\"\"\r\n        mean, covariance = self.project(mean, covariance)\r\n        if only_position:\r\n            mean, covariance = mean[:2], covariance[:2, :2]\r\n            measurements = measurements[:, :2]\r\n\r\n        d = measurements - mean\r\n        if metric == 'gaussian':\r\n            return np.sum(d * d, axis=1)\r\n        elif metric == 'maha':\r\n            cholesky_factor = np.linalg.cholesky(covariance)\r\n            z = scipy.linalg.solve_triangular(\r\n                cholesky_factor,\r\n                d.T,\r\n                lower=True,\r\n                check_finite=False,\r\n                overwrite_b=True)\r\n            squared_maha = np.sum(z * z, axis=0)\r\n            return squared_maha\r\n        else:\r\n            raise ValueError('invalid distance metric')\r\n"
  },
  {
    "path": "ppdet/modeling/mot/motion/ocsort_kalman_filter.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/danbochman/SORT/blob/danny_opencv/kalman_filter.py\n\"\"\"\n\nimport numpy as np\nfrom numpy import dot, zeros, eye\nfrom numpy.linalg import inv\n\nuse_numba = True\ntry:\n    import numba as nb\n\n    @nb.njit(fastmath=True, cache=True)\n    def nb_predict(x, F, P, Q):\n        x = dot(F, x)\n        P = dot(dot(F, P), F.T) + Q\n        return x, P\n\n    @nb.njit(fastmath=True, cache=True)\n    def nb_update(x, z, H, P, R, _I):\n\n        y = z - np.dot(H, x)\n        PHT = dot(P, H.T)\n\n        S = dot(H, PHT) + R\n        K = dot(PHT, inv(S))\n\n        x = x + dot(K, y)\n\n        I_KH = _I - dot(K, H)\n        P = dot(dot(I_KH, P), I_KH.T) + dot(dot(K, R), K.T)\n        return x, P\nexcept:\n    use_numba = False\n    print(\n        'Warning: Unable to use numba in PP-Tracking, please install numba, for example(python3.7): `pip install numba==0.56.4`'\n    )\n    pass\n\n\nclass OCSORTKalmanFilter:\n    def __init__(self, dim_x, dim_z):\n        self.dim_x = dim_x\n        self.dim_z = dim_z\n        self.x = zeros((dim_x, 1))\n        self.P = eye(dim_x)\n        self.Q = eye(dim_x)\n        self.F = eye(dim_x)\n        self.H = zeros((dim_z, dim_x))\n        self.R = eye(dim_z)\n        self.M = zeros((dim_z, dim_z))\n\n        self._I = eye(dim_x)\n\n    def predict(self):\n        if use_numba:\n            self.x, self.P = nb_predict(self.x, self.F, self.P, self.Q)\n        else:\n            self.x = dot(self.F, self.x)\n            self.P = dot(dot(self.F, self.P), self.F.T) + self.Q\n\n    def update(self, z):\n\n        if z is None:\n            return\n\n        if use_numba:\n            self.x, self.P = nb_update(self.x, z, self.H, self.P, self.R,\n                                       self._I)\n        else:\n            y = z - np.dot(self.H, self.x)\n            PHT = dot(self.P, self.H.T)\n\n            S = dot(self.H, PHT) + self.R\n            K = dot(PHT, inv(S))\n\n            self.x = self.x + dot(K, y)\n\n            I_KH = self._I - dot(K, self.H)\n            self.P = dot(dot(I_KH, self.P), I_KH.T) + dot(dot(K, self.R), K.T)\n"
  },
  {
    "path": "ppdet/modeling/mot/tracker/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import base_jde_tracker\nfrom . import base_sde_tracker\n\nfrom .base_jde_tracker import *\nfrom .base_sde_tracker import *\n\nfrom . import jde_tracker\nfrom . import deepsort_tracker\nfrom . import ocsort_tracker\nfrom . import center_tracker\n\nfrom .jde_tracker import *\nfrom .deepsort_tracker import *\nfrom .ocsort_tracker import *\nfrom .botsort_tracker import *\nfrom .center_tracker import *\n"
  },
  {
    "path": "ppdet/modeling/mot/tracker/base_jde_tracker.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py\n\"\"\"\n\nimport numpy as np\nfrom collections import defaultdict\nfrom collections import deque, OrderedDict\nfrom ..matching import jde_matching as matching\nfrom ppdet.core.workspace import register, serializable\nimport warnings\nwarnings.filterwarnings(\"ignore\")\n\n__all__ = [\n    'TrackState',\n    'BaseTrack',\n    'STrack',\n    'joint_stracks',\n    'sub_stracks',\n    'remove_duplicate_stracks',\n]\n\n\nclass TrackState(object):\n    New = 0\n    Tracked = 1\n    Lost = 2\n    Removed = 3\n\n\n@register\n@serializable\nclass BaseTrack(object):\n    _count_dict = defaultdict(int)  # support single class and multi classes\n\n    track_id = 0\n    is_activated = False\n    state = TrackState.New\n\n    history = OrderedDict()\n    features = []\n    curr_feat = None\n    score = 0\n    start_frame = 0\n    frame_id = 0\n    time_since_update = 0\n\n    # multi-camera\n    location = (np.inf, np.inf)\n\n    @property\n    def end_frame(self):\n        return self.frame_id\n\n    @staticmethod\n    def next_id(cls_id):\n        BaseTrack._count_dict[cls_id] += 1\n        return BaseTrack._count_dict[cls_id]\n\n    # @even: reset track id\n    @staticmethod\n    def init_count(num_classes):\n        \"\"\"\n        Initiate _count for all object classes\n        :param num_classes:\n        \"\"\"\n        for cls_id in range(num_classes):\n            BaseTrack._count_dict[cls_id] = 0\n\n    @staticmethod\n    def reset_track_count(cls_id):\n        BaseTrack._count_dict[cls_id] = 0\n\n    def activate(self, *args):\n        raise NotImplementedError\n\n    def predict(self):\n        raise NotImplementedError\n\n    def update(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def mark_lost(self):\n        self.state = TrackState.Lost\n\n    def mark_removed(self):\n        self.state = TrackState.Removed\n\n\n@register\n@serializable\nclass STrack(BaseTrack):\n    def __init__(self, tlwh, score, cls_id, buff_size=30, temp_feat=None):\n        # wait activate\n        self._tlwh = np.asarray(tlwh, dtype=np.float32)\n        self.score = score\n        self.cls_id = cls_id\n        self.track_len = 0\n\n        self.kalman_filter = None\n        self.mean, self.covariance = None, None\n        self.is_activated = False\n\n        self.use_reid = True if temp_feat is not None else False\n        if self.use_reid:\n            self.smooth_feat = None\n            self.update_features(temp_feat)\n            self.features = deque([], maxlen=buff_size)\n            self.alpha = 0.9\n\n    def update_features(self, feat):\n        # L2 normalizing, this function has no use for BYTETracker\n        feat /= np.linalg.norm(feat)\n        self.curr_feat = feat\n        if self.smooth_feat is None:\n            self.smooth_feat = feat\n        else:\n            self.smooth_feat = self.alpha * self.smooth_feat + (1.0 - self.alpha\n                                                                ) * feat\n        self.features.append(feat)\n        self.smooth_feat /= np.linalg.norm(self.smooth_feat)\n\n    def predict(self):\n        mean_state = self.mean.copy()\n        if self.state != TrackState.Tracked:\n            mean_state[7] = 0\n        self.mean, self.covariance = self.kalman_filter.predict(mean_state,\n                                                                self.covariance)\n\n    @staticmethod\n    def multi_predict(tracks, kalman_filter):\n        if len(tracks) > 0:\n            multi_mean = np.asarray([track.mean.copy() for track in tracks])\n            multi_covariance = np.asarray(\n                [track.covariance for track in tracks])\n            for i, st in enumerate(tracks):\n                if st.state != TrackState.Tracked:\n                    multi_mean[i][7] = 0\n            multi_mean, multi_covariance = kalman_filter.multi_predict(\n                multi_mean, multi_covariance)\n            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):\n                tracks[i].mean = mean\n                tracks[i].covariance = cov\n\n    @staticmethod\n    def multi_gmc(stracks, H=np.eye(2, 3)):\n        if len(stracks) > 0:\n            multi_mean = np.asarray([st.mean.copy() for st in stracks])\n            multi_covariance = np.asarray([st.covariance for st in stracks])\n\n            R = H[:2, :2]\n            R8x8 = np.kron(np.eye(4, dtype=float), R)\n            t = H[:2, 2]\n\n            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):\n                mean = R8x8.dot(mean)\n                mean[:2] += t\n                cov = R8x8.dot(cov).dot(R8x8.transpose())\n\n                stracks[i].mean = mean\n                stracks[i].covariance = cov\n\n    def reset_track_id(self):\n        self.reset_track_count(self.cls_id)\n\n    def activate(self, kalman_filter, frame_id):\n        \"\"\"Start a new track\"\"\"\n        self.kalman_filter = kalman_filter\n        # update track id for the object class\n        self.track_id = self.next_id(self.cls_id)\n        self.mean, self.covariance = self.kalman_filter.initiate(\n            self.tlwh_to_xyah(self._tlwh))\n\n        self.track_len = 0\n        self.state = TrackState.Tracked  # set flag 'tracked'\n\n        if frame_id == 1:  # to record the first frame's detection result\n            self.is_activated = True\n\n        self.frame_id = frame_id\n        self.start_frame = frame_id\n\n    def re_activate(self, new_track, frame_id, new_id=False):\n        self.mean, self.covariance = self.kalman_filter.update(\n            self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh))\n        if self.use_reid:\n            self.update_features(new_track.curr_feat)\n        self.track_len = 0\n        self.state = TrackState.Tracked\n        self.is_activated = True\n        self.frame_id = frame_id\n        if new_id:  # update track id for the object class\n            self.track_id = self.next_id(self.cls_id)\n\n    def update(self, new_track, frame_id, update_feature=True):\n        self.frame_id = frame_id\n        self.track_len += 1\n\n        new_tlwh = new_track.tlwh\n        self.mean, self.covariance = self.kalman_filter.update(\n            self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh))\n        self.state = TrackState.Tracked  # set flag 'tracked'\n        self.is_activated = True  # set flag 'activated'\n\n        self.score = new_track.score\n        if update_feature and self.use_reid:\n            self.update_features(new_track.curr_feat)\n\n    @property\n    def tlwh(self):\n        \"\"\"Get current position in bounding box format `(top left x, top left y,\n                width, height)`.\n        \"\"\"\n        if self.mean is None:\n            return self._tlwh.copy()\n\n        ret = self.mean[:4].copy()\n        ret[2] *= ret[3]\n        ret[:2] -= ret[2:] / 2\n        return ret\n\n    @property\n    def tlbr(self):\n        \"\"\"Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,\n        `(top left, bottom right)`.\n        \"\"\"\n        ret = self.tlwh.copy()\n        ret[2:] += ret[:2]\n        return ret\n\n    @staticmethod\n    def tlwh_to_xyah(tlwh):\n        \"\"\"Convert bounding box to format `(center x, center y, aspect ratio,\n        height)`, where the aspect ratio is `width / height`.\n        \"\"\"\n        ret = np.asarray(tlwh).copy()\n        ret[:2] += ret[2:] / 2\n        ret[2] /= ret[3]\n        return ret\n\n    def to_xyah(self):\n        return self.tlwh_to_xyah(self.tlwh)\n\n    @staticmethod\n    def tlbr_to_tlwh(tlbr):\n        ret = np.asarray(tlbr).copy()\n        ret[2:] -= ret[:2]\n        return ret\n\n    @staticmethod\n    def tlwh_to_tlbr(tlwh):\n        ret = np.asarray(tlwh).copy()\n        ret[2:] += ret[:2]\n        return ret\n\n    def __repr__(self):\n        return 'OT_({}-{})_({}-{})'.format(self.cls_id, self.track_id,\n                                           self.start_frame, self.end_frame)\n\n\ndef joint_stracks(tlista, tlistb):\n    exists = {}\n    res = []\n    for t in tlista:\n        exists[t.track_id] = 1\n        res.append(t)\n    for t in tlistb:\n        tid = t.track_id\n        if not exists.get(tid, 0):\n            exists[tid] = 1\n            res.append(t)\n    return res\n\n\ndef sub_stracks(tlista, tlistb):\n    stracks = {}\n    for t in tlista:\n        stracks[t.track_id] = t\n    for t in tlistb:\n        tid = t.track_id\n        if stracks.get(tid, 0):\n            del stracks[tid]\n    return list(stracks.values())\n\n\ndef remove_duplicate_stracks(stracksa, stracksb):\n    pdist = matching.iou_distance(stracksa, stracksb)\n    pairs = np.where(pdist < 0.15)\n    dupa, dupb = list(), list()\n    for p, q in zip(*pairs):\n        timep = stracksa[p].frame_id - stracksa[p].start_frame\n        timeq = stracksb[q].frame_id - stracksb[q].start_frame\n        if timep > timeq:\n            dupb.append(q)\n        else:\n            dupa.append(p)\n    resa = [t for i, t in enumerate(stracksa) if not i in dupa]\n    resb = [t for i, t in enumerate(stracksb) if not i in dupb]\n    return resa, resb\n"
  },
  {
    "path": "ppdet/modeling/mot/tracker/base_sde_tracker.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/track.py\n\"\"\"\n\nimport datetime\nfrom ppdet.core.workspace import register, serializable\n\n__all__ = ['TrackState', 'Track']\n\n\nclass TrackState(object):\n    \"\"\"\n    Enumeration type for the single target track state. Newly created tracks are\n    classified as `tentative` until enough evidence has been collected. Then,\n    the track state is changed to `confirmed`. Tracks that are no longer alive\n    are classified as `deleted` to mark them for removal from the set of active\n    tracks.\n    \"\"\"\n    Tentative = 1\n    Confirmed = 2\n    Deleted = 3\n\n\n@register\n@serializable\nclass Track(object):\n    \"\"\"\n    A single target track with state space `(x, y, a, h)` and associated\n    velocities, where `(x, y)` is the center of the bounding box, `a` is the\n    aspect ratio and `h` is the height.\n\n    Args:\n        mean (ndarray): Mean vector of the initial state distribution.\n        covariance (ndarray): Covariance matrix of the initial state distribution.\n        track_id (int): A unique track identifier.\n        n_init (int): Number of consecutive detections before the track is confirmed.\n            The track state is set to `Deleted` if a miss occurs within the first\n            `n_init` frames.\n        max_age (int): The maximum number of consecutive misses before the track\n            state is set to `Deleted`.\n        cls_id (int): The category id of the tracked box.\n        score (float): The confidence score of the tracked box.\n        feature (Optional[ndarray]): Feature vector of the detection this track\n            originates from. If not None, this feature is added to the `features` cache.\n\n    Attributes:\n        hits (int): Total number of measurement updates.\n        age (int): Total number of frames since first occurance.\n        time_since_update (int): Total number of frames since last measurement\n            update.\n        state (TrackState): The current track state.\n        features (List[ndarray]): A cache of features. On each measurement update,\n            the associated feature vector is added to this list.\n    \"\"\"\n\n    def __init__(self,\n                 mean,\n                 covariance,\n                 track_id,\n                 n_init,\n                 max_age,\n                 cls_id,\n                 score,\n                 feature=None):\n        self.mean = mean\n        self.covariance = covariance\n        self.track_id = track_id\n        self.hits = 1\n        self.age = 1\n        self.time_since_update = 0\n        self.cls_id = cls_id\n        self.score = score\n        self.start_time = datetime.datetime.now()\n\n        self.state = TrackState.Tentative\n        self.features = []\n        self.feat = feature\n        if feature is not None:\n            self.features.append(feature)\n\n        self._n_init = n_init\n        self._max_age = max_age\n\n    def to_tlwh(self):\n        \"\"\"Get position in format `(top left x, top left y, width, height)`.\"\"\"\n        ret = self.mean[:4].copy()\n        ret[2] *= ret[3]\n        ret[:2] -= ret[2:] / 2\n        return ret\n\n    def to_tlbr(self):\n        \"\"\"Get position in bounding box format `(min x, miny, max x, max y)`.\"\"\"\n        ret = self.to_tlwh()\n        ret[2:] = ret[:2] + ret[2:]\n        return ret\n\n    def predict(self, kalman_filter):\n        \"\"\"\n        Propagate the state distribution to the current time step using a Kalman\n        filter prediction step.\n        \"\"\"\n        self.mean, self.covariance = kalman_filter.predict(self.mean,\n                                                           self.covariance)\n        self.age += 1\n        self.time_since_update += 1\n\n    def update(self, kalman_filter, detection):\n        \"\"\"\n        Perform Kalman filter measurement update step and update the associated\n        detection feature cache.\n        \"\"\"\n        self.mean, self.covariance = kalman_filter.update(self.mean,\n                                                          self.covariance,\n                                                          detection.to_xyah())\n        self.features.append(detection.feature)\n        self.feat = detection.feature\n        self.cls_id = detection.cls_id\n        self.score = detection.score\n\n        self.hits += 1\n        self.time_since_update = 0\n        if self.state == TrackState.Tentative and self.hits >= self._n_init:\n            self.state = TrackState.Confirmed\n\n    def mark_missed(self):\n        \"\"\"Mark this track as missed (no association at the current time step).\n        \"\"\"\n        if self.state == TrackState.Tentative:\n            self.state = TrackState.Deleted\n        elif self.time_since_update > self._max_age:\n            self.state = TrackState.Deleted\n\n    def is_tentative(self):\n        \"\"\"Returns True if this track is tentative (unconfirmed).\"\"\"\n        return self.state == TrackState.Tentative\n\n    def is_confirmed(self):\n        \"\"\"Returns True if this track is confirmed.\"\"\"\n        return self.state == TrackState.Confirmed\n\n    def is_deleted(self):\n        \"\"\"Returns True if this track is dead and should be deleted.\"\"\"\n        return self.state == TrackState.Deleted\n"
  },
  {
    "path": "ppdet/modeling/mot/tracker/botsort_tracker.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/bot_sort.py\n\"\"\"\n\nimport cv2\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom collections import deque\n\nfrom ..matching import jde_matching as matching\nfrom ..motion import GMC\nfrom .base_jde_tracker import TrackState, STrack\nfrom .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks\nfrom ..motion import KalmanFilter\n\nfrom ppdet.core.workspace import register, serializable\n\n\n@register\n@serializable\nclass BOTSORTTracker(object):\n    \"\"\"\n    BOTSORT tracker, support single class\n\n    Args:\n        track_high_thresh (float): threshold of detection high score\n        track_low_thresh (float): threshold of remove detection score\n        new_track_thresh (float): threshold of new track score\n        match_thresh (float): iou threshold for associate\n        track_buffer (int): tracking reserved frames,default 30\n        min_box_area (float): reserved min box\n        camera_motion (bool): Whether use camera motion, default False\n        cmc_method (str): camera motion method,defalut sparseOptFlow\n        frame_rate (int): fps buffer_size=int(frame_rate / 30.0 * track_buffer)\n    \"\"\"\n\n    def __init__(self,\n                 track_high_thresh=0.3,\n                 track_low_thresh=0.2,\n                 new_track_thresh=0.4,\n                 match_thresh=0.7,\n                 track_buffer=30,\n                 min_box_area=0,\n                 camera_motion=False,\n                 cmc_method='sparseOptFlow',\n                 frame_rate=30):\n\n        self.tracked_stracks = []  # type: list[STrack]\n        self.lost_stracks = []  # type: list[STrack]\n        self.removed_stracks = []  # type: list[STrack]\n\n        self.frame_id = 0\n\n        self.track_high_thresh = track_high_thresh\n        self.track_low_thresh = track_low_thresh\n        self.new_track_thresh = new_track_thresh\n        self.match_thresh = match_thresh\n        self.buffer_size = int(frame_rate / 30.0 * track_buffer)\n        self.max_time_lost = self.buffer_size\n        self.kalman_filter = KalmanFilter()\n        self.min_box_area = min_box_area\n\n        self.camera_motion = camera_motion\n        self.gmc = GMC(method=cmc_method)\n\n    def update(self, output_results, img=None):\n        self.frame_id += 1\n        activated_starcks = []\n        refind_stracks = []\n        lost_stracks = []\n        removed_stracks = []\n\n        if len(output_results):\n            bboxes = output_results[:, 2:6]\n            scores = output_results[:, 1]\n            classes = output_results[:, 0]\n\n            # Remove bad detections\n            lowest_inds = scores > self.track_low_thresh\n            bboxes = bboxes[lowest_inds]\n            scores = scores[lowest_inds]\n            classes = classes[lowest_inds]\n\n            # Find high threshold detections\n            remain_inds = scores > self.track_high_thresh\n            dets = bboxes[remain_inds]\n            scores_keep = scores[remain_inds]\n            classes_keep = classes[remain_inds]\n\n        else:\n            bboxes = []\n            scores = []\n            classes = []\n            dets = []\n            scores_keep = []\n            classes_keep = []\n\n        if len(dets) > 0:\n            '''Detections'''\n            detections = [\n                STrack(STrack.tlbr_to_tlwh(tlbr), s, c)\n                for (tlbr, s, c) in zip(dets, scores_keep, classes_keep)\n            ]\n        else:\n            detections = []\n        ''' Add newly detected tracklets to tracked_stracks'''\n        unconfirmed = []\n        tracked_stracks = []  # type: list[STrack]\n        for track in self.tracked_stracks:\n            if not track.is_activated:\n                unconfirmed.append(track)\n            else:\n                tracked_stracks.append(track)\n        ''' Step 2: First association, with high score detection boxes'''\n        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)\n\n        # Predict the current location with KF\n        STrack.multi_predict(strack_pool, self.kalman_filter)\n\n        # Fix camera motion\n        if self.camera_motion:\n            warp = self.gmc.apply(img[0], dets)\n            STrack.multi_gmc(strack_pool, warp)\n            STrack.multi_gmc(unconfirmed, warp)\n\n        # Associate with high score detection boxes\n        ious_dists = matching.iou_distance(strack_pool, detections)\n        matches, u_track, u_detection = matching.linear_assignment(\n            ious_dists, thresh=self.match_thresh)\n\n        for itracked, idet in matches:\n            track = strack_pool[itracked]\n            det = detections[idet]\n            if track.state == TrackState.Tracked:\n                track.update(detections[idet], self.frame_id)\n                activated_starcks.append(track)\n            else:\n                track.re_activate(det, self.frame_id, new_id=False)\n                refind_stracks.append(track)\n        ''' Step 3: Second association, with low score detection boxes'''\n        if len(scores):\n            inds_high = scores < self.track_high_thresh\n            inds_low = scores > self.track_low_thresh\n            inds_second = np.logical_and(inds_low, inds_high)\n            dets_second = bboxes[inds_second]\n            scores_second = scores[inds_second]\n            classes_second = classes[inds_second]\n        else:\n            dets_second = []\n            scores_second = []\n            classes_second = []\n\n        # association the untrack to the low score detections\n        if len(dets_second) > 0:\n            '''Detections'''\n            detections_second = [\n                STrack(STrack.tlbr_to_tlwh(tlbr), s, c) for (tlbr, s, c) in\n                zip(dets_second, scores_second, classes_second)\n            ]\n        else:\n            detections_second = []\n\n        r_tracked_stracks = [\n            strack_pool[i] for i in u_track\n            if strack_pool[i].state == TrackState.Tracked\n        ]\n        dists = matching.iou_distance(r_tracked_stracks, detections_second)\n        matches, u_track, u_detection_second = matching.linear_assignment(\n            dists, thresh=0.5)\n        for itracked, idet in matches:\n            track = r_tracked_stracks[itracked]\n            det = detections_second[idet]\n            if track.state == TrackState.Tracked:\n                track.update(det, self.frame_id)\n                activated_starcks.append(track)\n            else:\n                track.re_activate(det, self.frame_id, new_id=False)\n                refind_stracks.append(track)\n\n        for it in u_track:\n            track = r_tracked_stracks[it]\n            if not track.state == TrackState.Lost:\n                track.mark_lost()\n                lost_stracks.append(track)\n        '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''\n        detections = [detections[i] for i in u_detection]\n        dists = matching.iou_distance(unconfirmed, detections)\n\n        matches, u_unconfirmed, u_detection = matching.linear_assignment(\n            dists, thresh=0.7)\n        for itracked, idet in matches:\n            unconfirmed[itracked].update(detections[idet], self.frame_id)\n            activated_starcks.append(unconfirmed[itracked])\n        for it in u_unconfirmed:\n            track = unconfirmed[it]\n            track.mark_removed()\n            removed_stracks.append(track)\n        \"\"\" Step 4: Init new stracks\"\"\"\n        for inew in u_detection:\n            track = detections[inew]\n            if track.score < self.new_track_thresh:\n                continue\n\n            track.activate(self.kalman_filter, self.frame_id)\n            activated_starcks.append(track)\n        \"\"\" Step 5: Update state\"\"\"\n        for track in self.lost_stracks:\n            if self.frame_id - track.end_frame > self.max_time_lost:\n                track.mark_removed()\n                removed_stracks.append(track)\n        \"\"\" Merge \"\"\"\n        self.tracked_stracks = [\n            t for t in self.tracked_stracks if t.state == TrackState.Tracked\n        ]\n        self.tracked_stracks = joint_stracks(self.tracked_stracks,\n                                             activated_starcks)\n        self.tracked_stracks = joint_stracks(self.tracked_stracks,\n                                             refind_stracks)\n        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)\n        self.lost_stracks.extend(lost_stracks)\n        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)\n        self.removed_stracks.extend(removed_stracks)\n        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(\n            self.tracked_stracks, self.lost_stracks)\n\n        # output_stracks = [track for track in self.tracked_stracks if track.is_activated]\n        output_stracks = [track for track in self.tracked_stracks]\n\n        return output_stracks\n"
  },
  {
    "path": "ppdet/modeling/mot/tracker/center_tracker.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\r\n#\r\n# Licensed under the Apache License, Version 2.0 (the \"License\");\r\n# you may not use this file except in compliance with the License.\r\n# You may obtain a copy of the License at\r\n#\r\n#     http://www.apache.org/licenses/LICENSE-2.0\r\n#\r\n# Unless required by applicable law or agreed to in writing, software\r\n# distributed under the License is distributed on an \"AS IS\" BASIS,\r\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n# See the License for the specific language governing permissions and\r\n# limitations under the License.\r\n\"\"\"\r\nThis code is based on https://github.com/xingyizhou/CenterTrack/blob/master/src/lib/utils/tracker.py\r\n\"\"\"\r\n\r\nimport copy\r\nimport numpy as np\r\nimport sklearn\r\n\r\nfrom ppdet.core.workspace import register, serializable\r\nfrom ppdet.utils.logger import setup_logger\r\nlogger = setup_logger(__name__)\r\n\r\n__all__ = ['CenterTracker']\r\n\r\n\r\n@register\r\n@serializable\r\nclass CenterTracker(object):\r\n    __shared__ = ['num_classes']\r\n\r\n    def __init__(self,\r\n                 num_classes=1,\r\n                 min_box_area=0,\r\n                 vertical_ratio=-1,\r\n                 track_thresh=0.4,\r\n                 pre_thresh=0.5,\r\n                 new_thresh=0.4,\r\n                 out_thresh=0.4,\r\n                 hungarian=False):\r\n        self.num_classes = num_classes\r\n        self.min_box_area = min_box_area\r\n        self.vertical_ratio = vertical_ratio\r\n\r\n        self.track_thresh = track_thresh\r\n        self.pre_thresh = max(track_thresh, pre_thresh)\r\n        self.new_thresh = max(track_thresh, new_thresh)\r\n        self.out_thresh = max(track_thresh, out_thresh)\r\n        self.hungarian = hungarian\r\n\r\n        self.reset()\r\n\r\n    def init_track(self, results):\r\n        print('Initialize tracking!')\r\n        for item in results:\r\n            if item['score'] > self.new_thresh:\r\n                self.id_count += 1\r\n                item['tracking_id'] = self.id_count\r\n                if not ('ct' in item):\r\n                    bbox = item['bbox']\r\n                    item['ct'] = [(bbox[0] + bbox[2]) / 2,\r\n                                  (bbox[1] + bbox[3]) / 2]\r\n                self.tracks.append(item)\r\n\r\n    def reset(self):\r\n        self.id_count = 0\r\n        self.tracks = []\r\n\r\n    def update(self, results, public_det=None):\r\n        N = len(results)\r\n        M = len(self.tracks)\r\n\r\n        dets = np.array([det['ct'] + det['tracking'] for det in results],\r\n                        np.float32)  # N x 2\r\n        track_size = np.array([((track['bbox'][2] - track['bbox'][0]) * \\\r\n            (track['bbox'][3] - track['bbox'][1])) \\\r\n            for track in self.tracks], np.float32) # M\r\n        track_cat = np.array([track['class'] for track in self.tracks],\r\n                             np.int32)  # M\r\n        item_size = np.array([((item['bbox'][2] - item['bbox'][0]) * \\\r\n            (item['bbox'][3] - item['bbox'][1])) \\\r\n            for item in results], np.float32) # N\r\n        item_cat = np.array([item['class'] for item in results], np.int32)  # N\r\n        tracks = np.array([pre_det['ct'] for pre_det in self.tracks],\r\n                          np.float32)  # M x 2\r\n        dist = (((tracks.reshape(1, -1, 2) - \\\r\n            dets.reshape(-1, 1, 2)) ** 2).sum(axis=2)) # N x M\r\n\r\n        invalid = ((dist > track_size.reshape(1, M)) + \\\r\n            (dist > item_size.reshape(N, 1)) + \\\r\n            (item_cat.reshape(N, 1) != track_cat.reshape(1, M))) > 0\r\n        dist = dist + invalid * 1e18\r\n\r\n        if self.hungarian:\r\n            item_score = np.array([item['score'] for item in results],\r\n                                  np.float32)\r\n            dist[dist > 1e18] = 1e18\r\n            from sklearn.utils.linear_assignment_ import linear_assignment\r\n            matched_indices = linear_assignment(dist)\r\n        else:\r\n            matched_indices = greedy_assignment(copy.deepcopy(dist))\r\n\r\n        unmatched_dets = [d for d in range(dets.shape[0]) \\\r\n            if not (d in matched_indices[:, 0])]\r\n        unmatched_tracks = [d for d in range(tracks.shape[0]) \\\r\n            if not (d in matched_indices[:, 1])]\r\n\r\n        if self.hungarian:\r\n            matches = []\r\n            for m in matched_indices:\r\n                if dist[m[0], m[1]] > 1e16:\r\n                    unmatched_dets.append(m[0])\r\n                    unmatched_tracks.append(m[1])\r\n                else:\r\n                    matches.append(m)\r\n            matches = np.array(matches).reshape(-1, 2)\r\n        else:\r\n            matches = matched_indices\r\n\r\n        ret = []\r\n        for m in matches:\r\n            track = results[m[0]]\r\n            track['tracking_id'] = self.tracks[m[1]]['tracking_id']\r\n            ret.append(track)\r\n\r\n        # Private detection: create tracks for all un-matched detections\r\n        for i in unmatched_dets:\r\n            track = results[i]\r\n            if track['score'] > self.new_thresh:\r\n                self.id_count += 1\r\n                track['tracking_id'] = self.id_count\r\n                ret.append(track)\r\n\r\n        self.tracks = ret\r\n        return ret\r\n\r\n\r\ndef greedy_assignment(dist):\r\n    matched_indices = []\r\n    if dist.shape[1] == 0:\r\n        return np.array(matched_indices, np.int32).reshape(-1, 2)\r\n    for i in range(dist.shape[0]):\r\n        j = dist[i].argmin()\r\n        if dist[i][j] < 1e16:\r\n            dist[:, j] = 1e18\r\n            matched_indices.append([i, j])\r\n    return np.array(matched_indices, np.int32).reshape(-1, 2)\r\n"
  },
  {
    "path": "ppdet/modeling/mot/tracker/deepsort_tracker.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\r\n#\r\n# Licensed under the Apache License, Version 2.0 (the \"License\");\r\n# you may not use this file except in compliance with the License.\r\n# You may obtain a copy of the License at\r\n#\r\n#     http://www.apache.org/licenses/LICENSE-2.0\r\n#\r\n# Unless required by applicable law or agreed to in writing, software\r\n# distributed under the License is distributed on an \"AS IS\" BASIS,\r\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n# See the License for the specific language governing permissions and\r\n# limitations under the License.\r\n\"\"\"\r\nThis code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/tracker.py\r\n\"\"\"\r\n\r\nimport numpy as np\r\n\r\nfrom ..motion import KalmanFilter\r\nfrom ..matching.deepsort_matching import NearestNeighborDistanceMetric\r\nfrom ..matching.deepsort_matching import iou_cost, min_cost_matching, matching_cascade, gate_cost_matrix\r\nfrom .base_sde_tracker import Track\r\nfrom ..utils import Detection\r\n\r\nfrom ppdet.core.workspace import register, serializable\r\nfrom ppdet.utils.logger import setup_logger\r\nlogger = setup_logger(__name__)\r\n\r\n__all__ = ['DeepSORTTracker']\r\n\r\n\r\n@register\r\n@serializable\r\nclass DeepSORTTracker(object):\r\n    \"\"\"\r\n    DeepSORT tracker\r\n\r\n    Args:\r\n        input_size (list): input feature map size to reid model, [h, w] format,\r\n            [64, 192] as default.\r\n        min_box_area (int): min box area to filter out low quality boxes\r\n        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter\r\n            bad results, set 1.6 default for pedestrian tracking. If set <=0\r\n            means no need to filter bboxes.\r\n        budget (int): If not None, fix samples per class to at most this number.\r\n            Removes the oldest samples when the budget is reached.\r\n        max_age (int): maximum number of missed misses before a track is deleted\r\n        n_init (float): Number of frames that a track remains in initialization\r\n            phase. Number of consecutive detections before the track is confirmed. \r\n            The track state is set to `Deleted` if a miss occurs within the first \r\n            `n_init` frames.\r\n        metric_type (str): either \"euclidean\" or \"cosine\", the distance metric \r\n            used for measurement to track association.\r\n        matching_threshold (float): samples with larger distance are \r\n            considered an invalid match.\r\n        max_iou_distance (float): max iou distance threshold\r\n        motion (object): KalmanFilter instance\r\n    \"\"\"\r\n\r\n    def __init__(self,\r\n                 input_size=[64, 192],\r\n                 min_box_area=0,\r\n                 vertical_ratio=-1,\r\n                 budget=100,\r\n                 max_age=70,\r\n                 n_init=3,\r\n                 metric_type='cosine',\r\n                 matching_threshold=0.2,\r\n                 max_iou_distance=0.9,\r\n                 motion='KalmanFilter'):\r\n        self.input_size = input_size\r\n        self.min_box_area = min_box_area\r\n        self.vertical_ratio = vertical_ratio\r\n        self.max_age = max_age\r\n        self.n_init = n_init\r\n        self.metric = NearestNeighborDistanceMetric(metric_type,\r\n                                                    matching_threshold, budget)\r\n        self.max_iou_distance = max_iou_distance\r\n        if motion == 'KalmanFilter':\r\n            self.motion = KalmanFilter()\r\n\r\n        self.tracks = []\r\n        self._next_id = 1\r\n\r\n    def predict(self):\r\n        \"\"\"\r\n        Propagate track state distributions one time step forward.\r\n        This function should be called once every time step, before `update`.\r\n        \"\"\"\r\n        for track in self.tracks:\r\n            track.predict(self.motion)\r\n\r\n    def update(self, pred_dets, pred_embs):\r\n        \"\"\"\r\n        Perform measurement update and track management.\r\n        Args:\r\n            pred_dets (np.array): Detection results of the image, the shape is\r\n                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.\r\n            pred_embs (np.array): Embedding results of the image, the shape is\r\n                [N, 128], usually pred_embs.shape[1] is a multiple of 128.\r\n        \"\"\"\r\n        pred_cls_ids = pred_dets[:, 0:1]\r\n        pred_scores = pred_dets[:, 1:2]\r\n        pred_xyxys = pred_dets[:, 2:6]\r\n        pred_tlwhs = np.concatenate((pred_xyxys[:, 0:2], pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1), axis=1)\r\n\r\n        detections = [\r\n            Detection(tlwh, score, feat, cls_id)\r\n            for tlwh, score, feat, cls_id in zip(pred_tlwhs, pred_scores,\r\n                                                 pred_embs, pred_cls_ids)\r\n        ]\r\n\r\n        # Run matching cascade.\r\n        matches, unmatched_tracks, unmatched_detections = \\\r\n            self._match(detections)\r\n\r\n        # Update track set.\r\n        for track_idx, detection_idx in matches:\r\n            self.tracks[track_idx].update(self.motion,\r\n                                          detections[detection_idx])\r\n        for track_idx in unmatched_tracks:\r\n            self.tracks[track_idx].mark_missed()\r\n        for detection_idx in unmatched_detections:\r\n            self._initiate_track(detections[detection_idx])\r\n        self.tracks = [t for t in self.tracks if not t.is_deleted()]\r\n\r\n        # Update distance metric.\r\n        active_targets = [t.track_id for t in self.tracks if t.is_confirmed()]\r\n        features, targets = [], []\r\n        for track in self.tracks:\r\n            if not track.is_confirmed():\r\n                continue\r\n            features += track.features\r\n            targets += [track.track_id for _ in track.features]\r\n            track.features = []\r\n        self.metric.partial_fit(\r\n            np.asarray(features), np.asarray(targets), active_targets)\r\n        output_stracks = self.tracks\r\n        return output_stracks\r\n\r\n    def _match(self, detections):\r\n        def gated_metric(tracks, dets, track_indices, detection_indices):\r\n            features = np.array([dets[i].feature for i in detection_indices])\r\n            targets = np.array([tracks[i].track_id for i in track_indices])\r\n            cost_matrix = self.metric.distance(features, targets)\r\n            cost_matrix = gate_cost_matrix(self.motion, cost_matrix, tracks,\r\n                                           dets, track_indices,\r\n                                           detection_indices)\r\n            return cost_matrix\r\n\r\n        # Split track set into confirmed and unconfirmed tracks.\r\n        confirmed_tracks = [\r\n            i for i, t in enumerate(self.tracks) if t.is_confirmed()\r\n        ]\r\n        unconfirmed_tracks = [\r\n            i for i, t in enumerate(self.tracks) if not t.is_confirmed()\r\n        ]\r\n\r\n        # Associate confirmed tracks using appearance features.\r\n        matches_a, unmatched_tracks_a, unmatched_detections = \\\r\n            matching_cascade(\r\n                gated_metric, self.metric.matching_threshold, self.max_age,\r\n                self.tracks, detections, confirmed_tracks)\r\n\r\n        # Associate remaining tracks together with unconfirmed tracks using IOU.\r\n        iou_track_candidates = unconfirmed_tracks + [\r\n            k for k in unmatched_tracks_a\r\n            if self.tracks[k].time_since_update == 1\r\n        ]\r\n        unmatched_tracks_a = [\r\n            k for k in unmatched_tracks_a\r\n            if self.tracks[k].time_since_update != 1\r\n        ]\r\n        matches_b, unmatched_tracks_b, unmatched_detections = \\\r\n            min_cost_matching(\r\n                iou_cost, self.max_iou_distance, self.tracks,\r\n                detections, iou_track_candidates, unmatched_detections)\r\n\r\n        matches = matches_a + matches_b\r\n        unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b))\r\n        return matches, unmatched_tracks, unmatched_detections\r\n\r\n    def _initiate_track(self, detection):\r\n        mean, covariance = self.motion.initiate(detection.to_xyah())\r\n        self.tracks.append(\r\n            Track(mean, covariance, self._next_id, self.n_init, self.max_age,\r\n                  detection.cls_id, detection.score, detection.feature))\r\n        self._next_id += 1\r\n"
  },
  {
    "path": "ppdet/modeling/mot/tracker/jde_tracker.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\r\n#\r\n# Licensed under the Apache License, Version 2.0 (the \"License\");\r\n# you may not use this file except in compliance with the License.\r\n# You may obtain a copy of the License at\r\n#\r\n#     http://www.apache.org/licenses/LICENSE-2.0\r\n#\r\n# Unless required by applicable law or agreed to in writing, software\r\n# distributed under the License is distributed on an \"AS IS\" BASIS,\r\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n# See the License for the specific language governing permissions and\r\n# limitations under the License.\r\n\"\"\"\r\nThis code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py\r\n\"\"\"\r\n\r\nimport numpy as np\r\nfrom collections import defaultdict\r\n\r\nfrom ..matching import jde_matching as matching\r\nfrom ..motion import KalmanFilter\r\nfrom .base_jde_tracker import TrackState, STrack\r\nfrom .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks\r\n\r\nfrom ppdet.core.workspace import register, serializable\r\nfrom ppdet.utils.logger import setup_logger\r\nlogger = setup_logger(__name__)\r\n\r\n__all__ = ['JDETracker']\r\n\r\n\r\n@register\r\n@serializable\r\nclass JDETracker(object):\r\n    __shared__ = ['num_classes']\r\n    \"\"\"\r\n    JDE tracker, support single class and multi classes\r\n\r\n    Args:\r\n        use_byte (bool): Whether use ByteTracker, default False\r\n        num_classes (int): the number of classes\r\n        det_thresh (float): threshold of detection score\r\n        track_buffer (int): buffer for tracker\r\n        min_box_area (int): min box area to filter out low quality boxes\r\n        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter\r\n            bad results. If set <= 0 means no need to filter bboxes，usually set\r\n            1.6 for pedestrian tracking.\r\n        tracked_thresh (float): linear assignment threshold of tracked \r\n            stracks and detections\r\n        r_tracked_thresh (float): linear assignment threshold of \r\n            tracked stracks and unmatched detections\r\n        unconfirmed_thresh (float): linear assignment threshold of \r\n            unconfirmed stracks and unmatched detections\r\n        conf_thres (float): confidence threshold for tracking, also used in\r\n            ByteTracker as higher confidence threshold\r\n        match_thres (float): linear assignment threshold of tracked \r\n            stracks and detections in ByteTracker\r\n        low_conf_thres (float): lower confidence threshold for tracking in\r\n            ByteTracker\r\n        input_size (list): input feature map size to reid model, [h, w] format,\r\n            [64, 192] as default.\r\n        motion (str): motion model, KalmanFilter as default\r\n        metric_type (str): either \"euclidean\" or \"cosine\", the distance metric \r\n            used for measurement to track association.\r\n    \"\"\"\r\n\r\n    def __init__(self,\r\n                 use_byte=False,\r\n                 num_classes=1,\r\n                 det_thresh=0.3,\r\n                 track_buffer=30,\r\n                 min_box_area=0,\r\n                 vertical_ratio=0,\r\n                 tracked_thresh=0.7,\r\n                 r_tracked_thresh=0.5,\r\n                 unconfirmed_thresh=0.7,\r\n                 conf_thres=0,\r\n                 match_thres=0.8,\r\n                 low_conf_thres=0.2,\r\n                 input_size=[64, 192],\r\n                 motion='KalmanFilter',\r\n                 metric_type='euclidean'):\r\n        self.use_byte = use_byte\r\n        self.num_classes = num_classes\r\n        self.det_thresh = det_thresh if not use_byte else conf_thres + 0.1\r\n        self.track_buffer = track_buffer\r\n        self.min_box_area = min_box_area\r\n        self.vertical_ratio = vertical_ratio\r\n\r\n        self.tracked_thresh = tracked_thresh\r\n        self.r_tracked_thresh = r_tracked_thresh\r\n        self.unconfirmed_thresh = unconfirmed_thresh\r\n        self.conf_thres = conf_thres\r\n        self.match_thres = match_thres\r\n        self.low_conf_thres = low_conf_thres\r\n\r\n        self.input_size = input_size\r\n        if motion == 'KalmanFilter':\r\n            self.motion = KalmanFilter()\r\n        self.metric_type = metric_type\r\n\r\n        self.frame_id = 0\r\n        self.tracked_tracks_dict = defaultdict(list)  # dict(list[STrack])\r\n        self.lost_tracks_dict = defaultdict(list)  # dict(list[STrack])\r\n        self.removed_tracks_dict = defaultdict(list)  # dict(list[STrack])\r\n\r\n        self.max_time_lost = 0\r\n        # max_time_lost will be calculated: int(frame_rate / 30.0 * track_buffer)\r\n\r\n    def update(self, pred_dets, pred_embs=None):\r\n        \"\"\"\r\n        Processes the image frame and finds bounding box(detections).\r\n        Associates the detection with corresponding tracklets and also handles\r\n            lost, removed, refound and active tracklets.\r\n\r\n        Args:\r\n            pred_dets (np.array): Detection results of the image, the shape is\r\n                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.\r\n            pred_embs (np.array): Embedding results of the image, the shape is\r\n                [N, 128] or [N, 512].\r\n\r\n        Return:\r\n            output_stracks_dict (dict(list)): The list contains information\r\n                regarding the online_tracklets for the received image tensor.\r\n        \"\"\"\r\n        self.frame_id += 1\r\n        if self.frame_id == 1:\r\n            STrack.init_count(self.num_classes)\r\n        activated_tracks_dict = defaultdict(list)\r\n        refined_tracks_dict = defaultdict(list)\r\n        lost_tracks_dict = defaultdict(list)\r\n        removed_tracks_dict = defaultdict(list)\r\n        output_tracks_dict = defaultdict(list)\r\n\r\n        pred_dets_dict = defaultdict(list)\r\n        pred_embs_dict = defaultdict(list)\r\n\r\n        # unify single and multi classes detection and embedding results\r\n        for cls_id in range(self.num_classes):\r\n            cls_idx = (pred_dets[:, 0:1] == cls_id).squeeze(-1)\r\n            pred_dets_dict[cls_id] = pred_dets[cls_idx]\r\n            if pred_embs is not None:\r\n                pred_embs_dict[cls_id] = pred_embs[cls_idx]\r\n            else:\r\n                pred_embs_dict[cls_id] = None\r\n\r\n        for cls_id in range(self.num_classes):\r\n            \"\"\" Step 1: Get detections by class\"\"\"\r\n            pred_dets_cls = pred_dets_dict[cls_id]\r\n            pred_embs_cls = pred_embs_dict[cls_id]\r\n            remain_inds = (pred_dets_cls[:, 1:2] > self.conf_thres).squeeze(-1)\r\n            if remain_inds.sum() > 0:\r\n                pred_dets_cls = pred_dets_cls[remain_inds]\r\n                if pred_embs_cls is None:\r\n                    # in original ByteTrack\r\n                    detections = [\r\n                        STrack(\r\n                            STrack.tlbr_to_tlwh(tlbrs[2:6]),\r\n                            tlbrs[1],\r\n                            cls_id,\r\n                            30,\r\n                            temp_feat=None) for tlbrs in pred_dets_cls\r\n                    ]\r\n                else:\r\n                    pred_embs_cls = pred_embs_cls[remain_inds]\r\n                    detections = [\r\n                        STrack(\r\n                            STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id,\r\n                            30, temp_feat) for (tlbrs, temp_feat) in\r\n                        zip(pred_dets_cls, pred_embs_cls)\r\n                    ]\r\n            else:\r\n                detections = []\r\n            ''' Add newly detected tracklets to tracked_stracks'''\r\n            unconfirmed_dict = defaultdict(list)\r\n            tracked_tracks_dict = defaultdict(list)\r\n            for track in self.tracked_tracks_dict[cls_id]:\r\n                if not track.is_activated:\r\n                    # previous tracks which are not active in the current frame are added in unconfirmed list\r\n                    unconfirmed_dict[cls_id].append(track)\r\n                else:\r\n                    # Active tracks are added to the local list 'tracked_stracks'\r\n                    tracked_tracks_dict[cls_id].append(track)\r\n            \"\"\" Step 2: First association, with embedding\"\"\"\r\n            # building tracking pool for the current frame\r\n            track_pool_dict = defaultdict(list)\r\n            track_pool_dict[cls_id] = joint_stracks(\r\n                tracked_tracks_dict[cls_id], self.lost_tracks_dict[cls_id])\r\n\r\n            # Predict the current location with KalmanFilter\r\n            STrack.multi_predict(track_pool_dict[cls_id], self.motion)\r\n\r\n            if pred_embs_cls is None:\r\n                # in original ByteTrack\r\n                dists = matching.iou_distance(track_pool_dict[cls_id],\r\n                                              detections)\r\n                matches, u_track, u_detection = matching.linear_assignment(\r\n                    dists, thresh=self.match_thres)  # not self.tracked_thresh\r\n            else:\r\n                dists = matching.embedding_distance(\r\n                    track_pool_dict[cls_id],\r\n                    detections,\r\n                    metric=self.metric_type)\r\n                dists = matching.fuse_motion(\r\n                    self.motion, dists, track_pool_dict[cls_id], detections)\r\n                matches, u_track, u_detection = matching.linear_assignment(\r\n                    dists, thresh=self.tracked_thresh)\r\n\r\n            for i_tracked, idet in matches:\r\n                # i_tracked is the id of the track and idet is the detection\r\n                track = track_pool_dict[cls_id][i_tracked]\r\n                det = detections[idet]\r\n                if track.state == TrackState.Tracked:\r\n                    # If the track is active, add the detection to the track\r\n                    track.update(detections[idet], self.frame_id)\r\n                    activated_tracks_dict[cls_id].append(track)\r\n                else:\r\n                    # We have obtained a detection from a track which is not active,\r\n                    # hence put the track in refind_stracks list\r\n                    track.re_activate(det, self.frame_id, new_id=False)\r\n                    refined_tracks_dict[cls_id].append(track)\r\n\r\n            # None of the steps below happen if there are no undetected tracks.\r\n            \"\"\" Step 3: Second association, with IOU\"\"\"\r\n            if self.use_byte:\r\n                inds_low = pred_dets_dict[cls_id][:, 1:2] > self.low_conf_thres\r\n                inds_high = pred_dets_dict[cls_id][:, 1:2] < self.conf_thres\r\n                inds_second = np.logical_and(inds_low, inds_high).squeeze(-1)\r\n                pred_dets_cls_second = pred_dets_dict[cls_id][inds_second]\r\n\r\n                # association the untrack to the low score detections\r\n                if len(pred_dets_cls_second) > 0:\r\n                    if pred_embs_dict[cls_id] is None:\r\n                        # in original ByteTrack\r\n                        detections_second = [\r\n                            STrack(\r\n                                STrack.tlbr_to_tlwh(tlbrs[2:6]),\r\n                                tlbrs[1],\r\n                                cls_id,\r\n                                30,\r\n                                temp_feat=None)\r\n                            for tlbrs in pred_dets_cls_second\r\n                        ]\r\n                    else:\r\n                        pred_embs_cls_second = pred_embs_dict[cls_id][\r\n                            inds_second]\r\n                        detections_second = [\r\n                            STrack(\r\n                                STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1],\r\n                                cls_id, 30, temp_feat) for (tlbrs, temp_feat) in\r\n                            zip(pred_dets_cls_second, pred_embs_cls_second)\r\n                        ]\r\n                else:\r\n                    detections_second = []\r\n                r_tracked_stracks = [\r\n                    track_pool_dict[cls_id][i] for i in u_track\r\n                    if track_pool_dict[cls_id][i].state == TrackState.Tracked\r\n                ]\r\n                dists = matching.iou_distance(r_tracked_stracks,\r\n                                              detections_second)\r\n                matches, u_track, u_detection_second = matching.linear_assignment(\r\n                    dists, thresh=0.4)  # not r_tracked_thresh\r\n            else:\r\n                detections = [detections[i] for i in u_detection]\r\n                r_tracked_stracks = []\r\n                for i in u_track:\r\n                    if track_pool_dict[cls_id][i].state == TrackState.Tracked:\r\n                        r_tracked_stracks.append(track_pool_dict[cls_id][i])\r\n                dists = matching.iou_distance(r_tracked_stracks, detections)\r\n\r\n                matches, u_track, u_detection = matching.linear_assignment(\r\n                    dists, thresh=self.r_tracked_thresh)\r\n\r\n            for i_tracked, idet in matches:\r\n                track = r_tracked_stracks[i_tracked]\r\n                det = detections[\r\n                    idet] if not self.use_byte else detections_second[idet]\r\n                if track.state == TrackState.Tracked:\r\n                    track.update(det, self.frame_id)\r\n                    activated_tracks_dict[cls_id].append(track)\r\n                else:\r\n                    track.re_activate(det, self.frame_id, new_id=False)\r\n                    refined_tracks_dict[cls_id].append(track)\r\n\r\n            for it in u_track:\r\n                track = r_tracked_stracks[it]\r\n                if not track.state == TrackState.Lost:\r\n                    track.mark_lost()\r\n                    lost_tracks_dict[cls_id].append(track)\r\n            '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''\r\n            detections = [detections[i] for i in u_detection]\r\n            dists = matching.iou_distance(unconfirmed_dict[cls_id], detections)\r\n            matches, u_unconfirmed, u_detection = matching.linear_assignment(\r\n                dists, thresh=self.unconfirmed_thresh)\r\n            for i_tracked, idet in matches:\r\n                unconfirmed_dict[cls_id][i_tracked].update(detections[idet],\r\n                                                           self.frame_id)\r\n                activated_tracks_dict[cls_id].append(unconfirmed_dict[cls_id][\r\n                    i_tracked])\r\n            for it in u_unconfirmed:\r\n                track = unconfirmed_dict[cls_id][it]\r\n                track.mark_removed()\r\n                removed_tracks_dict[cls_id].append(track)\r\n            \"\"\" Step 4: Init new stracks\"\"\"\r\n            for inew in u_detection:\r\n                track = detections[inew]\r\n                if track.score < self.det_thresh:\r\n                    continue\r\n                track.activate(self.motion, self.frame_id)\r\n                activated_tracks_dict[cls_id].append(track)\r\n            \"\"\" Step 5: Update state\"\"\"\r\n            for track in self.lost_tracks_dict[cls_id]:\r\n                if self.frame_id - track.end_frame > self.max_time_lost:\r\n                    track.mark_removed()\r\n                    removed_tracks_dict[cls_id].append(track)\r\n\r\n            self.tracked_tracks_dict[cls_id] = [\r\n                t for t in self.tracked_tracks_dict[cls_id]\r\n                if t.state == TrackState.Tracked\r\n            ]\r\n            self.tracked_tracks_dict[cls_id] = joint_stracks(\r\n                self.tracked_tracks_dict[cls_id], activated_tracks_dict[cls_id])\r\n            self.tracked_tracks_dict[cls_id] = joint_stracks(\r\n                self.tracked_tracks_dict[cls_id], refined_tracks_dict[cls_id])\r\n            self.lost_tracks_dict[cls_id] = sub_stracks(\r\n                self.lost_tracks_dict[cls_id], self.tracked_tracks_dict[cls_id])\r\n            self.lost_tracks_dict[cls_id].extend(lost_tracks_dict[cls_id])\r\n            self.lost_tracks_dict[cls_id] = sub_stracks(\r\n                self.lost_tracks_dict[cls_id], self.removed_tracks_dict[cls_id])\r\n            self.removed_tracks_dict[cls_id].extend(removed_tracks_dict[cls_id])\r\n            self.tracked_tracks_dict[cls_id], self.lost_tracks_dict[\r\n                cls_id] = remove_duplicate_stracks(\r\n                    self.tracked_tracks_dict[cls_id],\r\n                    self.lost_tracks_dict[cls_id])\r\n\r\n            # get scores of lost tracks\r\n            output_tracks_dict[cls_id] = [\r\n                track for track in self.tracked_tracks_dict[cls_id]\r\n                if track.is_activated\r\n            ]\r\n\r\n            logger.debug('===========Frame {}=========='.format(self.frame_id))\r\n            logger.debug('Activated: {}'.format(\r\n                [track.track_id for track in activated_tracks_dict[cls_id]]))\r\n            logger.debug('Refind: {}'.format(\r\n                [track.track_id for track in refined_tracks_dict[cls_id]]))\r\n            logger.debug('Lost: {}'.format(\r\n                [track.track_id for track in lost_tracks_dict[cls_id]]))\r\n            logger.debug('Removed: {}'.format(\r\n                [track.track_id for track in removed_tracks_dict[cls_id]]))\r\n\r\n        return output_tracks_dict\r\n"
  },
  {
    "path": "ppdet/modeling/mot/tracker/ocsort_tracker.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nThis code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/ocsort.py\n\"\"\"\n\nimport numpy as np\nfrom ..matching.ocsort_matching import associate, linear_assignment, iou_batch, associate_only_iou\nfrom ..motion.ocsort_kalman_filter import OCSORTKalmanFilter\nfrom ppdet.core.workspace import register, serializable\n\n\ndef k_previous_obs(observations, cur_age, k):\n    if len(observations) == 0:\n        return [-1, -1, -1, -1, -1]\n    for i in range(k):\n        dt = k - i\n        if cur_age - dt in observations:\n            return observations[cur_age - dt]\n    max_age = max(observations.keys())\n    return observations[max_age]\n\n\ndef convert_bbox_to_z(bbox):\n    \"\"\"\n    Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form\n      [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is\n      the aspect ratio\n    \"\"\"\n    w = bbox[2] - bbox[0]\n    h = bbox[3] - bbox[1]\n    x = bbox[0] + w / 2.\n    y = bbox[1] + h / 2.\n    s = w * h  # scale is just area\n    r = w / float(h + 1e-6)\n    return np.array([x, y, s, r]).reshape((4, 1))\n\n\ndef convert_x_to_bbox(x, score=None):\n    \"\"\"\n    Takes a bounding box in the centre form [x,y,s,r] and returns it in the form\n      [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right\n    \"\"\"\n    w = np.sqrt(x[2] * x[3])\n    h = x[2] / w\n    if (score == None):\n        return np.array(\n            [x[0] - w / 2., x[1] - h / 2., x[0] + w / 2.,\n             x[1] + h / 2.]).reshape((1, 4))\n    else:\n        score = np.array([score])\n        return np.array([\n            x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score\n        ]).reshape((1, 5))\n\n\ndef speed_direction(bbox1, bbox2):\n    cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0\n    cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0\n    speed = np.array([cy2 - cy1, cx2 - cx1])\n    norm = np.sqrt((cy2 - cy1)**2 + (cx2 - cx1)**2) + 1e-6\n    return speed / norm\n\n\nclass KalmanBoxTracker(object):\n    \"\"\"\n    This class represents the internal state of individual tracked objects observed as bbox.\n\n    Args:\n        bbox (np.array): bbox in [x1,y1,x2,y2,score] format.\n        delta_t (int): delta_t of previous observation\n    \"\"\"\n    count = 0\n\n    def __init__(self, bbox, delta_t=3):\n\n        self.kf = OCSORTKalmanFilter(dim_x=7, dim_z=4)\n        self.kf.F = np.array([[1., 0, 0, 0, 1., 0, 0], [0, 1., 0, 0, 0, 1., 0],\n                              [0, 0, 1., 0, 0, 0, 1], [0, 0, 0, 1., 0, 0, 0],\n                              [0, 0, 0, 0, 1., 0, 0], [0, 0, 0, 0, 0, 1., 0],\n                              [0, 0, 0, 0, 0, 0, 1.]])\n        self.kf.H = np.array([[1., 0, 0, 0, 0, 0, 0], [0, 1., 0, 0, 0, 0, 0],\n                              [0, 0, 1., 0, 0, 0, 0], [0, 0, 0, 1., 0, 0, 0]])\n        self.kf.R[2:, 2:] *= 10.\n        self.kf.P[4:, 4:] *= 1000.\n        # give high uncertainty to the unobservable initial velocities\n        self.kf.P *= 10.\n        self.kf.Q[-1, -1] *= 0.01\n        self.kf.Q[4:, 4:] *= 0.01\n\n        self.score = bbox[4]\n        self.kf.x[:4] = convert_bbox_to_z(bbox)\n        self.time_since_update = 0\n        self.id = KalmanBoxTracker.count\n        KalmanBoxTracker.count += 1\n        self.history = []\n        self.hits = 0\n        self.hit_streak = 0\n        self.age = 0\n        \"\"\"\n        NOTE: [-1,-1,-1,-1,-1] is a compromising placeholder for non-observation status, the same for the return of \n        function k_previous_obs. It is ugly and I do not like it. But to support generate observation array in a \n        fast and unified way, which you would see below k_observations = np.array([k_previous_obs(...]]), let's bear it for now.\n        \"\"\"\n        self.last_observation = np.array([-1, -1, -1, -1, -1])  # placeholder\n        self.observations = dict()\n        self.history_observations = []\n        self.velocity = None\n        self.delta_t = delta_t\n\n    def update(self, bbox, angle_cost=False):\n        \"\"\"\n        Updates the state vector with observed bbox.\n        \"\"\"\n        if bbox is not None:\n            if angle_cost and self.last_observation.sum(\n            ) >= 0:  # no previous observation\n                previous_box = None\n                for i in range(self.delta_t):\n                    dt = self.delta_t - i\n                    if self.age - dt in self.observations:\n                        previous_box = self.observations[self.age - dt]\n                        break\n                if previous_box is None:\n                    previous_box = self.last_observation\n                \"\"\"\n                  Estimate the track speed direction with observations \\Delta t steps away\n                \"\"\"\n                self.velocity = speed_direction(previous_box, bbox)\n            \"\"\"\n              Insert new observations. This is a ugly way to maintain both self.observations\n              and self.history_observations. Bear it for the moment.\n            \"\"\"\n            self.last_observation = bbox\n            self.observations[self.age] = bbox\n            self.history_observations.append(bbox)\n\n            self.time_since_update = 0\n            self.history = []\n            self.hits += 1\n            self.hit_streak += 1\n            self.kf.update(convert_bbox_to_z(bbox))\n        else:\n            self.kf.update(bbox)\n\n    def predict(self):\n        \"\"\"\n        Advances the state vector and returns the predicted bounding box estimate.\n        \"\"\"\n        if ((self.kf.x[6] + self.kf.x[2]) <= 0):\n            self.kf.x[6] *= 0.0\n\n        self.kf.predict()\n        self.age += 1\n        if (self.time_since_update > 0):\n            self.hit_streak = 0\n        self.time_since_update += 1\n        self.history.append(convert_x_to_bbox(self.kf.x, score=self.score))\n        return self.history[-1]\n\n    def get_state(self):\n        return convert_x_to_bbox(self.kf.x, score=self.score)\n\n\n@register\n@serializable\nclass OCSORTTracker(object):\n    \"\"\"\n    OCSORT tracker, support single class\n\n    Args:\n        det_thresh (float): threshold of detection score\n        max_age (int): maximum number of missed misses before a track is deleted\n        min_hits (int): minimum hits for associate\n        iou_threshold (float): iou threshold for associate\n        delta_t (int): delta_t of previous observation\n        inertia (float): vdc_weight of angle_diff_cost for associate\n        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter\n            bad results. If set <= 0 means no need to filter bboxes，usually set\n            1.6 for pedestrian tracking.\n        min_box_area (int): min box area to filter out low quality boxes\n        use_byte (bool): Whether use ByteTracker, default False\n    \"\"\"\n\n    def __init__(self,\n                 det_thresh=0.6,\n                 max_age=30,\n                 min_hits=3,\n                 iou_threshold=0.3,\n                 delta_t=3,\n                 inertia=0.2,\n                 vertical_ratio=-1,\n                 min_box_area=0,\n                 use_byte=False,\n                 use_angle_cost=False):\n        self.det_thresh = det_thresh\n        self.max_age = max_age\n        self.min_hits = min_hits\n        self.iou_threshold = iou_threshold\n        self.delta_t = delta_t\n        self.inertia = inertia\n        self.vertical_ratio = vertical_ratio\n        self.min_box_area = min_box_area\n        self.use_byte = use_byte\n        self.use_angle_cost = use_angle_cost\n\n        self.trackers = []\n        self.frame_count = 0\n        KalmanBoxTracker.count = 0\n\n    def update(self, pred_dets, pred_embs=None):\n        \"\"\"\n        Args:\n            pred_dets (np.array): Detection results of the image, the shape is\n                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.\n            pred_embs (np.array): Embedding results of the image, the shape is\n                [N, 128] or [N, 512], default as None.\n\n        Return:\n            tracking boxes (np.array): [M, 6], means 'x0, y0, x1, y1, score, id'.\n        \"\"\"\n        if pred_dets is None:\n            return np.empty((0, 6))\n\n        self.frame_count += 1\n\n        bboxes = pred_dets[:, 2:]\n        scores = pred_dets[:, 1:2]\n        dets = np.concatenate((bboxes, scores), axis=1)\n        scores = scores.squeeze(-1)\n\n        inds_low = scores > 0.1\n        inds_high = scores < self.det_thresh\n        inds_second = np.logical_and(inds_low, inds_high)\n        # self.det_thresh > score > 0.1, for second matching\n        dets_second = dets[inds_second]  # detections for second matching\n        remain_inds = scores > self.det_thresh\n        dets = dets[remain_inds]\n\n        # get predicted locations from existing trackers.\n        trks = np.zeros((len(self.trackers), 5))\n        to_del = []\n        ret = []\n        for t, trk in enumerate(trks):\n            pos = self.trackers[t].predict()[0]\n            trk[:] = [pos[0], pos[1], pos[2], pos[3], 0]\n            if np.any(np.isnan(pos)):\n                to_del.append(t)\n        trks = np.ma.compress_rows(np.ma.masked_invalid(trks))\n        for t in reversed(to_del):\n            self.trackers.pop(t)\n\n        if self.use_angle_cost:\n            velocities = np.array([\n                trk.velocity if trk.velocity is not None else np.array((0, 0))\n                for trk in self.trackers\n            ])\n\n            k_observations = np.array([\n                k_previous_obs(trk.observations, trk.age, self.delta_t)\n                for trk in self.trackers\n            ])\n        last_boxes = np.array([trk.last_observation for trk in self.trackers])\n        \"\"\"\n            First round of association\n        \"\"\"\n        if self.use_angle_cost:\n            matched, unmatched_dets, unmatched_trks = associate(\n                dets, trks, self.iou_threshold, velocities, k_observations,\n                self.inertia)\n        else:\n            matched, unmatched_dets, unmatched_trks = associate_only_iou(\n                dets, trks, self.iou_threshold)\n\n        for m in matched:\n            self.trackers[m[1]].update(\n                dets[m[0], :], angle_cost=self.use_angle_cost)\n        \"\"\"\n            Second round of associaton by OCR\n        \"\"\"\n        # BYTE association\n        if self.use_byte and len(dets_second) > 0 and unmatched_trks.shape[\n                0] > 0:\n            u_trks = trks[unmatched_trks]\n            iou_left = iou_batch(\n                dets_second,\n                u_trks)  # iou between low score detections and unmatched tracks\n            iou_left = np.array(iou_left)\n            if iou_left.max() > self.iou_threshold:\n                \"\"\"\n                    NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may\n                    get a higher performance especially on MOT17/MOT20 datasets. But we keep it\n                    uniform here for simplicity\n                \"\"\"\n                matched_indices = linear_assignment(-iou_left)\n                to_remove_trk_indices = []\n                for m in matched_indices:\n                    det_ind, trk_ind = m[0], unmatched_trks[m[1]]\n                    if iou_left[m[0], m[1]] < self.iou_threshold:\n                        continue\n                    self.trackers[trk_ind].update(\n                        dets_second[det_ind, :], angle_cost=self.use_angle_cost)\n                    to_remove_trk_indices.append(trk_ind)\n                unmatched_trks = np.setdiff1d(unmatched_trks,\n                                              np.array(to_remove_trk_indices))\n\n        if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0:\n            left_dets = dets[unmatched_dets]\n            left_trks = last_boxes[unmatched_trks]\n            iou_left = iou_batch(left_dets, left_trks)\n            iou_left = np.array(iou_left)\n            if iou_left.max() > self.iou_threshold:\n                \"\"\"\n                    NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may\n                    get a higher performance especially on MOT17/MOT20 datasets. But we keep it\n                    uniform here for simplicity\n                \"\"\"\n                rematched_indices = linear_assignment(-iou_left)\n                to_remove_det_indices = []\n                to_remove_trk_indices = []\n                for m in rematched_indices:\n                    det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[\n                        1]]\n                    if iou_left[m[0], m[1]] < self.iou_threshold:\n                        continue\n                    self.trackers[trk_ind].update(\n                        dets[det_ind, :], angle_cost=self.use_angle_cost)\n                    to_remove_det_indices.append(det_ind)\n                    to_remove_trk_indices.append(trk_ind)\n                unmatched_dets = np.setdiff1d(unmatched_dets,\n                                              np.array(to_remove_det_indices))\n                unmatched_trks = np.setdiff1d(unmatched_trks,\n                                              np.array(to_remove_trk_indices))\n\n        for m in unmatched_trks:\n            self.trackers[m].update(None)\n\n        # create and initialise new trackers for unmatched detections\n        for i in unmatched_dets:\n            trk = KalmanBoxTracker(dets[i, :], delta_t=self.delta_t)\n            self.trackers.append(trk)\n\n        i = len(self.trackers)\n        for trk in reversed(self.trackers):\n            if trk.last_observation.sum() < 0:\n                d = trk.get_state()[0]\n            else:\n                d = trk.last_observation  # tlbr + score\n            if (trk.time_since_update < 1) and (\n                    trk.hit_streak >= self.min_hits or\n                    self.frame_count <= self.min_hits):\n                # +1 as MOT benchmark requires positive\n                ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1))\n            i -= 1\n            # remove dead tracklet\n            if (trk.time_since_update > self.max_age):\n                self.trackers.pop(i)\n        if (len(ret) > 0):\n            return np.concatenate(ret)\n        return np.empty((0, 6))\n"
  },
  {
    "path": "ppdet/modeling/mot/utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport cv2\nimport time\nimport numpy as np\nfrom .visualization import plot_tracking_dict, plot_tracking\n\n__all__ = [\n    'MOTTimer',\n    'Detection',\n    'write_mot_results',\n    'save_vis_results',\n    'load_det_results',\n    'preprocess_reid',\n    'get_crops',\n    'clip_box',\n    'scale_coords',\n]\n\n\nclass MOTTimer(object):\n    \"\"\"\n    This class used to compute and print the current FPS while evaling.\n    \"\"\"\n\n    def __init__(self):\n        self.total_time = 0.\n        self.calls = 0\n        self.start_time = 0.\n        self.diff = 0.\n        self.average_time = 0.\n        self.duration = 0.\n\n    def tic(self):\n        # using time.time instead of time.clock because time time.clock\n        # does not normalize for multithreading\n        self.start_time = time.time()\n\n    def toc(self, average=True):\n        self.diff = time.time() - self.start_time\n        self.total_time += self.diff\n        self.calls += 1\n        self.average_time = self.total_time / self.calls\n        if average:\n            self.duration = self.average_time\n        else:\n            self.duration = self.diff\n        return self.duration\n\n    def clear(self):\n        self.total_time = 0.\n        self.calls = 0\n        self.start_time = 0.\n        self.diff = 0.\n        self.average_time = 0.\n        self.duration = 0.\n\n\nclass Detection(object):\n    \"\"\"\n    This class represents a bounding box detection in a single image.\n\n    Args:\n        tlwh (Tensor): Bounding box in format `(top left x, top left y,\n            width, height)`.\n        score (Tensor): Bounding box confidence score.\n        feature (Tensor): A feature vector that describes the object \n            contained in this image.\n        cls_id (Tensor): Bounding box category id.\n    \"\"\"\n\n    def __init__(self, tlwh, score, feature, cls_id):\n        self.tlwh = np.asarray(tlwh, dtype=np.float32)\n        self.score = float(score)\n        self.feature = np.asarray(feature, dtype=np.float32)\n        self.cls_id = int(cls_id)\n\n    def to_tlbr(self):\n        \"\"\"\n        Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,\n        `(top left, bottom right)`.\n        \"\"\"\n        ret = self.tlwh.copy()\n        ret[2:] += ret[:2]\n        return ret\n\n    def to_xyah(self):\n        \"\"\"\n        Convert bounding box to format `(center x, center y, aspect ratio,\n        height)`, where the aspect ratio is `width / height`.\n        \"\"\"\n        ret = self.tlwh.copy()\n        ret[:2] += ret[2:] / 2\n        ret[2] /= ret[3]\n        return ret\n\n\ndef write_mot_results(filename, results, data_type='mot', num_classes=1):\n    # support single and multi classes\n    if data_type in ['mot', 'mcmot']:\n        save_format = '{frame},{id},{x1},{y1},{w},{h},{score},{cls_id},-1,-1\\n'\n    elif data_type == 'kitti':\n        save_format = '{frame} {id} car 0 0 -10 {x1} {y1} {x2} {y2} -10 -10 -10 -1000 -1000 -1000 -10\\n'\n    else:\n        raise ValueError(data_type)\n\n    f = open(filename, 'w')\n    for cls_id in range(num_classes):\n        for frame_id, tlwhs, tscores, track_ids in results[cls_id]:\n            if data_type == 'kitti':\n                frame_id -= 1\n            for tlwh, score, track_id in zip(tlwhs, tscores, track_ids):\n                if track_id < 0: continue\n                if data_type == 'mot':\n                    cls_id = -1\n\n                x1, y1, w, h = tlwh\n                x2, y2 = x1 + w, y1 + h\n                line = save_format.format(\n                    frame=frame_id,\n                    id=track_id,\n                    x1=x1,\n                    y1=y1,\n                    x2=x2,\n                    y2=y2,\n                    w=w,\n                    h=h,\n                    score=score,\n                    cls_id=cls_id)\n                f.write(line)\n    print('MOT results save in {}'.format(filename))\n\n\ndef save_vis_results(data,\n                     frame_id,\n                     online_ids,\n                     online_tlwhs,\n                     online_scores,\n                     average_time,\n                     show_image,\n                     save_dir,\n                     num_classes=1,\n                     ids2names=[]):\n    if show_image or save_dir is not None:\n        assert 'ori_image' in data\n        img0 = data['ori_image'].numpy()[0]\n        if online_ids is None:\n            online_im = img0\n        else:\n            if isinstance(online_tlwhs, dict):\n                online_im = plot_tracking_dict(\n                    img0,\n                    num_classes,\n                    online_tlwhs,\n                    online_ids,\n                    online_scores,\n                    frame_id=frame_id,\n                    fps=1. / average_time,\n                    ids2names=ids2names)\n            else:\n                online_im = plot_tracking(\n                    img0,\n                    online_tlwhs,\n                    online_ids,\n                    online_scores,\n                    frame_id=frame_id,\n                    fps=1. / average_time,\n                    ids2names=ids2names)\n    if show_image:\n        cv2.imshow('online_im', online_im)\n    if save_dir is not None:\n        cv2.imwrite(\n            os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), online_im)\n\n\ndef load_det_results(det_file, num_frames):\n    assert os.path.exists(det_file) and os.path.isfile(det_file), \\\n        '{} is not exist or not a file.'.format(det_file)\n    labels = np.loadtxt(det_file, dtype='float32', delimiter=',')\n    assert labels.shape[1] == 7, \\\n        \"Each line of {} should have 7 items: '[frame_id],[x0],[y0],[w],[h],[score],[class_id]'.\".format(det_file)\n    results_list = []\n    for frame_i in range(num_frames):\n        results = {'bbox': [], 'score': [], 'cls_id': []}\n        lables_with_frame = labels[labels[:, 0] == frame_i + 1]\n        # each line of lables_with_frame:\n        # [frame_id],[x0],[y0],[w],[h],[score],[class_id]\n        for l in lables_with_frame:\n            results['bbox'].append(l[1:5])\n            results['score'].append(l[5:6])\n            results['cls_id'].append(l[6:7])\n        results_list.append(results)\n    return results_list\n\n\ndef scale_coords(coords, input_shape, im_shape, scale_factor):\n    # Note: ratio has only one value, scale_factor[0] == scale_factor[1]\n    # \n    # This function only used for JDE YOLOv3 or other detectors with \n    # LetterBoxResize and JDEBBoxPostProcess, coords output from detector had\n    # not scaled back to the origin image.\n\n    ratio = scale_factor[0]\n    pad_w = (input_shape[1] - int(im_shape[1])) / 2\n    pad_h = (input_shape[0] - int(im_shape[0])) / 2\n    coords[:, 0::2] -= pad_w\n    coords[:, 1::2] -= pad_h\n    coords[:, 0:4] /= ratio\n    coords[:, :4] = np.clip(coords[:, :4], a_min=0, a_max=coords[:, :4].max())\n    return coords.round()\n\n\ndef clip_box(xyxy, ori_image_shape):\n    H, W = ori_image_shape\n    xyxy[:, 0::2] = np.clip(xyxy[:, 0::2], a_min=0, a_max=W)\n    xyxy[:, 1::2] = np.clip(xyxy[:, 1::2], a_min=0, a_max=H)\n    w = xyxy[:, 2:3] - xyxy[:, 0:1]\n    h = xyxy[:, 3:4] - xyxy[:, 1:2]\n    mask = np.logical_and(h > 0, w > 0)\n    keep_idx = np.nonzero(mask)\n    return xyxy[keep_idx[0]], keep_idx\n\n\ndef get_crops(xyxy, ori_img, w, h):\n    crops = []\n    xyxy = xyxy.astype(np.int64)\n    ori_img = ori_img.numpy()\n    ori_img = np.squeeze(ori_img, axis=0).transpose(1, 0, 2)  # [h,w,3]->[w,h,3]\n    for i, bbox in enumerate(xyxy):\n        crop = ori_img[bbox[0]:bbox[2], bbox[1]:bbox[3], :]\n        crops.append(crop)\n    crops = preprocess_reid(crops, w, h)\n    return crops\n\n\ndef preprocess_reid(imgs,\n                    w=64,\n                    h=192,\n                    mean=[0.485, 0.456, 0.406],\n                    std=[0.229, 0.224, 0.225]):\n    im_batch = []\n    for img in imgs:\n        img = cv2.resize(img, (w, h))\n        img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255\n        img_mean = np.array(mean).reshape((3, 1, 1))\n        img_std = np.array(std).reshape((3, 1, 1))\n        img -= img_mean\n        img /= img_std\n        img = np.expand_dims(img, axis=0)\n        im_batch.append(img)\n    im_batch = np.concatenate(im_batch, 0)\n    return im_batch\n"
  },
  {
    "path": "ppdet/modeling/mot/visualization.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport cv2\nimport numpy as np\n\n\ndef get_color(idx):\n    idx = idx * 3\n    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)\n    return color\n\n\ndef plot_tracking(image,\n                  tlwhs,\n                  obj_ids,\n                  scores=None,\n                  frame_id=0,\n                  fps=0.,\n                  ids2names=[]):\n    im = np.ascontiguousarray(np.copy(image))\n    im_h, im_w = im.shape[:2]\n\n    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255\n\n    text_scale = max(1, image.shape[1] / 1600.)\n    text_thickness = 2\n    line_thickness = max(1, int(image.shape[1] / 500.))\n\n    radius = max(5, int(im_w / 140.))\n    cv2.putText(\n        im,\n        'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),\n        (0, int(15 * text_scale)),\n        cv2.FONT_HERSHEY_PLAIN,\n        text_scale, (0, 0, 255),\n        thickness=2)\n\n    for i, tlwh in enumerate(tlwhs):\n        x1, y1, w, h = tlwh\n        intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))\n        obj_id = int(obj_ids[i])\n        id_text = '{}'.format(int(obj_id))\n        if ids2names != []:\n            assert len(\n                ids2names) == 1, \"plot_tracking only supports single classes.\"\n            id_text = '{}_'.format(ids2names[0]) + id_text\n        _line_thickness = 1 if obj_id <= 0 else line_thickness\n        color = get_color(abs(obj_id))\n        cv2.rectangle(\n            im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)\n        cv2.putText(\n            im,\n            id_text, (intbox[0], intbox[1] - 10),\n            cv2.FONT_HERSHEY_PLAIN,\n            text_scale, (0, 0, 255),\n            thickness=text_thickness)\n\n        if scores is not None:\n            text = '{:.2f}'.format(float(scores[i]))\n            cv2.putText(\n                im,\n                text, (intbox[0], intbox[1] + 10),\n                cv2.FONT_HERSHEY_PLAIN,\n                text_scale, (0, 255, 255),\n                thickness=text_thickness)\n    return im\n\n\ndef plot_tracking_dict(image,\n                       num_classes,\n                       tlwhs_dict,\n                       obj_ids_dict,\n                       scores_dict,\n                       frame_id=0,\n                       fps=0.,\n                       ids2names=[]):\n    im = np.ascontiguousarray(np.copy(image))\n    im_h, im_w = im.shape[:2]\n\n    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255\n\n    text_scale = max(1, image.shape[1] / 1600.)\n    text_thickness = 2\n    line_thickness = max(1, int(image.shape[1] / 500.))\n\n    radius = max(5, int(im_w / 140.))\n\n    for cls_id in range(num_classes):\n        tlwhs = tlwhs_dict[cls_id]\n        obj_ids = obj_ids_dict[cls_id]\n        scores = scores_dict[cls_id]\n        cv2.putText(\n            im,\n            'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),\n            (0, int(15 * text_scale)),\n            cv2.FONT_HERSHEY_PLAIN,\n            text_scale, (0, 0, 255),\n            thickness=2)\n\n        for i, tlwh in enumerate(tlwhs):\n            x1, y1, w, h = tlwh\n            intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))\n            obj_id = int(obj_ids[i])\n\n            id_text = '{}'.format(int(obj_id))\n            if ids2names != []:\n                id_text = '{}_{}'.format(ids2names[cls_id], id_text)\n            else:\n                id_text = 'class{}_{}'.format(cls_id, id_text)\n\n            _line_thickness = 1 if obj_id <= 0 else line_thickness\n            color = get_color(abs(obj_id))\n            cv2.rectangle(\n                im,\n                intbox[0:2],\n                intbox[2:4],\n                color=color,\n                thickness=line_thickness)\n            cv2.putText(\n                im,\n                id_text, (intbox[0], intbox[1] - 10),\n                cv2.FONT_HERSHEY_PLAIN,\n                text_scale, (0, 0, 255),\n                thickness=text_thickness)\n\n            if scores is not None:\n                text = '{:.2f}'.format(float(scores[i]))\n                cv2.putText(\n                    im,\n                    text, (intbox[0], intbox[1] + 10),\n                    cv2.FONT_HERSHEY_PLAIN,\n                    text_scale, (0, 255, 255),\n                    thickness=text_thickness)\n    return im\n"
  },
  {
    "path": "ppdet/modeling/necks/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import fpn\nfrom . import yolo_fpn\nfrom . import hrfpn\nfrom . import ttf_fpn\nfrom . import centernet_fpn\nfrom . import bifpn\nfrom . import csp_pan\nfrom . import es_pan\nfrom . import lc_pan\nfrom . import custom_pan\nfrom . import dilated_encoder\nfrom . import clrnet_fpn\n\nfrom .fpn import *\nfrom .yolo_fpn import *\nfrom .hrfpn import *\nfrom .ttf_fpn import *\nfrom .centernet_fpn import *\nfrom .blazeface_fpn import *\nfrom .bifpn import *\nfrom .csp_pan import *\nfrom .es_pan import *\nfrom .lc_pan import *\nfrom .custom_pan import *\nfrom .dilated_encoder import *\nfrom .channel_mapper import *\nfrom .clrnet_fpn import *\n"
  },
  {
    "path": "ppdet/modeling/necks/bifpn.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Constant\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.layers import ConvNormLayer\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['BiFPN']\n\n\nclass SeparableConvLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels=None,\n                 kernel_size=3,\n                 norm_type='bn',\n                 norm_groups=32,\n                 act='swish'):\n        super(SeparableConvLayer, self).__init__()\n        assert norm_type in ['bn', 'sync_bn', 'gn', None]\n        assert act in ['swish', 'relu', None]\n\n        self.in_channels = in_channels\n        if out_channels is None:\n            self.out_channels = self.in_channels\n        self.norm_type = norm_type\n        self.norm_groups = norm_groups\n        self.depthwise_conv = nn.Conv2D(\n            in_channels,\n            in_channels,\n            kernel_size,\n            padding=kernel_size // 2,\n            groups=in_channels,\n            bias_attr=False)\n        self.pointwise_conv = nn.Conv2D(in_channels, self.out_channels, 1)\n\n        # norm type\n        if self.norm_type in ['bn', 'sync_bn']:\n            self.norm = nn.BatchNorm2D(self.out_channels)\n        elif self.norm_type == 'gn':\n            self.norm = nn.GroupNorm(\n                num_groups=self.norm_groups, num_channels=self.out_channels)\n\n        # activation\n        if act == 'swish':\n            self.act = nn.Swish()\n        elif act == 'relu':\n            self.act = nn.ReLU()\n\n    def forward(self, x):\n        if self.act is not None:\n            x = self.act(x)\n        out = self.depthwise_conv(x)\n        out = self.pointwise_conv(out)\n        if self.norm_type is not None:\n            out = self.norm(out)\n        return out\n\n\nclass BiFPNCell(nn.Layer):\n    def __init__(self,\n                 channels=256,\n                 num_levels=5,\n                 eps=1e-5,\n                 use_weighted_fusion=True,\n                 kernel_size=3,\n                 norm_type='bn',\n                 norm_groups=32,\n                 act='swish'):\n        super(BiFPNCell, self).__init__()\n        self.channels = channels\n        self.num_levels = num_levels\n        self.eps = eps\n        self.use_weighted_fusion = use_weighted_fusion\n\n        # up\n        self.conv_up = nn.LayerList([\n            SeparableConvLayer(\n                self.channels,\n                kernel_size=kernel_size,\n                norm_type=norm_type,\n                norm_groups=norm_groups,\n                act=act) for _ in range(self.num_levels - 1)\n        ])\n        # down\n        self.conv_down = nn.LayerList([\n            SeparableConvLayer(\n                self.channels,\n                kernel_size=kernel_size,\n                norm_type=norm_type,\n                norm_groups=norm_groups,\n                act=act) for _ in range(self.num_levels - 1)\n        ])\n\n        if self.use_weighted_fusion:\n            self.up_weights = self.create_parameter(\n                shape=[self.num_levels - 1, 2],\n                attr=ParamAttr(initializer=Constant(1.)))\n            self.down_weights = self.create_parameter(\n                shape=[self.num_levels - 1, 3],\n                attr=ParamAttr(initializer=Constant(1.)))\n\n    def _feature_fusion_cell(self,\n                             conv_layer,\n                             lateral_feat,\n                             sampling_feat,\n                             route_feat=None,\n                             weights=None):\n        if self.use_weighted_fusion:\n            weights = F.relu(weights)\n            weights = weights / (weights.sum() + self.eps)\n            if route_feat is not None:\n                out_feat = weights[0] * lateral_feat + \\\n                           weights[1] * sampling_feat + \\\n                           weights[2] * route_feat\n            else:\n                out_feat = weights[0] * lateral_feat + \\\n                           weights[1] * sampling_feat\n        else:\n            if route_feat is not None:\n                out_feat = lateral_feat + sampling_feat + route_feat\n            else:\n                out_feat = lateral_feat + sampling_feat\n\n        out_feat = conv_layer(out_feat)\n        return out_feat\n\n    def forward(self, feats):\n        # feats: [P3 - P7]\n        lateral_feats = []\n\n        # up\n        up_feature = feats[-1]\n        for i, feature in enumerate(feats[::-1]):\n            if i == 0:\n                lateral_feats.append(feature)\n            else:\n                shape = feature.shape\n                up_feature = F.interpolate(\n                    up_feature, size=[shape[2], shape[3]])\n                lateral_feature = self._feature_fusion_cell(\n                    self.conv_up[i - 1],\n                    feature,\n                    up_feature,\n                    weights=self.up_weights[i - 1]\n                    if self.use_weighted_fusion else None)\n                lateral_feats.append(lateral_feature)\n                up_feature = lateral_feature\n\n        out_feats = []\n        # down\n        down_feature = lateral_feats[-1]\n        for i, (lateral_feature,\n                route_feature) in enumerate(zip(lateral_feats[::-1], feats)):\n            if i == 0:\n                out_feats.append(lateral_feature)\n            else:\n                down_feature = F.max_pool2d(down_feature, 3, 2, 1)\n                if i == len(feats) - 1:\n                    route_feature = None\n                    weights = self.down_weights[\n                        i - 1][:2] if self.use_weighted_fusion else None\n                else:\n                    weights = self.down_weights[\n                        i - 1] if self.use_weighted_fusion else None\n                out_feature = self._feature_fusion_cell(\n                    self.conv_down[i - 1],\n                    lateral_feature,\n                    down_feature,\n                    route_feature,\n                    weights=weights)\n                out_feats.append(out_feature)\n                down_feature = out_feature\n\n        return out_feats\n\n\n@register\n@serializable\nclass BiFPN(nn.Layer):\n    \"\"\"\n    Bidirectional Feature Pyramid Network, see https://arxiv.org/abs/1911.09070\n\n    Args:\n        in_channels (list[int]): input channels of each level which can be\n            derived from the output shape of backbone by from_config.\n        out_channel (int): output channel of each level.\n        num_extra_levels (int): the number of extra stages added to the last level.\n            default: 2\n        fpn_strides (List): The stride of each level.\n        num_stacks (int): the number of stacks for BiFPN, default: 1.\n        use_weighted_fusion (bool): use weighted feature fusion in BiFPN, default: True.\n        norm_type (string|None): the normalization type in BiFPN module. If\n            norm_type is None, norm will not be used after conv and if\n            norm_type is string, bn, gn, sync_bn are available. default: bn.\n        norm_groups (int): if you use gn, set this param.\n        act (string|None): the activation function of BiFPN.\n    \"\"\"\n\n    def __init__(self,\n                 in_channels=(512, 1024, 2048),\n                 out_channel=256,\n                 num_extra_levels=2,\n                 fpn_strides=[8, 16, 32, 64, 128],\n                 num_stacks=1,\n                 use_weighted_fusion=True,\n                 norm_type='bn',\n                 norm_groups=32,\n                 act='swish'):\n        super(BiFPN, self).__init__()\n        assert num_stacks > 0, \"The number of stacks of BiFPN is at least 1.\"\n        assert norm_type in ['bn', 'sync_bn', 'gn', None]\n        assert act in ['swish', 'relu', None]\n        assert num_extra_levels >= 0, \\\n            \"The `num_extra_levels` must be non negative(>=0).\"\n\n        self.in_channels = in_channels\n        self.out_channel = out_channel\n        self.num_extra_levels = num_extra_levels\n        self.num_stacks = num_stacks\n        self.use_weighted_fusion = use_weighted_fusion\n        self.norm_type = norm_type\n        self.norm_groups = norm_groups\n        self.act = act\n        self.num_levels = len(self.in_channels) + self.num_extra_levels\n        if len(fpn_strides) != self.num_levels:\n            for i in range(self.num_extra_levels):\n                fpn_strides += [fpn_strides[-1] * 2]\n        self.fpn_strides = fpn_strides\n\n        self.lateral_convs = nn.LayerList()\n        for in_c in in_channels:\n            self.lateral_convs.append(\n                ConvNormLayer(in_c, self.out_channel, 1, 1))\n        if self.num_extra_levels > 0:\n            self.extra_convs = nn.LayerList()\n            for i in range(self.num_extra_levels):\n                if i == 0:\n                    self.extra_convs.append(\n                        ConvNormLayer(self.in_channels[-1], self.out_channel, 3,\n                                      2))\n                else:\n                    self.extra_convs.append(nn.MaxPool2D(3, 2, 1))\n\n        self.bifpn_cells = nn.LayerList()\n        for i in range(self.num_stacks):\n            self.bifpn_cells.append(\n                BiFPNCell(\n                    self.out_channel,\n                    self.num_levels,\n                    use_weighted_fusion=self.use_weighted_fusion,\n                    norm_type=self.norm_type,\n                    norm_groups=self.norm_groups,\n                    act=self.act))\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {\n            'in_channels': [i.channels for i in input_shape],\n            'fpn_strides': [i.stride for i in input_shape]\n        }\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.out_channel, stride=s) for s in self.fpn_strides\n        ]\n\n    def forward(self, feats):\n        assert len(feats) == len(self.in_channels)\n        fpn_feats = []\n        for conv_layer, feature in zip(self.lateral_convs, feats):\n            fpn_feats.append(conv_layer(feature))\n        if self.num_extra_levels > 0:\n            feat = feats[-1]\n            for conv_layer in self.extra_convs:\n                feat = conv_layer(feat)\n                fpn_feats.append(feat)\n\n        for bifpn_cell in self.bifpn_cells:\n            fpn_feats = bifpn_cell(fpn_feats)\n        return fpn_feats\n"
  },
  {
    "path": "ppdet/modeling/necks/blazeface_fpn.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nfrom paddle.nn.initializer import KaimingNormal\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['BlazeNeck']\n\n\ndef hard_swish(x):\n    return x * F.relu6(x + 3) / 6.\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride,\n                 padding,\n                 num_groups=1,\n                 act='relu',\n                 conv_lr=0.1,\n                 conv_decay=0.,\n                 norm_decay=0.,\n                 norm_type='bn',\n                 name=None):\n        super(ConvBNLayer, self).__init__()\n        self.act = act\n        self._conv = nn.Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            groups=num_groups,\n            weight_attr=ParamAttr(\n                learning_rate=conv_lr, initializer=KaimingNormal()),\n            bias_attr=False)\n\n        if norm_type in ['sync_bn', 'bn']:\n            self._batch_norm = nn.BatchNorm2D(out_channels)\n\n    def forward(self, x):\n        x = self._conv(x)\n        x = self._batch_norm(x)\n        if self.act == \"relu\":\n            x = F.relu(x)\n        elif self.act == \"relu6\":\n            x = F.relu6(x)\n        elif self.act == 'leaky':\n            x = F.leaky_relu(x)\n        elif self.act == 'hard_swish':\n            x = hard_swish(x)\n        return x\n\n\nclass FPN(nn.Layer):\n    def __init__(self, in_channels, out_channels, name=None):\n        super(FPN, self).__init__()\n        self.conv1_fpn = ConvBNLayer(\n            in_channels,\n            out_channels // 2,\n            kernel_size=1,\n            padding=0,\n            stride=1,\n            act='leaky',\n            name=name + '_output1')\n        self.conv2_fpn = ConvBNLayer(\n            in_channels,\n            out_channels // 2,\n            kernel_size=1,\n            padding=0,\n            stride=1,\n            act='leaky',\n            name=name + '_output2')\n        self.conv3_fpn = ConvBNLayer(\n            out_channels // 2,\n            out_channels // 2,\n            kernel_size=3,\n            padding=1,\n            stride=1,\n            act='leaky',\n            name=name + '_merge')\n\n    def forward(self, input):\n        output1 = self.conv1_fpn(input[0])\n        output2 = self.conv2_fpn(input[1])\n        up2 = F.upsample(\n            output2, size=output1.shape[-2:], mode='nearest')\n        output1 = paddle.add(output1, up2)\n        output1 = self.conv3_fpn(output1)\n        return output1, output2\n\n\nclass SSH(nn.Layer):\n    def __init__(self, in_channels, out_channels, name=None):\n        super(SSH, self).__init__()\n        assert out_channels % 4 == 0\n        self.conv0_ssh = ConvBNLayer(\n            in_channels,\n            out_channels // 2,\n            kernel_size=3,\n            padding=1,\n            stride=1,\n            act=None,\n            name=name + 'ssh_conv3')\n        self.conv1_ssh = ConvBNLayer(\n            out_channels // 2,\n            out_channels // 4,\n            kernel_size=3,\n            padding=1,\n            stride=1,\n            act='leaky',\n            name=name + 'ssh_conv5_1')\n        self.conv2_ssh = ConvBNLayer(\n            out_channels // 4,\n            out_channels // 4,\n            kernel_size=3,\n            padding=1,\n            stride=1,\n            act=None,\n            name=name + 'ssh_conv5_2')\n        self.conv3_ssh = ConvBNLayer(\n            out_channels // 4,\n            out_channels // 4,\n            kernel_size=3,\n            padding=1,\n            stride=1,\n            act='leaky',\n            name=name + 'ssh_conv7_1')\n        self.conv4_ssh = ConvBNLayer(\n            out_channels // 4,\n            out_channels // 4,\n            kernel_size=3,\n            padding=1,\n            stride=1,\n            act=None,\n            name=name + 'ssh_conv7_2')\n\n    def forward(self, x):\n        conv0 = self.conv0_ssh(x)\n        conv1 = self.conv1_ssh(conv0)\n        conv2 = self.conv2_ssh(conv1)\n        conv3 = self.conv3_ssh(conv2)\n        conv4 = self.conv4_ssh(conv3)\n        concat = paddle.concat([conv0, conv2, conv4], axis=1)\n        return F.relu(concat)\n\n\n@register\n@serializable\nclass BlazeNeck(nn.Layer):\n    def __init__(self, in_channel, neck_type=\"None\", data_format='NCHW'):\n        super(BlazeNeck, self).__init__()\n        self.neck_type = neck_type\n        self.reture_input = False\n        self._out_channels = in_channel\n        if self.neck_type == 'None':\n            self.reture_input = True\n        if \"fpn\" in self.neck_type:\n            self.fpn = FPN(self._out_channels[0],\n                           self._out_channels[1],\n                           name='fpn')\n            self._out_channels = [\n                self._out_channels[0] // 2, self._out_channels[1] // 2\n            ]\n        if \"ssh\" in self.neck_type:\n            self.ssh1 = SSH(self._out_channels[0],\n                            self._out_channels[0],\n                            name='ssh1')\n            self.ssh2 = SSH(self._out_channels[1],\n                            self._out_channels[1],\n                            name='ssh2')\n            self._out_channels = [self._out_channels[0], self._out_channels[1]]\n\n    def forward(self, inputs):\n        if self.reture_input:\n            return inputs\n        output1, output2 = None, None\n        if \"fpn\" in self.neck_type:\n            backout_4, backout_1 = inputs\n            output1, output2 = self.fpn([backout_4, backout_1])\n        if self.neck_type == \"only_fpn\":\n            return [output1, output2]\n        if self.neck_type == \"only_ssh\":\n            output1, output2 = inputs\n        feature1 = self.ssh1(output1)\n        feature2 = self.ssh2(output2)\n        return [feature1, feature2]\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(channels=c)\n            for c in [self._out_channels[0], self._out_channels[1]]\n        ]\n"
  },
  {
    "path": "ppdet/modeling/necks/centernet_fpn.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport math\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Uniform\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.layers import ConvNormLayer\nfrom ppdet.modeling.backbones.hardnet import ConvLayer, HarDBlock\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['CenterNetDLAFPN', 'CenterNetHarDNetFPN']\n\n\n# SGE attention\nclass BasicConv(nn.Layer):\n    def __init__(self,\n                 in_planes,\n                 out_planes,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 relu=True,\n                 bn=True,\n                 bias_attr=False):\n        super(BasicConv, self).__init__()\n        self.out_channels = out_planes\n        self.conv = nn.Conv2D(\n            in_planes,\n            out_planes,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            bias_attr=bias_attr)\n        self.bn = nn.BatchNorm2D(\n            out_planes,\n            epsilon=1e-5,\n            momentum=0.01,\n            weight_attr=False,\n            bias_attr=False) if bn else None\n        self.relu = nn.ReLU() if relu else None\n\n    def forward(self, x):\n        x = self.conv(x)\n        if self.bn is not None:\n            x = self.bn(x)\n        if self.relu is not None:\n            x = self.relu(x)\n        return x\n\n\nclass ChannelPool(nn.Layer):\n    def forward(self, x):\n        return paddle.concat(\n            (paddle.max(x, 1).unsqueeze(1), paddle.mean(x, 1).unsqueeze(1)),\n            axis=1)\n\n\nclass SpatialGate(nn.Layer):\n    def __init__(self):\n        super(SpatialGate, self).__init__()\n        kernel_size = 7\n        self.compress = ChannelPool()\n        self.spatial = BasicConv(\n            2,\n            1,\n            kernel_size,\n            stride=1,\n            padding=(kernel_size - 1) // 2,\n            relu=False)\n\n    def forward(self, x):\n        x_compress = self.compress(x)\n        x_out = self.spatial(x_compress)\n        scale = F.sigmoid(x_out)  # broadcasting\n        return x * scale\n\n\ndef fill_up_weights(up):\n    weight = up.weight.numpy()\n    f = math.ceil(weight.shape[2] / 2)\n    c = (2 * f - 1 - f % 2) / (2. * f)\n    for i in range(weight.shape[2]):\n        for j in range(weight.shape[3]):\n            weight[0, 0, i, j] = \\\n                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))\n    for c in range(1, weight.shape[0]):\n        weight[c, 0, :, :] = weight[0, 0, :, :]\n    up.weight.set_value(weight)\n\n\nclass IDAUp(nn.Layer):\n    def __init__(self, ch_ins, ch_out, up_strides, dcn_v2=True):\n        super(IDAUp, self).__init__()\n        for i in range(1, len(ch_ins)):\n            ch_in = ch_ins[i]\n            up_s = int(up_strides[i])\n            fan_in = ch_in * 3 * 3\n            stdv = 1. / math.sqrt(fan_in)\n            proj = nn.Sequential(\n                ConvNormLayer(\n                    ch_in,\n                    ch_out,\n                    filter_size=3,\n                    stride=1,\n                    use_dcn=dcn_v2,\n                    bias_on=dcn_v2,\n                    norm_decay=None,\n                    dcn_lr_scale=1.,\n                    dcn_regularizer=None,\n                    initializer=Uniform(-stdv, stdv)),\n                nn.ReLU())\n            node = nn.Sequential(\n                ConvNormLayer(\n                    ch_out,\n                    ch_out,\n                    filter_size=3,\n                    stride=1,\n                    use_dcn=dcn_v2,\n                    bias_on=dcn_v2,\n                    norm_decay=None,\n                    dcn_lr_scale=1.,\n                    dcn_regularizer=None,\n                    initializer=Uniform(-stdv, stdv)),\n                nn.ReLU())\n\n            kernel_size = up_s * 2\n            fan_in = ch_out * kernel_size * kernel_size\n            stdv = 1. / math.sqrt(fan_in)\n            up = nn.Conv2DTranspose(\n                ch_out,\n                ch_out,\n                kernel_size=up_s * 2,\n                stride=up_s,\n                padding=up_s // 2,\n                groups=ch_out,\n                weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),\n                bias_attr=False)\n            fill_up_weights(up)\n            setattr(self, 'proj_' + str(i), proj)\n            setattr(self, 'up_' + str(i), up)\n            setattr(self, 'node_' + str(i), node)\n\n    def forward(self, inputs, start_level, end_level):\n        for i in range(start_level + 1, end_level):\n            upsample = getattr(self, 'up_' + str(i - start_level))\n            project = getattr(self, 'proj_' + str(i - start_level))\n            inputs[i] = project(inputs[i])\n            inputs[i] = upsample(inputs[i])\n            node = getattr(self, 'node_' + str(i - start_level))\n            inputs[i] = node(paddle.add(inputs[i], inputs[i - 1]))\n        return inputs\n\n\nclass DLAUp(nn.Layer):\n    def __init__(self, start_level, channels, scales, ch_in=None, dcn_v2=True):\n        super(DLAUp, self).__init__()\n        self.start_level = start_level\n        if ch_in is None:\n            ch_in = channels\n        self.channels = channels\n        channels = list(channels)\n        scales = np.array(scales, dtype=int)\n        for i in range(len(channels) - 1):\n            j = -i - 2\n            setattr(\n                self,\n                'ida_{}'.format(i),\n                IDAUp(\n                    ch_in[j:],\n                    channels[j],\n                    scales[j:] // scales[j],\n                    dcn_v2=dcn_v2))\n            scales[j + 1:] = scales[j]\n            ch_in[j + 1:] = [channels[j] for _ in channels[j + 1:]]\n\n    def forward(self, inputs):\n        out = [inputs[-1]]  # start with 32\n        for i in range(len(inputs) - self.start_level - 1):\n            ida = getattr(self, 'ida_{}'.format(i))\n            outputs = ida(inputs, len(inputs) - i - 2, len(inputs))\n            out.insert(0, outputs[-1])\n        return out\n\n\n@register\n@serializable\nclass CenterNetDLAFPN(nn.Layer):\n    \"\"\"\n    Args:\n        in_channels (list): number of input feature channels from backbone.\n            [16, 32, 64, 128, 256, 512] by default, means the channels of DLA-34\n        down_ratio (int): the down ratio from images to heatmap, 4 by default\n        last_level (int): the last level of input feature fed into the upsamplng block\n        out_channel (int): the channel of the output feature, 0 by default means\n            the channel of the input feature whose down ratio is `down_ratio`\n        first_level (None): the first level of input feature fed into the upsamplng block.\n            if None, the first level stands for logs(down_ratio)\n        dcn_v2 (bool): whether use the DCNv2, True by default\n        with_sge (bool): whether use SGE attention, False by default\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 down_ratio=4,\n                 last_level=5,\n                 out_channel=0,\n                 first_level=None,\n                 dcn_v2=True,\n                 with_sge=False):\n        super(CenterNetDLAFPN, self).__init__()\n        self.first_level = int(np.log2(\n            down_ratio)) if first_level is None else first_level\n        assert self.first_level >= 0, \"first level in CenterNetDLAFPN should be greater or equal to 0, but received {}\".format(\n            self.first_level)\n        self.down_ratio = down_ratio\n        self.last_level = last_level\n        scales = [2**i for i in range(len(in_channels[self.first_level:]))]\n        self.dla_up = DLAUp(\n            self.first_level,\n            in_channels[self.first_level:],\n            scales,\n            dcn_v2=dcn_v2)\n        self.out_channel = out_channel\n        if out_channel == 0:\n            self.out_channel = in_channels[self.first_level]\n        self.ida_up = IDAUp(\n            in_channels[self.first_level:self.last_level],\n            self.out_channel,\n            [2**i for i in range(self.last_level - self.first_level)],\n            dcn_v2=dcn_v2)\n\n        self.with_sge = with_sge\n        if self.with_sge:\n            self.sge_attention = SpatialGate()\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape]}\n\n    def forward(self, body_feats):\n\n        inputs = [body_feats[i] for i in range(len(body_feats))]\n\n        dla_up_feats = self.dla_up(inputs)\n\n        ida_up_feats = []\n        for i in range(self.last_level - self.first_level):\n            ida_up_feats.append(dla_up_feats[i].clone())\n\n        self.ida_up(ida_up_feats, 0, len(ida_up_feats))\n\n        feat = ida_up_feats[-1]\n        if self.with_sge:\n            feat = self.sge_attention(feat)\n        if self.down_ratio != 4:\n            feat = F.interpolate(\n                feat,\n                scale_factor=self.down_ratio // 4,\n                mode=\"bilinear\",\n                align_corners=True)\n        return feat\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)]\n\n\nclass TransitionUp(nn.Layer):\n    def __init__(self, in_channels, out_channels):\n        super().__init__()\n\n    def forward(self, x, skip):\n        w, h = skip.shape[2], skip.shape[3]\n        out = F.interpolate(x, size=(w, h), mode=\"bilinear\", align_corners=True)\n        out = paddle.concat([out, skip], 1)\n        return out\n\n\n@register\n@serializable\nclass CenterNetHarDNetFPN(nn.Layer):\n    \"\"\"\n    Args:\n        in_channels (list): number of input feature channels from backbone.\n            [96, 214, 458, 784] by default, means the channels of HarDNet85\n        num_layers (int): HarDNet laters, 85 by default\n        down_ratio (int): the down ratio from images to heatmap, 4 by default\n        first_level (int|None): the first level of input feature fed into the upsamplng block.\n            if None, the first level stands for logs(down_ratio) - 1\n\n        last_level (int): the last level of input feature fed into the upsamplng block\n        out_channel (int): the channel of the output feature, 0 by default means\n            the channel of the input feature whose down ratio is `down_ratio`\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 num_layers=85,\n                 down_ratio=4,\n                 first_level=None,\n                 last_level=4,\n                 out_channel=0):\n        super(CenterNetHarDNetFPN, self).__init__()\n        self.first_level = int(np.log2(\n            down_ratio)) - 1 if first_level is None else first_level\n        assert self.first_level >= 0, \"first level in CenterNetDLAFPN should be greater or equal to 0, but received {}\".format(\n            self.first_level)\n        self.down_ratio = down_ratio\n        self.last_level = last_level\n        self.last_pool = nn.AvgPool2D(kernel_size=2, stride=2)\n\n        assert num_layers in [68, 85], \"HarDNet-{} not support.\".format(\n            num_layers)\n        if num_layers == 85:\n            self.last_proj = ConvLayer(784, 256, kernel_size=1)\n            self.last_blk = HarDBlock(768, 80, 1.7, 8)\n            self.skip_nodes = [1, 3, 8, 13]\n            self.SC = [32, 32, 0]\n            gr = [64, 48, 28]\n            layers = [8, 8, 4]\n            ch_list2 = [224 + self.SC[0], 160 + self.SC[1], 96 + self.SC[2]]\n            channels = [96, 214, 458, 784]\n            self.skip_lv = 3\n\n        elif num_layers == 68:\n            self.last_proj = ConvLayer(654, 192, kernel_size=1)\n            self.last_blk = HarDBlock(576, 72, 1.7, 8)\n            self.skip_nodes = [1, 3, 8, 11]\n            self.SC = [32, 32, 0]\n            gr = [48, 32, 20]\n            layers = [8, 8, 4]\n            ch_list2 = [224 + self.SC[0], 96 + self.SC[1], 64 + self.SC[2]]\n            channels = [64, 124, 328, 654]\n            self.skip_lv = 2\n\n        self.transUpBlocks = nn.LayerList([])\n        self.denseBlocksUp = nn.LayerList([])\n        self.conv1x1_up = nn.LayerList([])\n        self.avg9x9 = nn.AvgPool2D(kernel_size=(9, 9), stride=1, padding=(4, 4))\n        prev_ch = self.last_blk.get_out_ch()\n\n        for i in range(3):\n            skip_ch = channels[3 - i]\n            self.transUpBlocks.append(TransitionUp(prev_ch, prev_ch))\n            if i < self.skip_lv:\n                cur_ch = prev_ch + skip_ch\n            else:\n                cur_ch = prev_ch\n            self.conv1x1_up.append(\n                ConvLayer(\n                    cur_ch, ch_list2[i], kernel_size=1))\n            cur_ch = ch_list2[i]\n            cur_ch -= self.SC[i]\n            cur_ch *= 3\n\n            blk = HarDBlock(cur_ch, gr[i], 1.7, layers[i])\n            self.denseBlocksUp.append(blk)\n            prev_ch = blk.get_out_ch()\n\n        prev_ch += self.SC[0] + self.SC[1] + self.SC[2]\n        self.out_channel = prev_ch\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape]}\n\n    def forward(self, body_feats):\n        x = body_feats[-1]\n        x_sc = []\n        x = self.last_proj(x)\n        x = self.last_pool(x)\n        x2 = self.avg9x9(x)\n        x3 = x / (x.sum((2, 3), keepdim=True) + 0.1)\n        x = paddle.concat([x, x2, x3], 1)\n        x = self.last_blk(x)\n\n        for i in range(3):\n            skip_x = body_feats[3 - i]\n            x_up = self.transUpBlocks[i](x, skip_x)\n            x_ch = self.conv1x1_up[i](x_up)\n            if self.SC[i] > 0:\n                end = x_ch.shape[1]\n                new_st = end - self.SC[i]\n                x_sc.append(x_ch[:, new_st:, :, :])\n                x_ch = x_ch[:, :new_st, :, :]\n            x2 = self.avg9x9(x_ch)\n            x3 = x_ch / (x_ch.sum((2, 3), keepdim=True) + 0.1)\n            x_new = paddle.concat([x_ch, x2, x3], 1)\n            x = self.denseBlocksUp[i](x_new)\n\n        scs = [x]\n        for i in range(3):\n            if self.SC[i] > 0:\n                scs.insert(\n                    0,\n                    F.interpolate(\n                        x_sc[i],\n                        size=(x.shape[2], x.shape[3]),\n                        mode=\"bilinear\",\n                        align_corners=True))\n        neck_feat = paddle.concat(scs, 1)\n        return neck_feat\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)]\n"
  },
  {
    "path": "ppdet/modeling/necks/channel_mapper.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\"\"\"\nthis code is base on mmdet: git@github.com:open-mmlab/mmdetection.git\n\"\"\"\nimport paddle.nn as nn\n\nfrom ppdet.core.workspace import register, serializable\nfrom ..backbones.hrnet import ConvNormLayer\nfrom ..shape_spec import ShapeSpec\nfrom ..initializer import xavier_uniform_, constant_\n\n__all__ = ['ChannelMapper']\n\n\n@register\n@serializable\nclass ChannelMapper(nn.Layer):\n    \"\"\"Channel Mapper to reduce/increase channels of backbone features.\n\n    This is used to reduce/increase channels of backbone features.\n\n    Args:\n        in_channels (List[int]): Number of input channels per scale.\n        out_channels (int): Number of output channels (used at each scale).\n        kernel_size (int, optional): kernel_size for reducing channels (used\n            at each scale). Default: 3.\n        conv_cfg (dict, optional): Config dict for convolution layer.\n            Default: None.\n        norm_cfg (dict, optional): Config dict for normalization layer.\n            Default: None.\n        act_cfg (dict, optional): Config dict for activation layer in\n            ConvModule. Default: dict(type='ReLU').\n        num_outs (int, optional): Number of output feature maps. There\n            would be extra_convs when num_outs larger than the length\n            of in_channels.\n        init_cfg (dict or list[dict], optional): Initialization config dict.\n        \n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=3,\n                 norm_type=\"gn\",\n                 norm_groups=32,\n                 act='relu',\n                 num_outs=None,\n                 init_cfg=dict(\n                     type='Xavier', layer='Conv2d', distribution='uniform')):\n        super(ChannelMapper, self).__init__()\n        assert isinstance(in_channels, list)\n        self.extra_convs = None\n        if num_outs is None:\n            num_outs = len(in_channels)\n        self.convs = nn.LayerList()\n        for in_channel in in_channels:\n            self.convs.append(\n                ConvNormLayer(\n                    ch_in=in_channel,\n                    ch_out=out_channels,\n                    filter_size=kernel_size,\n                    norm_type='gn',\n                    norm_groups=32,\n                    act=act))\n\n        if num_outs > len(in_channels):\n            self.extra_convs = nn.LayerList()\n            for i in range(len(in_channels), num_outs):\n                if i == len(in_channels):\n                    in_channel = in_channels[-1]\n                else:\n                    in_channel = out_channels\n                self.extra_convs.append(\n                    ConvNormLayer(\n                        ch_in=in_channel,\n                        ch_out=out_channels,\n                        filter_size=3,\n                        stride=2,\n                        norm_type='gn',\n                        norm_groups=32,\n                        act=act))\n        self.init_weights()\n\n    def forward(self, inputs):\n        \"\"\"Forward function.\"\"\"\n        assert len(inputs) == len(self.convs)\n        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]\n        if self.extra_convs:\n            for i in range(len(self.extra_convs)):\n                if i == 0:\n                    outs.append(self.extra_convs[0](inputs[-1]))\n                else:\n                    outs.append(self.extra_convs[i](outs[-1]))\n        return tuple(outs)\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.out_channel, stride=1. / s)\n            for s in self.spatial_scales\n        ]\n\n    def init_weights(self):\n        \"\"\"Initialize the transformer weights.\"\"\"\n        for p in self.parameters():\n            if p.rank() > 1:\n                xavier_uniform_(p)\n                if hasattr(p, 'bias') and p.bias is not None:\n                    constant_(p.bais)\n"
  },
  {
    "path": "ppdet/modeling/necks/clrnet_fpn.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import XavierUniform\nfrom ppdet.modeling.initializer import kaiming_normal_, constant_\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.layers import ConvNormLayer\nfrom ppdet.modeling.shape_spec import ShapeSpec\n\n__all__ = ['CLRFPN']\n\n\n@register\n@serializable\nclass CLRFPN(nn.Layer):\n    \"\"\"\n    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144\n    Args:\n        in_channels (list[int]): input channels of each level which can be \n            derived from the output shape of backbone by from_config\n        out_channel (int): output channel of each level\n        spatial_scales (list[float]): the spatial scales between input feature\n            maps and original input image which can be derived from the output \n            shape of backbone by from_config\n        has_extra_convs (bool): whether to add extra conv to the last level.\n            default False\n        extra_stage (int): the number of extra stages added to the last level.\n            default 1\n        use_c5 (bool): Whether to use c5 as the input of extra stage, \n            otherwise p5 is used. default True\n        norm_type (string|None): The normalization type in FPN module. If \n            norm_type is None, norm will not be used after conv and if \n            norm_type is string, bn, gn, sync_bn are available. default None\n        norm_decay (float): weight decay for normalization layer weights.\n            default 0.\n        freeze_norm (bool): whether to freeze normalization layer.  \n            default False\n        relu_before_extra_convs (bool): whether to add relu before extra convs.\n            default False\n        \n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channel,\n                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],\n                 has_extra_convs=False,\n                 extra_stage=1,\n                 use_c5=True,\n                 norm_type=None,\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 relu_before_extra_convs=True):\n        super(CLRFPN, self).__init__()\n        self.out_channel = out_channel\n        for s in range(extra_stage):\n            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]\n        self.spatial_scales = spatial_scales\n        self.has_extra_convs = has_extra_convs\n        self.extra_stage = extra_stage\n        self.use_c5 = use_c5\n        self.relu_before_extra_convs = relu_before_extra_convs\n        self.norm_type = norm_type\n        self.norm_decay = norm_decay\n        self.freeze_norm = freeze_norm\n        self.in_channels = in_channels\n        self.lateral_convs = []\n        self.fpn_convs = []\n        fan = out_channel * 3 * 3\n\n        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone\n        # 0 <= st_stage < ed_stage <= 3\n        st_stage = 4 - len(in_channels)\n        ed_stage = st_stage + len(in_channels) - 1\n\n        for i in range(st_stage, ed_stage + 1):\n            # if i == 3:\n            #     lateral_name = 'fpn_inner_res5_sum'\n            # else:\n            #     lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)\n            lateral_name = \"lateral_convs.{}.conv\".format(i - 1)\n            in_c = in_channels[i - st_stage]\n            if self.norm_type is not None:\n                lateral = self.add_sublayer(\n                    lateral_name,\n                    ConvNormLayer(\n                        ch_in=in_c,\n                        ch_out=out_channel,\n                        filter_size=1,\n                        stride=1,\n                        norm_type=self.norm_type,\n                        norm_decay=self.norm_decay,\n                        freeze_norm=self.freeze_norm,\n                        initializer=XavierUniform(fan_out=in_c)))\n            else:\n                lateral = self.add_sublayer(\n                    lateral_name,\n                    nn.Conv2D(\n                        in_channels=in_c,\n                        out_channels=out_channel,\n                        kernel_size=1,\n                        weight_attr=ParamAttr(\n                            initializer=XavierUniform(fan_out=in_c))))\n            self.lateral_convs.append(lateral)\n\n            fpn_name = \"fpn_convs.{}.conv\".format(i - 1)\n            if self.norm_type is not None:\n                fpn_conv = self.add_sublayer(\n                    fpn_name,\n                    ConvNormLayer(\n                        ch_in=out_channel,\n                        ch_out=out_channel,\n                        filter_size=3,\n                        stride=1,\n                        norm_type=self.norm_type,\n                        norm_decay=self.norm_decay,\n                        freeze_norm=self.freeze_norm,\n                        initializer=XavierUniform(fan_out=fan)))\n            else:\n                fpn_conv = self.add_sublayer(\n                    fpn_name,\n                    nn.Conv2D(\n                        in_channels=out_channel,\n                        out_channels=out_channel,\n                        kernel_size=3,\n                        padding=1,\n                        weight_attr=ParamAttr(\n                            initializer=XavierUniform(fan_out=fan))))\n            self.fpn_convs.append(fpn_conv)\n\n        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)\n        if self.has_extra_convs:\n            for i in range(self.extra_stage):\n                lvl = ed_stage + 1 + i\n                if i == 0 and self.use_c5:\n                    in_c = in_channels[-1]\n                else:\n                    in_c = out_channel\n                extra_fpn_name = 'fpn_{}'.format(lvl + 2)\n                if self.norm_type is not None:\n                    extra_fpn_conv = self.add_sublayer(\n                        extra_fpn_name,\n                        ConvNormLayer(\n                            ch_in=in_c,\n                            ch_out=out_channel,\n                            filter_size=3,\n                            stride=2,\n                            norm_type=self.norm_type,\n                            norm_decay=self.norm_decay,\n                            freeze_norm=self.freeze_norm,\n                            initializer=XavierUniform(fan_out=fan)))\n                else:\n                    extra_fpn_conv = self.add_sublayer(\n                        extra_fpn_name,\n                        nn.Conv2D(\n                            in_channels=in_c,\n                            out_channels=out_channel,\n                            kernel_size=3,\n                            stride=2,\n                            padding=1,\n                            weight_attr=ParamAttr(\n                                initializer=XavierUniform(fan_out=fan))))\n                self.fpn_convs.append(extra_fpn_conv)\n        self.init_weights()\n\n    def init_weights(self):\n        for m in self.lateral_convs:\n            if isinstance(m, (nn.Conv1D, nn.Conv2D)):\n                kaiming_normal_(\n                    m.weight, a=0, mode='fan_out', nonlinearity='relu')\n                if m.bias is not None:\n                    constant_(m.bias, value=0.)\n            elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)):\n                constant_(m.weight, value=1)\n                constant_(m.bias, value=0)\n        for m in self.fpn_convs:\n            if isinstance(m, (nn.Conv1D, nn.Conv2D)):\n                kaiming_normal_(\n                    m.weight, a=0, mode='fan_out', nonlinearity='relu')\n                if m.bias is not None:\n                    constant_(m.bias, value=0.)\n            elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)):\n                constant_(m.weight, value=1)\n                constant_(m.bias, value=0)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {}\n\n    def forward(self, body_feats):\n        laterals = []\n        if len(body_feats) > len(self.in_channels):\n            for _ in range(len(body_feats) - len(self.in_channels)):\n                del body_feats[0]\n        num_levels = len(body_feats)\n        # print(\"body_feats\",num_levels)\n        for i in range(num_levels):\n            laterals.append(self.lateral_convs[i](body_feats[i]))\n\n        for i in range(1, num_levels):\n            lvl = num_levels - i\n            upsample = F.interpolate(\n                laterals[lvl],\n                scale_factor=2.,\n                mode='nearest', )\n            laterals[lvl - 1] += upsample\n\n        fpn_output = []\n        for lvl in range(num_levels):\n            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))\n\n        if self.extra_stage > 0:\n            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)\n            if not self.has_extra_convs:\n                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'\n                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))\n            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)\n            else:\n                if self.use_c5:\n                    extra_source = body_feats[-1]\n                else:\n                    extra_source = fpn_output[-1]\n                fpn_output.append(self.fpn_convs[num_levels](extra_source))\n\n                for i in range(1, self.extra_stage):\n                    if self.relu_before_extra_convs:\n                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(\n                            fpn_output[-1])))\n                    else:\n                        fpn_output.append(self.fpn_convs[num_levels + i](\n                            fpn_output[-1]))\n        return fpn_output\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.out_channel, stride=1. / s)\n            for s in self.spatial_scales\n        ]\n"
  },
  {
    "path": "ppdet/modeling/necks/csp_pan.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\n# The code is based on:\n# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/yolox_pafpn.py\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['CSPPAN']\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 in_channel=96,\n                 out_channel=96,\n                 kernel_size=3,\n                 stride=1,\n                 groups=1,\n                 act='leaky_relu'):\n        super(ConvBNLayer, self).__init__()\n        initializer = nn.initializer.KaimingUniform()\n        self.conv = nn.Conv2D(\n            in_channels=in_channel,\n            out_channels=out_channel,\n            kernel_size=kernel_size,\n            groups=groups,\n            padding=(kernel_size - 1) // 2,\n            stride=stride,\n            weight_attr=ParamAttr(initializer=initializer),\n            bias_attr=False)\n        self.bn = nn.BatchNorm2D(out_channel)\n        if act == \"hard_swish\":\n            act = 'hardswish'\n        self.act = act\n\n    def forward(self, x):\n        x = self.bn(self.conv(x))\n        if self.act:\n            x = getattr(F, self.act)(x)\n        return x\n\n\nclass DPModule(nn.Layer):\n    \"\"\"\n    Depth-wise and point-wise module.\n     Args:\n        in_channel (int): The input channels of this Module.\n        out_channel (int): The output channels of this Module.\n        kernel_size (int): The conv2d kernel size of this Module.\n        stride (int): The conv2d's stride of this Module.\n        act (str): The activation function of this Module,\n                   Now support `leaky_relu` and `hard_swish`.\n    \"\"\"\n\n    def __init__(self,\n                 in_channel=96,\n                 out_channel=96,\n                 kernel_size=3,\n                 stride=1,\n                 act='leaky_relu',\n                 use_act_in_out=True):\n        super(DPModule, self).__init__()\n        initializer = nn.initializer.KaimingUniform()\n        self.use_act_in_out = use_act_in_out\n        self.dwconv = nn.Conv2D(\n            in_channels=in_channel,\n            out_channels=out_channel,\n            kernel_size=kernel_size,\n            groups=out_channel,\n            padding=(kernel_size - 1) // 2,\n            stride=stride,\n            weight_attr=ParamAttr(initializer=initializer),\n            bias_attr=False)\n        self.bn1 = nn.BatchNorm2D(out_channel)\n        self.pwconv = nn.Conv2D(\n            in_channels=out_channel,\n            out_channels=out_channel,\n            kernel_size=1,\n            groups=1,\n            padding=0,\n            weight_attr=ParamAttr(initializer=initializer),\n            bias_attr=False)\n        self.bn2 = nn.BatchNorm2D(out_channel)\n        if act == \"hard_swish\":\n            act = 'hardswish'\n        self.act = act\n\n    def forward(self, x):\n        x = self.bn1(self.dwconv(x))\n        if self.act:\n            x = getattr(F, self.act)(x)\n        x = self.bn2(self.pwconv(x))\n        if self.use_act_in_out and self.act:\n            x = getattr(F, self.act)(x)\n        return x\n\n\nclass DarknetBottleneck(nn.Layer):\n    \"\"\"The basic bottleneck block used in Darknet.\n\n    Each Block consists of two ConvModules and the input is added to the\n    final output. Each ConvModule is composed of Conv, BN, and act.\n    The first convLayer has filter size of 1x1 and the second one has the\n    filter size of 3x3.\n\n    Args:\n        in_channels (int): The input channels of this Module.\n        out_channels (int): The output channels of this Module.\n        expansion (int): The kernel size of the convolution. Default: 0.5\n        add_identity (bool): Whether to add identity to the out.\n            Default: True\n        use_depthwise (bool): Whether to use depthwise separable convolution.\n            Default: False\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=3,\n                 expansion=0.5,\n                 add_identity=True,\n                 use_depthwise=False,\n                 act=\"leaky_relu\"):\n        super(DarknetBottleneck, self).__init__()\n        hidden_channels = int(out_channels * expansion)\n        conv_func = DPModule if use_depthwise else ConvBNLayer\n        self.conv1 = ConvBNLayer(\n            in_channel=in_channels,\n            out_channel=hidden_channels,\n            kernel_size=1,\n            act=act)\n        self.conv2 = conv_func(\n            in_channel=hidden_channels,\n            out_channel=out_channels,\n            kernel_size=kernel_size,\n            stride=1,\n            act=act)\n        self.add_identity = \\\n            add_identity and in_channels == out_channels\n\n    def forward(self, x):\n        identity = x\n        out = self.conv1(x)\n        out = self.conv2(out)\n\n        if self.add_identity:\n            return out + identity\n        else:\n            return out\n\n\nclass CSPLayer(nn.Layer):\n    \"\"\"Cross Stage Partial Layer.\n\n    Args:\n        in_channels (int): The input channels of the CSP layer.\n        out_channels (int): The output channels of the CSP layer.\n        expand_ratio (float): Ratio to adjust the number of channels of the\n            hidden layer. Default: 0.5\n        num_blocks (int): Number of blocks. Default: 1\n        add_identity (bool): Whether to add identity in blocks.\n            Default: True\n        use_depthwise (bool): Whether to depthwise separable convolution in\n            blocks. Default: False\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=3,\n                 expand_ratio=0.5,\n                 num_blocks=1,\n                 add_identity=True,\n                 use_depthwise=False,\n                 act=\"leaky_relu\"):\n        super().__init__()\n        mid_channels = int(out_channels * expand_ratio)\n        self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act)\n        self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act)\n        self.final_conv = ConvBNLayer(\n            2 * mid_channels, out_channels, 1, act=act)\n\n        self.blocks = nn.Sequential(* [\n            DarknetBottleneck(\n                mid_channels,\n                mid_channels,\n                kernel_size,\n                1.0,\n                add_identity,\n                use_depthwise,\n                act=act) for _ in range(num_blocks)\n        ])\n\n    def forward(self, x):\n        x_short = self.short_conv(x)\n\n        x_main = self.main_conv(x)\n        x_main = self.blocks(x_main)\n\n        x_final = paddle.concat((x_main, x_short), axis=1)\n        return self.final_conv(x_final)\n\n\nclass Channel_T(nn.Layer):\n    def __init__(self,\n                 in_channels=[116, 232, 464],\n                 out_channels=96,\n                 act=\"leaky_relu\"):\n        super(Channel_T, self).__init__()\n        self.convs = nn.LayerList()\n        for i in range(len(in_channels)):\n            self.convs.append(\n                ConvBNLayer(\n                    in_channels[i], out_channels, 1, act=act))\n\n    def forward(self, x):\n        outs = [self.convs[i](x[i]) for i in range(len(x))]\n        return outs\n\n\n@register\n@serializable\nclass CSPPAN(nn.Layer):\n    \"\"\"Path Aggregation Network with CSP module.\n\n    Args:\n        in_channels (List[int]): Number of input channels per scale.\n        out_channels (int): Number of output channels (used at each scale)\n        kernel_size (int): The conv2d kernel size of this Module.\n        num_features (int): Number of output features of CSPPAN module.\n        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1\n        use_depthwise (bool): Whether to depthwise separable convolution in\n            blocks. Default: True\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=5,\n                 num_features=3,\n                 num_csp_blocks=1,\n                 use_depthwise=True,\n                 act='hard_swish',\n                 spatial_scales=[0.125, 0.0625, 0.03125]):\n        super(CSPPAN, self).__init__()\n        self.conv_t = Channel_T(in_channels, out_channels, act=act)\n        in_channels = [out_channels] * len(spatial_scales)\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.spatial_scales = spatial_scales\n        self.num_features = num_features\n        conv_func = DPModule if use_depthwise else ConvBNLayer\n\n        if self.num_features == 4:\n            self.first_top_conv = conv_func(\n                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)\n            self.second_top_conv = conv_func(\n                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)\n            self.spatial_scales.append(self.spatial_scales[-1] / 2)\n\n        # build top-down blocks\n        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')\n        self.top_down_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1, 0, -1):\n            self.top_down_blocks.append(\n                CSPLayer(\n                    in_channels[idx - 1] * 2,\n                    in_channels[idx - 1],\n                    kernel_size=kernel_size,\n                    num_blocks=num_csp_blocks,\n                    add_identity=False,\n                    use_depthwise=use_depthwise,\n                    act=act))\n\n        # build bottom-up blocks\n        self.downsamples = nn.LayerList()\n        self.bottom_up_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1):\n            self.downsamples.append(\n                conv_func(\n                    in_channels[idx],\n                    in_channels[idx],\n                    kernel_size=kernel_size,\n                    stride=2,\n                    act=act))\n            self.bottom_up_blocks.append(\n                CSPLayer(\n                    in_channels[idx] * 2,\n                    in_channels[idx + 1],\n                    kernel_size=kernel_size,\n                    num_blocks=num_csp_blocks,\n                    add_identity=False,\n                    use_depthwise=use_depthwise,\n                    act=act))\n\n    def forward(self, inputs):\n        \"\"\"\n        Args:\n            inputs (tuple[Tensor]): input features.\n\n        Returns:\n            tuple[Tensor]: CSPPAN features.\n        \"\"\"\n        assert len(inputs) == len(self.in_channels)\n        inputs = self.conv_t(inputs)\n\n        # top-down path\n        inner_outs = [inputs[-1]]\n        for idx in range(len(self.in_channels) - 1, 0, -1):\n            feat_heigh = inner_outs[0]\n            feat_low = inputs[idx - 1]\n\n            upsample_feat = self.upsample(feat_heigh)\n\n            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](\n                paddle.concat([upsample_feat, feat_low], 1))\n            inner_outs.insert(0, inner_out)\n\n        # bottom-up path\n        outs = [inner_outs[0]]\n        for idx in range(len(self.in_channels) - 1):\n            feat_low = outs[-1]\n            feat_height = inner_outs[idx + 1]\n            downsample_feat = self.downsamples[idx](feat_low)\n            out = self.bottom_up_blocks[idx](paddle.concat(\n                [downsample_feat, feat_height], 1))\n            outs.append(out)\n\n        top_features = None\n        if self.num_features == 4:\n            top_features = self.first_top_conv(inputs[-1])\n            top_features = top_features + self.second_top_conv(outs[-1])\n            outs.append(top_features)\n\n        return tuple(outs)\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.out_channels, stride=1. / s)\n            for s in self.spatial_scales\n        ]\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n"
  },
  {
    "path": "ppdet/modeling/necks/custom_pan.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\nimport copy\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.layers import DropBlock, MultiHeadAttention\nfrom ppdet.modeling.ops import get_act_fn\nfrom ..backbones.cspresnet import ConvBNLayer, BasicBlock\nfrom ..shape_spec import ShapeSpec\nfrom ..initializer import linear_init_\n\n__all__ = ['CustomCSPPAN']\n\n\ndef _get_clones(module, N):\n    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])\n\n\nclass SPP(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 k,\n                 pool_size,\n                 act='swish',\n                 data_format='NCHW'):\n        super(SPP, self).__init__()\n        self.pool = []\n        self.data_format = data_format\n        for i, size in enumerate(pool_size):\n            pool = self.add_sublayer(\n                'pool{}'.format(i),\n                nn.MaxPool2D(\n                    kernel_size=size,\n                    stride=1,\n                    padding=size // 2,\n                    data_format=data_format,\n                    ceil_mode=False))\n            self.pool.append(pool)\n        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)\n\n    def forward(self, x):\n        outs = [x]\n        for pool in self.pool:\n            outs.append(pool(x))\n        if self.data_format == 'NCHW':\n            y = paddle.concat(outs, axis=1)\n        else:\n            y = paddle.concat(outs, axis=-1)\n\n        y = self.conv(y)\n        return y\n\n\nclass CSPStage(nn.Layer):\n    def __init__(self,\n                 block_fn,\n                 ch_in,\n                 ch_out,\n                 n,\n                 act='swish',\n                 spp=False,\n                 use_alpha=False):\n        super(CSPStage, self).__init__()\n\n        ch_mid = int(ch_out // 2)\n        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)\n        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)\n        self.convs = nn.Sequential()\n        next_ch_in = ch_mid\n        for i in range(n):\n            self.convs.add_sublayer(\n                str(i),\n                eval(block_fn)(next_ch_in,\n                               ch_mid,\n                               act=act,\n                               shortcut=False,\n                               use_alpha=use_alpha))\n            if i == (n - 1) // 2 and spp:\n                self.convs.add_sublayer(\n                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))\n            next_ch_in = ch_mid\n        self.conv3 = ConvBNLayer(ch_mid * 2, ch_out, 1, act=act)\n\n    def forward(self, x):\n        y1 = self.conv1(x)\n        y2 = self.conv2(x)\n        y2 = self.convs(y2)\n        y = paddle.concat([y1, y2], axis=1)\n        y = self.conv3(y)\n        return y\n\n\nclass TransformerEncoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False):\n        super(TransformerEncoderLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n\n        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos_embed):\n        return tensor if pos_embed is None else tensor + pos_embed\n\n    def forward(self, src, src_mask=None, pos_embed=None):\n        residual = src\n        if self.normalize_before:\n            src = self.norm1(src)\n        q = k = self.with_pos_embed(src, pos_embed)\n        src = self.self_attn(q, k, value=src, attn_mask=src_mask)\n\n        src = residual + self.dropout1(src)\n        if not self.normalize_before:\n            src = self.norm1(src)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm2(src)\n        src = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = residual + self.dropout2(src)\n        if not self.normalize_before:\n            src = self.norm2(src)\n        return src\n\n\nclass TransformerEncoder(nn.Layer):\n    def __init__(self, encoder_layer, num_layers, norm=None):\n        super(TransformerEncoder, self).__init__()\n        self.layers = _get_clones(encoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.norm = norm\n\n    def forward(self, src, src_mask=None, pos_embed=None):\n        output = src\n        for layer in self.layers:\n            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n        return output\n\n\n@register\n@serializable\nclass CustomCSPPAN(nn.Layer):\n    __shared__ = [\n        'norm_type', 'data_format', 'width_mult', 'depth_mult', 'trt',\n        'eval_size'\n    ]\n\n    def __init__(self,\n                 in_channels=[256, 512, 1024],\n                 out_channels=[1024, 512, 256],\n                 norm_type='bn',\n                 act='leaky',\n                 stage_fn='CSPStage',\n                 block_fn='BasicBlock',\n                 stage_num=1,\n                 block_num=3,\n                 drop_block=False,\n                 block_size=3,\n                 keep_prob=0.9,\n                 spp=False,\n                 data_format='NCHW',\n                 width_mult=1.0,\n                 depth_mult=1.0,\n                 use_alpha=False,\n                 trt=False,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation='gelu',\n                 nhead=4,\n                 num_layers=4,\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False,\n                 use_trans=False,\n                 eval_size=None):\n\n        super(CustomCSPPAN, self).__init__()\n        out_channels = [max(round(c * width_mult), 1) for c in out_channels]\n        block_num = max(round(block_num * depth_mult), 1)\n        act = get_act_fn(\n            act, trt=trt) if act is None or isinstance(act,\n                                                       (str, dict)) else act\n        self.num_blocks = len(in_channels)\n        self.data_format = data_format\n        self._out_channels = out_channels\n\n        self.hidden_dim = in_channels[-1]\n        in_channels = in_channels[::-1]\n\n        self.use_trans = use_trans\n        self.eval_size = eval_size\n        if use_trans:\n            if eval_size is not None:\n                self.pos_embed = self.build_2d_sincos_position_embedding(\n                    eval_size[1] // 32,\n                    eval_size[0] // 32,\n                    embed_dim=self.hidden_dim)\n            else:\n                self.pos_embed = None\n\n            encoder_layer = TransformerEncoderLayer(\n                self.hidden_dim, nhead, dim_feedforward, dropout, activation,\n                attn_dropout, act_dropout, normalize_before)\n            encoder_norm = nn.LayerNorm(\n                self.hidden_dim) if normalize_before else None\n            self.encoder = TransformerEncoder(encoder_layer, num_layers,\n                                              encoder_norm)\n\n        fpn_stages = []\n        fpn_routes = []\n        for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)):\n            if i > 0:\n                ch_in += ch_pre // 2\n\n            stage = nn.Sequential()\n            for j in range(stage_num):\n                stage.add_sublayer(\n                    str(j),\n                    eval(stage_fn)(block_fn,\n                                   ch_in if j == 0 else ch_out,\n                                   ch_out,\n                                   block_num,\n                                   act=act,\n                                   spp=(spp and i == 0),\n                                   use_alpha=use_alpha))\n\n            if drop_block:\n                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))\n\n            fpn_stages.append(stage)\n\n            if i < self.num_blocks - 1:\n                fpn_routes.append(\n                    ConvBNLayer(\n                        ch_in=ch_out,\n                        ch_out=ch_out // 2,\n                        filter_size=1,\n                        stride=1,\n                        padding=0,\n                        act=act))\n\n            ch_pre = ch_out\n\n        self.fpn_stages = nn.LayerList(fpn_stages)\n        self.fpn_routes = nn.LayerList(fpn_routes)\n\n        pan_stages = []\n        pan_routes = []\n        for i in reversed(range(self.num_blocks - 1)):\n            pan_routes.append(\n                ConvBNLayer(\n                    ch_in=out_channels[i + 1],\n                    ch_out=out_channels[i + 1],\n                    filter_size=3,\n                    stride=2,\n                    padding=1,\n                    act=act))\n\n            ch_in = out_channels[i] + out_channels[i + 1]\n            ch_out = out_channels[i]\n            stage = nn.Sequential()\n            for j in range(stage_num):\n                stage.add_sublayer(\n                    str(j),\n                    eval(stage_fn)(block_fn,\n                                   ch_in if j == 0 else ch_out,\n                                   ch_out,\n                                   block_num,\n                                   act=act,\n                                   spp=False,\n                                   use_alpha=use_alpha))\n            if drop_block:\n                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))\n\n            pan_stages.append(stage)\n\n        self.pan_stages = nn.LayerList(pan_stages[::-1])\n        self.pan_routes = nn.LayerList(pan_routes[::-1])\n\n    def build_2d_sincos_position_embedding(\n            self,\n            w,\n            h,\n            embed_dim=1024,\n            temperature=10000., ):\n        grid_w = paddle.arange(int(w), dtype=paddle.float32)\n        grid_h = paddle.arange(int(h), dtype=paddle.float32)\n        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)\n        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'\n        pos_dim = embed_dim // 4\n        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim\n        omega = 1. / (temperature**omega)\n\n        out_w = grid_w.flatten()[..., None] @omega[None]\n        out_h = grid_h.flatten()[..., None] @omega[None]\n\n        pos_emb = paddle.concat(\n            [\n                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),\n                paddle.cos(out_h)\n            ],\n            axis=1)[None, :, :]\n\n        return pos_emb\n\n    def forward(self, blocks, for_mot=False):\n        if self.use_trans:\n            last_feat = blocks[-1]\n            n, c, h, w = last_feat.shape\n\n            # flatten [B, C, H, W] to [B, HxW, C]\n            src_flatten = last_feat.flatten(2).transpose([0, 2, 1])\n            if self.eval_size is not None and not self.training:\n                pos_embed = self.pos_embed\n            else:\n                pos_embed = self.build_2d_sincos_position_embedding(\n                    w=w, h=h, embed_dim=self.hidden_dim)\n\n            memory = self.encoder(src_flatten, pos_embed=pos_embed)\n            last_feat_encode = memory.transpose([0, 2, 1]).reshape([n, c, h, w])\n            blocks[-1] = last_feat_encode\n\n        blocks = blocks[::-1]\n        fpn_feats = []\n\n        for i, block in enumerate(blocks):\n            if i > 0:\n                block = paddle.concat([route, block], axis=1)\n            route = self.fpn_stages[i](block)\n            fpn_feats.append(route)\n\n            if i < self.num_blocks - 1:\n                route = self.fpn_routes[i](route)\n                route = F.interpolate(\n                    route, scale_factor=2., data_format=self.data_format)\n\n        pan_feats = [fpn_feats[-1], ]\n        route = fpn_feats[-1]\n        for i in reversed(range(self.num_blocks - 1)):\n            block = fpn_feats[i]\n            route = self.pan_routes[i](route)\n            block = paddle.concat([route, block], axis=1)\n            route = self.pan_stages[i](block)\n            pan_feats.append(route)\n\n        return pan_feats[::-1]\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/necks/dilated_encoder.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import KaimingUniform, Constant, Normal\nfrom ppdet.core.workspace import register, serializable\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['DilatedEncoder']\n\n\nclass Bottleneck(nn.Layer):\n    def __init__(self, in_channels, mid_channels, dilation):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Sequential(* [\n            nn.Conv2D(\n                in_channels,\n                mid_channels,\n                1,\n                padding=0,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0, std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(0.0))),\n            nn.BatchNorm2D(\n                mid_channels,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),\n            nn.ReLU(),\n        ])\n        self.conv2 = nn.Sequential(* [\n            nn.Conv2D(\n                mid_channels,\n                mid_channels,\n                3,\n                padding=dilation,\n                dilation=dilation,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0, std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(0.0))),\n            nn.BatchNorm2D(\n                mid_channels,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),\n            nn.ReLU(),\n        ])\n        self.conv3 = nn.Sequential(* [\n            nn.Conv2D(\n                mid_channels,\n                in_channels,\n                1,\n                padding=0,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0, std=0.01)),\n                bias_attr=ParamAttr(initializer=Constant(0.0))),\n            nn.BatchNorm2D(\n                in_channels,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),\n            nn.ReLU(),\n        ])\n\n    def forward(self, x):\n        identity = x\n        y = self.conv3(self.conv2(self.conv1(x)))\n        return y + identity\n\n\n@register\nclass DilatedEncoder(nn.Layer):\n    \"\"\"\n    DilatedEncoder used in YOLOF\n    \"\"\"\n\n    def __init__(self,\n                 in_channels=[2048],\n                 out_channels=[512],\n                 block_mid_channels=128,\n                 num_residual_blocks=4,\n                 block_dilations=[2, 4, 6, 8]):\n        super(DilatedEncoder, self).__init__()\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        assert len(self.in_channels) == 1, \"YOLOF only has one level feature.\"\n        assert len(self.out_channels) == 1, \"YOLOF only has one level feature.\"\n\n        self.block_mid_channels = block_mid_channels\n        self.num_residual_blocks = num_residual_blocks\n        self.block_dilations = block_dilations\n\n        out_ch = self.out_channels[0]\n        self.lateral_conv = nn.Conv2D(\n            self.in_channels[0],\n            out_ch,\n            1,\n            weight_attr=ParamAttr(initializer=KaimingUniform(\n                negative_slope=1, nonlinearity='leaky_relu')),\n            bias_attr=ParamAttr(initializer=Constant(value=0.0)))\n        self.lateral_norm = nn.BatchNorm2D(\n            out_ch,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        self.fpn_conv = nn.Conv2D(\n            out_ch,\n            out_ch,\n            3,\n            padding=1,\n            weight_attr=ParamAttr(initializer=KaimingUniform(\n                negative_slope=1, nonlinearity='leaky_relu')))\n        self.fpn_norm = nn.BatchNorm2D(\n            out_ch,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        encoder_blocks = []\n        for i in range(self.num_residual_blocks):\n            encoder_blocks.append(\n                Bottleneck(\n                    out_ch,\n                    self.block_mid_channels,\n                    dilation=block_dilations[i]))\n        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)\n\n    def forward(self, inputs, for_mot=False):\n        out = self.lateral_norm(self.lateral_conv(inputs[0]))\n        out = self.fpn_norm(self.fpn_conv(out))\n        out = self.dilated_encoder_blocks(out)\n        return [out]\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self.out_channels]\n"
  },
  {
    "path": "ppdet/modeling/necks/es_pan.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom ppdet.core.workspace import register, serializable\n\nfrom ..shape_spec import ShapeSpec\nfrom ..backbones.esnet import SEModule\nfrom .csp_pan import ConvBNLayer, Channel_T, DPModule\n\n__all__ = ['ESPAN']\n\n\nclass ES_Block(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 mid_channels,\n                 out_channels,\n                 kernel_size=5,\n                 stride=1,\n                 act='leaky_relu'):\n        super(ES_Block, self).__init__()\n        self._residual = ConvBNLayer(\n            in_channel=in_channels,\n            out_channel=out_channels,\n            kernel_size=1,\n            stride=1,\n            groups=1,\n            act=act)\n        self._conv_pw = ConvBNLayer(\n            in_channel=in_channels,\n            out_channel=mid_channels // 2,\n            kernel_size=1,\n            stride=1,\n            groups=1,\n            act=act)\n        self._conv_dw = ConvBNLayer(\n            in_channel=mid_channels // 2,\n            out_channel=mid_channels // 2,\n            kernel_size=kernel_size,\n            stride=stride,\n            groups=mid_channels // 2,\n            act=None)\n        self._se = SEModule(mid_channels)\n\n        self._conv_linear = ConvBNLayer(\n            in_channel=mid_channels,\n            out_channel=out_channels,\n            kernel_size=1,\n            stride=1,\n            groups=1,\n            act=act)\n\n        self._out_conv = ConvBNLayer(\n            in_channel=out_channels * 2,\n            out_channel=out_channels,\n            kernel_size=1,\n            stride=1,\n            groups=1,\n            act=act)\n\n    def forward(self, inputs):\n        x1 = self._residual(inputs)\n        x2 = self._conv_pw(inputs)\n        x3 = self._conv_dw(x2)\n        x3 = paddle.concat([x2, x3], axis=1)\n        x3 = self._se(x3)\n        x3 = self._conv_linear(x3)\n        out = paddle.concat([x1, x3], axis=1)\n        out = self._out_conv(out)\n        return out\n\n\n@register\n@serializable\nclass ESPAN(nn.Layer):\n    \"\"\"Path Aggregation Network with ES module.\n\n    Args:\n        in_channels (List[int]): Number of input channels per scale.\n        out_channels (int): Number of output channels (used at each scale)\n        kernel_size (int): The conv2d kernel size of this Module.\n        num_features (int): Number of output features of CSPPAN module.\n        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1\n        use_depthwise (bool): Whether to depthwise separable convolution in\n            blocks. Default: True\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=5,\n                 num_features=3,\n                 use_depthwise=True,\n                 act='hard_swish',\n                 spatial_scales=[0.125, 0.0625, 0.03125]):\n        super(ESPAN, self).__init__()\n        self.conv_t = Channel_T(in_channels, out_channels, act=act)\n        in_channels = [out_channels] * len(spatial_scales)\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.spatial_scales = spatial_scales\n        self.num_features = num_features\n        conv_func = DPModule if use_depthwise else ConvBNLayer\n\n        if self.num_features == 4:\n            self.first_top_conv = conv_func(\n                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)\n            self.second_top_conv = conv_func(\n                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)\n            self.spatial_scales.append(self.spatial_scales[-1] / 2)\n\n        # build top-down blocks\n        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')\n        self.top_down_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1, 0, -1):\n            self.top_down_blocks.append(\n                ES_Block(\n                    in_channels[idx - 1] * 2,\n                    in_channels[idx - 1],\n                    in_channels[idx - 1],\n                    kernel_size=kernel_size,\n                    stride=1,\n                    act=act))\n\n        # build bottom-up blocks\n        self.downsamples = nn.LayerList()\n        self.bottom_up_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1):\n            self.downsamples.append(\n                conv_func(\n                    in_channels[idx],\n                    in_channels[idx],\n                    kernel_size=kernel_size,\n                    stride=2,\n                    act=act))\n            self.bottom_up_blocks.append(\n                ES_Block(\n                    in_channels[idx] * 2,\n                    in_channels[idx + 1],\n                    in_channels[idx + 1],\n                    kernel_size=kernel_size,\n                    stride=1,\n                    act=act))\n\n    def forward(self, inputs):\n        \"\"\"\n        Args:\n            inputs (tuple[Tensor]): input features.\n\n        Returns:\n            tuple[Tensor]: CSPPAN features.\n        \"\"\"\n        assert len(inputs) == len(self.in_channels)\n        inputs = self.conv_t(inputs)\n\n        # top-down path\n        inner_outs = [inputs[-1]]\n        for idx in range(len(self.in_channels) - 1, 0, -1):\n            feat_heigh = inner_outs[0]\n            feat_low = inputs[idx - 1]\n\n            upsample_feat = self.upsample(feat_heigh)\n\n            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](\n                paddle.concat([upsample_feat, feat_low], 1))\n            inner_outs.insert(0, inner_out)\n\n        # bottom-up path\n        outs = [inner_outs[0]]\n        for idx in range(len(self.in_channels) - 1):\n            feat_low = outs[-1]\n            feat_height = inner_outs[idx + 1]\n            downsample_feat = self.downsamples[idx](feat_low)\n            out = self.bottom_up_blocks[idx](paddle.concat(\n                [downsample_feat, feat_height], 1))\n            outs.append(out)\n\n        top_features = None\n        if self.num_features == 4:\n            top_features = self.first_top_conv(inputs[-1])\n            top_features = top_features + self.second_top_conv(outs[-1])\n            outs.append(top_features)\n\n        return tuple(outs)\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.out_channels, stride=1. / s)\n            for s in self.spatial_scales\n        ]\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n"
  },
  {
    "path": "ppdet/modeling/necks/fpn.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import XavierUniform\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.layers import ConvNormLayer\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['FPN']\n\n\n@register\n@serializable\nclass FPN(nn.Layer):\n    \"\"\"\n    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144\n\n    Args:\n        in_channels (list[int]): input channels of each level which can be \n            derived from the output shape of backbone by from_config\n        out_channel (int): output channel of each level\n        spatial_scales (list[float]): the spatial scales between input feature\n            maps and original input image which can be derived from the output \n            shape of backbone by from_config\n        has_extra_convs (bool): whether to add extra conv to the last level.\n            default False\n        extra_stage (int): the number of extra stages added to the last level.\n            default 1\n        use_c5 (bool): Whether to use c5 as the input of extra stage, \n            otherwise p5 is used. default True\n        norm_type (string|None): The normalization type in FPN module. If \n            norm_type is None, norm will not be used after conv and if \n            norm_type is string, bn, gn, sync_bn are available. default None\n        norm_decay (float): weight decay for normalization layer weights.\n            default 0.\n        freeze_norm (bool): whether to freeze normalization layer.  \n            default False\n        relu_before_extra_convs (bool): whether to add relu before extra convs.\n            default False\n        \n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channel,\n                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],\n                 has_extra_convs=False,\n                 extra_stage=1,\n                 use_c5=True,\n                 norm_type=None,\n                 norm_decay=0.,\n                 freeze_norm=False,\n                 relu_before_extra_convs=True):\n        super(FPN, self).__init__()\n        self.out_channel = out_channel\n        for s in range(extra_stage):\n            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]\n        self.spatial_scales = spatial_scales\n        self.has_extra_convs = has_extra_convs\n        self.extra_stage = extra_stage\n        self.use_c5 = use_c5\n        self.relu_before_extra_convs = relu_before_extra_convs\n        self.norm_type = norm_type\n        self.norm_decay = norm_decay\n        self.freeze_norm = freeze_norm\n\n        self.lateral_convs = []\n        self.fpn_convs = []\n        fan = out_channel * 3 * 3\n\n        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone\n        # 0 <= st_stage < ed_stage <= 3\n        st_stage = 4 - len(in_channels)\n        ed_stage = st_stage + len(in_channels) - 1\n        for i in range(st_stage, ed_stage + 1):\n            if i == 3:\n                lateral_name = 'fpn_inner_res5_sum'\n            else:\n                lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)\n            in_c = in_channels[i - st_stage]\n            if self.norm_type is not None:\n                lateral = self.add_sublayer(\n                    lateral_name,\n                    ConvNormLayer(\n                        ch_in=in_c,\n                        ch_out=out_channel,\n                        filter_size=1,\n                        stride=1,\n                        norm_type=self.norm_type,\n                        norm_decay=self.norm_decay,\n                        freeze_norm=self.freeze_norm,\n                        initializer=XavierUniform(fan_out=in_c)))\n            else:\n                lateral = self.add_sublayer(\n                    lateral_name,\n                    nn.Conv2D(\n                        in_channels=in_c,\n                        out_channels=out_channel,\n                        kernel_size=1,\n                        weight_attr=ParamAttr(\n                            initializer=XavierUniform(fan_out=in_c))))\n            self.lateral_convs.append(lateral)\n\n            fpn_name = 'fpn_res{}_sum'.format(i + 2)\n            if self.norm_type is not None:\n                fpn_conv = self.add_sublayer(\n                    fpn_name,\n                    ConvNormLayer(\n                        ch_in=out_channel,\n                        ch_out=out_channel,\n                        filter_size=3,\n                        stride=1,\n                        norm_type=self.norm_type,\n                        norm_decay=self.norm_decay,\n                        freeze_norm=self.freeze_norm,\n                        initializer=XavierUniform(fan_out=fan)))\n            else:\n                fpn_conv = self.add_sublayer(\n                    fpn_name,\n                    nn.Conv2D(\n                        in_channels=out_channel,\n                        out_channels=out_channel,\n                        kernel_size=3,\n                        padding=1,\n                        weight_attr=ParamAttr(\n                            initializer=XavierUniform(fan_out=fan))))\n            self.fpn_convs.append(fpn_conv)\n\n        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)\n        if self.has_extra_convs:\n            for i in range(self.extra_stage):\n                lvl = ed_stage + 1 + i\n                if i == 0 and self.use_c5:\n                    in_c = in_channels[-1]\n                else:\n                    in_c = out_channel\n                extra_fpn_name = 'fpn_{}'.format(lvl + 2)\n                if self.norm_type is not None:\n                    extra_fpn_conv = self.add_sublayer(\n                        extra_fpn_name,\n                        ConvNormLayer(\n                            ch_in=in_c,\n                            ch_out=out_channel,\n                            filter_size=3,\n                            stride=2,\n                            norm_type=self.norm_type,\n                            norm_decay=self.norm_decay,\n                            freeze_norm=self.freeze_norm,\n                            initializer=XavierUniform(fan_out=fan)))\n                else:\n                    extra_fpn_conv = self.add_sublayer(\n                        extra_fpn_name,\n                        nn.Conv2D(\n                            in_channels=in_c,\n                            out_channels=out_channel,\n                            kernel_size=3,\n                            stride=2,\n                            padding=1,\n                            weight_attr=ParamAttr(\n                                initializer=XavierUniform(fan_out=fan))))\n                self.fpn_convs.append(extra_fpn_conv)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {\n            'in_channels': [i.channels for i in input_shape],\n            'spatial_scales': [1.0 / i.stride for i in input_shape],\n        }\n\n    def forward(self, body_feats):\n        laterals = []\n        num_levels = len(body_feats)\n        for i in range(num_levels):\n            laterals.append(self.lateral_convs[i](body_feats[i]))\n\n        for i in range(1, num_levels):\n            lvl = num_levels - i\n            upsample = F.interpolate(\n                laterals[lvl],\n                scale_factor=2.,\n                mode='nearest', )\n            laterals[lvl - 1] += upsample\n\n        fpn_output = []\n        for lvl in range(num_levels):\n            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))\n\n        if self.extra_stage > 0:\n            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)\n            if not self.has_extra_convs:\n                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'\n                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))\n            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)\n            else:\n                if self.use_c5:\n                    extra_source = body_feats[-1]\n                else:\n                    extra_source = fpn_output[-1]\n                fpn_output.append(self.fpn_convs[num_levels](extra_source))\n\n                for i in range(1, self.extra_stage):\n                    if self.relu_before_extra_convs:\n                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(\n                            fpn_output[-1])))\n                    else:\n                        fpn_output.append(self.fpn_convs[num_levels + i](\n                            fpn_output[-1]))\n        return fpn_output\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.out_channel, stride=1. / s)\n            for s in self.spatial_scales\n        ]\n"
  },
  {
    "path": "ppdet/modeling/necks/hrfpn.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nfrom ppdet.core.workspace import register\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['HRFPN']\n\n\n@register\nclass HRFPN(nn.Layer):\n    \"\"\"\n    Args:\n        in_channels (list): number of input feature channels from backbone\n        out_channel (int): number of output feature channels\n        share_conv (bool): whether to share conv for different layers' reduction\n        extra_stage (int): add extra stage for returning HRFPN fpn_feats\n        spatial_scales (list): feature map scaling factor\n    \"\"\"\n\n    def __init__(self,\n                 in_channels=[18, 36, 72, 144],\n                 out_channel=256,\n                 share_conv=False,\n                 extra_stage=1,\n                 spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32],\n                 use_bias=False):\n        super(HRFPN, self).__init__()\n        in_channel = sum(in_channels)\n        self.in_channel = in_channel\n        self.out_channel = out_channel\n        self.share_conv = share_conv\n        for i in range(extra_stage):\n            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]\n        self.spatial_scales = spatial_scales\n        self.num_out = len(self.spatial_scales)\n        self.use_bias = use_bias\n        bias_attr = False if use_bias is False else None\n\n        self.reduction = nn.Conv2D(\n            in_channels=in_channel,\n            out_channels=out_channel,\n            kernel_size=1,\n            bias_attr=bias_attr)\n\n        if share_conv:\n            self.fpn_conv = nn.Conv2D(\n                in_channels=out_channel,\n                out_channels=out_channel,\n                kernel_size=3,\n                padding=1,\n                bias_attr=bias_attr)\n        else:\n            self.fpn_conv = []\n            for i in range(self.num_out):\n                conv_name = \"fpn_conv_\" + str(i)\n                conv = self.add_sublayer(\n                    conv_name,\n                    nn.Conv2D(\n                        in_channels=out_channel,\n                        out_channels=out_channel,\n                        kernel_size=3,\n                        padding=1,\n                        bias_attr=bias_attr))\n                self.fpn_conv.append(conv)\n\n    def forward(self, body_feats):\n        num_backbone_stages = len(body_feats)\n\n        outs = []\n        outs.append(body_feats[0])\n\n        # resize\n        for i in range(1, num_backbone_stages):\n            resized = F.interpolate(\n                body_feats[i], scale_factor=2**i, mode='bilinear')\n            outs.append(resized)\n\n        # concat\n        out = paddle.concat(outs, axis=1)\n        assert out.shape[\n            1] == self.in_channel, 'in_channel should be {}, be received {}'.format(\n                out.shape[1], self.in_channel)\n\n        # reduction\n        out = self.reduction(out)\n\n        # conv\n        outs = [out]\n        for i in range(1, self.num_out):\n            outs.append(F.avg_pool2d(out, kernel_size=2**i, stride=2**i))\n        outputs = []\n\n        for i in range(self.num_out):\n            conv_func = self.fpn_conv if self.share_conv else self.fpn_conv[i]\n            conv = conv_func(outs[i])\n            outputs.append(conv)\n\n        fpn_feats = [outputs[k] for k in range(self.num_out)]\n        return fpn_feats\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {\n            'in_channels': [i.channels for i in input_shape],\n            'spatial_scales': [1.0 / i.stride for i in input_shape],\n        }\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.out_channel, stride=1. / s)\n            for s in self.spatial_scales\n        ]\n"
  },
  {
    "path": "ppdet/modeling/necks/lc_pan.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom ppdet.core.workspace import register, serializable\n\nfrom ..shape_spec import ShapeSpec\nfrom ..backbones.lcnet import DepthwiseSeparable\nfrom .csp_pan import ConvBNLayer, Channel_T, DPModule\n\n__all__ = ['LCPAN']\n\n\n@register\n@serializable\nclass LCPAN(nn.Layer):\n    \"\"\"Path Aggregation Network with LCNet module.\n    Args:\n        in_channels (List[int]): Number of input channels per scale.\n        out_channels (int): Number of output channels (used at each scale)\n        kernel_size (int): The conv2d kernel size of this Module.\n        num_features (int): Number of output features of CSPPAN module.\n        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1\n        use_depthwise (bool): Whether to depthwise separable convolution in\n            blocks. Default: True\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=5,\n                 num_features=3,\n                 use_depthwise=True,\n                 act='hard_swish',\n                 spatial_scales=[0.125, 0.0625, 0.03125]):\n        super(LCPAN, self).__init__()\n        self.conv_t = Channel_T(in_channels, out_channels, act=act)\n        in_channels = [out_channels] * len(spatial_scales)\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.spatial_scales = spatial_scales\n        self.num_features = num_features\n        conv_func = DPModule if use_depthwise else ConvBNLayer\n\n        NET_CONFIG = {\n            #k, in_c, out_c, stride, use_se\n            \"block1\": [\n                [kernel_size, out_channels * 2, out_channels * 2, 1, False],\n                [kernel_size, out_channels * 2, out_channels, 1, False],\n            ],\n            \"block2\": [\n                [kernel_size, out_channels * 2, out_channels * 2, 1, False],\n                [kernel_size, out_channels * 2, out_channels, 1, False],\n            ]\n        }\n\n        if self.num_features == 4:\n            self.first_top_conv = conv_func(\n                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)\n            self.second_top_conv = conv_func(\n                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)\n            self.spatial_scales.append(self.spatial_scales[-1] / 2)\n\n        # build top-down blocks\n        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')\n        self.top_down_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1, 0, -1):\n            self.top_down_blocks.append(\n                nn.Sequential(* [\n                    DepthwiseSeparable(\n                        num_channels=in_c,\n                        num_filters=out_c,\n                        dw_size=k,\n                        stride=s,\n                        use_se=se)\n                    for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\n                        \"block1\"])\n                ]))\n\n        # build bottom-up blocks\n        self.downsamples = nn.LayerList()\n        self.bottom_up_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1):\n            self.downsamples.append(\n                conv_func(\n                    in_channels[idx],\n                    in_channels[idx],\n                    kernel_size=kernel_size,\n                    stride=2,\n                    act=act))\n            self.bottom_up_blocks.append(\n                nn.Sequential(* [\n                    DepthwiseSeparable(\n                        num_channels=in_c,\n                        num_filters=out_c,\n                        dw_size=k,\n                        stride=s,\n                        use_se=se)\n                    for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\n                        \"block2\"])\n                ]))\n\n    def forward(self, inputs):\n        \"\"\"\n        Args:\n            inputs (tuple[Tensor]): input features.\n        Returns:\n            tuple[Tensor]: CSPPAN features.\n        \"\"\"\n        assert len(inputs) == len(self.in_channels)\n        inputs = self.conv_t(inputs)\n\n        # top-down path\n        inner_outs = [inputs[-1]]\n        for idx in range(len(self.in_channels) - 1, 0, -1):\n            feat_heigh = inner_outs[0]\n            feat_low = inputs[idx - 1]\n\n            upsample_feat = self.upsample(feat_heigh)\n\n            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](\n                paddle.concat([upsample_feat, feat_low], 1))\n            inner_outs.insert(0, inner_out)\n\n        # bottom-up path\n        outs = [inner_outs[0]]\n        for idx in range(len(self.in_channels) - 1):\n            feat_low = outs[-1]\n            feat_height = inner_outs[idx + 1]\n            downsample_feat = self.downsamples[idx](feat_low)\n            out = self.bottom_up_blocks[idx](paddle.concat(\n                [downsample_feat, feat_height], 1))\n            outs.append(out)\n\n        top_features = None\n        if self.num_features == 4:\n            top_features = self.first_top_conv(inputs[-1])\n            top_features = top_features + self.second_top_conv(outs[-1])\n            outs.append(top_features)\n\n        return tuple(outs)\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.out_channels, stride=1. / s)\n            for s in self.spatial_scales\n        ]\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n"
  },
  {
    "path": "ppdet/modeling/necks/ttf_fpn.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.nn.initializer import Constant, Uniform, Normal, XavierUniform\nfrom ppdet.core.workspace import register, serializable\nfrom paddle.regularizer import L2Decay\nfrom ppdet.modeling.layers import DeformableConvV2, ConvNormLayer, LiteConv\nimport math\nfrom ppdet.modeling.ops import batch_norm\nfrom ..shape_spec import ShapeSpec\n\n__all__ = ['TTFFPN']\n\n\nclass Upsample(nn.Layer):\n    def __init__(self, ch_in, ch_out, norm_type='bn'):\n        super(Upsample, self).__init__()\n        fan_in = ch_in * 3 * 3\n        stdv = 1. / math.sqrt(fan_in)\n        self.dcn = DeformableConvV2(\n            ch_in,\n            ch_out,\n            kernel_size=3,\n            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),\n            bias_attr=ParamAttr(\n                initializer=Constant(0),\n                regularizer=L2Decay(0.),\n                learning_rate=2.),\n            lr_scale=2.,\n            regularizer=L2Decay(0.))\n\n        self.bn = batch_norm(\n            ch_out, norm_type=norm_type, initializer=Constant(1.))\n\n    def forward(self, feat):\n        dcn = self.dcn(feat)\n        bn = self.bn(dcn)\n        relu = F.relu(bn)\n        out = F.interpolate(relu, scale_factor=2., mode='bilinear')\n        return out\n\n\nclass DeConv(nn.Layer):\n    def __init__(self, ch_in, ch_out, norm_type='bn'):\n        super(DeConv, self).__init__()\n        self.deconv = nn.Sequential()\n        conv1 = ConvNormLayer(\n            ch_in=ch_in,\n            ch_out=ch_out,\n            stride=1,\n            filter_size=1,\n            norm_type=norm_type,\n            initializer=XavierUniform())\n        conv2 = nn.Conv2DTranspose(\n            in_channels=ch_out,\n            out_channels=ch_out,\n            kernel_size=4,\n            padding=1,\n            stride=2,\n            groups=ch_out,\n            weight_attr=ParamAttr(initializer=XavierUniform()),\n            bias_attr=False)\n        bn = batch_norm(ch_out, norm_type=norm_type, norm_decay=0.)\n        conv3 = ConvNormLayer(\n            ch_in=ch_out,\n            ch_out=ch_out,\n            stride=1,\n            filter_size=1,\n            norm_type=norm_type,\n            initializer=XavierUniform())\n\n        self.deconv.add_sublayer('conv1', conv1)\n        self.deconv.add_sublayer('relu6_1', nn.ReLU6())\n        self.deconv.add_sublayer('conv2', conv2)\n        self.deconv.add_sublayer('bn', bn)\n        self.deconv.add_sublayer('relu6_2', nn.ReLU6())\n        self.deconv.add_sublayer('conv3', conv3)\n        self.deconv.add_sublayer('relu6_3', nn.ReLU6())\n\n    def forward(self, inputs):\n        return self.deconv(inputs)\n\n\nclass LiteUpsample(nn.Layer):\n    def __init__(self, ch_in, ch_out, norm_type='bn'):\n        super(LiteUpsample, self).__init__()\n        self.deconv = DeConv(ch_in, ch_out, norm_type=norm_type)\n        self.conv = LiteConv(ch_in, ch_out, norm_type=norm_type)\n\n    def forward(self, inputs):\n        deconv_up = self.deconv(inputs)\n        conv = self.conv(inputs)\n        interp_up = F.interpolate(conv, scale_factor=2., mode='bilinear')\n        return deconv_up + interp_up\n\n\nclass ShortCut(nn.Layer):\n    def __init__(self,\n                 layer_num,\n                 ch_in,\n                 ch_out,\n                 norm_type='bn',\n                 lite_neck=False,\n                 name=None):\n        super(ShortCut, self).__init__()\n        shortcut_conv = nn.Sequential()\n        for i in range(layer_num):\n            fan_out = 3 * 3 * ch_out\n            std = math.sqrt(2. / fan_out)\n            in_channels = ch_in if i == 0 else ch_out\n            shortcut_name = name + '.conv.{}'.format(i)\n            if lite_neck:\n                shortcut_conv.add_sublayer(\n                    shortcut_name,\n                    LiteConv(\n                        in_channels=in_channels,\n                        out_channels=ch_out,\n                        with_act=i < layer_num - 1,\n                        norm_type=norm_type))\n            else:\n                shortcut_conv.add_sublayer(\n                    shortcut_name,\n                    nn.Conv2D(\n                        in_channels=in_channels,\n                        out_channels=ch_out,\n                        kernel_size=3,\n                        padding=1,\n                        weight_attr=ParamAttr(initializer=Normal(0, std)),\n                        bias_attr=ParamAttr(\n                            learning_rate=2., regularizer=L2Decay(0.))))\n                if i < layer_num - 1:\n                    shortcut_conv.add_sublayer(shortcut_name + '.act',\n                                               nn.ReLU())\n        self.shortcut = self.add_sublayer('shortcut', shortcut_conv)\n\n    def forward(self, feat):\n        out = self.shortcut(feat)\n        return out\n\n\n@register\n@serializable\nclass TTFFPN(nn.Layer):\n    \"\"\"\n    Args:\n        in_channels (list): number of input feature channels from backbone.\n            [128,256,512,1024] by default, means the channels of DarkNet53\n            backbone return_idx [1,2,3,4].\n        planes (list): the number of output feature channels of FPN.\n            [256, 128, 64] by default\n        shortcut_num (list): the number of convolution layers in each shortcut.\n            [3,2,1] by default, means DarkNet53 backbone return_idx_1 has 3 convs\n            in its shortcut, return_idx_2 has 2 convs and return_idx_3 has 1 conv.\n        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. \n            bn by default\n        lite_neck (bool): whether to use lite conv in TTFNet FPN, \n            False by default\n        fusion_method (string): the method to fusion upsample and lateral layer.\n            'add' and 'concat' are optional, add by default\n    \"\"\"\n\n    __shared__ = ['norm_type']\n\n    def __init__(self,\n                 in_channels,\n                 planes=[256, 128, 64],\n                 shortcut_num=[3, 2, 1],\n                 norm_type='bn',\n                 lite_neck=False,\n                 fusion_method='add'):\n        super(TTFFPN, self).__init__()\n        self.planes = planes\n        self.shortcut_num = shortcut_num[::-1]\n        self.shortcut_len = len(shortcut_num)\n        self.ch_in = in_channels[::-1]\n        self.fusion_method = fusion_method\n\n        self.upsample_list = []\n        self.shortcut_list = []\n        self.upper_list = []\n        for i, out_c in enumerate(self.planes):\n            in_c = self.ch_in[i] if i == 0 else self.upper_list[-1]\n            upsample_module = LiteUpsample if lite_neck else Upsample\n            upsample = self.add_sublayer(\n                'upsample.' + str(i),\n                upsample_module(\n                    in_c, out_c, norm_type=norm_type))\n            self.upsample_list.append(upsample)\n            if i < self.shortcut_len:\n                shortcut = self.add_sublayer(\n                    'shortcut.' + str(i),\n                    ShortCut(\n                        self.shortcut_num[i],\n                        self.ch_in[i + 1],\n                        out_c,\n                        norm_type=norm_type,\n                        lite_neck=lite_neck,\n                        name='shortcut.' + str(i)))\n                self.shortcut_list.append(shortcut)\n                if self.fusion_method == 'add':\n                    upper_c = out_c\n                elif self.fusion_method == 'concat':\n                    upper_c = out_c * 2\n                else:\n                    raise ValueError('Illegal fusion method. Expected add or\\\n                        concat, but received {}'.format(self.fusion_method))\n                self.upper_list.append(upper_c)\n\n    def forward(self, inputs):\n        feat = inputs[-1]\n        for i, out_c in enumerate(self.planes):\n            feat = self.upsample_list[i](feat)\n            if i < self.shortcut_len:\n                shortcut = self.shortcut_list[i](inputs[-i - 2])\n                if self.fusion_method == 'add':\n                    feat = feat + shortcut\n                else:\n                    feat = paddle.concat([feat, shortcut], axis=1)\n        return feat\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=self.upper_list[-1], )]\n"
  },
  {
    "path": "ppdet/modeling/necks/yolo_fpn.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.layers import DropBlock\nfrom ppdet.modeling.ops import get_act_fn\nfrom ..backbones.darknet import ConvBNLayer\nfrom ..shape_spec import ShapeSpec\nfrom ..backbones.csp_darknet import BaseConv, DWConv, CSPLayer\n\n__all__ = ['YOLOv3FPN', 'PPYOLOFPN', 'PPYOLOTinyFPN', 'PPYOLOPAN', 'YOLOCSPPAN']\n\n\ndef add_coord(x, data_format):\n    b = x.shape[0]\n    if data_format == 'NCHW':\n        h, w = x.shape[2], x.shape[3]\n    else:\n        h, w = x.shape[1], x.shape[2]\n\n    gx = paddle.cast(paddle.arange(w) / ((w - 1.) * 2.0) - 1., x.dtype)\n    gy = paddle.cast(paddle.arange(h) / ((h - 1.) * 2.0) - 1., x.dtype)\n\n    if data_format == 'NCHW':\n        gx = gx.reshape([1, 1, 1, w]).expand([b, 1, h, w])\n        gy = gy.reshape([1, 1, h, 1]).expand([b, 1, h, w])\n    else:\n        gx = gx.reshape([1, 1, w, 1]).expand([b, h, w, 1])\n        gy = gy.reshape([1, h, 1, 1]).expand([b, h, w, 1])\n\n    gx.stop_gradient = True\n    gy.stop_gradient = True\n    return gx, gy\n\n\nclass YoloDetBlock(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 channel,\n                 norm_type,\n                 freeze_norm=False,\n                 name='',\n                 data_format='NCHW'):\n        \"\"\"\n        YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767\n\n        Args:\n            ch_in (int): input channel\n            channel (int): base channel\n            norm_type (str): batch norm type\n            freeze_norm (bool): whether to freeze norm, default False\n            name (str): layer name\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(YoloDetBlock, self).__init__()\n        self.ch_in = ch_in\n        self.channel = channel\n        assert channel % 2 == 0, \\\n            \"channel {} cannot be divided by 2\".format(channel)\n        conv_def = [\n            ['conv0', ch_in, channel, 1, '.0.0'],\n            ['conv1', channel, channel * 2, 3, '.0.1'],\n            ['conv2', channel * 2, channel, 1, '.1.0'],\n            ['conv3', channel, channel * 2, 3, '.1.1'],\n            ['route', channel * 2, channel, 1, '.2'],\n        ]\n\n        self.conv_module = nn.Sequential()\n        for idx, (conv_name, ch_in, ch_out, filter_size,\n                  post_name) in enumerate(conv_def):\n            self.conv_module.add_sublayer(\n                conv_name,\n                ConvBNLayer(\n                    ch_in=ch_in,\n                    ch_out=ch_out,\n                    filter_size=filter_size,\n                    padding=(filter_size - 1) // 2,\n                    norm_type=norm_type,\n                    freeze_norm=freeze_norm,\n                    data_format=data_format,\n                    name=name + post_name))\n\n        self.tip = ConvBNLayer(\n            ch_in=channel,\n            ch_out=channel * 2,\n            filter_size=3,\n            padding=1,\n            norm_type=norm_type,\n            freeze_norm=freeze_norm,\n            data_format=data_format,\n            name=name + '.tip')\n\n    def forward(self, inputs):\n        route = self.conv_module(inputs)\n        tip = self.tip(route)\n        return route, tip\n\n\nclass SPP(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 k,\n                 pool_size,\n                 norm_type='bn',\n                 freeze_norm=False,\n                 name='',\n                 act='leaky',\n                 data_format='NCHW'):\n        \"\"\"\n        SPP layer, which consist of four pooling layer follwed by conv layer\n\n        Args:\n            ch_in (int): input channel of conv layer\n            ch_out (int): output channel of conv layer\n            k (int): kernel size of conv layer\n            norm_type (str): batch norm type\n            freeze_norm (bool): whether to freeze norm, default False\n            name (str): layer name\n            act (str): activation function\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(SPP, self).__init__()\n        self.pool = []\n        self.data_format = data_format\n        for size in pool_size:\n            pool = self.add_sublayer(\n                '{}.pool1'.format(name),\n                nn.MaxPool2D(\n                    kernel_size=size,\n                    stride=1,\n                    padding=size // 2,\n                    data_format=data_format,\n                    ceil_mode=False))\n            self.pool.append(pool)\n        self.conv = ConvBNLayer(\n            ch_in,\n            ch_out,\n            k,\n            padding=k // 2,\n            norm_type=norm_type,\n            freeze_norm=freeze_norm,\n            name=name,\n            act=act,\n            data_format=data_format)\n\n    def forward(self, x):\n        outs = [x]\n        for pool in self.pool:\n            outs.append(pool(x))\n        if self.data_format == \"NCHW\":\n            y = paddle.concat(outs, axis=1)\n        else:\n            y = paddle.concat(outs, axis=-1)\n\n        y = self.conv(y)\n        return y\n\n\nclass CoordConv(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 filter_size,\n                 padding,\n                 norm_type,\n                 freeze_norm=False,\n                 name='',\n                 data_format='NCHW'):\n        \"\"\"\n        CoordConv layer, see https://arxiv.org/abs/1807.03247\n\n        Args:\n            ch_in (int): input channel\n            ch_out (int): output channel\n            filter_size (int): filter size, default 3\n            padding (int): padding size, default 0\n            norm_type (str): batch norm type, default bn\n            name (str): layer name\n            data_format (str): data format, NCHW or NHWC\n\n        \"\"\"\n        super(CoordConv, self).__init__()\n        self.conv = ConvBNLayer(\n            ch_in + 2,\n            ch_out,\n            filter_size=filter_size,\n            padding=padding,\n            norm_type=norm_type,\n            freeze_norm=freeze_norm,\n            data_format=data_format,\n            name=name)\n        self.data_format = data_format\n\n    def forward(self, x):\n        gx, gy = add_coord(x, self.data_format)\n        if self.data_format == 'NCHW':\n            y = paddle.concat([x, gx, gy], axis=1)\n        else:\n            y = paddle.concat([x, gx, gy], axis=-1)\n        y = self.conv(y)\n        return y\n\n\nclass PPYOLODetBlock(nn.Layer):\n    def __init__(self, cfg, name, data_format='NCHW'):\n        \"\"\"\n        PPYOLODetBlock layer\n\n        Args:\n            cfg (list): layer configs for this block\n            name (str): block name\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(PPYOLODetBlock, self).__init__()\n        self.conv_module = nn.Sequential()\n        for idx, (conv_name, layer, args, kwargs) in enumerate(cfg[:-1]):\n            kwargs.update(\n                name='{}.{}'.format(name, conv_name), data_format=data_format)\n            self.conv_module.add_sublayer(conv_name, layer(*args, **kwargs))\n\n        conv_name, layer, args, kwargs = cfg[-1]\n        kwargs.update(\n            name='{}.{}'.format(name, conv_name), data_format=data_format)\n        self.tip = layer(*args, **kwargs)\n\n    def forward(self, inputs):\n        route = self.conv_module(inputs)\n        tip = self.tip(route)\n        return route, tip\n\n\nclass PPYOLOTinyDetBlock(nn.Layer):\n    def __init__(self,\n                 ch_in,\n                 ch_out,\n                 name,\n                 drop_block=False,\n                 block_size=3,\n                 keep_prob=0.9,\n                 data_format='NCHW'):\n        \"\"\"\n        PPYOLO Tiny DetBlock layer\n        Args:\n            ch_in (list): input channel number\n            ch_out (list): output channel number\n            name (str): block name\n            drop_block: whether user DropBlock\n            block_size: drop block size\n            keep_prob: probability to keep block in DropBlock\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(PPYOLOTinyDetBlock, self).__init__()\n        self.drop_block_ = drop_block\n        self.conv_module = nn.Sequential()\n\n        cfgs = [\n            # name, in channels, out channels, filter_size, \n            # stride, padding, groups\n            ['.0', ch_in, ch_out, 1, 1, 0, 1],\n            ['.1', ch_out, ch_out, 5, 1, 2, ch_out],\n            ['.2', ch_out, ch_out, 1, 1, 0, 1],\n            ['.route', ch_out, ch_out, 5, 1, 2, ch_out],\n        ]\n        for cfg in cfgs:\n            conv_name, conv_ch_in, conv_ch_out, filter_size, stride, padding, \\\n                    groups = cfg\n            self.conv_module.add_sublayer(\n                name + conv_name,\n                ConvBNLayer(\n                    ch_in=conv_ch_in,\n                    ch_out=conv_ch_out,\n                    filter_size=filter_size,\n                    stride=stride,\n                    padding=padding,\n                    groups=groups,\n                    name=name + conv_name))\n\n        self.tip = ConvBNLayer(\n            ch_in=ch_out,\n            ch_out=ch_out,\n            filter_size=1,\n            stride=1,\n            padding=0,\n            groups=1,\n            name=name + conv_name)\n\n        if self.drop_block_:\n            self.drop_block = DropBlock(\n                block_size=block_size,\n                keep_prob=keep_prob,\n                data_format=data_format,\n                name=name + '.dropblock')\n\n    def forward(self, inputs):\n        if self.drop_block_:\n            inputs = self.drop_block(inputs)\n        route = self.conv_module(inputs)\n        tip = self.tip(route)\n        return route, tip\n\n\nclass PPYOLODetBlockCSP(nn.Layer):\n    def __init__(self,\n                 cfg,\n                 ch_in,\n                 ch_out,\n                 act,\n                 norm_type,\n                 name,\n                 data_format='NCHW'):\n        \"\"\"\n        PPYOLODetBlockCSP layer\n\n        Args:\n            cfg (list): layer configs for this block\n            ch_in (int): input channel\n            ch_out (int): output channel\n            act (str): default mish\n            name (str): block name\n            data_format (str): data format, NCHW or NHWC\n        \"\"\"\n        super(PPYOLODetBlockCSP, self).__init__()\n        self.data_format = data_format\n        self.conv1 = ConvBNLayer(\n            ch_in,\n            ch_out,\n            1,\n            padding=0,\n            act=act,\n            norm_type=norm_type,\n            name=name + '.left',\n            data_format=data_format)\n        self.conv2 = ConvBNLayer(\n            ch_in,\n            ch_out,\n            1,\n            padding=0,\n            act=act,\n            norm_type=norm_type,\n            name=name + '.right',\n            data_format=data_format)\n        self.conv3 = ConvBNLayer(\n            ch_out * 2,\n            ch_out * 2,\n            1,\n            padding=0,\n            act=act,\n            norm_type=norm_type,\n            name=name,\n            data_format=data_format)\n        self.conv_module = nn.Sequential()\n        for idx, (layer_name, layer, args, kwargs) in enumerate(cfg):\n            kwargs.update(name=name + layer_name, data_format=data_format)\n            self.conv_module.add_sublayer(layer_name, layer(*args, **kwargs))\n\n    def forward(self, inputs):\n        conv_left = self.conv1(inputs)\n        conv_right = self.conv2(inputs)\n        conv_left = self.conv_module(conv_left)\n        if self.data_format == 'NCHW':\n            conv = paddle.concat([conv_left, conv_right], axis=1)\n        else:\n            conv = paddle.concat([conv_left, conv_right], axis=-1)\n\n        conv = self.conv3(conv)\n        return conv, conv\n\n\n@register\n@serializable\nclass YOLOv3FPN(nn.Layer):\n    __shared__ = ['norm_type', 'data_format']\n\n    def __init__(self,\n                 in_channels=[256, 512, 1024],\n                 norm_type='bn',\n                 freeze_norm=False,\n                 data_format='NCHW'):\n        \"\"\"\n        YOLOv3FPN layer\n\n        Args:\n            in_channels (list): input channels for fpn\n            norm_type (str): batch norm type, default bn\n            data_format (str): data format, NCHW or NHWC\n\n        \"\"\"\n        super(YOLOv3FPN, self).__init__()\n        assert len(in_channels) > 0, \"in_channels length should > 0\"\n        self.in_channels = in_channels\n        self.num_blocks = len(in_channels)\n\n        self._out_channels = []\n        self.yolo_blocks = []\n        self.routes = []\n        self.data_format = data_format\n        for i in range(self.num_blocks):\n            name = 'yolo_block.{}'.format(i)\n            in_channel = in_channels[-i - 1]\n            if i > 0:\n                in_channel += 512 // (2**i)\n            yolo_block = self.add_sublayer(\n                name,\n                YoloDetBlock(\n                    in_channel,\n                    channel=512 // (2**i),\n                    norm_type=norm_type,\n                    freeze_norm=freeze_norm,\n                    data_format=data_format,\n                    name=name))\n            self.yolo_blocks.append(yolo_block)\n            # tip layer output channel doubled\n            self._out_channels.append(1024 // (2**i))\n\n            if i < self.num_blocks - 1:\n                name = 'yolo_transition.{}'.format(i)\n                route = self.add_sublayer(\n                    name,\n                    ConvBNLayer(\n                        ch_in=512 // (2**i),\n                        ch_out=256 // (2**i),\n                        filter_size=1,\n                        stride=1,\n                        padding=0,\n                        norm_type=norm_type,\n                        freeze_norm=freeze_norm,\n                        data_format=data_format,\n                        name=name))\n                self.routes.append(route)\n\n    def forward(self, blocks, for_mot=False):\n        assert len(blocks) == self.num_blocks\n        blocks = blocks[::-1]\n        yolo_feats = []\n\n        # add embedding features output for multi-object tracking model\n        if for_mot:\n            emb_feats = []\n\n        for i, block in enumerate(blocks):\n            if i > 0:\n                if self.data_format == 'NCHW':\n                    block = paddle.concat([route, block], axis=1)\n                else:\n                    block = paddle.concat([route, block], axis=-1)\n            route, tip = self.yolo_blocks[i](block)\n            yolo_feats.append(tip)\n\n            if for_mot:\n                # add embedding features output\n                emb_feats.append(route)\n\n            if i < self.num_blocks - 1:\n                route = self.routes[i](route)\n                route = F.interpolate(\n                    route, scale_factor=2., data_format=self.data_format)\n\n        if for_mot:\n            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}\n        else:\n            return yolo_feats\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n\n\n@register\n@serializable\nclass PPYOLOFPN(nn.Layer):\n    __shared__ = ['norm_type', 'data_format']\n\n    def __init__(self,\n                 in_channels=[512, 1024, 2048],\n                 norm_type='bn',\n                 freeze_norm=False,\n                 data_format='NCHW',\n                 coord_conv=False,\n                 conv_block_num=2,\n                 drop_block=False,\n                 block_size=3,\n                 keep_prob=0.9,\n                 spp=False):\n        \"\"\"\n        PPYOLOFPN layer\n\n        Args:\n            in_channels (list): input channels for fpn\n            norm_type (str): batch norm type, default bn\n            data_format (str): data format, NCHW or NHWC\n            coord_conv (bool): whether use CoordConv or not\n            conv_block_num (int): conv block num of each pan block\n            drop_block (bool): whether use DropBlock or not\n            block_size (int): block size of DropBlock\n            keep_prob (float): keep probability of DropBlock\n            spp (bool): whether use spp or not\n\n        \"\"\"\n        super(PPYOLOFPN, self).__init__()\n        assert len(in_channels) > 0, \"in_channels length should > 0\"\n        self.in_channels = in_channels\n        self.num_blocks = len(in_channels)\n        # parse kwargs\n        self.coord_conv = coord_conv\n        self.drop_block = drop_block\n        self.block_size = block_size\n        self.keep_prob = keep_prob\n        self.spp = spp\n        self.conv_block_num = conv_block_num\n        self.data_format = data_format\n        if self.coord_conv:\n            ConvLayer = CoordConv\n        else:\n            ConvLayer = ConvBNLayer\n\n        if self.drop_block:\n            dropblock_cfg = [[\n                'dropblock', DropBlock, [self.block_size, self.keep_prob],\n                dict()\n            ]]\n        else:\n            dropblock_cfg = []\n\n        self._out_channels = []\n        self.yolo_blocks = []\n        self.routes = []\n        for i, ch_in in enumerate(self.in_channels[::-1]):\n            if i > 0:\n                ch_in += 512 // (2**i)\n            channel = 64 * (2**self.num_blocks) // (2**i)\n            base_cfg = []\n            c_in, c_out = ch_in, channel\n            for j in range(self.conv_block_num):\n                base_cfg += [\n                    [\n                        'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1],\n                        dict(\n                            padding=0,\n                            norm_type=norm_type,\n                            freeze_norm=freeze_norm)\n                    ],\n                    [\n                        'conv{}'.format(2 * j + 1), ConvBNLayer,\n                        [c_out, c_out * 2, 3], dict(\n                            padding=1,\n                            norm_type=norm_type,\n                            freeze_norm=freeze_norm)\n                    ],\n                ]\n                c_in, c_out = c_out * 2, c_out\n\n            base_cfg += [[\n                'route', ConvLayer, [c_in, c_out, 1], dict(\n                    padding=0, norm_type=norm_type, freeze_norm=freeze_norm)\n            ], [\n                'tip', ConvLayer, [c_out, c_out * 2, 3], dict(\n                    padding=1, norm_type=norm_type, freeze_norm=freeze_norm)\n            ]]\n\n            if self.conv_block_num == 2:\n                if i == 0:\n                    if self.spp:\n                        spp_cfg = [[\n                            'spp', SPP, [channel * 4, channel, 1], dict(\n                                pool_size=[5, 9, 13],\n                                norm_type=norm_type,\n                                freeze_norm=freeze_norm)\n                        ]]\n                    else:\n                        spp_cfg = []\n                    cfg = base_cfg[0:3] + spp_cfg + base_cfg[\n                        3:4] + dropblock_cfg + base_cfg[4:6]\n                else:\n                    cfg = base_cfg[0:2] + dropblock_cfg + base_cfg[2:6]\n            elif self.conv_block_num == 0:\n                if self.spp and i == 0:\n                    spp_cfg = [[\n                        'spp', SPP, [c_in * 4, c_in, 1], dict(\n                            pool_size=[5, 9, 13],\n                            norm_type=norm_type,\n                            freeze_norm=freeze_norm)\n                    ]]\n                else:\n                    spp_cfg = []\n                cfg = spp_cfg + dropblock_cfg + base_cfg\n            name = 'yolo_block.{}'.format(i)\n            yolo_block = self.add_sublayer(name, PPYOLODetBlock(cfg, name))\n            self.yolo_blocks.append(yolo_block)\n            self._out_channels.append(channel * 2)\n            if i < self.num_blocks - 1:\n                name = 'yolo_transition.{}'.format(i)\n                route = self.add_sublayer(\n                    name,\n                    ConvBNLayer(\n                        ch_in=channel,\n                        ch_out=256 // (2**i),\n                        filter_size=1,\n                        stride=1,\n                        padding=0,\n                        norm_type=norm_type,\n                        freeze_norm=freeze_norm,\n                        data_format=data_format,\n                        name=name))\n                self.routes.append(route)\n\n    def forward(self, blocks, for_mot=False):\n        assert len(blocks) == self.num_blocks\n        blocks = blocks[::-1]\n        yolo_feats = []\n\n        # add embedding features output for multi-object tracking model\n        if for_mot:\n            emb_feats = []\n\n        for i, block in enumerate(blocks):\n            if i > 0:\n                if self.data_format == 'NCHW':\n                    block = paddle.concat([route, block], axis=1)\n                else:\n                    block = paddle.concat([route, block], axis=-1)\n            route, tip = self.yolo_blocks[i](block)\n            yolo_feats.append(tip)\n\n            if for_mot:\n                # add embedding features output\n                emb_feats.append(route)\n\n            if i < self.num_blocks - 1:\n                route = self.routes[i](route)\n                route = F.interpolate(\n                    route, scale_factor=2., data_format=self.data_format)\n\n        if for_mot:\n            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}\n        else:\n            return yolo_feats\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n\n\n@register\n@serializable\nclass PPYOLOTinyFPN(nn.Layer):\n    __shared__ = ['norm_type', 'data_format']\n\n    def __init__(self,\n                 in_channels=[80, 56, 34],\n                 detection_block_channels=[160, 128, 96],\n                 norm_type='bn',\n                 data_format='NCHW',\n                 **kwargs):\n        \"\"\"\n        PPYOLO Tiny FPN layer\n        Args:\n            in_channels (list): input channels for fpn\n            detection_block_channels (list): channels in fpn\n            norm_type (str): batch norm type, default bn\n            data_format (str): data format, NCHW or NHWC\n            kwargs: extra key-value pairs, such as parameter of DropBlock and spp \n        \"\"\"\n        super(PPYOLOTinyFPN, self).__init__()\n        assert len(in_channels) > 0, \"in_channels length should > 0\"\n        self.in_channels = in_channels[::-1]\n        assert len(detection_block_channels\n                   ) > 0, \"detection_block_channelslength should > 0\"\n        self.detection_block_channels = detection_block_channels\n        self.data_format = data_format\n        self.num_blocks = len(in_channels)\n        # parse kwargs\n        self.drop_block = kwargs.get('drop_block', False)\n        self.block_size = kwargs.get('block_size', 3)\n        self.keep_prob = kwargs.get('keep_prob', 0.9)\n\n        self.spp_ = kwargs.get('spp', False)\n        if self.spp_:\n            self.spp = SPP(self.in_channels[0] * 4,\n                           self.in_channels[0],\n                           k=1,\n                           pool_size=[5, 9, 13],\n                           norm_type=norm_type,\n                           name='spp')\n\n        self._out_channels = []\n        self.yolo_blocks = []\n        self.routes = []\n        for i, (\n                ch_in, ch_out\n        ) in enumerate(zip(self.in_channels, self.detection_block_channels)):\n            name = 'yolo_block.{}'.format(i)\n            if i > 0:\n                ch_in += self.detection_block_channels[i - 1]\n            yolo_block = self.add_sublayer(\n                name,\n                PPYOLOTinyDetBlock(\n                    ch_in,\n                    ch_out,\n                    name,\n                    drop_block=self.drop_block,\n                    block_size=self.block_size,\n                    keep_prob=self.keep_prob))\n            self.yolo_blocks.append(yolo_block)\n            self._out_channels.append(ch_out)\n\n            if i < self.num_blocks - 1:\n                name = 'yolo_transition.{}'.format(i)\n                route = self.add_sublayer(\n                    name,\n                    ConvBNLayer(\n                        ch_in=ch_out,\n                        ch_out=ch_out,\n                        filter_size=1,\n                        stride=1,\n                        padding=0,\n                        norm_type=norm_type,\n                        data_format=data_format,\n                        name=name))\n                self.routes.append(route)\n\n    def forward(self, blocks, for_mot=False):\n        assert len(blocks) == self.num_blocks\n        blocks = blocks[::-1]\n        yolo_feats = []\n\n        # add embedding features output for multi-object tracking model\n        if for_mot:\n            emb_feats = []\n\n        for i, block in enumerate(blocks):\n            if i == 0 and self.spp_:\n                block = self.spp(block)\n\n            if i > 0:\n                if self.data_format == 'NCHW':\n                    block = paddle.concat([route, block], axis=1)\n                else:\n                    block = paddle.concat([route, block], axis=-1)\n            route, tip = self.yolo_blocks[i](block)\n            yolo_feats.append(tip)\n\n            if for_mot:\n                # add embedding features output\n                emb_feats.append(route)\n\n            if i < self.num_blocks - 1:\n                route = self.routes[i](route)\n                route = F.interpolate(\n                    route, scale_factor=2., data_format=self.data_format)\n\n        if for_mot:\n            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}\n        else:\n            return yolo_feats\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n\n\n@register\n@serializable\nclass PPYOLOPAN(nn.Layer):\n    __shared__ = ['norm_type', 'data_format']\n\n    def __init__(self,\n                 in_channels=[512, 1024, 2048],\n                 norm_type='bn',\n                 data_format='NCHW',\n                 act='mish',\n                 conv_block_num=3,\n                 drop_block=False,\n                 block_size=3,\n                 keep_prob=0.9,\n                 spp=False):\n        \"\"\"\n        PPYOLOPAN layer with SPP, DropBlock and CSP connection.\n\n        Args:\n            in_channels (list): input channels for fpn\n            norm_type (str): batch norm type, default bn\n            data_format (str): data format, NCHW or NHWC\n            act (str): activation function, default mish\n            conv_block_num (int): conv block num of each pan block\n            drop_block (bool): whether use DropBlock or not\n            block_size (int): block size of DropBlock\n            keep_prob (float): keep probability of DropBlock\n            spp (bool): whether use spp or not\n\n        \"\"\"\n        super(PPYOLOPAN, self).__init__()\n        assert len(in_channels) > 0, \"in_channels length should > 0\"\n        self.in_channels = in_channels\n        self.num_blocks = len(in_channels)\n        # parse kwargs\n        self.drop_block = drop_block\n        self.block_size = block_size\n        self.keep_prob = keep_prob\n        self.spp = spp\n        self.conv_block_num = conv_block_num\n        self.data_format = data_format\n        if self.drop_block:\n            dropblock_cfg = [[\n                'dropblock', DropBlock, [self.block_size, self.keep_prob],\n                dict()\n            ]]\n        else:\n            dropblock_cfg = []\n\n        # fpn\n        self.fpn_blocks = []\n        self.fpn_routes = []\n        fpn_channels = []\n        for i, ch_in in enumerate(self.in_channels[::-1]):\n            if i > 0:\n                ch_in += 512 // (2**(i - 1))\n            channel = 512 // (2**i)\n            base_cfg = []\n            for j in range(self.conv_block_num):\n                base_cfg += [\n                    # name, layer, args\n                    [\n                        '{}.0'.format(j), ConvBNLayer, [channel, channel, 1],\n                        dict(\n                            padding=0, act=act, norm_type=norm_type)\n                    ],\n                    [\n                        '{}.1'.format(j), ConvBNLayer, [channel, channel, 3],\n                        dict(\n                            padding=1, act=act, norm_type=norm_type)\n                    ]\n                ]\n\n            if i == 0 and self.spp:\n                base_cfg[3] = [\n                    'spp', SPP, [channel * 4, channel, 1], dict(\n                        pool_size=[5, 9, 13], act=act, norm_type=norm_type)\n                ]\n\n            cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:]\n            name = 'fpn.{}'.format(i)\n            fpn_block = self.add_sublayer(\n                name,\n                PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name,\n                                  data_format))\n            self.fpn_blocks.append(fpn_block)\n            fpn_channels.append(channel * 2)\n            if i < self.num_blocks - 1:\n                name = 'fpn_transition.{}'.format(i)\n                route = self.add_sublayer(\n                    name,\n                    ConvBNLayer(\n                        ch_in=channel * 2,\n                        ch_out=channel,\n                        filter_size=1,\n                        stride=1,\n                        padding=0,\n                        act=act,\n                        norm_type=norm_type,\n                        data_format=data_format,\n                        name=name))\n                self.fpn_routes.append(route)\n        # pan\n        self.pan_blocks = []\n        self.pan_routes = []\n        self._out_channels = [512 // (2**(self.num_blocks - 2)), ]\n        for i in reversed(range(self.num_blocks - 1)):\n            name = 'pan_transition.{}'.format(i)\n            route = self.add_sublayer(\n                name,\n                ConvBNLayer(\n                    ch_in=fpn_channels[i + 1],\n                    ch_out=fpn_channels[i + 1],\n                    filter_size=3,\n                    stride=2,\n                    padding=1,\n                    act=act,\n                    norm_type=norm_type,\n                    data_format=data_format,\n                    name=name))\n            self.pan_routes = [route, ] + self.pan_routes\n            base_cfg = []\n            ch_in = fpn_channels[i] + fpn_channels[i + 1]\n            channel = 512 // (2**i)\n            for j in range(self.conv_block_num):\n                base_cfg += [\n                    # name, layer, args\n                    [\n                        '{}.0'.format(j), ConvBNLayer, [channel, channel, 1],\n                        dict(\n                            padding=0, act=act, norm_type=norm_type)\n                    ],\n                    [\n                        '{}.1'.format(j), ConvBNLayer, [channel, channel, 3],\n                        dict(\n                            padding=1, act=act, norm_type=norm_type)\n                    ]\n                ]\n\n            cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:]\n            name = 'pan.{}'.format(i)\n            pan_block = self.add_sublayer(\n                name,\n                PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name,\n                                  data_format))\n\n            self.pan_blocks = [pan_block, ] + self.pan_blocks\n            self._out_channels.append(channel * 2)\n\n        self._out_channels = self._out_channels[::-1]\n\n    def forward(self, blocks, for_mot=False):\n        assert len(blocks) == self.num_blocks\n        blocks = blocks[::-1]\n        fpn_feats = []\n\n        # add embedding features output for multi-object tracking model\n        if for_mot:\n            emb_feats = []\n\n        for i, block in enumerate(blocks):\n            if i > 0:\n                if self.data_format == 'NCHW':\n                    block = paddle.concat([route, block], axis=1)\n                else:\n                    block = paddle.concat([route, block], axis=-1)\n            route, tip = self.fpn_blocks[i](block)\n            fpn_feats.append(tip)\n\n            if for_mot:\n                # add embedding features output\n                emb_feats.append(route)\n\n            if i < self.num_blocks - 1:\n                route = self.fpn_routes[i](route)\n                route = F.interpolate(\n                    route, scale_factor=2., data_format=self.data_format)\n\n        pan_feats = [fpn_feats[-1], ]\n        route = fpn_feats[self.num_blocks - 1]\n        for i in reversed(range(self.num_blocks - 1)):\n            block = fpn_feats[i]\n            route = self.pan_routes[i](route)\n            if self.data_format == 'NCHW':\n                block = paddle.concat([route, block], axis=1)\n            else:\n                block = paddle.concat([route, block], axis=-1)\n\n            route, tip = self.pan_blocks[i](block)\n            pan_feats.append(tip)\n\n        if for_mot:\n            return {'yolo_feats': pan_feats[::-1], 'emb_feats': emb_feats}\n        else:\n            return pan_feats[::-1]\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n\n\n@register\n@serializable\nclass YOLOCSPPAN(nn.Layer):\n    \"\"\"\n    YOLO CSP-PAN, used in YOLOv5 and YOLOX.\n    \"\"\"\n    __shared__ = ['depth_mult', 'data_format', 'act', 'trt']\n\n    def __init__(self,\n                 depth_mult=1.0,\n                 in_channels=[256, 512, 1024],\n                 depthwise=False,\n                 data_format='NCHW',\n                 act='silu',\n                 trt=False):\n        super(YOLOCSPPAN, self).__init__()\n        self.in_channels = in_channels\n        self._out_channels = in_channels\n        Conv = DWConv if depthwise else BaseConv\n\n        self.data_format = data_format\n        act = get_act_fn(\n            act, trt=trt) if act is None or isinstance(act,\n                                                       (str, dict)) else act\n        self.upsample = nn.Upsample(scale_factor=2, mode=\"nearest\")\n\n        # top-down fpn\n        self.lateral_convs = nn.LayerList()\n        self.fpn_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1, 0, -1):\n            self.lateral_convs.append(\n                BaseConv(\n                    int(in_channels[idx]),\n                    int(in_channels[idx - 1]),\n                    1,\n                    1,\n                    act=act))\n            self.fpn_blocks.append(\n                CSPLayer(\n                    int(in_channels[idx - 1] * 2),\n                    int(in_channels[idx - 1]),\n                    round(3 * depth_mult),\n                    shortcut=False,\n                    depthwise=depthwise,\n                    act=act))\n\n        # bottom-up pan\n        self.downsample_convs = nn.LayerList()\n        self.pan_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1):\n            self.downsample_convs.append(\n                Conv(\n                    int(in_channels[idx]),\n                    int(in_channels[idx]),\n                    3,\n                    stride=2,\n                    act=act))\n            self.pan_blocks.append(\n                CSPLayer(\n                    int(in_channels[idx] * 2),\n                    int(in_channels[idx + 1]),\n                    round(3 * depth_mult),\n                    shortcut=False,\n                    depthwise=depthwise,\n                    act=act))\n\n    def forward(self, feats, for_mot=False):\n        assert len(feats) == len(self.in_channels)\n\n        # top-down fpn\n        inner_outs = [feats[-1]]\n        for idx in range(len(self.in_channels) - 1, 0, -1):\n            feat_heigh = inner_outs[0]\n            feat_low = feats[idx - 1]\n            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](\n                feat_heigh)\n            inner_outs[0] = feat_heigh\n\n            upsample_feat = F.interpolate(\n                feat_heigh,\n                scale_factor=2.,\n                mode=\"nearest\",\n                data_format=self.data_format)\n            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](\n                paddle.concat(\n                    [upsample_feat, feat_low], axis=1))\n            inner_outs.insert(0, inner_out)\n\n        # bottom-up pan\n        outs = [inner_outs[0]]\n        for idx in range(len(self.in_channels) - 1):\n            feat_low = outs[-1]\n            feat_height = inner_outs[idx + 1]\n            downsample_feat = self.downsample_convs[idx](feat_low)\n            out = self.pan_blocks[idx](paddle.concat(\n                [downsample_feat, feat_height], axis=1))\n            outs.append(out)\n\n        return outs\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_channels': [i.channels for i in input_shape], }\n\n    @property\n    def out_shape(self):\n        return [ShapeSpec(channels=c) for c in self._out_channels]\n"
  },
  {
    "path": "ppdet/modeling/ops.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n# \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\ntry:\n    import paddle._legacy_C_ops as C_ops\nexcept:\n    import paddle._C_ops as C_ops\n\ntry:\n    from paddle.framework import in_dynamic_or_pir_mode\n    HAVE_PIR = True\nexcept:\n    HAVE_PIR = False\n\nfrom paddle import in_dynamic_mode\nfrom paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype\n\n__all__ = [\n    'prior_box', 'generate_proposals', 'box_coder', 'multiclass_nms',\n    'distribute_fpn_proposals', 'matrix_nms', 'batch_norm', 'mish', 'silu',\n    'swish', 'identity', 'anchor_generator'\n]\n\n\ndef identity(x):\n    return x\n\n\ndef mish(x):\n    return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x))\n\n\ndef silu(x):\n    return F.silu(x)\n\n\ndef swish(x):\n    return x * F.sigmoid(x)\n\n\nTRT_ACT_SPEC = {'swish': swish, 'silu': swish}\n\nACT_SPEC = {'mish': mish, 'silu': silu}\n\n\ndef get_act_fn(act=None, trt=False):\n    assert act is None or isinstance(act, (\n        str, dict)), 'name of activation should be str, dict or None'\n    if not act:\n        return identity\n\n    if isinstance(act, dict):\n        name = act['name']\n        act.pop('name')\n        kwargs = act\n    else:\n        name = act\n        kwargs = dict()\n\n    if trt and name in TRT_ACT_SPEC:\n        fn = TRT_ACT_SPEC[name]\n    elif name in ACT_SPEC:\n        fn = ACT_SPEC[name]\n    else:\n        fn = getattr(F, name)\n\n    return lambda x: fn(x, **kwargs)\n\n\ndef batch_norm(ch,\n               norm_type='bn',\n               norm_decay=0.,\n               freeze_norm=False,\n               initializer=None,\n               data_format='NCHW'):\n\n    norm_lr = 0. if freeze_norm else 1.\n    weight_attr = ParamAttr(\n        initializer=initializer,\n        learning_rate=norm_lr,\n        regularizer=L2Decay(norm_decay),\n        trainable=False if freeze_norm else True)\n    bias_attr = ParamAttr(\n        learning_rate=norm_lr,\n        regularizer=L2Decay(norm_decay),\n        trainable=False if freeze_norm else True)\n\n    if norm_type in ['sync_bn', 'bn']:\n        norm_layer = nn.BatchNorm2D(\n            ch,\n            weight_attr=weight_attr,\n            bias_attr=bias_attr,\n            data_format=data_format)\n\n    norm_params = norm_layer.parameters()\n    if freeze_norm:\n        for param in norm_params:\n            param.stop_gradient = True\n\n    return norm_layer\n\n\n@paddle.jit.not_to_static\ndef anchor_generator(input,\n                     anchor_sizes=None,\n                     aspect_ratios=None,\n                     variance=[0.1, 0.1, 0.2, 0.2],\n                     stride=None,\n                     offset=0.5):\n    \"\"\"\n    **Anchor generator operator**\n    Generate anchors for Faster RCNN algorithm.\n    Each position of the input produce N anchors, N =\n    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors\n    is firstly aspect_ratios loop then anchor_sizes loop.\n    Args:\n       input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map.\n       anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated\n          anchors, given in absolute pixels e.g. [64., 128., 256., 512.].\n          For instance, the anchor size of 64 means the area of this anchor \n          equals to 64**2. None by default.\n       aspect_ratios(float32|list|tuple, optional): The height / width ratios \n           of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default.\n       variance(list|tuple, optional): The variances to be used in box \n           regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by \n           default.\n       stride(list|tuple, optional): The anchors stride across width and height.\n           The data type is float32. e.g. [16.0, 16.0]. None by default.\n       offset(float32, optional): Prior boxes center offset. 0.5 by default.\n    Returns:\n        Tuple:\n        Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4].\n        H is the height of input, W is the width of input,\n        num_anchors is the box count of each position. \n        Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.\n \n        Variances(Variable): The expanded variances of anchors\n        with a layout of [H, W, num_priors, 4].\n        H is the height of input, W is the width of input\n        num_anchors is the box count of each position.\n        Each variance is in (xcenter, ycenter, w, h) format.\n    Examples:\n        .. code-block:: python\n            import paddle.fluid as fluid\n            conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32')\n            anchor, var = fluid.layers.anchor_generator(\n                input=conv1,\n                anchor_sizes=[64, 128, 256, 512],\n                aspect_ratios=[0.5, 1.0, 2.0],\n                variance=[0.1, 0.1, 0.2, 0.2],\n                stride=[16.0, 16.0],\n                offset=0.5)\n    \"\"\"\n\n    def _is_list_or_tuple_(data):\n        return (isinstance(data, list) or isinstance(data, tuple))\n\n    if not _is_list_or_tuple_(anchor_sizes):\n        anchor_sizes = [anchor_sizes]\n    if not _is_list_or_tuple_(aspect_ratios):\n        aspect_ratios = [aspect_ratios]\n    if not (_is_list_or_tuple_(stride) and len(stride) == 2):\n        raise ValueError('stride should be a list or tuple ',\n                         'with length 2, (stride_width, stride_height).')\n\n    anchor_sizes = list(map(float, anchor_sizes))\n    aspect_ratios = list(map(float, aspect_ratios))\n    stride = list(map(float, stride))\n\n    if in_dynamic_mode():\n        attrs = ('anchor_sizes', anchor_sizes, 'aspect_ratios', aspect_ratios,\n                 'variances', variance, 'stride', stride, 'offset', offset)\n        anchor, var = C_ops.anchor_generator(input, *attrs)\n        return anchor, var\n\n    helper = LayerHelper(\"anchor_generator\", **locals())\n    dtype = helper.input_dtype()\n    attrs = {\n        'anchor_sizes': anchor_sizes,\n        'aspect_ratios': aspect_ratios,\n        'variances': variance,\n        'stride': stride,\n        'offset': offset\n    }\n\n    anchor = helper.create_variable_for_type_inference(dtype)\n    var = helper.create_variable_for_type_inference(dtype)\n    helper.append_op(\n        type=\"anchor_generator\",\n        inputs={\"Input\": input},\n        outputs={\"Anchors\": anchor,\n                 \"Variances\": var},\n        attrs=attrs, )\n    anchor.stop_gradient = True\n    var.stop_gradient = True\n    return anchor, var\n\n\n@paddle.jit.not_to_static\ndef distribute_fpn_proposals(fpn_rois,\n                             min_level,\n                             max_level,\n                             refer_level,\n                             refer_scale,\n                             pixel_offset=False,\n                             rois_num=None,\n                             name=None):\n    r\"\"\"\n    \n    **This op only takes LoDTensor as input.** In Feature Pyramid Networks \n    (FPN) models, it is needed to distribute all proposals into different FPN \n    level, with respect to scale of the proposals, the referring scale and the \n    referring level. Besides, to restore the order of proposals, we return an \n    array which indicates the original index of rois in current proposals. \n    To compute FPN level for each roi, the formula is given as follows:\n    \n    .. math::\n\n        roi\\_scale &= \\sqrt{BBoxArea(fpn\\_roi)}\n\n        level = floor(&\\log(\\\\frac{roi\\_scale}{refer\\_scale}) + refer\\_level)\n\n    where BBoxArea is a function to compute the area of each roi.\n\n    Args:\n\n        fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is \n            float32 or float64. The input fpn_rois.\n        min_level(int32): The lowest level of FPN layer where the proposals come \n            from.\n        max_level(int32): The highest level of FPN layer where the proposals\n            come from.\n        refer_level(int32): The referring level of FPN layer with specified scale.\n        refer_scale(int32): The referring scale of FPN layer with specified level.\n        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. \n            The shape is [B] and data type is int32. B is the number of images.\n            If it is not None then return a list of 1-D Tensor. Each element \n            is the output RoIs' number of each image on the corresponding level\n            and the shape is [B]. None by default.\n        name(str, optional): For detailed information, please refer \n            to :ref:`api_guide_Name`. Usually name is no need to set and \n            None by default. \n\n    Returns:\n        Tuple:\n\n        multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] \n        and data type of float32 and float64. The length is \n        max_level-min_level+1. The proposals in each FPN level.\n\n        restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is \n        the number of total rois. The data type is int32. It is\n        used to restore the order of fpn_rois.\n\n        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is \n        the RoIs' number in each image on the corresponding level. The shape \n        is [B] and data type of int32. B is the number of images\n\n\n    Examples:\n        .. code-block:: python\n\n            import paddle\n            from ppdet.modeling import ops\n            paddle.enable_static()\n            fpn_rois = paddle.static.data(\n                name='data', shape=[None, 4], dtype='float32', lod_level=1)\n            multi_rois, restore_ind = ops.distribute_fpn_proposals(\n                fpn_rois=fpn_rois,\n                min_level=2,\n                max_level=5,\n                refer_level=4,\n                refer_scale=224)\n    \"\"\"\n    num_lvl = max_level - min_level + 1\n\n    if in_dynamic_mode():\n        assert rois_num is not None, \"rois_num should not be None in dygraph mode.\"\n        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',\n                 refer_level, 'refer_scale', refer_scale, 'pixel_offset',\n                 pixel_offset)\n        multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals(\n            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)\n\n        return multi_rois, restore_ind, rois_num_per_level\n\n    else:\n        check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],\n                                 'distribute_fpn_proposals')\n        helper = LayerHelper('distribute_fpn_proposals', **locals())\n        dtype = helper.input_dtype('fpn_rois')\n        multi_rois = [\n            helper.create_variable_for_type_inference(dtype)\n            for i in range(num_lvl)\n        ]\n\n        restore_ind = helper.create_variable_for_type_inference(dtype='int32')\n\n        inputs = {'FpnRois': fpn_rois}\n        outputs = {\n            'MultiFpnRois': multi_rois,\n            'RestoreIndex': restore_ind,\n        }\n\n        if rois_num is not None:\n            inputs['RoisNum'] = rois_num\n            rois_num_per_level = [\n                helper.create_variable_for_type_inference(dtype='int32')\n                for i in range(num_lvl)\n            ]\n            outputs['MultiLevelRoIsNum'] = rois_num_per_level\n        else:\n            rois_num_per_level = None\n\n        helper.append_op(\n            type='distribute_fpn_proposals',\n            inputs=inputs,\n            outputs=outputs,\n            attrs={\n                'min_level': min_level,\n                'max_level': max_level,\n                'refer_level': refer_level,\n                'refer_scale': refer_scale,\n                'pixel_offset': pixel_offset\n            })\n        return multi_rois, restore_ind, rois_num_per_level\n\n\n@paddle.jit.not_to_static\ndef prior_box(input,\n              image,\n              min_sizes,\n              max_sizes=None,\n              aspect_ratios=[1.],\n              variance=[0.1, 0.1, 0.2, 0.2],\n              flip=False,\n              clip=False,\n              steps=[0.0, 0.0],\n              offset=0.5,\n              min_max_aspect_ratios_order=False,\n              name=None):\n    \"\"\"\n\n    This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.\n    Each position of the input produce N prior boxes, N is determined by\n    the count of min_sizes, max_sizes and aspect_ratios, The size of the\n    box is in range(min_size, max_size) interval, which is generated in\n    sequence according to the aspect_ratios.\n\n    Parameters:\n       input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64.\n       image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp,\n            the data type should be float32 or float64.\n       min_sizes(list|tuple|float): the min sizes of generated prior boxes.\n       max_sizes(list|tuple|None): the max sizes of generated prior boxes.\n            Default: None.\n       aspect_ratios(list|tuple|float): the aspect ratios of generated\n            prior boxes. Default: [1.].\n       variance(list|tuple): the variances to be encoded in prior boxes.\n            Default:[0.1, 0.1, 0.2, 0.2].\n       flip(bool): Whether to flip aspect ratios. Default:False.\n       clip(bool): Whether to clip out-of-boundary boxes. Default: False.\n       step(list|tuple): Prior boxes step across width and height, If\n            step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across\n            height or weight of the input will be automatically calculated.\n            Default: [0., 0.]\n       offset(float): Prior boxes center offset. Default: 0.5\n       min_max_aspect_ratios_order(bool): If set True, the output prior box is\n            in order of [min, max, aspect_ratios], which is consistent with\n            Caffe. Please note, this order affects the weights order of\n            convolution layer followed by and does not affect the final\n            detection results. Default: False.\n       name(str, optional): The default value is None.  Normally there is no need for \n            user to set this property. For more information, please refer to :ref:`api_guide_Name`\n\n    Returns:\n        Tuple: A tuple with two Variable (boxes, variances)\n\n        boxes(Tensor): the output prior boxes of PriorBox.\n        4-D tensor, the layout is [H, W, num_priors, 4].\n        H is the height of input, W is the width of input,\n        num_priors is the total box count of each position of input.\n\n        variances(Tensor): the expanded variances of PriorBox.\n        4-D tensor, the layput is [H, W, num_priors, 4].\n        H is the height of input, W is the width of input\n        num_priors is the total box count of each position of input\n\n    Examples:\n        .. code-block:: python\n\n        import paddle\n        from ppdet.modeling import ops\n\n        paddle.enable_static()\n        input = paddle.static.data(name=\"input\", shape=[None,3,6,9])\n        image = paddle.static.data(name=\"image\", shape=[None,3,9,12])\n        box, var = ops.prior_box(\n                    input=input,\n                    image=image,\n                    min_sizes=[100.],\n                    clip=True,\n                    flip=True)\n    \"\"\"\n    return paddle.vision.ops.prior_box(\n        input,\n        image,\n        min_sizes,\n        max_sizes,\n        aspect_ratios,\n        variance,\n        flip,\n        clip,\n        steps,\n        offset,\n        min_max_aspect_ratios_order,\n        name,\n    )\n\n\n@paddle.jit.not_to_static\ndef multiclass_nms(bboxes,\n                   scores,\n                   score_threshold,\n                   nms_top_k,\n                   keep_top_k,\n                   nms_threshold=0.3,\n                   normalized=True,\n                   nms_eta=1.,\n                   background_label=-1,\n                   return_index=False,\n                   return_rois_num=True,\n                   rois_num=None,\n                   name=None):\n    \"\"\"\n    This operator is to do multi-class non maximum suppression (NMS) on\n    boxes and scores.\n    In the NMS step, this operator greedily selects a subset of detection bounding\n    boxes that have high scores larger than score_threshold, if providing this\n    threshold, then selects the largest nms_top_k confidences scores if nms_top_k\n    is larger than -1. Then this operator pruns away boxes that have high IOU\n    (intersection over union) overlap with already selected boxes by adaptive\n    threshold NMS based on parameters of nms_threshold and nms_eta.\n    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept\n    per image if keep_top_k is larger than -1.\n    Args:\n        bboxes (Tensor): Two types of bboxes are supported:\n                           1. (Tensor) A 3-D Tensor with shape\n                           [N, M, 4 or 8 16 24 32] represents the\n                           predicted locations of M bounding bboxes,\n                           N is the batch size. Each bounding box has four\n                           coordinate values and the layout is\n                           [xmin, ymin, xmax, ymax], when box size equals to 4.\n                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]\n                           M is the number of bounding boxes, C is the\n                           class number\n        scores (Tensor): Two types of scores are supported:\n                           1. (Tensor) A 3-D Tensor with shape [N, C, M]\n                           represents the predicted confidence predictions.\n                           N is the batch size, C is the class number, M is\n                           number of bounding boxes. For each category there\n                           are total M scores which corresponding M bounding\n                           boxes. Please note, M is equal to the 2nd dimension\n                           of BBoxes.\n                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].\n                           M is the number of bbox, C is the class number.\n                           In this case, input BBoxes should be the second\n                           case with shape [M, C, 4].\n        background_label (int): The index of background label, the background\n                                label will be ignored. If set to -1, then all\n                                categories will be considered. Default: 0\n        score_threshold (float): Threshold to filter out bounding boxes with\n                                 low confidence score. If not provided,\n                                 consider all boxes.\n        nms_top_k (int): Maximum number of detections to be kept according to\n                         the confidences after the filtering detections based\n                         on score_threshold.\n        nms_threshold (float): The threshold to be used in NMS. Default: 0.3\n        nms_eta (float): The threshold to be used in NMS. Default: 1.0\n        keep_top_k (int): Number of total bboxes to be kept per image after NMS\n                          step. -1 means keeping all bboxes after NMS step.\n        normalized (bool): Whether detections are normalized. Default: True\n        return_index(bool): Whether return selected index. Default: False\n        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. \n            The shape is [B] and data type is int32. B is the number of images.\n            If it is not None then return a list of 1-D Tensor. Each element \n            is the output RoIs' number of each image on the corresponding level\n            and the shape is [B]. None by default.\n        name(str): Name of the multiclass nms op. Default: None.\n    Returns:\n        A tuple with two Variables: (Out, Index) if return_index is True,\n        otherwise, a tuple with one Variable(Out) is returned.\n        Out: A 2-D LoDTensor with shape [No, 6] represents the detections.\n        Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]\n        or A 2-D LoDTensor with shape [No, 10] represents the detections.\n        Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3,\n        x4, y4]. No is the total number of detections.\n        If all images have not detected results, all elements in LoD will be\n        0, and output tensor is empty (None).\n        Index: Only return when return_index is True. A 2-D LoDTensor with\n        shape [No, 1] represents the selected index which type is Integer.\n        The index is the absolute value cross batches. No is the same number\n        as Out. If the index is used to gather other attribute such as age,\n        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where\n        N is the batch size and M is the number of boxes.\n    Examples:\n        .. code-block:: python\n\n            import paddle\n            from ppdet.modeling import ops\n            boxes = paddle.static.data(name='bboxes', shape=[81, 4],\n                                      dtype='float32', lod_level=1)\n            scores = paddle.static.data(name='scores', shape=[81],\n                                      dtype='float32', lod_level=1)\n            out, index = ops.multiclass_nms(bboxes=boxes,\n                                            scores=scores,\n                                            background_label=0,\n                                            score_threshold=0.5,\n                                            nms_top_k=400,\n                                            nms_threshold=0.3,\n                                            keep_top_k=200,\n                                            normalized=False,\n                                            return_index=True)\n    \"\"\"\n    helper = LayerHelper('multiclass_nms3', **locals())\n\n    if HAVE_PIR and in_dynamic_or_pir_mode():\n        # https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/ops/yaml/ops.yaml#L3175\n        attrs = (score_threshold, nms_top_k, keep_top_k, nms_threshold, normalized, nms_eta, background_label, )\n        output, index, nms_rois_num = paddle._C_ops.multiclass_nms3(bboxes, scores, rois_num, *attrs)\n\n        if not return_index:\n            index = None\n        return output, nms_rois_num, index\n\n    elif in_dynamic_mode():\n        attrs = ('background_label', background_label, 'score_threshold',\n                 score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',\n                 nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,\n                 'normalized', normalized)\n        output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores,\n                                                            rois_num, *attrs)\n        if not return_index:\n            index = None\n        return output, nms_rois_num, index\n        \n    else:\n        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)\n        index = helper.create_variable_for_type_inference(dtype='int32')\n\n        inputs = {'BBoxes': bboxes, 'Scores': scores}\n        outputs = {'Out': output, 'Index': index}\n\n        if rois_num is not None:\n            inputs['RoisNum'] = rois_num\n\n        if return_rois_num:\n            nms_rois_num = helper.create_variable_for_type_inference(\n                dtype='int32')\n            outputs['NmsRoisNum'] = nms_rois_num\n\n        helper.append_op(\n            type=\"multiclass_nms3\",\n            inputs=inputs,\n            attrs={\n                'background_label': background_label,\n                'score_threshold': score_threshold,\n                'nms_top_k': nms_top_k,\n                'nms_threshold': nms_threshold,\n                'keep_top_k': keep_top_k,\n                'nms_eta': nms_eta,\n                'normalized': normalized\n            },\n            outputs=outputs)\n        output.stop_gradient = True\n        index.stop_gradient = True\n        if not return_index:\n            index = None\n        if not return_rois_num:\n            nms_rois_num = None\n\n        return output, nms_rois_num, index\n\n\n@paddle.jit.not_to_static\ndef matrix_nms(bboxes,\n               scores,\n               score_threshold,\n               post_threshold,\n               nms_top_k,\n               keep_top_k,\n               use_gaussian=False,\n               gaussian_sigma=2.,\n               background_label=0,\n               normalized=True,\n               return_index=False,\n               return_rois_num=True,\n               name=None):\n    \"\"\"\n    **Matrix NMS**\n    This operator does matrix non maximum suppression (NMS).\n    First selects a subset of candidate bounding boxes that have higher scores\n    than score_threshold (if provided), then the top k candidate is selected if\n    nms_top_k is larger than -1. Score of the remaining candidate are then\n    decayed according to the Matrix NMS scheme.\n    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept\n    per image if keep_top_k is larger than -1.\n    Args:\n        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the\n                           predicted locations of M bounding bboxes,\n                           N is the batch size. Each bounding box has four\n                           coordinate values and the layout is\n                           [xmin, ymin, xmax, ymax], when box size equals to 4.\n                           The data type is float32 or float64.\n        scores (Tensor): A 3-D Tensor with shape [N, C, M]\n                           represents the predicted confidence predictions.\n                           N is the batch size, C is the class number, M is\n                           number of bounding boxes. For each category there\n                           are total M scores which corresponding M bounding\n                           boxes. Please note, M is equal to the 2nd dimension\n                           of BBoxes. The data type is float32 or float64.\n        score_threshold (float): Threshold to filter out bounding boxes with\n                                 low confidence score.\n        post_threshold (float): Threshold to filter out bounding boxes with\n                                low confidence score AFTER decaying.\n        nms_top_k (int): Maximum number of detections to be kept according to\n                         the confidences after the filtering detections based\n                         on score_threshold.\n        keep_top_k (int): Number of total bboxes to be kept per image after NMS\n                          step. -1 means keeping all bboxes after NMS step.\n        use_gaussian (bool): Use Gaussian as the decay function. Default: False\n        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0\n        background_label (int): The index of background label, the background\n                                label will be ignored. If set to -1, then all\n                                categories will be considered. Default: 0\n        normalized (bool): Whether detections are normalized. Default: True\n        return_index(bool): Whether return selected index. Default: False\n        return_rois_num(bool): whether return rois_num. Default: True\n        name(str): Name of the matrix nms op. Default: None.\n    Returns:\n        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,\n        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.\n        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the\n             detection results.\n             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]\n             (After version 1.3, when no boxes detected, the lod is changed\n             from {0} to {1})\n        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the\n            selected indices, which are absolute values cross batches.\n        rois_num (Tensor): A 1-D Tensor with shape [N] containing \n            the number of detected boxes in each image.\n    Examples:\n        .. code-block:: python\n            import paddle\n            from ppdet.modeling import ops\n            boxes = paddle.static.data(name='bboxes', shape=[None,81, 4],\n                                      dtype='float32', lod_level=1)\n            scores = paddle.static.data(name='scores', shape=[None,81],\n                                      dtype='float32', lod_level=1)\n            out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0,\n                                 score_threshold=0.5, post_threshold=0.1,\n                                 nms_top_k=400, keep_top_k=200, normalized=False)\n    \"\"\"\n    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],\n                             'matrix_nms')\n    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],\n                             'matrix_nms')\n    check_type(score_threshold, 'score_threshold', float, 'matrix_nms')\n    check_type(post_threshold, 'post_threshold', float, 'matrix_nms')\n    check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')\n    check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms')\n    check_type(normalized, 'normalized', bool, 'matrix_nms')\n    check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms')\n    check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms')\n    check_type(background_label, 'background_label', int, 'matrix_nms')\n\n    if in_dynamic_mode():\n        attrs = ('background_label', background_label, 'score_threshold',\n                 score_threshold, 'post_threshold', post_threshold, 'nms_top_k',\n                 nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',\n                 use_gaussian, 'keep_top_k', keep_top_k, 'normalized',\n                 normalized)\n        out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs)\n        if not return_index:\n            index = None\n        if not return_rois_num:\n            rois_num = None\n        return out, rois_num, index\n    else:\n        helper = LayerHelper('matrix_nms', **locals())\n        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)\n        index = helper.create_variable_for_type_inference(dtype='int32')\n        outputs = {'Out': output, 'Index': index}\n        if return_rois_num:\n            rois_num = helper.create_variable_for_type_inference(dtype='int32')\n            outputs['RoisNum'] = rois_num\n\n        helper.append_op(\n            type=\"matrix_nms\",\n            inputs={'BBoxes': bboxes,\n                    'Scores': scores},\n            attrs={\n                'background_label': background_label,\n                'score_threshold': score_threshold,\n                'post_threshold': post_threshold,\n                'nms_top_k': nms_top_k,\n                'gaussian_sigma': gaussian_sigma,\n                'use_gaussian': use_gaussian,\n                'keep_top_k': keep_top_k,\n                'normalized': normalized\n            },\n            outputs=outputs)\n        output.stop_gradient = True\n\n        if not return_index:\n            index = None\n        if not return_rois_num:\n            rois_num = None\n        return output, rois_num, index\n\n\n@paddle.jit.not_to_static\ndef box_coder(prior_box,\n              prior_box_var,\n              target_box,\n              code_type=\"encode_center_size\",\n              box_normalized=True,\n              axis=0,\n              name=None):\n    r\"\"\"\n    **Box Coder Layer**\n    Encode/Decode the target bounding box with the priorbox information.\n    \n    The Encoding schema described below:\n    .. math::\n        ox = (tx - px) / pw / pxv\n        oy = (ty - py) / ph / pyv\n        ow = \\log(\\abs(tw / pw)) / pwv \n        oh = \\log(\\abs(th / ph)) / phv \n    The Decoding schema described below:\n    \n    .. math::\n  \n        ox = (pw * pxv * tx * + px) - tw / 2\n        oy = (ph * pyv * ty * + py) - th / 2\n        ow = \\exp(pwv * tw) * pw + tw / 2\n        oh = \\exp(phv * th) * ph + th / 2   \n    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, \n    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote \n    the priorbox's (anchor) center coordinates, width and height. `pxv`, \n    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, \n    `ow`, `oh` denote the encoded/decoded coordinates, width and height. \n    During Box Decoding, two modes for broadcast are supported. Say target \n    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or \n    [M, 4]. Then prior box will broadcast to target box along the \n    assigned axis. \n\n    Args:\n        prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape \n            [M, 4] holds M boxes and data type is float32 or float64. Each box\n            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the \n            left top coordinate of the anchor box, if the input is image feature\n            map, they are close to the origin of the coordinate system. \n            [xmax, ymax] is the right bottom coordinate of the anchor box.       \n        prior_box_var(List|Tensor|None): prior_box_var supports three types \n            of input. One is Tensor with shape [M, 4] which holds M group and \n            data type is float32 or float64. The second is list consist of \n            4 elements shared by all boxes and data type is float32 or float64. \n            Other is None and not involved in calculation. \n        target_box(Tensor): This input can be a 2-D LoDTensor with shape \n            [N, 4] when code_type is 'encode_center_size'. This input also can \n            be a 3-D Tensor with shape [N, M, 4] when code_type is \n            'decode_center_size'. Each box is represented as \n            [xmin, ymin, xmax, ymax]. The data type is float32 or float64. \n        code_type(str): The code type used with the target box. It can be\n            `encode_center_size` or `decode_center_size`. `encode_center_size` \n            by default.\n        box_normalized(bool): Whether treat the priorbox as a normalized box.\n            Set true by default.\n        axis(int): Which axis in PriorBox to broadcast for box decode, \n            for example, if axis is 0 and TargetBox has shape [N, M, 4] and \n            PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]\n            for decoding. It is only valid when code type is \n            `decode_center_size`. Set 0 by default. \n        name(str, optional): For detailed information, please refer \n            to :ref:`api_guide_Name`. Usually name is no need to set and \n            None by default. \n\n    Returns:\n        Tensor:\n        output_box(Tensor): When code_type is 'encode_center_size', the \n        output tensor of box_coder_op with shape [N, M, 4] representing the \n        result of N target boxes encoded with M Prior boxes and variances. \n        When code_type is 'decode_center_size', N represents the batch size \n        and M represents the number of decoded boxes.\n\n    Examples:\n \n        .. code-block:: python\n \n            import paddle\n            from ppdet.modeling import ops\n            paddle.enable_static()\n            # For encode\n            prior_box_encode = paddle.static.data(name='prior_box_encode',\n                                  shape=[512, 4],\n                                  dtype='float32')\n            target_box_encode = paddle.static.data(name='target_box_encode',\n                                   shape=[81, 4],\n                                   dtype='float32')\n            output_encode = ops.box_coder(prior_box=prior_box_encode,\n                                    prior_box_var=[0.1,0.1,0.2,0.2],\n                                    target_box=target_box_encode,\n                                    code_type=\"encode_center_size\")\n            # For decode\n            prior_box_decode = paddle.static.data(name='prior_box_decode',\n                                  shape=[512, 4],\n                                  dtype='float32')\n            target_box_decode = paddle.static.data(name='target_box_decode',\n                                   shape=[512, 81, 4],\n                                   dtype='float32')\n            output_decode = ops.box_coder(prior_box=prior_box_decode,\n                                    prior_box_var=[0.1,0.1,0.2,0.2],\n                                    target_box=target_box_decode,\n                                    code_type=\"decode_center_size\",\n                                    box_normalized=False,\n                                    axis=1)\n    \"\"\"\n    check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'],\n                             'box_coder')\n    check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'],\n                             'box_coder')\n\n    if in_dynamic_mode():\n        if isinstance(prior_box_var, Variable):\n            output_box = C_ops.box_coder(\n                prior_box, prior_box_var, target_box, \"code_type\", code_type,\n                \"box_normalized\", box_normalized, \"axis\", axis)\n\n        elif isinstance(prior_box_var, list):\n            output_box = C_ops.box_coder(\n                prior_box, None, target_box, \"code_type\", code_type,\n                \"box_normalized\", box_normalized, \"axis\", axis, \"variance\",\n                prior_box_var)\n        else:\n            raise TypeError(\n                \"Input variance of box_coder must be Variable or list\")\n        return output_box\n    else:\n        helper = LayerHelper(\"box_coder\", **locals())\n\n        output_box = helper.create_variable_for_type_inference(\n            dtype=prior_box.dtype)\n\n        inputs = {\"PriorBox\": prior_box, \"TargetBox\": target_box}\n        attrs = {\n            \"code_type\": code_type,\n            \"box_normalized\": box_normalized,\n            \"axis\": axis\n        }\n        if isinstance(prior_box_var, Variable):\n            inputs['PriorBoxVar'] = prior_box_var\n        elif isinstance(prior_box_var, list):\n            attrs['variance'] = prior_box_var\n        else:\n            raise TypeError(\n                \"Input variance of box_coder must be Variable or list\")\n        helper.append_op(\n            type=\"box_coder\",\n            inputs=inputs,\n            attrs=attrs,\n            outputs={\"OutputBox\": output_box})\n        return output_box\n\n\n@paddle.jit.not_to_static\ndef generate_proposals(scores,\n                       bbox_deltas,\n                       im_shape,\n                       anchors,\n                       variances,\n                       pre_nms_top_n=6000,\n                       post_nms_top_n=1000,\n                       nms_thresh=0.5,\n                       min_size=0.1,\n                       eta=1.0,\n                       pixel_offset=False,\n                       return_rois_num=False,\n                       name=None):\n    \"\"\"\n    **Generate proposal Faster-RCNN**\n    This operation proposes RoIs according to each box with their\n    probability to be a foreground object and \n    the box can be calculated by anchors. Bbox_deltais and scores\n    to be an object are the output of RPN. Final proposals\n    could be used to train detection net.\n    For generating proposals, this operation performs following steps:\n    1. Transposes and resizes scores and bbox_deltas in size of\n       (H*W*A, 1) and (H*W*A, 4)\n    2. Calculate box locations as proposals candidates. \n    3. Clip boxes to image\n    4. Remove predicted boxes with small area. \n    5. Apply NMS to get final proposals as output.\n    Args:\n        scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents\n            the probability for each box to be an object.\n            N is batch size, A is number of anchors, H and W are height and\n            width of the feature map. The data type must be float32.\n        bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W]\n            represents the difference between predicted box location and\n            anchor location. The data type must be float32.\n        im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the\n            origin image size or input size. The data type can be float32 or \n            float64.\n        anchors(Tensor):   A 4-D Tensor represents the anchors with a layout\n            of [H, W, A, 4]. H and W are height and width of the feature map,\n            num_anchors is the box count of each position. Each anchor is\n            in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32.\n        variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of\n            [H, W, num_priors, 4]. Each variance is in\n            (xcenter, ycenter, w, h) format. The data type must be float32.\n        pre_nms_top_n(float): Number of total bboxes to be kept per\n            image before NMS. The data type must be float32. `6000` by default.\n        post_nms_top_n(float): Number of total bboxes to be kept per\n            image after NMS. The data type must be float32. `1000` by default.\n        nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default.\n        min_size(float): Remove predicted boxes with either height or\n            width < min_size. The data type must be float32. `0.1` by default.\n        eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,\n            `adaptive_threshold = adaptive_threshold * eta` in each iteration.\n        return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's \n            num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents\n            the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. \n            'False' by default. \n        name(str, optional): For detailed information, please refer \n            to :ref:`api_guide_Name`. Usually name is no need to set and \n            None by default. \n\n    Returns:\n        tuple:\n        A tuple with format ``(rpn_rois, rpn_roi_probs)``.\n        - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.\n        - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.\n\n    Examples:\n        .. code-block:: python\n        \n            import paddle\n            from ppdet.modeling import ops\n            paddle.enable_static()\n            scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32')\n            bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32')\n            im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32')\n            anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32')\n            variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32')\n            rois, roi_probs = ops.generate_proposals(scores, bbox_deltas,\n                         im_shape, anchors, variances)\n    \"\"\"\n    if in_dynamic_mode():\n        assert return_rois_num, \"return_rois_num should be True in dygraph mode.\"\n        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,\n                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta,\n                 'pixel_offset', pixel_offset)\n        rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2(\n            scores, bbox_deltas, im_shape, anchors, variances, *attrs)\n        if not return_rois_num:\n            rpn_rois_num = None\n        return rpn_rois, rpn_roi_probs, rpn_rois_num\n\n    else:\n        helper = LayerHelper('generate_proposals_v2', **locals())\n\n        check_variable_and_dtype(scores, 'scores', ['float32'],\n                                 'generate_proposals_v2')\n        check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'],\n                                 'generate_proposals_v2')\n        check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'],\n                                 'generate_proposals_v2')\n        check_variable_and_dtype(anchors, 'anchors', ['float32'],\n                                 'generate_proposals_v2')\n        check_variable_and_dtype(variances, 'variances', ['float32'],\n                                 'generate_proposals_v2')\n\n        rpn_rois = helper.create_variable_for_type_inference(\n            dtype=bbox_deltas.dtype)\n        rpn_roi_probs = helper.create_variable_for_type_inference(\n            dtype=scores.dtype)\n        outputs = {\n            'RpnRois': rpn_rois,\n            'RpnRoiProbs': rpn_roi_probs,\n        }\n        if return_rois_num:\n            rpn_rois_num = helper.create_variable_for_type_inference(\n                dtype='int32')\n            rpn_rois_num.stop_gradient = True\n            outputs['RpnRoisNum'] = rpn_rois_num\n\n        helper.append_op(\n            type=\"generate_proposals_v2\",\n            inputs={\n                'Scores': scores,\n                'BboxDeltas': bbox_deltas,\n                'ImShape': im_shape,\n                'Anchors': anchors,\n                'Variances': variances\n            },\n            attrs={\n                'pre_nms_topN': pre_nms_top_n,\n                'post_nms_topN': post_nms_top_n,\n                'nms_thresh': nms_thresh,\n                'min_size': min_size,\n                'eta': eta,\n                'pixel_offset': pixel_offset\n            },\n            outputs=outputs)\n        rpn_rois.stop_gradient = True\n        rpn_roi_probs.stop_gradient = True\n        if not return_rois_num:\n            rpn_rois_num = None\n\n        return rpn_rois, rpn_roi_probs, rpn_rois_num\n\n\ndef sigmoid_cross_entropy_with_logits(input,\n                                      label,\n                                      ignore_index=-100,\n                                      normalize=False):\n    output = F.binary_cross_entropy_with_logits(input, label, reduction='none')\n    mask_tensor = paddle.cast(label != ignore_index, 'float32')\n    output = paddle.multiply(output, mask_tensor)\n    if normalize:\n        sum_valid_mask = paddle.sum(mask_tensor)\n        output = output / sum_valid_mask\n    return output\n\n\ndef smooth_l1(input, label, inside_weight=None, outside_weight=None,\n              sigma=None):\n    input_new = paddle.multiply(input, inside_weight)\n    label_new = paddle.multiply(label, inside_weight)\n    delta = 1 / (sigma * sigma)\n    out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta)\n    out = paddle.multiply(out, outside_weight)\n    out = out / delta\n    out = paddle.reshape(out, shape=[out.shape[0], -1])\n    out = paddle.sum(out, axis=1)\n    return out\n\n\ndef channel_shuffle(x, groups):\n    batch_size, num_channels, height, width = x.shape[0:4]\n    assert num_channels % groups == 0, 'num_channels should be divisible by groups'\n    channels_per_group = num_channels // groups\n    x = paddle.reshape(\n        x=x, shape=[batch_size, groups, channels_per_group, height, width])\n    x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4])\n    x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width])\n    return x\n\n\ndef get_static_shape(tensor):\n    shape = paddle.shape(tensor)\n    shape.stop_gradient = True\n    return shape\n"
  },
  {
    "path": "ppdet/modeling/post_process.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.bbox_utils import nonempty_bbox\nfrom .transformers import bbox_cxcywh_to_xyxy\ntry:\n    from collections.abc import Sequence\nexcept Exception:\n    from collections import Sequence\n\n__all__ = [\n    'BBoxPostProcess', 'MaskPostProcess', 'JDEBBoxPostProcess',\n    'CenterNetPostProcess', 'DETRPostProcess', 'SparsePostProcess',\n    'DETRBBoxSemiPostProcess'\n]\n\n\n@register\nclass BBoxPostProcess(object):\n    __shared__ = ['num_classes', 'export_onnx', 'export_eb']\n    __inject__ = ['decode', 'nms']\n\n    def __init__(self,\n                 num_classes=80,\n                 decode=None,\n                 nms=None,\n                 export_onnx=False,\n                 export_eb=False):\n        super(BBoxPostProcess, self).__init__()\n        self.num_classes = num_classes\n        self.decode = decode\n        self.nms = nms\n        self.export_onnx = export_onnx\n        self.export_eb = export_eb\n\n    def __call__(self, head_out, rois, im_shape, scale_factor):\n        \"\"\"\n        Decode the bbox and do NMS if needed.\n\n        Args:\n            head_out (tuple): bbox_pred and cls_prob of bbox_head output.\n            rois (tuple): roi and rois_num of rpn_head output.\n            im_shape (Tensor): The shape of the input image.\n            scale_factor (Tensor): The scale factor of the input image.\n            export_onnx (bool): whether export model to onnx\n        Returns:\n            bbox_pred (Tensor): The output prediction with shape [N, 6], including\n                labels, scores and bboxes. The size of bboxes are corresponding\n                to the input image, the bboxes may be used in other branch.\n            bbox_num (Tensor): The number of prediction boxes of each batch with\n                shape [1], and is N.\n        \"\"\"\n        if self.nms is not None:\n            bboxes, score = self.decode(head_out, rois, im_shape, scale_factor)\n            bbox_pred, bbox_num, before_nms_indexes = self.nms(bboxes, score,\n                                                               self.num_classes)\n\n        else:\n            bbox_pred, bbox_num = self.decode(head_out, rois, im_shape,\n                                              scale_factor)\n\n        if self.export_onnx:\n            # add fake box after postprocess when exporting onnx \n            fake_bboxes = paddle.to_tensor(\n                np.array(\n                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))\n\n            bbox_pred = paddle.concat([bbox_pred, fake_bboxes])\n            bbox_num = bbox_num + 1\n\n        if self.nms is not None:\n            return bbox_pred, bbox_num, before_nms_indexes\n        else:\n            return bbox_pred, bbox_num\n\n    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):\n        \"\"\"\n        Rescale, clip and filter the bbox from the output of NMS to \n        get final prediction. \n\n        Notes:\n        Currently only support bs = 1.\n\n        Args:\n            bboxes (Tensor): The output bboxes with shape [N, 6] after decode\n                and NMS, including labels, scores and bboxes.\n            bbox_num (Tensor): The number of prediction boxes of each batch with\n                shape [1], and is N.\n            im_shape (Tensor): The shape of the input image.\n            scale_factor (Tensor): The scale factor of the input image.\n        Returns:\n            pred_result (Tensor): The final prediction results with shape [N, 6]\n                including labels, scores and bboxes.\n        \"\"\"\n        if self.export_eb:\n            # enable rcnn models for edgeboard hw to skip the following postprocess.\n            return bboxes, bboxes, bbox_num\n\n        if not self.export_onnx:\n            bboxes_list = []\n            bbox_num_list = []\n            id_start = 0\n            fake_bboxes = paddle.to_tensor(\n                np.array(\n                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))\n            fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))\n\n            # add fake bbox when output is empty for each batch\n            for i in range(bbox_num.shape[0]):\n                if bbox_num[i] == 0:\n                    bboxes_i = fake_bboxes\n                    bbox_num_i = fake_bbox_num\n                else:\n                    bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]\n                    bbox_num_i = bbox_num[i:i + 1]\n                    # id_start: 0-dim, bbox_num: 1-dim. Use bbox_num[i] instead of bbox_num[i:i+1] in pir.\n                    id_start += bbox_num[i]\n                bboxes_list.append(bboxes_i)\n                bbox_num_list.append(bbox_num_i)\n            bboxes = paddle.concat(bboxes_list)\n            bbox_num = paddle.concat(bbox_num_list)\n\n        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)\n\n        if not self.export_onnx:\n            origin_shape_list = []\n            scale_factor_list = []\n            # scale_factor: scale_y, scale_x\n            for i in range(bbox_num.shape[0]):\n                expand_shape = paddle.expand(origin_shape[i:i + 1, :],\n                                             [bbox_num[i:i + 1], 2])                          \n                scale_y, scale_x = scale_factor[i, 0], scale_factor[i, 1]\n                # TODO(PIR): something wrong with slice op, remove unsqueeze in the future.\n                scale_y = paddle.unsqueeze(scale_y, 0)\n                scale_x = paddle.unsqueeze(scale_x, 0)\n                scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])\n                expand_scale = paddle.expand(scale, [bbox_num[i:i + 1], 4])\n                origin_shape_list.append(expand_shape)\n                scale_factor_list.append(expand_scale)\n\n            self.origin_shape_list = paddle.concat(origin_shape_list)\n            scale_factor_list = paddle.concat(scale_factor_list)\n\n        else:\n            # simplify the computation for bs=1 when exporting onnx\n            scale_y, scale_x = scale_factor[0][0], scale_factor[0][1]\n            scale = paddle.concat(\n                [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0)\n            self.origin_shape_list = paddle.expand(origin_shape,\n                                                   [bbox_num[0:1], 2])\n            scale_factor_list = paddle.expand(scale, [bbox_num[0:1], 4])\n\n        # bboxes: [N, 6], label, score, bbox\n        pred_label = bboxes[:, 0:1]\n        pred_score = bboxes[:, 1:2]\n        pred_bbox = bboxes[:, 2:]\n        # rescale bbox to original image\n        scaled_bbox = pred_bbox / scale_factor_list\n        origin_h = self.origin_shape_list[:, 0]\n        origin_w = self.origin_shape_list[:, 1]\n        zeros = paddle.zeros_like(origin_h)\n        # clip bbox to [0, original_size]\n        x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros)\n        y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros)\n        x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros)\n        y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros)\n        pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1)\n        # filter empty bbox\n        keep_mask = nonempty_bbox(pred_bbox, return_mask=True)\n        keep_mask = paddle.unsqueeze(keep_mask, [1])\n        pred_label = paddle.where(keep_mask, pred_label,\n                                  paddle.ones_like(pred_label) * -1)\n        pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1)\n        return bboxes, pred_result, bbox_num\n\n    def get_origin_shape(self, ):\n        return self.origin_shape_list\n\n\n@register\nclass MaskPostProcess(object):\n    __shared__ = ['export_onnx', 'assign_on_cpu']\n    \"\"\"\n    refer to:\n    https://github.com/facebookresearch/detectron2/layers/mask_ops.py\n\n    Get Mask output according to the output from model\n    \"\"\"\n\n    def __init__(self,\n                 binary_thresh=0.5,\n                 export_onnx=False,\n                 assign_on_cpu=False):\n        super(MaskPostProcess, self).__init__()\n        self.binary_thresh = binary_thresh\n        self.export_onnx = export_onnx\n        self.assign_on_cpu = assign_on_cpu\n\n    def __call__(self, mask_out, bboxes, bbox_num, origin_shape):\n        \"\"\"\n        Decode the mask_out and paste the mask to the origin image.\n\n        Args:\n            mask_out (Tensor): mask_head output with shape [N, 28, 28].\n            bbox_pred (Tensor): The output bboxes with shape [N, 6] after decode\n                and NMS, including labels, scores and bboxes.\n            bbox_num (Tensor): The number of prediction boxes of each batch with\n                shape [1], and is N.\n            origin_shape (Tensor): The origin shape of the input image, the tensor\n                shape is [N, 2], and each row is [h, w].\n        Returns:\n            pred_result (Tensor): The final prediction mask results with shape\n                [N, h, w] in binary mask style.\n        \"\"\"\n        num_mask = mask_out.shape[0]\n        origin_shape = paddle.cast(origin_shape, 'int32')\n        device = paddle.device.get_device()\n\n        if self.export_onnx:\n            h, w = origin_shape[0][0], origin_shape[0][1]\n            mask_onnx = paste_mask(mask_out[:, None, :, :], bboxes[:, 2:], h, w,\n                                   self.assign_on_cpu)\n            mask_onnx = mask_onnx >= self.binary_thresh\n            pred_result = paddle.cast(mask_onnx, 'int32')\n\n        else:\n            max_h = paddle.max(origin_shape[:, 0])\n            max_w = paddle.max(origin_shape[:, 1])\n            pred_result = paddle.zeros(\n                [num_mask, max_h, max_w], dtype='int32') - 1\n\n            id_start = 0\n            for i in range(bbox_num.shape[0]):\n                bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]\n                mask_out_i = mask_out[id_start:id_start + bbox_num[i], :, :]\n                im_h = origin_shape[i, 0]\n                im_w = origin_shape[i, 1]\n                pred_mask = paste_mask(mask_out_i[:, None, :, :],\n                                       bboxes_i[:, 2:], im_h, im_w,\n                                       self.assign_on_cpu)\n                pred_mask = paddle.cast(pred_mask >= self.binary_thresh,\n                                        'int32')\n                pred_result[id_start:id_start + bbox_num[i], :im_h, :\n                            im_w] = pred_mask\n                id_start += bbox_num[i]\n        if self.assign_on_cpu:\n            paddle.set_device(device)\n\n        return pred_result\n\n\n@register\nclass JDEBBoxPostProcess(nn.Layer):\n    __shared__ = ['num_classes']\n    __inject__ = ['decode', 'nms']\n\n    def __init__(self, num_classes=1, decode=None, nms=None, return_idx=True):\n        super(JDEBBoxPostProcess, self).__init__()\n        self.num_classes = num_classes\n        self.decode = decode\n        self.nms = nms\n        self.return_idx = return_idx\n\n        self.fake_bbox_pred = paddle.to_tensor(\n            np.array(\n                [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32'))\n        self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))\n        self.fake_nms_keep_idx = paddle.to_tensor(\n            np.array(\n                [[0]], dtype='int32'))\n\n        self.fake_yolo_boxes_out = paddle.to_tensor(\n            np.array(\n                [[[0.0, 0.0, 0.0, 0.0]]], dtype='float32'))\n        self.fake_yolo_scores_out = paddle.to_tensor(\n            np.array(\n                [[[0.0]]], dtype='float32'))\n        self.fake_boxes_idx = paddle.to_tensor(np.array([[0]], dtype='int64'))\n\n    def forward(self, head_out, anchors):\n        \"\"\"\n        Decode the bbox and do NMS for JDE model. \n\n        Args:\n            head_out (list): Bbox_pred and cls_prob of bbox_head output.\n            anchors (list): Anchors of JDE model.\n\n        Returns:\n            boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. \n            bbox_pred (Tensor): The output is the prediction with shape [N, 6]\n                including labels, scores and bboxes.\n            bbox_num (Tensor): The number of prediction of each batch with shape [N].\n            nms_keep_idx (Tensor): The index of kept bboxes after NMS. \n        \"\"\"\n        boxes_idx, yolo_boxes_scores = self.decode(head_out, anchors)\n\n        if len(boxes_idx) == 0:\n            boxes_idx = self.fake_boxes_idx\n            yolo_boxes_out = self.fake_yolo_boxes_out\n            yolo_scores_out = self.fake_yolo_scores_out\n        else:\n            yolo_boxes = paddle.gather_nd(yolo_boxes_scores, boxes_idx)\n            # TODO: only support bs=1 now\n            yolo_boxes_out = paddle.reshape(\n                yolo_boxes[:, :4], shape=[1, len(boxes_idx), 4])\n            yolo_scores_out = paddle.reshape(\n                yolo_boxes[:, 4:5], shape=[1, 1, len(boxes_idx)])\n            boxes_idx = boxes_idx[:, 1:]\n\n        if self.return_idx:\n            bbox_pred, bbox_num, nms_keep_idx = self.nms(\n                yolo_boxes_out, yolo_scores_out, self.num_classes)\n            if bbox_pred.shape[0] == 0:\n                bbox_pred = self.fake_bbox_pred\n                bbox_num = self.fake_bbox_num\n                nms_keep_idx = self.fake_nms_keep_idx\n            return boxes_idx, bbox_pred, bbox_num, nms_keep_idx\n        else:\n            bbox_pred, bbox_num, _ = self.nms(yolo_boxes_out, yolo_scores_out,\n                                              self.num_classes)\n            if bbox_pred.shape[0] == 0:\n                bbox_pred = self.fake_bbox_pred\n                bbox_num = self.fake_bbox_num\n            return _, bbox_pred, bbox_num, _\n\n\n@register\nclass CenterNetPostProcess(object):\n    \"\"\"\n    Postprocess the model outputs to get final prediction:\n        1. Do NMS for heatmap to get top `max_per_img` bboxes.\n        2. Decode bboxes using center offset and box size.\n        3. Rescale decoded bboxes reference to the origin image shape.\n    Args:\n        max_per_img(int): the maximum number of predicted objects in a image,\n            500 by default.\n        down_ratio(int): the down ratio from images to heatmap, 4 by default.\n        regress_ltrb (bool): whether to regress left/top/right/bottom or\n            width/height for a box, true by default.\n    \"\"\"\n    __shared__ = ['down_ratio']\n\n    def __init__(self, max_per_img=500, down_ratio=4, regress_ltrb=True):\n        super(CenterNetPostProcess, self).__init__()\n        self.max_per_img = max_per_img\n        self.down_ratio = down_ratio\n        self.regress_ltrb = regress_ltrb\n        # _simple_nms() _topk() are same as TTFBox in ppdet/modeling/layers.py\n\n    def _simple_nms(self, heat, kernel=3):\n        \"\"\" Use maxpool to filter the max score, get local peaks. \"\"\"\n        pad = (kernel - 1) // 2\n        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)\n        keep = paddle.cast(hmax == heat, 'float32')\n        return heat * keep\n\n    def _topk(self, scores):\n        \"\"\" Select top k scores and decode to get xy coordinates. \"\"\"\n        k = self.max_per_img\n        shape_fm = paddle.shape(scores)\n        shape_fm.stop_gradient = True\n        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]\n        # batch size is 1\n        scores_r = paddle.reshape(scores, [cat, -1])\n        topk_scores, topk_inds = paddle.topk(scores_r, k)\n        topk_ys = topk_inds // width\n        topk_xs = topk_inds % width\n\n        topk_score_r = paddle.reshape(topk_scores, [-1])\n        topk_score, topk_ind = paddle.topk(topk_score_r, k)\n        k_t = paddle.full(topk_ind.shape, k, dtype='int64')\n        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')\n\n        topk_inds = paddle.reshape(topk_inds, [-1])\n        topk_ys = paddle.reshape(topk_ys, [-1, 1])\n        topk_xs = paddle.reshape(topk_xs, [-1, 1])\n        topk_inds = paddle.gather(topk_inds, topk_ind)\n        topk_ys = paddle.gather(topk_ys, topk_ind)\n        topk_xs = paddle.gather(topk_xs, topk_ind)\n        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs\n\n    def __call__(self, hm, wh, reg, im_shape, scale_factor):\n        # 1.get clses and scores, note that hm had been done sigmoid\n        heat = self._simple_nms(hm)\n        scores, inds, topk_clses, ys, xs = self._topk(heat)\n        clses = topk_clses.unsqueeze(1)\n        scores = scores.unsqueeze(1)\n\n        # 2.get bboxes, note only support batch_size=1 now\n        reg_t = paddle.transpose(reg, [0, 2, 3, 1])\n        reg = paddle.reshape(reg_t, [-1, reg_t.shape[-1]])\n        reg = paddle.gather(reg, inds)\n        xs = paddle.cast(xs, 'float32')\n        ys = paddle.cast(ys, 'float32')\n        xs = xs + reg[:, 0:1]\n        ys = ys + reg[:, 1:2]\n        wh_t = paddle.transpose(wh, [0, 2, 3, 1])\n        wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]])\n        wh = paddle.gather(wh, inds)\n        if self.regress_ltrb:\n            x1 = xs - wh[:, 0:1]\n            y1 = ys - wh[:, 1:2]\n            x2 = xs + wh[:, 2:3]\n            y2 = ys + wh[:, 3:4]\n        else:\n            x1 = xs - wh[:, 0:1] / 2\n            y1 = ys - wh[:, 1:2] / 2\n            x2 = xs + wh[:, 0:1] / 2\n            y2 = ys + wh[:, 1:2] / 2\n        n, c, feat_h, feat_w = paddle.shape(hm)\n        padw = (feat_w * self.down_ratio - im_shape[0, 1]) / 2\n        padh = (feat_h * self.down_ratio - im_shape[0, 0]) / 2\n        x1 = x1 * self.down_ratio\n        y1 = y1 * self.down_ratio\n        x2 = x2 * self.down_ratio\n        y2 = y2 * self.down_ratio\n        x1 = x1 - padw\n        y1 = y1 - padh\n        x2 = x2 - padw\n        y2 = y2 - padh\n        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)\n        scale_y = scale_factor[:, 0:1]\n        scale_x = scale_factor[:, 1:2]\n        scale_expand = paddle.concat(\n            [scale_x, scale_y, scale_x, scale_y], axis=1)\n        boxes_shape = bboxes.shape[:]\n        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)\n        bboxes = paddle.divide(bboxes, scale_expand)\n\n        results = paddle.concat([clses, scores, bboxes], axis=1)\n        return results, paddle.shape(results)[0:1], inds, topk_clses, ys, xs\n\n\n@register\nclass DETRPostProcess(object):\n    __shared__ = ['num_classes', 'use_focal_loss', 'with_mask']\n    __inject__ = []\n\n    def __init__(self,\n                 num_classes=80,\n                 num_top_queries=100,\n                 dual_queries=False,\n                 dual_groups=0,\n                 use_focal_loss=False,\n                 with_mask=False,\n                 mask_stride=4,\n                 mask_threshold=0.5,\n                 use_avg_mask_score=False,\n                 bbox_decode_type='origin'):\n        super(DETRPostProcess, self).__init__()\n        assert bbox_decode_type in ['origin', 'pad']\n\n        self.num_classes = num_classes\n        self.num_top_queries = num_top_queries\n        self.dual_queries = dual_queries\n        self.dual_groups = dual_groups\n        self.use_focal_loss = use_focal_loss\n        self.with_mask = with_mask\n        self.mask_stride = mask_stride\n        self.mask_threshold = mask_threshold\n        self.use_avg_mask_score = use_avg_mask_score\n        self.bbox_decode_type = bbox_decode_type\n\n    def _mask_postprocess(self, mask_pred, score_pred):\n        mask_score = F.sigmoid(mask_pred)\n        mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)\n        if self.use_avg_mask_score:\n            avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (\n                mask_pred.sum([-2, -1]) + 1e-6)\n            score_pred *= avg_mask_score\n\n        return mask_pred.flatten(0, 1).astype('int32'), score_pred\n\n    def __call__(self, head_out, im_shape, scale_factor, pad_shape):\n        \"\"\"\n        Decode the bbox and mask.\n\n        Args:\n            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.\n            im_shape (Tensor): The shape of the input image without padding.\n            scale_factor (Tensor): The scale factor of the input image.\n            pad_shape (Tensor): The shape of the input image with padding.\n        Returns:\n            bbox_pred (Tensor): The output prediction with shape [N, 6], including\n                labels, scores and bboxes. The size of bboxes are corresponding\n                to the input image, the bboxes may be used in other branch.\n            bbox_num (Tensor): The number of prediction boxes of each batch with\n                shape [bs], and is N.\n        \"\"\"\n        bboxes, logits, masks = head_out\n        if self.dual_queries:\n            num_queries = logits.shape[1]\n            logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \\\n                             bboxes[:, :int(num_queries // (self.dual_groups + 1)), :]\n\n        bbox_pred = bbox_cxcywh_to_xyxy(bboxes)\n        # calculate the original shape of the image\n        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)\n        img_h, img_w = paddle.split(origin_shape, 2, axis=-1)\n        if self.bbox_decode_type == 'pad':\n            # calculate the shape of the image with padding\n            out_shape = pad_shape / im_shape * origin_shape\n            out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)\n        elif self.bbox_decode_type == 'origin':\n            out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1)\n        else:\n            raise Exception(\n                f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.')\n        bbox_pred *= out_shape\n\n        scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(\n            logits)[:, :, :-1]\n\n        if not self.use_focal_loss:\n            scores, labels = scores.max(-1), scores.argmax(-1)\n            if scores.shape[1] > self.num_top_queries:\n                scores, index = paddle.topk(\n                    scores, self.num_top_queries, axis=-1)\n                batch_ind = paddle.arange(\n                    end=scores.shape[0]).unsqueeze(-1).tile(\n                        [1, self.num_top_queries])\n                index = paddle.stack([batch_ind, index], axis=-1)\n                labels = paddle.gather_nd(labels, index)\n                bbox_pred = paddle.gather_nd(bbox_pred, index)\n        else:\n            scores, index = paddle.topk(\n                scores.flatten(1), self.num_top_queries, axis=-1)\n            labels = index % self.num_classes\n            index = index // self.num_classes\n            batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(\n                [1, self.num_top_queries])\n            index = paddle.stack([batch_ind, index], axis=-1)\n            bbox_pred = paddle.gather_nd(bbox_pred, index)\n\n        mask_pred = None\n        if self.with_mask:\n            assert masks is not None\n            assert masks.shape[0] == 1\n            masks = paddle.gather_nd(masks, index)\n            if self.bbox_decode_type == 'pad':\n                masks = F.interpolate(\n                    masks,\n                    scale_factor=self.mask_stride,\n                    mode=\"bilinear\",\n                    align_corners=False)\n                # TODO: Support prediction with bs>1.\n                # remove padding for input image\n                h, w = im_shape.astype('int32')[0]\n                masks = masks[..., :h, :w]\n            # get pred_mask in the original resolution.\n            img_h = img_h[0].astype('int32')\n            img_w = img_w[0].astype('int32')\n            masks = F.interpolate(\n                masks,\n                size=[img_h, img_w],\n                mode=\"bilinear\",\n                align_corners=False)\n            mask_pred, scores = self._mask_postprocess(masks, scores)\n\n        bbox_pred = paddle.concat(\n            [\n                labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),\n                bbox_pred\n            ],\n            axis=-1)\n        bbox_num = paddle.to_tensor(\n            self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]])\n        bbox_pred = bbox_pred.reshape([-1, 6])\n        return bbox_pred, bbox_num, mask_pred\n\n\n@register\nclass SparsePostProcess(object):\n    __shared__ = ['num_classes', 'assign_on_cpu']\n\n    def __init__(self,\n                 num_proposals,\n                 num_classes=80,\n                 binary_thresh=0.5,\n                 assign_on_cpu=False):\n        super(SparsePostProcess, self).__init__()\n        self.num_classes = num_classes\n        self.num_proposals = num_proposals\n        self.binary_thresh = binary_thresh\n        self.assign_on_cpu = assign_on_cpu\n\n    def __call__(self, scores, bboxes, scale_factor, ori_shape, masks=None):\n        assert len(scores) == len(bboxes) == \\\n               len(ori_shape) == len(scale_factor)\n        device = paddle.device.get_device()\n        batch_size = len(ori_shape)\n\n        scores = F.sigmoid(scores)\n        has_mask = masks is not None\n        if has_mask:\n            masks = F.sigmoid(masks)\n            masks = masks.reshape([batch_size, -1, *masks.shape[1:]])\n\n        bbox_pred = []\n        mask_pred = [] if has_mask else None\n        bbox_num = paddle.zeros([batch_size], dtype='int32')\n        for i in range(batch_size):\n            score = scores[i]\n            bbox = bboxes[i]\n            score, indices = score.flatten(0, 1).topk(\n                self.num_proposals, sorted=False)\n            label = indices % self.num_classes\n            if has_mask:\n                mask = masks[i]\n                mask = mask.flatten(0, 1)[indices]\n\n            H, W = ori_shape[i][0], ori_shape[i][1]\n            bbox = bbox[paddle.cast(indices / self.num_classes, indices.dtype)]\n            bbox /= scale_factor[i]\n            bbox[:, 0::2] = paddle.clip(bbox[:, 0::2], 0, W)\n            bbox[:, 1::2] = paddle.clip(bbox[:, 1::2], 0, H)\n\n            keep = ((bbox[:, 2] - bbox[:, 0]).numpy() > 1.) & \\\n                   ((bbox[:, 3] - bbox[:, 1]).numpy() > 1.)\n            if keep.sum() == 0:\n                bbox = paddle.zeros([1, 6], dtype='float32')\n                if has_mask:\n                    mask = paddle.zeros([1, H, W], dtype='uint8')\n            else:\n                label = paddle.to_tensor(label.numpy()[keep]).astype(\n                    'float32').unsqueeze(-1)\n                score = paddle.to_tensor(score.numpy()[keep]).astype(\n                    'float32').unsqueeze(-1)\n                bbox = paddle.to_tensor(bbox.numpy()[keep]).astype('float32')\n                if has_mask:\n                    mask = paddle.to_tensor(mask.numpy()[keep]).astype(\n                        'float32').unsqueeze(1)\n                    mask = paste_mask(mask, bbox, H, W, self.assign_on_cpu)\n                    mask = paddle.cast(mask >= self.binary_thresh, 'uint8')\n                bbox = paddle.concat([label, score, bbox], axis=-1)\n\n            bbox_num[i] = bbox.shape[0]\n            bbox_pred.append(bbox)\n            if has_mask:\n                mask_pred.append(mask)\n\n        bbox_pred = paddle.concat(bbox_pred)\n        mask_pred = paddle.concat(mask_pred) if has_mask else None\n\n        if self.assign_on_cpu:\n            paddle.set_device(device)\n\n        if has_mask:\n            return bbox_pred, bbox_num, mask_pred\n        else:\n            return bbox_pred, bbox_num\n\n\ndef paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False):\n    \"\"\"\n    Paste the mask prediction to the original image.\n    \"\"\"\n    x0_int, y0_int = 0, 0\n    x1_int, y1_int = im_w, im_h\n    x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)\n    N = masks.shape[0]\n    img_y = paddle.arange(y0_int, y1_int) + 0.5\n    img_x = paddle.arange(x0_int, x1_int) + 0.5\n\n    img_y = (img_y - y0) / (y1 - y0) * 2 - 1\n    img_x = (img_x - x0) / (x1 - x0) * 2 - 1\n    # img_x, img_y have shapes (N, w), (N, h)\n\n    if assign_on_cpu:\n        paddle.set_device('cpu')\n    gx = img_x[:, None, :].expand(\n        [N, img_y.shape[1], img_x.shape[1]])\n    gy = img_y[:, :, None].expand(\n        [N, img_y.shape[1], img_x.shape[1]])\n    grid = paddle.stack([gx, gy], axis=3)\n    img_masks = F.grid_sample(masks, grid, align_corners=False)\n    return img_masks[:, 0]\n\n\ndef multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):\n    final_boxes = []\n    for c in range(num_classes):\n        idxs = bboxs[:, 0] == c\n        if np.count_nonzero(idxs) == 0: continue\n        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)\n        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))\n    return final_boxes\n\n\ndef nms(dets, match_threshold=0.6, match_metric='iou'):\n    \"\"\" Apply NMS to avoid detecting too many overlapping bounding boxes.\n        Args:\n            dets: shape [N, 5], [score, x1, y1, x2, y2]\n            match_metric: 'iou' or 'ios'\n            match_threshold: overlap thresh for match metric.\n    \"\"\"\n    if dets.shape[0] == 0:\n        return dets[[], :]\n    scores = dets[:, 0]\n    x1 = dets[:, 1]\n    y1 = dets[:, 2]\n    x2 = dets[:, 3]\n    y2 = dets[:, 4]\n    areas = (x2 - x1 + 1) * (y2 - y1 + 1)\n    order = scores.argsort()[::-1]\n\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n\n        xx1 = np.maximum(x1[i], x1[order[1:]])\n        yy1 = np.maximum(y1[i], y1[order[1:]])\n        xx2 = np.minimum(x2[i], x2[order[1:]])\n        yy2 = np.minimum(y2[i], y2[order[1:]])\n\n        w = np.maximum(0.0, xx2 - xx1 + 1)\n        h = np.maximum(0.0, yy2 - yy1 + 1)\n        inter = w * h\n\n        if match_metric == 'iou':\n            union = areas[i] + areas[order[1:]] - inter\n            match_value = inter / union\n        elif match_metric == 'ios':\n            smaller = np.minimum(areas[i], areas[order[1:]])\n            match_value = inter / smaller\n        else:\n            raise ValueError()\n\n        inds = np.where(match_value < match_threshold)[0]\n        order = order[inds + 1]\n\n    dets = dets[keep, :]\n    return dets\n\n\n@register\nclass DETRBBoxSemiPostProcess(object):\n    __shared__ = ['num_classes', 'use_focal_loss']\n    __inject__ = []\n\n    def __init__(self,\n                 num_classes=80,\n                 num_top_queries=100,\n                 use_focal_loss=False):\n        super(DETRBBoxSemiPostProcess, self).__init__()\n        self.num_classes = num_classes\n        self.num_top_queries = num_top_queries\n        self.use_focal_loss = use_focal_loss\n\n    def __call__(self, head_out):\n        \"\"\"\n        Decode the bbox.\n        Args:\n            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.\n            im_shape (Tensor): The shape of the input image.\n            scale_factor (Tensor): The scale factor of the input image.\n        Returns:\n            bbox_pred (Tensor): The output prediction with shape [N, 6], including\n                labels, scores and bboxes. The size of bboxes are corresponding\n                to the input image, the bboxes may be used in other branch.\n            bbox_num (Tensor): The number of prediction boxes of each batch with\n                shape [bs], and is N.\n        \"\"\"\n        bboxes, logits, masks = head_out\n        bbox_pred = bboxes\n\n        scores = F.softmax(logits, axis=2)\n\n        import copy\n        soft_scores = copy.deepcopy(scores)\n        scores, index = paddle.topk(scores.max(-1), 300, axis=-1)\n\n        batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(\n            [1, 300])\n        index = paddle.stack([batch_ind, index], axis=-1)\n        labels = paddle.gather_nd(soft_scores.argmax(-1), index).astype('int32')\n        score_class = paddle.gather_nd(soft_scores, index)\n        bbox_pred = paddle.gather_nd(bbox_pred, index)\n        bbox_pred = paddle.concat(\n            [\n                labels.unsqueeze(-1).astype('float32'), score_class,\n                scores.unsqueeze(-1), bbox_pred\n            ],\n            axis=-1)\n        bbox_num = paddle.to_tensor(\n            bbox_pred.shape[1], dtype='int32').tile([bbox_pred.shape[0]])\n        bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]])\n        return bbox_pred, bbox_num"
  },
  {
    "path": "ppdet/modeling/proposal_generator/__init__.py",
    "content": "from . import rpn_head\nfrom . import embedding_rpn_head\n\nfrom .rpn_head import *\nfrom .embedding_rpn_head import *\n"
  },
  {
    "path": "ppdet/modeling/proposal_generator/anchor_generator.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# The code is based on \n# https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/anchor_generator.py\n\nimport math\n\nimport paddle\nimport paddle.nn as nn\nimport numpy as np\n\nfrom ppdet.core.workspace import register\n\n__all__ = ['AnchorGenerator', 'RetinaAnchorGenerator', 'S2ANetAnchorGenerator']\n\n\n@register\nclass AnchorGenerator(nn.Layer):\n    \"\"\"\n    Generate anchors according to the feature maps\n\n    Args:\n        anchor_sizes (list[float] | list[list[float]]): The anchor sizes at \n            each feature point. list[float] means all feature levels share the \n            same sizes. list[list[float]] means the anchor sizes for \n            each level. The sizes stand for the scale of input size.\n        aspect_ratios (list[float] | list[list[float]]): The aspect ratios at\n            each feature point. list[float] means all feature levels share the\n            same ratios. list[list[float]] means the aspect ratios for\n            each level.\n        strides (list[float]): The strides of feature maps which generate \n            anchors\n        offset (float): The offset of the coordinate of anchors, default 0.\n        \n    \"\"\"\n\n    def __init__(self,\n                 anchor_sizes=[32, 64, 128, 256, 512],\n                 aspect_ratios=[0.5, 1.0, 2.0],\n                 strides=[16.0],\n                 variance=[1.0, 1.0, 1.0, 1.0],\n                 offset=0.):\n        super(AnchorGenerator, self).__init__()\n        self.anchor_sizes = anchor_sizes\n        self.aspect_ratios = aspect_ratios\n        self.strides = strides\n        self.variance = variance\n        self.cell_anchors = self._calculate_anchors(len(strides))\n        self.offset = offset\n\n    def _broadcast_params(self, params, num_features):\n        if not isinstance(params[0], (list, tuple)):  # list[float]\n            return [params] * num_features\n        if len(params) == 1:\n            return list(params) * num_features\n        return params\n\n    def generate_cell_anchors(self, sizes, aspect_ratios):\n        anchors = []\n        for size in sizes:\n            area = size**2.0\n            for aspect_ratio in aspect_ratios:\n                w = math.sqrt(area / aspect_ratio)\n                h = aspect_ratio * w\n                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0\n                anchors.append([x0, y0, x1, y1])\n        return paddle.to_tensor(anchors, dtype='float32')\n\n    def _calculate_anchors(self, num_features):\n        sizes = self._broadcast_params(self.anchor_sizes, num_features)\n        aspect_ratios = self._broadcast_params(self.aspect_ratios, num_features)\n        cell_anchors = [\n            self.generate_cell_anchors(s, a)\n            for s, a in zip(sizes, aspect_ratios)\n        ]\n        [\n            self.register_buffer(\n                t.name, t, persistable=False) for t in cell_anchors\n        ]\n        return cell_anchors\n\n    def _create_grid_offsets(self, size, stride, offset):\n        grid_height, grid_width = size[0], size[1]\n        shifts_x = paddle.arange(\n            offset * stride, grid_width * stride, step=stride, dtype='float32')\n        shifts_y = paddle.arange(\n            offset * stride, grid_height * stride, step=stride, dtype='float32')\n        shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x)\n        shift_x = paddle.reshape(shift_x, [-1])\n        shift_y = paddle.reshape(shift_y, [-1])\n        return shift_x, shift_y\n\n    def _grid_anchors(self, grid_sizes):\n        anchors = []\n        for size, stride, base_anchors in zip(grid_sizes, self.strides,\n                                              self.cell_anchors):\n            shift_x, shift_y = self._create_grid_offsets(size, stride,\n                                                         self.offset)\n            shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1)\n            shifts = paddle.reshape(shifts, [-1, 1, 4])\n            base_anchors = paddle.reshape(base_anchors, [1, -1, 4])\n\n            anchors.append(paddle.reshape(shifts + base_anchors, [-1, 4]))\n\n        return anchors\n\n    def forward(self, input):\n        grid_sizes = [feature_map.shape[-2:] for feature_map in input]\n        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)\n        return anchors_over_all_feature_maps\n\n    @property\n    def num_anchors(self):\n        \"\"\"\n        Returns:\n            int: number of anchors at every pixel\n                location, on that feature map.\n                For example, if at every pixel we use anchors of 3 aspect\n                ratios and 5 sizes, the number of anchors is 15.\n                For FPN models, `num_anchors` on every feature map is the same.\n        \"\"\"\n        return len(self.cell_anchors[0])\n\n\n@register\nclass RetinaAnchorGenerator(AnchorGenerator):\n    def __init__(self,\n                 octave_base_scale=4,\n                 scales_per_octave=3,\n                 aspect_ratios=[0.5, 1.0, 2.0],\n                 strides=[8.0, 16.0, 32.0, 64.0, 128.0],\n                 variance=[1.0, 1.0, 1.0, 1.0],\n                 offset=0.0):\n        anchor_sizes = []\n        for s in strides:\n            anchor_sizes.append([\n                s * octave_base_scale * 2**(i/scales_per_octave) \\\n                for i in range(scales_per_octave)])\n        super(RetinaAnchorGenerator, self).__init__(\n            anchor_sizes=anchor_sizes,\n            aspect_ratios=aspect_ratios,\n            strides=strides,\n            variance=variance,\n            offset=offset)\n\n\n@register\nclass S2ANetAnchorGenerator(nn.Layer):\n    \"\"\"\n    AnchorGenerator by paddle\n    \"\"\"\n\n    def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None):\n        super(S2ANetAnchorGenerator, self).__init__()\n        self.base_size = base_size\n        self.scales = paddle.to_tensor(scales)\n        self.ratios = paddle.to_tensor(ratios)\n        self.scale_major = scale_major\n        self.ctr = ctr\n        self.base_anchors = self.gen_base_anchors()\n\n    @property\n    def num_base_anchors(self):\n        return self.base_anchors.shape[0]\n\n    def gen_base_anchors(self):\n        w = self.base_size\n        h = self.base_size\n        if self.ctr is None:\n            x_ctr = 0.5 * (w - 1)\n            y_ctr = 0.5 * (h - 1)\n        else:\n            x_ctr, y_ctr = self.ctr\n\n        h_ratios = paddle.sqrt(self.ratios)\n        w_ratios = 1 / h_ratios\n        if self.scale_major:\n            ws = (w * w_ratios[:] * self.scales[:].astype(w_ratios.dtype)).reshape([-1])\n            hs = (h * h_ratios[:] * self.scales[:].astype(h_ratios.dtype)).reshape([-1])\n        else:\n            ws = (w * self.scales[:].astype(w_ratios.dtype) * w_ratios[:]).reshape([-1])\n            hs = (h * self.scales[:].astype(h_ratios.dtype) * h_ratios[:]).reshape([-1])\n\n        base_anchors = paddle.stack(\n            [\n                x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),\n                x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)\n            ],\n            axis=-1)\n        base_anchors = paddle.round(base_anchors)\n        return base_anchors\n\n    def _meshgrid(self, x, y, row_major=True):\n        yy, xx = paddle.meshgrid(y, x)\n        yy = yy.reshape([-1])\n        xx = xx.reshape([-1])\n        if row_major:\n            return xx, yy\n        else:\n            return yy, xx\n\n    def forward(self, featmap_size, stride=16):\n        # featmap_size*stride project it to original area\n\n        feat_h = featmap_size[0]\n        feat_w = featmap_size[1]\n        shift_x = paddle.arange(0, feat_w, 1, 'int32') * stride\n        shift_y = paddle.arange(0, feat_h, 1, 'int32') * stride\n        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)\n        shifts = paddle.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1)\n\n        all_anchors = self.base_anchors[:, :] + shifts[:, :].astype(self.base_anchors.dtype)\n        all_anchors = all_anchors.cast(paddle.float32).reshape(\n            [feat_h * feat_w, 4])\n        all_anchors = self.rect2rbox(all_anchors)\n        return all_anchors\n\n    def valid_flags(self, featmap_size, valid_size):\n        feat_h, feat_w = featmap_size\n        valid_h, valid_w = valid_size\n        assert valid_h <= feat_h and valid_w <= feat_w\n        valid_x = paddle.zeros([feat_w], dtype='int32')\n        valid_y = paddle.zeros([feat_h], dtype='int32')\n        valid_x[:valid_w] = 1\n        valid_y[:valid_h] = 1\n        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)\n        valid = valid_xx & valid_yy\n        valid = paddle.reshape(valid, [-1, 1])\n        valid = paddle.expand(valid, [-1, self.num_base_anchors]).reshape([-1])\n        return valid\n\n    def rect2rbox(self, bboxes):\n        \"\"\"\n        :param bboxes: shape (L, 4) (xmin, ymin, xmax, ymax)\n        :return: dbboxes: shape (L, 5) (x_ctr, y_ctr, w, h, angle)\n        \"\"\"\n        x1, y1, x2, y2 = paddle.split(bboxes, 4, axis=-1)\n\n        x_ctr = (x1 + x2) / 2.0\n        y_ctr = (y1 + y2) / 2.0\n        edges1 = paddle.abs(x2 - x1)\n        edges2 = paddle.abs(y2 - y1)\n\n        rbox_w = paddle.maximum(edges1, edges2)\n        rbox_h = paddle.minimum(edges1, edges2)\n\n        # set angle\n        inds = edges1 < edges2\n        inds = paddle.cast(inds, paddle.float32)\n        rboxes_angle = inds * np.pi / 2.0\n\n        rboxes = paddle.concat(\n            (x_ctr, y_ctr, rbox_w, rbox_h, rboxes_angle), axis=-1)\n        return rboxes\n"
  },
  {
    "path": "ppdet/modeling/proposal_generator/embedding_rpn_head.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# This code is referenced from: https://github.com/open-mmlab/mmdetection\n\nimport paddle\nfrom paddle import nn\n\nfrom ppdet.core.workspace import register\n\n__all__ = ['EmbeddingRPNHead']\n\n\n@register\nclass EmbeddingRPNHead(nn.Layer):\n    __shared__ = ['proposal_embedding_dim']\n\n    def __init__(self, num_proposals, proposal_embedding_dim=256):\n        super(EmbeddingRPNHead, self).__init__()\n\n        self.num_proposals = num_proposals\n        self.proposal_embedding_dim = proposal_embedding_dim\n\n        self._init_layers()\n        self._init_weights()\n\n    def _init_layers(self):\n        self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4)\n        self.init_proposal_features = nn.Embedding(self.num_proposals,\n                                                   self.proposal_embedding_dim)\n\n    def _init_weights(self):\n        init_bboxes = paddle.empty_like(self.init_proposal_bboxes.weight)\n        init_bboxes[:, :2] = 0.5\n        init_bboxes[:, 2:] = 1.0\n        self.init_proposal_bboxes.weight.set_value(init_bboxes)\n\n    @staticmethod\n    def bbox_cxcywh_to_xyxy(x):\n        cxcy, wh = paddle.split(x, 2, axis=-1)\n        return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)\n\n    def forward(self, img_whwh):\n        proposal_bboxes = self.init_proposal_bboxes.weight.clone()\n        proposal_bboxes = self.bbox_cxcywh_to_xyxy(proposal_bboxes)\n        proposal_bboxes = proposal_bboxes.unsqueeze(0) * img_whwh.unsqueeze(1)\n\n        proposal_features = self.init_proposal_features.weight.clone()\n        proposal_features = proposal_features.unsqueeze(0).tile(\n            [img_whwh.shape[0], 1, 1])\n\n        return proposal_bboxes, proposal_features\n"
  },
  {
    "path": "ppdet/modeling/proposal_generator/proposal_generator.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\n\nfrom ppdet.core.workspace import register, serializable\nfrom .. import ops\n\n\n@register\n@serializable\nclass ProposalGenerator(object):\n    \"\"\"\n    Proposal generation module\n\n    For more details, please refer to the document of generate_proposals \n    in ppdet/modeing/ops.py\n\n    Args:\n        pre_nms_top_n (int): Number of total bboxes to be kept per\n            image before NMS. default 6000\n        post_nms_top_n (int): Number of total bboxes to be kept per\n            image after NMS. default 1000\n        nms_thresh (float): Threshold in NMS. default 0.5\n        min_size (flaot): Remove predicted boxes with either height or\n             width < min_size. default 0.1\n        eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,\n             `adaptive_threshold = adaptive_threshold * eta` in each iteration.\n             default 1.\n        topk_after_collect (bool): whether to adopt topk after batch \n             collection. If topk_after_collect is true, box filter will not be \n             used after NMS at each image in proposal generation. default false\n    \"\"\"\n\n    def __init__(self,\n                 pre_nms_top_n=12000,\n                 post_nms_top_n=2000,\n                 nms_thresh=.5,\n                 min_size=.1,\n                 eta=1.,\n                 topk_after_collect=False):\n        super(ProposalGenerator, self).__init__()\n        self.pre_nms_top_n = pre_nms_top_n\n        self.post_nms_top_n = post_nms_top_n\n        self.nms_thresh = nms_thresh\n        self.min_size = min_size\n        self.eta = eta\n        self.topk_after_collect = topk_after_collect\n\n    def __call__(self, scores, bbox_deltas, anchors, im_shape):\n\n        top_n = self.pre_nms_top_n if self.topk_after_collect else self.post_nms_top_n\n        variances = paddle.ones_like(anchors)\n        if hasattr(paddle.vision.ops, \"generate_proposals\"):\n            generate_proposals = getattr(paddle.vision.ops,\n                                         \"generate_proposals\")\n        else:\n            generate_proposals = ops.generate_proposals\n        rpn_rois, rpn_rois_prob, rpn_rois_num = generate_proposals(\n            scores,\n            bbox_deltas,\n            im_shape,\n            anchors,\n            variances,\n            pre_nms_top_n=self.pre_nms_top_n,\n            post_nms_top_n=top_n,\n            nms_thresh=self.nms_thresh,\n            min_size=self.min_size,\n            eta=self.eta,\n            return_rois_num=True)\n\n        return rpn_rois, rpn_rois_prob, rpn_rois_num, self.post_nms_top_n\n"
  },
  {
    "path": "ppdet/modeling/proposal_generator/rpn_head.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal\n\nfrom ppdet.core.workspace import register\nfrom .anchor_generator import AnchorGenerator\nfrom .target_layer import RPNTargetAssign\nfrom .proposal_generator import ProposalGenerator\nfrom ..cls_utils import _get_class_default_kwargs\n\n\nclass RPNFeat(nn.Layer):\n    \"\"\"\n    Feature extraction in RPN head\n\n    Args:\n        in_channel (int): Input channel\n        out_channel (int): Output channel\n    \"\"\"\n\n    def __init__(self, in_channel=1024, out_channel=1024):\n        super(RPNFeat, self).__init__()\n        # rpn feat is shared with each level\n        self.rpn_conv = nn.Conv2D(\n            in_channels=in_channel,\n            out_channels=out_channel,\n            kernel_size=3,\n            padding=1,\n            weight_attr=paddle.ParamAttr(initializer=Normal(\n                mean=0., std=0.01)))\n        self.rpn_conv.skip_quant = True\n\n    def forward(self, feats):\n        rpn_feats = []\n        for feat in feats:\n            rpn_feats.append(F.relu(self.rpn_conv(feat)))\n        return rpn_feats\n\n\n@register\nclass RPNHead(nn.Layer):\n    \"\"\"\n    Region Proposal Network\n\n    Args:\n        anchor_generator (dict): configure of anchor generation\n        rpn_target_assign (dict): configure of rpn targets assignment\n        train_proposal (dict): configure of proposals generation\n            at the stage of training\n        test_proposal (dict): configure of proposals generation\n            at the stage of prediction\n        in_channel (int): channel of input feature maps which can be\n            derived by from_config\n    \"\"\"\n    __shared__ = ['export_onnx']\n    __inject__ = ['loss_rpn_bbox']\n\n    def __init__(self,\n                 anchor_generator=_get_class_default_kwargs(AnchorGenerator),\n                 rpn_target_assign=_get_class_default_kwargs(RPNTargetAssign),\n                 train_proposal=_get_class_default_kwargs(ProposalGenerator,\n                                                          12000, 2000),\n                 test_proposal=_get_class_default_kwargs(ProposalGenerator),\n                 in_channel=1024,\n                 export_onnx=False,\n                 loss_rpn_bbox=None):\n        super(RPNHead, self).__init__()\n        self.anchor_generator = anchor_generator\n        self.rpn_target_assign = rpn_target_assign\n        self.train_proposal = train_proposal\n        self.test_proposal = test_proposal\n        self.export_onnx = export_onnx\n        if isinstance(anchor_generator, dict):\n            self.anchor_generator = AnchorGenerator(**anchor_generator)\n        if isinstance(rpn_target_assign, dict):\n            self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign)\n        if isinstance(train_proposal, dict):\n            self.train_proposal = ProposalGenerator(**train_proposal)\n        if isinstance(test_proposal, dict):\n            self.test_proposal = ProposalGenerator(**test_proposal)\n        self.loss_rpn_bbox = loss_rpn_bbox\n\n        num_anchors = self.anchor_generator.num_anchors\n        self.rpn_feat = RPNFeat(in_channel, in_channel)\n        # rpn head is shared with each level\n        # rpn roi classification scores\n        self.rpn_rois_score = nn.Conv2D(\n            in_channels=in_channel,\n            out_channels=num_anchors,\n            kernel_size=1,\n            padding=0,\n            weight_attr=paddle.ParamAttr(initializer=Normal(\n                mean=0., std=0.01)))\n        self.rpn_rois_score.skip_quant = True\n\n        # rpn roi bbox regression deltas\n        self.rpn_rois_delta = nn.Conv2D(\n            in_channels=in_channel,\n            out_channels=4 * num_anchors,\n            kernel_size=1,\n            padding=0,\n            weight_attr=paddle.ParamAttr(initializer=Normal(\n                mean=0., std=0.01)))\n        self.rpn_rois_delta.skip_quant = True\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        # FPN share same rpn head\n        if isinstance(input_shape, (list, tuple)):\n            input_shape = input_shape[0]\n        return {'in_channel': input_shape.channels}\n\n    def forward(self, feats, inputs):\n        rpn_feats = self.rpn_feat(feats)\n        scores = []\n        deltas = []\n\n        for rpn_feat in rpn_feats:\n            rrs = self.rpn_rois_score(rpn_feat)\n            rrd = self.rpn_rois_delta(rpn_feat)\n            scores.append(rrs)\n            deltas.append(rrd)\n\n        anchors = self.anchor_generator(rpn_feats)\n\n        rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs)\n        if self.training:\n            loss = self.get_loss(scores, deltas, anchors, inputs)\n            return rois, rois_num, loss\n        else:\n            return rois, rois_num, None\n\n    def _gen_proposal(self, scores, bbox_deltas, anchors, inputs):\n        \"\"\"\n        scores (list[Tensor]): Multi-level scores prediction\n        bbox_deltas (list[Tensor]): Multi-level deltas prediction\n        anchors (list[Tensor]): Multi-level anchors\n        inputs (dict): ground truth info\n        \"\"\"\n        prop_gen = self.train_proposal if self.training else self.test_proposal\n        im_shape = inputs['im_shape']\n\n        # Collect multi-level proposals for each batch\n        # Get 'topk' of them as final output\n\n        if self.export_onnx:\n            # bs = 1 when exporting onnx\n            onnx_rpn_rois_list = []\n            onnx_rpn_prob_list = []\n            onnx_rpn_rois_num_list = []\n\n            for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas,\n                                                    anchors):\n                onnx_rpn_rois, onnx_rpn_rois_prob, onnx_rpn_rois_num, onnx_post_nms_top_n = prop_gen(\n                    scores=rpn_score[0:1],\n                    bbox_deltas=rpn_delta[0:1],\n                    anchors=anchor,\n                    im_shape=im_shape[0:1])\n                onnx_rpn_rois_list.append(onnx_rpn_rois)\n                onnx_rpn_prob_list.append(onnx_rpn_rois_prob)\n                onnx_rpn_rois_num_list.append(onnx_rpn_rois_num)\n\n            onnx_rpn_rois = paddle.concat(onnx_rpn_rois_list)\n            onnx_rpn_prob = paddle.concat(onnx_rpn_prob_list).flatten()\n\n            onnx_top_n = paddle.to_tensor(onnx_post_nms_top_n).cast('int32')\n            onnx_num_rois = paddle.shape(onnx_rpn_prob)[0].cast('int32')\n            k = paddle.minimum(onnx_top_n, onnx_num_rois)\n            onnx_topk_prob, onnx_topk_inds = paddle.topk(onnx_rpn_prob, k)\n            onnx_topk_rois = paddle.gather(onnx_rpn_rois, onnx_topk_inds)\n            # TODO(wangguanzhong): Now bs_rois_collect in export_onnx is moved outside conditional branch\n            # due to problems in dy2static of paddle. Will fix it when updating paddle framework.\n            # bs_rois_collect = [onnx_topk_rois]\n            # bs_rois_num_collect = paddle.shape(onnx_topk_rois)[0]\n\n        else:\n            bs_rois_collect = []\n            bs_rois_num_collect = []\n\n            batch_size = im_shape.shape[0]\n\n            # Generate proposals for each level and each batch.\n            # Discard batch-computing to avoid sorting bbox cross different batches.\n            for i in range(batch_size):\n                rpn_rois_list = []\n                rpn_prob_list = []\n                rpn_rois_num_list = []\n\n                for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas,\n                                                        anchors):\n                    rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen(\n                        scores=rpn_score[i:i + 1],\n                        bbox_deltas=rpn_delta[i:i + 1],\n                        anchors=anchor,\n                        im_shape=im_shape[i:i + 1])\n                    rpn_rois_list.append(rpn_rois)\n                    rpn_prob_list.append(rpn_rois_prob)\n                    rpn_rois_num_list.append(rpn_rois_num)\n\n                if len(scores) > 1:\n                    rpn_rois = paddle.concat(rpn_rois_list)\n                    rpn_prob = paddle.concat(rpn_prob_list).flatten()\n\n                    num_rois = rpn_prob.shape[0]\n                    num_rois = paddle.shape(rpn_prob)[0].cast('int32')\n                    if num_rois > post_nms_top_n:\n                        topk_prob, topk_inds = paddle.topk(rpn_prob,\n                                                           post_nms_top_n)\n                        topk_rois = paddle.gather(rpn_rois, topk_inds)\n                    else:\n                        topk_rois = rpn_rois\n                        topk_prob = rpn_prob\n                        topk_inds = paddle.zeros(shape=[post_nms_top_n], dtype=\"int64\")\n                else:\n                    topk_rois = rpn_rois_list[0]\n                    topk_prob = rpn_prob_list[0].flatten()\n\n                bs_rois_collect.append(topk_rois)\n                bs_rois_num_collect.append(paddle.shape(topk_rois)[0:1])\n\n                # TODO(PIR): remove this after pir bug fixed\n                rpn_rois_list = None\n                rpn_prob_list = None\n                rpn_rois_num_list = None\n\n            bs_rois_num_collect = paddle.concat(bs_rois_num_collect)\n\n        if self.export_onnx:\n            output_rois = [onnx_topk_rois]\n            output_rois_num = paddle.shape(onnx_topk_rois)[0]\n        else:\n            output_rois = bs_rois_collect\n            output_rois_num = bs_rois_num_collect\n\n        return output_rois, output_rois_num\n\n    def get_loss(self, pred_scores, pred_deltas, anchors, inputs):\n        \"\"\"\n        pred_scores (list[Tensor]): Multi-level scores prediction\n        pred_deltas (list[Tensor]): Multi-level deltas prediction\n        anchors (list[Tensor]): Multi-level anchors\n        inputs (dict): ground truth info, including im, gt_bbox, gt_score\n        \"\"\"\n        anchors = [paddle.reshape(a, shape=(-1, 4)) for a in anchors]\n        anchors = paddle.concat(anchors)\n\n        scores = [\n            paddle.reshape(\n                paddle.transpose(\n                    v, perm=[0, 2, 3, 1]),\n                shape=(v.shape[0], -1, 1)) for v in pred_scores\n        ]\n        scores = paddle.concat(scores, axis=1)\n\n        deltas = [\n            paddle.reshape(\n                paddle.transpose(\n                    v, perm=[0, 2, 3, 1]),\n                shape=(v.shape[0], -1, 4)) for v in pred_deltas\n        ]\n        deltas = paddle.concat(deltas, axis=1)\n\n        score_tgt, bbox_tgt, loc_tgt, norm = self.rpn_target_assign(inputs,\n                                                                    anchors)\n\n        scores = paddle.reshape(x=scores, shape=(-1, ))\n        deltas = paddle.reshape(x=deltas, shape=(-1, 4))\n\n        score_tgt = paddle.concat(score_tgt)\n        score_tgt.stop_gradient = True\n\n        pos_mask = score_tgt == 1\n        pos_ind = paddle.nonzero(pos_mask)\n\n        valid_mask = score_tgt >= 0\n        valid_ind = paddle.nonzero(valid_mask)\n\n        # cls loss\n        if valid_ind.shape[0] == 0:\n            loss_rpn_cls = paddle.zeros([1], dtype='float32')\n        else:\n            score_pred = paddle.gather(scores, valid_ind)\n            score_label = paddle.gather(score_tgt, valid_ind).cast('float32')\n            score_label.stop_gradient = True\n            loss_rpn_cls = F.binary_cross_entropy_with_logits(\n                logit=score_pred, label=score_label, reduction=\"sum\")\n\n        # reg loss\n        if pos_ind.shape[0] == 0:\n            loss_rpn_reg = paddle.zeros([1], dtype='float32')\n        else:\n            loc_pred = paddle.gather(deltas, pos_ind)\n            loc_tgt = paddle.concat(loc_tgt)\n            loc_tgt = paddle.gather(loc_tgt, pos_ind)\n            loc_tgt.stop_gradient = True\n\n            if self.loss_rpn_bbox is None:\n                loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum()\n            else:\n                loss_rpn_reg = self.loss_rpn_bbox(loc_pred, loc_tgt).sum()\n\n        return {\n            'loss_rpn_cls': loss_rpn_cls / norm,\n            'loss_rpn_reg': loss_rpn_reg / norm\n        }\n"
  },
  {
    "path": "ppdet/modeling/proposal_generator/target.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport paddle\nfrom ..bbox_utils import bbox2delta, bbox_overlaps\n\n\ndef rpn_anchor_target(anchors,\n                      gt_boxes,\n                      rpn_batch_size_per_im,\n                      rpn_positive_overlap,\n                      rpn_negative_overlap,\n                      rpn_fg_fraction,\n                      use_random=True,\n                      batch_size=1,\n                      ignore_thresh=-1,\n                      is_crowd=None,\n                      weights=[1., 1., 1., 1.],\n                      assign_on_cpu=False):\n    tgt_labels = []\n    tgt_bboxes = []\n    tgt_deltas = []\n    for i in range(batch_size):\n        gt_bbox = gt_boxes[i]\n        is_crowd_i = is_crowd[i] if is_crowd else None\n        # Step1: match anchor and gt_bbox\n        matches, match_labels = label_box(\n            anchors, gt_bbox, rpn_positive_overlap, rpn_negative_overlap, True,\n            ignore_thresh, is_crowd_i, assign_on_cpu)\n        # Step2: sample anchor \n        fg_inds, bg_inds = subsample_labels(match_labels, rpn_batch_size_per_im,\n                                            rpn_fg_fraction, 0, use_random)\n        # Fill with the ignore label (-1), then set positive and negative labels\n        labels = paddle.full(match_labels.shape, -1, dtype='int32')\n        if bg_inds.shape[0] > 0:\n            labels = paddle.scatter(labels, bg_inds, paddle.zeros_like(bg_inds))\n        if fg_inds.shape[0] > 0:\n            labels = paddle.scatter(labels, fg_inds, paddle.ones_like(fg_inds))\n        # Step3: make output  \n        if gt_bbox.shape[0] == 0:\n            matched_gt_boxes = paddle.zeros([matches.shape[0], 4])\n            tgt_delta = paddle.zeros([matches.shape[0], 4])\n        else:\n            matched_gt_boxes = paddle.gather(gt_bbox, matches)\n            tgt_delta = bbox2delta(anchors, matched_gt_boxes, weights)\n            matched_gt_boxes.stop_gradient = True\n            tgt_delta.stop_gradient = True\n        labels.stop_gradient = True\n        tgt_labels.append(labels)\n        tgt_bboxes.append(matched_gt_boxes)\n        tgt_deltas.append(tgt_delta)\n\n    return tgt_labels, tgt_bboxes, tgt_deltas\n\n\ndef label_box(anchors,\n              gt_boxes,\n              positive_overlap,\n              negative_overlap,\n              allow_low_quality,\n              ignore_thresh,\n              is_crowd=None,\n              assign_on_cpu=False):\n    if assign_on_cpu:\n        device = paddle.device.get_device()\n        paddle.set_device(\"cpu\")\n        iou = bbox_overlaps(gt_boxes, anchors)\n        paddle.set_device(device)\n\n    else:\n        iou = bbox_overlaps(gt_boxes, anchors)\n    n_gt = gt_boxes.shape[0]\n    if n_gt == 0 or is_crowd is None:\n        n_gt_crowd = 0\n    else:\n        n_gt_crowd = paddle.nonzero(is_crowd).shape[0]\n    if iou.shape[0] == 0 or n_gt_crowd == n_gt:\n        # No truth, assign everything to background\n        default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64')\n        default_match_labels = paddle.full((iou.shape[1], ), 0, dtype='int32')\n        return default_matches, default_match_labels\n    # if ignore_thresh > 0, remove anchor if it is closed to \n    # one of the crowded ground-truth\n    if n_gt_crowd > 0:\n        N_a = anchors.shape[0]\n        ones = paddle.ones([N_a])\n        mask = is_crowd * ones\n\n        if ignore_thresh > 0:\n            crowd_iou = iou * mask\n            valid = (paddle.sum((crowd_iou > ignore_thresh).cast('int32'),\n                                axis=0) > 0).cast('float32')\n            iou = iou * (1 - valid) - valid\n\n        # ignore the iou between anchor and crowded ground-truth\n        iou = iou * (1 - mask) - mask\n\n    matched_vals, matches = paddle.topk(iou, k=1, axis=0)\n    match_labels = paddle.full(matches.shape, -1, dtype='int32')\n    # set ignored anchor with iou = -1\n    neg_cond = paddle.logical_and(matched_vals > -1,\n                                  matched_vals < negative_overlap)\n    match_labels = paddle.where(neg_cond,\n                                paddle.zeros_like(match_labels), match_labels)\n    match_labels = paddle.where(matched_vals >= positive_overlap,\n                                paddle.ones_like(match_labels), match_labels)\n    if allow_low_quality:\n        highest_quality_foreach_gt = iou.max(axis=1, keepdim=True)\n        pred_inds_with_highest_quality = paddle.logical_and(\n            iou > 0, iou == highest_quality_foreach_gt).cast('int32').sum(\n                0, keepdim=True)\n        match_labels = paddle.where(pred_inds_with_highest_quality > 0,\n                                    paddle.ones_like(match_labels),\n                                    match_labels)\n\n    matches = matches.flatten()\n    match_labels = match_labels.flatten()\n\n    return matches, match_labels\n\n\ndef subsample_labels(labels,\n                     num_samples,\n                     fg_fraction,\n                     bg_label=0,\n                     use_random=True):\n    positive = paddle.nonzero(\n        paddle.logical_and(labels != -1, labels != bg_label))\n    negative = paddle.nonzero(labels == bg_label)\n\n    fg_num = int(num_samples * fg_fraction)\n    fg_num = min(positive.numel(), fg_num)\n    bg_num = num_samples - fg_num\n    bg_num = min(negative.numel(), bg_num)\n    if fg_num == 0 and bg_num == 0:\n        fg_inds = paddle.zeros([0], dtype='int32')\n        bg_inds = paddle.zeros([0], dtype='int32')\n        return fg_inds, bg_inds\n\n    # randomly select positive and negative examples\n\n    negative = negative.cast('int32').flatten()\n    bg_perm = paddle.randperm(negative.numel(), dtype='int32')\n    bg_perm = paddle.slice(bg_perm, axes=[0], starts=[0], ends=[bg_num])\n    if use_random:\n        bg_inds = paddle.gather(negative, bg_perm)\n    else:\n        bg_inds = paddle.slice(negative, axes=[0], starts=[0], ends=[bg_num])\n    if fg_num == 0:\n        fg_inds = paddle.zeros([0], dtype='int32')\n        return fg_inds, bg_inds\n\n    positive = positive.cast('int32').flatten()\n    fg_perm = paddle.randperm(positive.numel(), dtype='int32')\n    fg_perm = paddle.slice(fg_perm, axes=[0], starts=[0], ends=[fg_num])\n    if use_random:\n        fg_inds = paddle.gather(positive, fg_perm)\n    else:\n        fg_inds = paddle.slice(positive, axes=[0], starts=[0], ends=[fg_num])\n\n    return fg_inds, bg_inds\n\n\ndef generate_proposal_target(rpn_rois,\n                             gt_classes,\n                             gt_boxes,\n                             batch_size_per_im,\n                             fg_fraction,\n                             fg_thresh,\n                             bg_thresh,\n                             num_classes,\n                             ignore_thresh=-1.,\n                             is_crowd=None,\n                             use_random=True,\n                             is_cascade=False,\n                             cascade_iou=0.5,\n                             assign_on_cpu=False,\n                             add_gt_as_proposals=True):\n\n    rois_with_gt = []\n    tgt_labels = []\n    tgt_bboxes = []\n    tgt_gt_inds = []\n    new_rois_num = []\n\n    # In cascade rcnn, the threshold for foreground and background\n    # is used from cascade_iou\n    fg_thresh = cascade_iou if is_cascade else fg_thresh\n    bg_thresh = cascade_iou if is_cascade else bg_thresh\n    for i, rpn_roi in enumerate(rpn_rois):\n        gt_bbox = gt_boxes[i]\n        is_crowd_i = is_crowd[i] if is_crowd else None\n        gt_class = paddle.squeeze(gt_classes[i], axis=-1)\n\n        # Concat RoIs and gt boxes except cascade rcnn or none gt\n        if add_gt_as_proposals and gt_bbox.shape[0] > 0:\n            bbox = paddle.concat([rpn_roi, gt_bbox])\n        else:\n            bbox = rpn_roi\n\n        # Step1: label bbox\n        matches, match_labels = label_box(bbox, gt_bbox, fg_thresh, bg_thresh,\n                                          False, ignore_thresh, is_crowd_i,\n                                          assign_on_cpu)\n        # Step2: sample bbox \n        sampled_inds, sampled_gt_classes = sample_bbox(\n            matches, match_labels, gt_class, batch_size_per_im, fg_fraction,\n            num_classes, use_random, is_cascade)\n\n        # Step3: make output \n        rois_per_image = bbox if is_cascade else paddle.gather(bbox,\n                                                               sampled_inds)\n        sampled_gt_ind = matches if is_cascade else paddle.gather(matches,\n                                                                  sampled_inds)\n        if gt_bbox.shape[0] > 0:\n            sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind)\n        else:\n            num = rois_per_image.shape[0]\n            sampled_bbox = paddle.zeros([num, 4], dtype='float32')\n\n        rois_per_image.stop_gradient = True\n        sampled_gt_ind.stop_gradient = True\n        sampled_bbox.stop_gradient = True\n        tgt_labels.append(sampled_gt_classes)\n        tgt_bboxes.append(sampled_bbox)\n        rois_with_gt.append(rois_per_image)\n        tgt_gt_inds.append(sampled_gt_ind)\n        new_rois_num.append(paddle.shape(sampled_inds)[0:1])\n    new_rois_num = paddle.concat(new_rois_num)\n    return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num\n\n\ndef sample_bbox(matches,\n                match_labels,\n                gt_classes,\n                batch_size_per_im,\n                fg_fraction,\n                num_classes,\n                use_random=True,\n                is_cascade=False):\n\n    n_gt = gt_classes.shape[0]\n    if n_gt == 0:\n        # No truth, assign everything to background\n        gt_classes = paddle.ones(matches.shape, dtype='int32') * num_classes\n        #return matches, match_labels + num_classes\n    else:\n        gt_classes = paddle.gather(gt_classes, matches)\n        gt_classes = paddle.where(match_labels == 0,\n                                  paddle.ones_like(gt_classes) * num_classes,\n                                  gt_classes)\n        gt_classes = paddle.where(match_labels == -1,\n                                  paddle.ones_like(gt_classes) * -1, gt_classes)\n    if is_cascade:\n        index = paddle.arange(matches.shape[0])\n        return index, gt_classes\n    rois_per_image = int(batch_size_per_im)\n\n    fg_inds, bg_inds = subsample_labels(gt_classes, rois_per_image, fg_fraction,\n                                        num_classes, use_random)\n    if fg_inds.shape[0] == 0 and bg_inds.shape[0] == 0:\n        # fake output labeled with -1 when all boxes are neither\n        # foreground nor background\n        sampled_inds = paddle.zeros([1], dtype='int32')\n    else:\n        sampled_inds = paddle.concat([fg_inds, bg_inds])\n    sampled_gt_classes = paddle.gather(gt_classes, sampled_inds)\n    return sampled_inds, sampled_gt_classes\n\n\ndef polygons_to_mask(polygons, height, width):\n    \"\"\"\n    Convert the polygons to mask format\n\n    Args:\n        polygons (list[ndarray]): each array has shape (Nx2,)\n        height (int): mask height\n        width (int): mask width\n    Returns:\n        ndarray: a bool mask of shape (height, width)\n    \"\"\"\n    import pycocotools.mask as mask_util\n    assert len(polygons) > 0, \"COCOAPI does not support empty polygons\"\n    rles = mask_util.frPyObjects(polygons, height, width)\n    rle = mask_util.merge(rles)\n    return mask_util.decode(rle).astype(np.bool_)\n\n\ndef rasterize_polygons_within_box(poly, box, resolution):\n    w, h = box[2] - box[0], box[3] - box[1]\n    polygons = [np.asarray(p, dtype=np.float64) for p in poly]\n    for p in polygons:\n        p[0::2] = p[0::2] - box[0]\n        p[1::2] = p[1::2] - box[1]\n\n    ratio_h = resolution / max(h, 0.1)\n    ratio_w = resolution / max(w, 0.1)\n\n    if ratio_h == ratio_w:\n        for p in polygons:\n            p *= ratio_h\n    else:\n        for p in polygons:\n            p[0::2] *= ratio_w\n            p[1::2] *= ratio_h\n\n    # 3. Rasterize the polygons with coco api\n    mask = polygons_to_mask(polygons, resolution, resolution)\n    mask = paddle.to_tensor(mask, dtype='int32')\n    return mask\n\n\ndef generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds,\n                         num_classes, resolution):\n    mask_rois = []\n    mask_rois_num = []\n    tgt_masks = []\n    tgt_classes = []\n    mask_index = []\n    tgt_weights = []\n    for k in range(len(rois)):\n        labels_per_im = labels_int32[k]\n        # select rois labeled with foreground\n        fg_inds = paddle.nonzero(\n            paddle.logical_and(labels_per_im != -1, labels_per_im !=\n                               num_classes))\n        has_fg = True\n        # generate fake roi if foreground is empty\n        if fg_inds.numel() == 0:\n            has_fg = False\n            fg_inds = paddle.ones([1, 1], dtype='int64')\n        inds_per_im = sampled_gt_inds[k]\n        inds_per_im = paddle.gather(inds_per_im, fg_inds)\n\n        rois_per_im = rois[k]\n        fg_rois = paddle.gather(rois_per_im, fg_inds)\n        # Copy the foreground roi to cpu\n        # to generate mask target with ground-truth\n        boxes = fg_rois.numpy()\n        gt_segms_per_im = gt_segms[k]\n\n        new_segm = []\n        inds_per_im = inds_per_im.numpy()\n        if len(gt_segms_per_im) > 0:\n            for i in inds_per_im:\n                new_segm.append(gt_segms_per_im[i])\n        fg_inds_new = fg_inds.reshape([-1]).numpy()\n        results = []\n        if len(gt_segms_per_im) > 0:\n            for j in range(fg_inds_new.shape[0]):\n                results.append(\n                    rasterize_polygons_within_box(new_segm[j], boxes[j],\n                                                  resolution))\n        else:\n            results.append(paddle.ones([resolution, resolution], dtype='int32'))\n\n        fg_classes = paddle.gather(labels_per_im, fg_inds)\n        weight = paddle.ones([fg_rois.shape[0]], dtype='float32')\n        if not has_fg:\n            # now all sampled classes are background\n            # which will cause error in loss calculation,\n            # make fake classes with weight of 0.\n            fg_classes = paddle.zeros([1], dtype='int32')\n            weight = weight - 1\n        tgt_mask = paddle.stack(results)\n        tgt_mask.stop_gradient = True\n        fg_rois.stop_gradient = True\n\n        mask_index.append(fg_inds)\n        mask_rois.append(fg_rois)\n        mask_rois_num.append(paddle.shape(fg_rois)[0:1])\n        tgt_classes.append(fg_classes)\n        tgt_masks.append(tgt_mask)\n        tgt_weights.append(weight)\n\n    mask_index = paddle.concat(mask_index)\n    mask_rois_num = paddle.concat(mask_rois_num)\n    tgt_classes = paddle.concat(tgt_classes, axis=0)\n    tgt_masks = paddle.concat(tgt_masks, axis=0)\n    tgt_weights = paddle.concat(tgt_weights, axis=0)\n\n    return mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights\n\n\ndef libra_sample_pos(max_overlaps, max_classes, pos_inds, num_expected):\n    if len(pos_inds) <= num_expected:\n        return pos_inds\n    else:\n        unique_gt_inds = np.unique(max_classes[pos_inds])\n        num_gts = len(unique_gt_inds)\n        num_per_gt = int(round(num_expected / float(num_gts)) + 1)\n\n        sampled_inds = []\n        for i in unique_gt_inds:\n            inds = np.nonzero(max_classes == i)[0]\n            before_len = len(inds)\n            inds = list(set(inds) & set(pos_inds))\n            after_len = len(inds)\n            if len(inds) > num_per_gt:\n                inds = np.random.choice(inds, size=num_per_gt, replace=False)\n            sampled_inds.extend(list(inds))  # combine as a new sampler\n        if len(sampled_inds) < num_expected:\n            num_extra = num_expected - len(sampled_inds)\n            extra_inds = np.array(list(set(pos_inds) - set(sampled_inds)))\n            assert len(sampled_inds) + len(extra_inds) == len(pos_inds), \\\n                \"sum of sampled_inds({}) and extra_inds({}) length must be equal with pos_inds({})!\".format(\n                    len(sampled_inds), len(extra_inds), len(pos_inds))\n            if len(extra_inds) > num_extra:\n                extra_inds = np.random.choice(\n                    extra_inds, size=num_extra, replace=False)\n            sampled_inds.extend(extra_inds.tolist())\n        elif len(sampled_inds) > num_expected:\n            sampled_inds = np.random.choice(\n                sampled_inds, size=num_expected, replace=False)\n        return paddle.to_tensor(sampled_inds)\n\n\ndef libra_sample_via_interval(max_overlaps, full_set, num_expected, floor_thr,\n                              num_bins, bg_thresh):\n    max_iou = max_overlaps.max()\n    iou_interval = (max_iou - floor_thr) / num_bins\n    per_num_expected = int(num_expected / num_bins)\n\n    sampled_inds = []\n    for i in range(num_bins):\n        start_iou = floor_thr + i * iou_interval\n        end_iou = floor_thr + (i + 1) * iou_interval\n\n        tmp_set = set(\n            np.where(\n                np.logical_and(max_overlaps >= start_iou, max_overlaps <\n                               end_iou))[0])\n        tmp_inds = list(tmp_set & full_set)\n\n        if len(tmp_inds) > per_num_expected:\n            tmp_sampled_set = np.random.choice(\n                tmp_inds, size=per_num_expected, replace=False)\n        else:\n            tmp_sampled_set = np.array(tmp_inds, dtype=np.int32)\n        sampled_inds.append(tmp_sampled_set)\n\n    sampled_inds = np.concatenate(sampled_inds)\n    if len(sampled_inds) < num_expected:\n        num_extra = num_expected - len(sampled_inds)\n        extra_inds = np.array(list(full_set - set(sampled_inds)))\n        assert len(sampled_inds) + len(extra_inds) == len(full_set), \\\n            \"sum of sampled_inds({}) and extra_inds({}) length must be equal with full_set({})!\".format(\n                len(sampled_inds), len(extra_inds), len(full_set))\n\n        if len(extra_inds) > num_extra:\n            extra_inds = np.random.choice(extra_inds, num_extra, replace=False)\n        sampled_inds = np.concatenate([sampled_inds, extra_inds])\n\n    return sampled_inds\n\n\ndef libra_sample_neg(max_overlaps,\n                     max_classes,\n                     neg_inds,\n                     num_expected,\n                     floor_thr=-1,\n                     floor_fraction=0,\n                     num_bins=3,\n                     bg_thresh=0.5):\n    if len(neg_inds) <= num_expected:\n        return neg_inds\n    else:\n        # balance sampling for negative samples\n        neg_set = set(neg_inds.tolist())\n        if floor_thr > 0:\n            floor_set = set(\n                np.where(\n                    np.logical_and(max_overlaps >= 0, max_overlaps < floor_thr))\n                [0])\n            iou_sampling_set = set(np.where(max_overlaps >= floor_thr)[0])\n        elif floor_thr == 0:\n            floor_set = set(np.where(max_overlaps == 0)[0])\n            iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0])\n        else:\n            floor_set = set()\n            iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0])\n            floor_thr = 0\n\n        floor_neg_inds = list(floor_set & neg_set)\n        iou_sampling_neg_inds = list(iou_sampling_set & neg_set)\n\n        num_expected_iou_sampling = int(num_expected * (1 - floor_fraction))\n        if len(iou_sampling_neg_inds) > num_expected_iou_sampling:\n            if num_bins >= 2:\n                iou_sampled_inds = libra_sample_via_interval(\n                    max_overlaps,\n                    set(iou_sampling_neg_inds), num_expected_iou_sampling,\n                    floor_thr, num_bins, bg_thresh)\n            else:\n                iou_sampled_inds = np.random.choice(\n                    iou_sampling_neg_inds,\n                    size=num_expected_iou_sampling,\n                    replace=False)\n        else:\n            iou_sampled_inds = np.array(iou_sampling_neg_inds, dtype=np.int32)\n        num_expected_floor = num_expected - len(iou_sampled_inds)\n        if len(floor_neg_inds) > num_expected_floor:\n            sampled_floor_inds = np.random.choice(\n                floor_neg_inds, size=num_expected_floor, replace=False)\n        else:\n            sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int32)\n        sampled_inds = np.concatenate((sampled_floor_inds, iou_sampled_inds))\n        if len(sampled_inds) < num_expected:\n            num_extra = num_expected - len(sampled_inds)\n            extra_inds = np.array(list(neg_set - set(sampled_inds)))\n            if len(extra_inds) > num_extra:\n                extra_inds = np.random.choice(\n                    extra_inds, size=num_extra, replace=False)\n            sampled_inds = np.concatenate((sampled_inds, extra_inds))\n        return paddle.to_tensor(sampled_inds)\n\n\ndef libra_label_box(anchors, gt_boxes, gt_classes, positive_overlap,\n                    negative_overlap, num_classes):\n    # TODO: use paddle API to speed up\n    gt_classes = gt_classes.numpy()\n    gt_overlaps = np.zeros((anchors.shape[0], num_classes))\n    matches = np.zeros((anchors.shape[0]), dtype=np.int32)\n    if len(gt_boxes) > 0:\n        proposal_to_gt_overlaps = bbox_overlaps(anchors, gt_boxes).numpy()\n        overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)\n        overlaps_max = proposal_to_gt_overlaps.max(axis=1)\n        # Boxes which with non-zero overlap with gt boxes\n        overlapped_boxes_ind = np.where(overlaps_max > 0)[0]\n        overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[\n            overlapped_boxes_ind]]\n\n        for idx in range(len(overlapped_boxes_ind)):\n            gt_overlaps[overlapped_boxes_ind[idx], overlapped_boxes_gt_classes[\n                idx]] = overlaps_max[overlapped_boxes_ind[idx]]\n            matches[overlapped_boxes_ind[idx]] = overlaps_argmax[\n                overlapped_boxes_ind[idx]]\n\n    gt_overlaps = paddle.to_tensor(gt_overlaps)\n    matches = paddle.to_tensor(matches)\n\n    matched_vals = paddle.max(gt_overlaps, axis=1)\n    match_labels = paddle.full(matches.shape, -1, dtype='int32')\n    match_labels = paddle.where(matched_vals < negative_overlap,\n                                paddle.zeros_like(match_labels), match_labels)\n    match_labels = paddle.where(matched_vals >= positive_overlap,\n                                paddle.ones_like(match_labels), match_labels)\n\n    return matches, match_labels, matched_vals\n\n\ndef libra_sample_bbox(matches,\n                      match_labels,\n                      matched_vals,\n                      gt_classes,\n                      batch_size_per_im,\n                      num_classes,\n                      fg_fraction,\n                      fg_thresh,\n                      bg_thresh,\n                      num_bins,\n                      use_random=True,\n                      is_cascade_rcnn=False):\n    rois_per_image = int(batch_size_per_im)\n    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))\n    bg_rois_per_im = rois_per_image - fg_rois_per_im\n\n    if is_cascade_rcnn:\n        fg_inds = paddle.nonzero(matched_vals >= fg_thresh)\n        bg_inds = paddle.nonzero(matched_vals < bg_thresh)\n    else:\n        matched_vals_np = matched_vals.numpy()\n        match_labels_np = match_labels.numpy()\n\n        # sample fg\n        fg_inds = paddle.nonzero(matched_vals >= fg_thresh).flatten()\n        fg_nums = int(np.minimum(fg_rois_per_im, fg_inds.shape[0]))\n        if (fg_inds.shape[0] > fg_nums) and use_random:\n            fg_inds = libra_sample_pos(matched_vals_np, match_labels_np,\n                                       fg_inds.numpy(), fg_rois_per_im)\n        fg_inds = fg_inds[:fg_nums]\n\n        # sample bg\n        bg_inds = paddle.nonzero(matched_vals < bg_thresh).flatten()\n        bg_nums = int(np.minimum(rois_per_image - fg_nums, bg_inds.shape[0]))\n        if (bg_inds.shape[0] > bg_nums) and use_random:\n            bg_inds = libra_sample_neg(\n                matched_vals_np,\n                match_labels_np,\n                bg_inds.numpy(),\n                bg_rois_per_im,\n                num_bins=num_bins,\n                bg_thresh=bg_thresh)\n        bg_inds = bg_inds[:bg_nums]\n\n        sampled_inds = paddle.concat([fg_inds, bg_inds])\n\n        gt_classes = paddle.gather(gt_classes, matches)\n        gt_classes = paddle.where(match_labels == 0,\n                                  paddle.ones_like(gt_classes) * num_classes,\n                                  gt_classes)\n        gt_classes = paddle.where(match_labels == -1,\n                                  paddle.ones_like(gt_classes) * -1, gt_classes)\n        sampled_gt_classes = paddle.gather(gt_classes, sampled_inds)\n\n        return sampled_inds, sampled_gt_classes\n\n\ndef libra_generate_proposal_target(rpn_rois,\n                                   gt_classes,\n                                   gt_boxes,\n                                   batch_size_per_im,\n                                   fg_fraction,\n                                   fg_thresh,\n                                   bg_thresh,\n                                   num_classes,\n                                   use_random=True,\n                                   is_cascade_rcnn=False,\n                                   max_overlaps=None,\n                                   num_bins=3):\n\n    rois_with_gt = []\n    tgt_labels = []\n    tgt_bboxes = []\n    sampled_max_overlaps = []\n    tgt_gt_inds = []\n    new_rois_num = []\n\n    for i, rpn_roi in enumerate(rpn_rois):\n        max_overlap = max_overlaps[i] if is_cascade_rcnn else None\n        gt_bbox = gt_boxes[i]\n        gt_class = paddle.squeeze(gt_classes[i], axis=-1)\n        if is_cascade_rcnn:\n            rpn_roi = filter_roi(rpn_roi, max_overlap)\n        bbox = paddle.concat([rpn_roi, gt_bbox])\n\n        # Step1: label bbox\n        matches, match_labels, matched_vals = libra_label_box(\n            bbox, gt_bbox, gt_class, fg_thresh, bg_thresh, num_classes)\n\n        # Step2: sample bbox\n        sampled_inds, sampled_gt_classes = libra_sample_bbox(\n            matches, match_labels, matched_vals, gt_class, batch_size_per_im,\n            num_classes, fg_fraction, fg_thresh, bg_thresh, num_bins,\n            use_random, is_cascade_rcnn)\n\n        # Step3: make output\n        rois_per_image = paddle.gather(bbox, sampled_inds)\n        sampled_gt_ind = paddle.gather(matches, sampled_inds)\n        sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind)\n        sampled_overlap = paddle.gather(matched_vals, sampled_inds)\n\n        rois_per_image.stop_gradient = True\n        sampled_gt_ind.stop_gradient = True\n        sampled_bbox.stop_gradient = True\n        sampled_overlap.stop_gradient = True\n\n        tgt_labels.append(sampled_gt_classes)\n        tgt_bboxes.append(sampled_bbox)\n        rois_with_gt.append(rois_per_image)\n        sampled_max_overlaps.append(sampled_overlap)\n        tgt_gt_inds.append(sampled_gt_ind)\n        new_rois_num.append(paddle.shape(sampled_inds)[0:1])\n    new_rois_num = paddle.concat(new_rois_num)\n    # rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num\n    return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num\n"
  },
  {
    "path": "ppdet/modeling/proposal_generator/target_layer.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\nimport sys\nimport paddle\nfrom ppdet.core.workspace import register, serializable\n\nfrom .target import rpn_anchor_target, generate_proposal_target, generate_mask_target, libra_generate_proposal_target\nimport numpy as np\n\n\n@register\n@serializable\nclass RPNTargetAssign(object):\n    __shared__ = ['assign_on_cpu']\n    \"\"\"\n    RPN targets assignment module\n\n    The assignment consists of three steps:\n        1. Match anchor and ground-truth box, label the anchor with foreground\n           or background sample\n        2. Sample anchors to keep the properly ratio between foreground and \n           background\n        3. Generate the targets for classification and regression branch\n\n\n    Args:\n        batch_size_per_im (int): Total number of RPN samples per image. \n            default 256\n        fg_fraction (float): Fraction of anchors that is labeled\n            foreground, default 0.5\n        positive_overlap (float): Minimum overlap required between an anchor\n            and ground-truth box for the (anchor, gt box) pair to be \n            a foreground sample. default 0.7\n        negative_overlap (float): Maximum overlap allowed between an anchor\n            and ground-truth box for the (anchor, gt box) pair to be \n            a background sample. default 0.3\n        ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth\n            if the value is larger than zero.\n        use_random (bool): Use random sampling to choose foreground and \n            background boxes, default true.\n        assign_on_cpu (bool): In case the number of gt box is too large, \n            compute IoU on CPU, default false.\n    \"\"\"\n\n    def __init__(self,\n                 batch_size_per_im=256,\n                 fg_fraction=0.5,\n                 positive_overlap=0.7,\n                 negative_overlap=0.3,\n                 ignore_thresh=-1.,\n                 use_random=True,\n                 assign_on_cpu=False):\n        super(RPNTargetAssign, self).__init__()\n        self.batch_size_per_im = batch_size_per_im\n        self.fg_fraction = fg_fraction\n        self.positive_overlap = positive_overlap\n        self.negative_overlap = negative_overlap\n        self.ignore_thresh = ignore_thresh\n        self.use_random = use_random\n        self.assign_on_cpu = assign_on_cpu\n\n    def __call__(self, inputs, anchors):\n        \"\"\"\n        inputs: ground-truth instances.\n        anchor_box (Tensor): [num_anchors, 4], num_anchors are all anchors in all feature maps.\n        \"\"\"\n        gt_boxes = inputs['gt_bbox']\n        is_crowd = inputs.get('is_crowd', None)\n        batch_size = len(gt_boxes)\n        tgt_labels, tgt_bboxes, tgt_deltas = rpn_anchor_target(\n            anchors,\n            gt_boxes,\n            self.batch_size_per_im,\n            self.positive_overlap,\n            self.negative_overlap,\n            self.fg_fraction,\n            self.use_random,\n            batch_size,\n            self.ignore_thresh,\n            is_crowd,\n            assign_on_cpu=self.assign_on_cpu)\n        norm = self.batch_size_per_im * batch_size\n\n        return tgt_labels, tgt_bboxes, tgt_deltas, norm\n\n\n@register\nclass BBoxAssigner(object):\n    __shared__ = ['num_classes', 'assign_on_cpu']\n    \"\"\"\n    RCNN targets assignment module\n\n    The assignment consists of three steps:\n        1. Match RoIs and ground-truth box, label the RoIs with foreground\n           or background sample\n        2. Sample anchors to keep the properly ratio between foreground and \n           background\n        3. Generate the targets for classification and regression branch\n\n    Args:\n        batch_size_per_im (int): Total number of RoIs per image. \n            default 512 \n        fg_fraction (float): Fraction of RoIs that is labeled\n            foreground, default 0.25\n        fg_thresh (float): Minimum overlap required between a RoI\n            and ground-truth box for the (roi, gt box) pair to be\n            a foreground sample. default 0.5\n        bg_thresh (float): Maximum overlap allowed between a RoI\n            and ground-truth box for the (roi, gt box) pair to be\n            a background sample. default 0.5\n        ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth\n            if the value is larger than zero.\n        use_random (bool): Use random sampling to choose foreground and \n            background boxes, default true\n        cascade_iou (list[iou]): The list of overlap to select foreground and\n            background of each stage, which is only used In Cascade RCNN.\n        num_classes (int): The number of class.\n        assign_on_cpu (bool): In case the number of gt box is too large, \n            compute IoU on CPU, default false.\n    \"\"\"\n\n    def __init__(self,\n                 batch_size_per_im=512,\n                 fg_fraction=.25,\n                 fg_thresh=.5,\n                 bg_thresh=.5,\n                 ignore_thresh=-1.,\n                 use_random=True,\n                 cascade_iou=[0.5, 0.6, 0.7],\n                 num_classes=80,\n                 assign_on_cpu=False):\n        super(BBoxAssigner, self).__init__()\n        self.batch_size_per_im = batch_size_per_im\n        self.fg_fraction = fg_fraction\n        self.fg_thresh = fg_thresh\n        self.bg_thresh = bg_thresh\n        self.ignore_thresh = ignore_thresh\n        self.use_random = use_random\n        self.cascade_iou = cascade_iou\n        self.num_classes = num_classes\n        self.assign_on_cpu = assign_on_cpu\n\n    def __call__(self,\n                 rpn_rois,\n                 rpn_rois_num,\n                 inputs,\n                 stage=0,\n                 is_cascade=False,\n                 add_gt_as_proposals=True):\n        gt_classes = inputs['gt_class']\n        gt_boxes = inputs['gt_bbox']\n        is_crowd = inputs.get('is_crowd', None)\n        # rois, tgt_labels, tgt_bboxes, tgt_gt_inds\n        # new_rois_num\n        outs = generate_proposal_target(\n            rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im,\n            self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes,\n            self.ignore_thresh, is_crowd, self.use_random, is_cascade,\n            self.cascade_iou[stage], self.assign_on_cpu, add_gt_as_proposals)\n        rois = outs[0]\n        rois_num = outs[-1]\n        # tgt_labels, tgt_bboxes, tgt_gt_inds\n        targets = outs[1:4]\n        return rois, rois_num, targets\n\n\n@register\nclass BBoxLibraAssigner(object):\n    __shared__ = ['num_classes']\n    \"\"\"\n    Libra-RCNN targets assignment module\n\n    The assignment consists of three steps:\n        1. Match RoIs and ground-truth box, label the RoIs with foreground\n           or background sample\n        2. Sample anchors to keep the properly ratio between foreground and\n           background\n        3. Generate the targets for classification and regression branch\n\n    Args:\n        batch_size_per_im (int): Total number of RoIs per image.\n            default 512\n        fg_fraction (float): Fraction of RoIs that is labeled\n            foreground, default 0.25\n        fg_thresh (float): Minimum overlap required between a RoI\n            and ground-truth box for the (roi, gt box) pair to be\n            a foreground sample. default 0.5\n        bg_thresh (float): Maximum overlap allowed between a RoI\n            and ground-truth box for the (roi, gt box) pair to be\n            a background sample. default 0.5\n        use_random (bool): Use random sampling to choose foreground and\n            background boxes, default true\n        cascade_iou (list[iou]): The list of overlap to select foreground and\n            background of each stage, which is only used In Cascade RCNN.\n        num_classes (int): The number of class.\n        num_bins (int): The number of libra_sample.\n    \"\"\"\n\n    def __init__(self,\n                 batch_size_per_im=512,\n                 fg_fraction=.25,\n                 fg_thresh=.5,\n                 bg_thresh=.5,\n                 use_random=True,\n                 cascade_iou=[0.5, 0.6, 0.7],\n                 num_classes=80,\n                 num_bins=3):\n        super(BBoxLibraAssigner, self).__init__()\n        self.batch_size_per_im = batch_size_per_im\n        self.fg_fraction = fg_fraction\n        self.fg_thresh = fg_thresh\n        self.bg_thresh = bg_thresh\n        self.use_random = use_random\n        self.cascade_iou = cascade_iou\n        self.num_classes = num_classes\n        self.num_bins = num_bins\n\n    def __call__(self,\n                 rpn_rois,\n                 rpn_rois_num,\n                 inputs,\n                 stage=0,\n                 is_cascade=False):\n        gt_classes = inputs['gt_class']\n        gt_boxes = inputs['gt_bbox']\n        # rois, tgt_labels, tgt_bboxes, tgt_gt_inds\n        outs = libra_generate_proposal_target(\n            rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im,\n            self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes,\n            self.use_random, is_cascade, self.cascade_iou[stage], self.num_bins)\n        rois = outs[0]\n        rois_num = outs[-1]\n        # tgt_labels, tgt_bboxes, tgt_gt_inds\n        targets = outs[1:4]\n        return rois, rois_num, targets\n\n\n@register\n@serializable\nclass MaskAssigner(object):\n    __shared__ = ['num_classes', 'mask_resolution']\n    \"\"\"\n    Mask targets assignment module\n\n    The assignment consists of three steps:\n        1. Select RoIs labels with foreground.\n        2. Encode the RoIs and corresponding gt polygons to generate \n           mask target\n\n    Args:\n        num_classes (int): The number of class\n        mask_resolution (int): The resolution of mask target, default 14\n    \"\"\"\n\n    def __init__(self, num_classes=80, mask_resolution=14):\n        super(MaskAssigner, self).__init__()\n        self.num_classes = num_classes\n        self.mask_resolution = mask_resolution\n\n    def __call__(self, rois, tgt_labels, tgt_gt_inds, inputs):\n        gt_segms = inputs['gt_poly']\n\n        outs = generate_mask_target(gt_segms, rois, tgt_labels, tgt_gt_inds,\n                                    self.num_classes, self.mask_resolution)\n\n        # mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights\n        return outs\n\n\n@register\nclass RBoxAssigner(object):\n    \"\"\"\n    assigner of rbox\n    Args:\n        pos_iou_thr (float): threshold of pos samples\n        neg_iou_thr (float): threshold of neg samples\n        min_iou_thr (float): the min threshold of samples\n        ignore_iof_thr (int): the ignored threshold\n    \"\"\"\n\n    def __init__(self,\n                 pos_iou_thr=0.5,\n                 neg_iou_thr=0.4,\n                 min_iou_thr=0.0,\n                 ignore_iof_thr=-2):\n        super(RBoxAssigner, self).__init__()\n\n        self.pos_iou_thr = pos_iou_thr\n        self.neg_iou_thr = neg_iou_thr\n        self.min_iou_thr = min_iou_thr\n        self.ignore_iof_thr = ignore_iof_thr\n\n    def anchor_valid(self, anchors):\n        \"\"\"\n\n        Args:\n            anchor: M x 4\n\n        Returns:\n\n        \"\"\"\n        if anchors.ndim == 3:\n            anchors = anchors.reshape(-1, anchors.shape[-1])\n        assert anchors.ndim == 2\n        anchor_num = anchors.shape[0]\n        anchor_valid = np.ones((anchor_num), np.int32)\n        anchor_inds = np.arange(anchor_num)\n        return anchor_inds\n\n    def rbox2delta(self,\n                   proposals,\n                   gt,\n                   means=[0, 0, 0, 0, 0],\n                   stds=[1, 1, 1, 1, 1]):\n        \"\"\"\n        Args:\n            proposals: tensor [N, 5]\n            gt: gt [N, 5]\n            means: means [5]\n            stds: stds [5]\n        Returns:\n\n        \"\"\"\n        proposals = proposals.astype(np.float64)\n\n        PI = np.pi\n\n        gt_widths = gt[..., 2]\n        gt_heights = gt[..., 3]\n        gt_angle = gt[..., 4]\n\n        proposals_widths = proposals[..., 2]\n        proposals_heights = proposals[..., 3]\n        proposals_angle = proposals[..., 4]\n\n        coord = gt[..., 0:2] - proposals[..., 0:2]\n        dx = (np.cos(proposals[..., 4]) * coord[..., 0] +\n              np.sin(proposals[..., 4]) * coord[..., 1]) / proposals_widths\n        dy = (-np.sin(proposals[..., 4]) * coord[..., 0] +\n              np.cos(proposals[..., 4]) * coord[..., 1]) / proposals_heights\n        dw = np.log(gt_widths / proposals_widths)\n        dh = np.log(gt_heights / proposals_heights)\n        da = (gt_angle - proposals_angle)\n\n        da = (da + PI / 4) % PI - PI / 4\n        da /= PI\n\n        deltas = np.stack([dx, dy, dw, dh, da], axis=-1)\n        means = np.array(means, dtype=deltas.dtype)\n        stds = np.array(stds, dtype=deltas.dtype)\n        deltas = (deltas - means) / stds\n        deltas = deltas.astype(np.float32)\n        return deltas\n\n    def assign_anchor(self,\n                      anchors,\n                      gt_bboxes,\n                      gt_labels,\n                      pos_iou_thr,\n                      neg_iou_thr,\n                      min_iou_thr=0.0,\n                      ignore_iof_thr=-2):\n        assert anchors.shape[1] == 4 or anchors.shape[1] == 5\n        assert gt_bboxes.shape[1] == 4 or gt_bboxes.shape[1] == 5\n        anchors_xc_yc = anchors\n        gt_bboxes_xc_yc = gt_bboxes\n\n        # calc rbox iou\n        anchors_xc_yc = anchors_xc_yc.astype(np.float32)\n        gt_bboxes_xc_yc = gt_bboxes_xc_yc.astype(np.float32)\n        anchors_xc_yc = paddle.to_tensor(anchors_xc_yc)\n        gt_bboxes_xc_yc = paddle.to_tensor(gt_bboxes_xc_yc)\n\n        try:\n            from ext_op import rbox_iou\n        except Exception as e:\n            print(\"import custom_ops error, try install ext_op \" \\\n                  \"following ppdet/ext_op/README.md\", e)\n            sys.stdout.flush()\n            sys.exit(-1)\n\n        iou = rbox_iou(gt_bboxes_xc_yc, anchors_xc_yc)\n        iou = iou.numpy()\n        iou = iou.T\n\n        # every gt's anchor's index\n        gt_bbox_anchor_inds = iou.argmax(axis=0)\n        gt_bbox_anchor_iou = iou[gt_bbox_anchor_inds, np.arange(iou.shape[1])]\n        gt_bbox_anchor_iou_inds = np.where(iou == gt_bbox_anchor_iou)[0]\n\n        # every anchor's gt bbox's index\n        anchor_gt_bbox_inds = iou.argmax(axis=1)\n        anchor_gt_bbox_iou = iou[np.arange(iou.shape[0]), anchor_gt_bbox_inds]\n\n        # (1) set labels=-2 as default\n        labels = np.ones((iou.shape[0], ), dtype=np.int32) * ignore_iof_thr\n\n        # (2) assign ignore\n        labels[anchor_gt_bbox_iou < min_iou_thr] = ignore_iof_thr\n\n        # (3) assign neg_ids -1\n        assign_neg_ids1 = anchor_gt_bbox_iou >= min_iou_thr\n        assign_neg_ids2 = anchor_gt_bbox_iou < neg_iou_thr\n        assign_neg_ids = np.logical_and(assign_neg_ids1, assign_neg_ids2)\n        labels[assign_neg_ids] = -1\n\n        # anchor_gt_bbox_iou_inds\n        # (4) assign max_iou as pos_ids >=0\n        anchor_gt_bbox_iou_inds = anchor_gt_bbox_inds[gt_bbox_anchor_iou_inds]\n        # gt_bbox_anchor_iou_inds = np.logical_and(gt_bbox_anchor_iou_inds, anchor_gt_bbox_iou >= min_iou_thr)\n        labels[gt_bbox_anchor_iou_inds] = gt_labels[anchor_gt_bbox_iou_inds]\n\n        # (5) assign >= pos_iou_thr as pos_ids\n        iou_pos_iou_thr_ids = anchor_gt_bbox_iou >= pos_iou_thr\n        iou_pos_iou_thr_ids_box_inds = anchor_gt_bbox_inds[iou_pos_iou_thr_ids]\n        labels[iou_pos_iou_thr_ids] = gt_labels[iou_pos_iou_thr_ids_box_inds]\n        return anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels\n\n    def __call__(self, anchors, gt_bboxes, gt_labels, is_crowd):\n\n        assert anchors.ndim == 2\n        assert anchors.shape[1] == 5\n        assert gt_bboxes.ndim == 2\n        assert gt_bboxes.shape[1] == 5\n\n        pos_iou_thr = self.pos_iou_thr\n        neg_iou_thr = self.neg_iou_thr\n        min_iou_thr = self.min_iou_thr\n        ignore_iof_thr = self.ignore_iof_thr\n\n        anchor_num = anchors.shape[0]\n\n        gt_bboxes = gt_bboxes\n        is_crowd_slice = is_crowd\n        not_crowd_inds = np.where(is_crowd_slice == 0)\n\n        # Step1: match anchor and gt_bbox\n        anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels = self.assign_anchor(\n            anchors, gt_bboxes,\n            gt_labels.reshape(-1), pos_iou_thr, neg_iou_thr, min_iou_thr,\n            ignore_iof_thr)\n\n        # Step2: sample anchor\n        pos_inds = np.where(labels >= 0)[0]\n        neg_inds = np.where(labels == -1)[0]\n\n        # Step3: make output\n        anchors_num = anchors.shape[0]\n        bbox_targets = np.zeros_like(anchors)\n        bbox_weights = np.zeros_like(anchors)\n        bbox_gt_bboxes = np.zeros_like(anchors)\n        pos_labels = np.zeros(anchors_num, dtype=np.int32)\n        pos_labels_weights = np.zeros(anchors_num, dtype=np.float32)\n\n        pos_sampled_anchors = anchors[pos_inds]\n        pos_sampled_gt_boxes = gt_bboxes[anchor_gt_bbox_inds[pos_inds]]\n        if len(pos_inds) > 0:\n            pos_bbox_targets = self.rbox2delta(pos_sampled_anchors,\n                                               pos_sampled_gt_boxes)\n            bbox_targets[pos_inds, :] = pos_bbox_targets\n            bbox_gt_bboxes[pos_inds, :] = pos_sampled_gt_boxes\n            bbox_weights[pos_inds, :] = 1.0\n\n            pos_labels[pos_inds] = labels[pos_inds]\n            pos_labels_weights[pos_inds] = 1.0\n\n        if len(neg_inds) > 0:\n            pos_labels_weights[neg_inds] = 1.0\n        return (pos_labels, pos_labels_weights, bbox_targets, bbox_weights,\n                bbox_gt_bboxes, pos_inds, neg_inds)\n"
  },
  {
    "path": "ppdet/modeling/rbox_utils.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport paddle\nimport numpy as np\nimport cv2\n\n\ndef norm_angle(angle, range=[-np.pi / 4, np.pi]):\n    return (angle - range[0]) % range[1] + range[0]\n\n\n# rbox function implemented using numpy\ndef poly2rbox_le135_np(poly):\n    \"\"\"convert poly to rbox [-pi / 4, 3 * pi / 4]\n\n    Args:\n        poly: [x1, y1, x2, y2, x3, y3, x4, y4]\n\n    Returns:\n        rbox: [cx, cy, w, h, angle]\n    \"\"\"\n    poly = np.array(poly[:8], dtype=np.float32)\n\n    pt1 = (poly[0], poly[1])\n    pt2 = (poly[2], poly[3])\n    pt3 = (poly[4], poly[5])\n    pt4 = (poly[6], poly[7])\n\n    edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) *\n                    (pt1[1] - pt2[1]))\n    edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) *\n                    (pt2[1] - pt3[1]))\n\n    width = max(edge1, edge2)\n    height = min(edge1, edge2)\n\n    rbox_angle = 0\n    if edge1 > edge2:\n        rbox_angle = np.arctan2(float(pt2[1] - pt1[1]), float(pt2[0] - pt1[0]))\n    elif edge2 >= edge1:\n        rbox_angle = np.arctan2(float(pt4[1] - pt1[1]), float(pt4[0] - pt1[0]))\n\n    rbox_angle = norm_angle(rbox_angle)\n\n    x_ctr = float(pt1[0] + pt3[0]) / 2\n    y_ctr = float(pt1[1] + pt3[1]) / 2\n    return [x_ctr, y_ctr, width, height, rbox_angle]\n\n\ndef poly2rbox_oc_np(poly):\n    \"\"\"convert poly to rbox (0, pi / 2]\n\n    Args:\n        poly: [x1, y1, x2, y2, x3, y3, x4, y4]\n\n    Returns:\n        rbox: [cx, cy, w, h, angle]\n    \"\"\"\n    points = np.array(poly, dtype=np.float32).reshape((-1, 2))\n    (cx, cy), (w, h), angle = cv2.minAreaRect(points)\n    # using the new OpenCV Rotated BBox definition since 4.5.1\n    # if angle < 0, opencv is older than 4.5.1, angle is in [-90, 0)\n    if angle < 0:\n        angle += 90\n        w, h = h, w\n\n    # convert angle to [0, 90)\n    if angle == -0.0:\n        angle = 0.0\n    if angle == 90.0:\n        angle = 0.0\n        w, h = h, w\n\n    angle = angle / 180 * np.pi\n    return [cx, cy, w, h, angle]\n\n\ndef poly2rbox_np(polys, rbox_type='oc'):\n    \"\"\"\n    polys: [x0,y0,x1,y1,x2,y2,x3,y3]\n    to\n    rboxes: [x_ctr,y_ctr,w,h,angle]\n    \"\"\"\n    assert rbox_type in ['oc', 'le135'], 'only oc or le135 is supported now'\n    poly2rbox_fn = poly2rbox_oc_np if rbox_type == 'oc' else poly2rbox_le135_np\n    rboxes = []\n    for poly in polys:\n        x, y, w, h, angle = poly2rbox_fn(poly)\n        rbox = np.array([x, y, w, h, angle], dtype=np.float32)\n        rboxes.append(rbox)\n\n    return np.array(rboxes)\n\n\ndef cal_line_length(point1, point2):\n    return math.sqrt(\n        math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))\n\n\ndef get_best_begin_point_single(coordinate):\n    x1, y1, x2, y2, x3, y3, x4, y4 = coordinate\n    xmin = min(x1, x2, x3, x4)\n    ymin = min(y1, y2, y3, y4)\n    xmax = max(x1, x2, x3, x4)\n    ymax = max(y1, y2, y3, y4)\n    combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],\n                 [[x4, y4], [x1, y1], [x2, y2], [x3, y3]],\n                 [[x3, y3], [x4, y4], [x1, y1], [x2, y2]],\n                 [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]\n    dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]\n    force = 100000000.0\n    force_flag = 0\n    for i in range(4):\n        temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \\\n                     + cal_line_length(combinate[i][1], dst_coordinate[1]) \\\n                     + cal_line_length(combinate[i][2], dst_coordinate[2]) \\\n                     + cal_line_length(combinate[i][3], dst_coordinate[3])\n        if temp_force < force:\n            force = temp_force\n            force_flag = i\n    if force_flag != 0:\n        pass\n    return np.array(combinate[force_flag]).reshape(8)\n\n\ndef rbox2poly_np(rboxes):\n    \"\"\"\n    rboxes:[x_ctr,y_ctr,w,h,angle]\n    to\n    poly:[x0,y0,x1,y1,x2,y2,x3,y3]\n    \"\"\"\n    polys = []\n    for i in range(len(rboxes)):\n        x_ctr, y_ctr, width, height, angle = rboxes[i][:5]\n        tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2\n        rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])\n        R = np.array([[np.cos(angle), -np.sin(angle)],\n                      [np.sin(angle), np.cos(angle)]])\n        poly = R.dot(rect)\n        x0, x1, x2, x3 = poly[0, :4] + x_ctr\n        y0, y1, y2, y3 = poly[1, :4] + y_ctr\n        poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)\n        poly = get_best_begin_point_single(poly)\n        polys.append(poly)\n    polys = np.array(polys)\n    return polys\n\n\n# rbox function implemented using paddle\ndef box2corners(box):\n    \"\"\"convert box coordinate to corners\n    Args:\n        box (Tensor): (B, N, 5) with (x, y, w, h, alpha) angle is in [0, 90)\n    Returns:\n        corners (Tensor): (B, N, 4, 2) with (x1, y1, x2, y2, x3, y3, x4, y4)\n    \"\"\"\n    B = box.shape[0]\n    x, y, w, h, alpha = paddle.split(box, 5, axis=-1)\n    x4 = paddle.to_tensor(\n        [0.5, 0.5, -0.5, -0.5], dtype=paddle.float32).reshape(\n            (1, 1, 4))  # (1,1,4)\n    x4 = x4 * w  # (B, N, 4)\n    y4 = paddle.to_tensor(\n        [-0.5, 0.5, 0.5, -0.5], dtype=paddle.float32).reshape((1, 1, 4))\n    y4 = y4 * h  # (B, N, 4)\n    corners = paddle.stack([x4, y4], axis=-1)  # (B, N, 4, 2)\n    sin = paddle.sin(alpha)\n    cos = paddle.cos(alpha)\n    row1 = paddle.concat([cos, sin], axis=-1)\n    row2 = paddle.concat([-sin, cos], axis=-1)  # (B, N, 2)\n    rot_T = paddle.stack([row1, row2], axis=-2)  # (B, N, 2, 2)\n    rotated = paddle.bmm(corners.reshape([-1, 4, 2]), rot_T.reshape([-1, 2, 2]))\n    rotated = rotated.reshape([B, -1, 4, 2])  # (B*N, 4, 2) -> (B, N, 4, 2)\n    rotated[..., 0] += x\n    rotated[..., 1] += y\n    return rotated\n\n\ndef paddle_gather(x, dim, index):\n    index_shape = index.shape\n    index_flatten = index.flatten()\n    if dim < 0:\n        dim = len(x.shape) + dim\n    nd_index = []\n    for k in range(len(x.shape)):\n        if k == dim:\n            nd_index.append(index_flatten)\n        else:\n            reshape_shape = [1] * len(x.shape)\n            reshape_shape[k] = x.shape[k]\n            x_arange = paddle.arange(x.shape[k], dtype=index.dtype)\n            x_arange = x_arange.reshape(reshape_shape)\n            dim_index = paddle.expand(x_arange, index_shape).flatten()\n            nd_index.append(dim_index)\n    ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype(\"int64\")\n    paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape)\n    return paddle_out\n\n\ndef check_points_in_polys(points, polys):\n    \"\"\"Check whether point is in rotated boxes\n    Args:\n        points (tensor): (1, L, 2) anchor points\n        polys (tensor): [B, N, 4, 2] gt_polys\n        eps (float): default 1e-9\n    Returns:\n        is_in_polys (tensor): (B, N, L)\n    \"\"\"\n    # [1, L, 2] -> [1, 1, L, 2]\n    points = points.unsqueeze(0)\n    # [B, N, 4, 2] -> [B, N, 1, 2]\n    a, b, c, d = polys.split(4, axis=2)\n    ab = b - a\n    ad = d - a\n    # [B, N, L, 2]\n    ap = points - a\n    # [B, N, 1]\n    norm_ab = paddle.sum(ab * ab, axis=-1)\n    # [B, N, 1]\n    norm_ad = paddle.sum(ad * ad, axis=-1)\n    # [B, N, L] dot product\n    ap_dot_ab = paddle.sum(ap * ab, axis=-1)\n    # [B, N, L] dot product\n    ap_dot_ad = paddle.sum(ap * ad, axis=-1)\n    # [B, N, L] <A, B> = |A|*|B|*cos(theta)\n    is_in_polys = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (\n        ap_dot_ad >= 0) & (ap_dot_ad <= norm_ad)\n    return is_in_polys\n\n\ndef check_points_in_rotated_boxes(points, boxes):\n    \"\"\"Check whether point is in rotated boxes\n\n    Args:\n        points (tensor): (1, L, 2) anchor points\n        boxes (tensor): [B, N, 5] gt_bboxes\n        eps (float): default 1e-9\n    \n    Returns:\n        is_in_box (tensor): (B, N, L)\n\n    \"\"\"\n    # [B, N, 5] -> [B, N, 4, 2]\n    corners = box2corners(boxes)\n    # [1, L, 2] -> [1, 1, L, 2]\n    points = points.unsqueeze(0)\n    # [B, N, 4, 2] -> [B, N, 1, 2]\n    a, b, c, d = corners.split(4, axis=2)\n    ab = b - a\n    ad = d - a\n    # [B, N, L, 2]\n    ap = points - a\n    # [B, N, L]\n    norm_ab = paddle.sum(ab * ab, axis=-1)\n    # [B, N, L]\n    norm_ad = paddle.sum(ad * ad, axis=-1)\n    # [B, N, L] dot product\n    ap_dot_ab = paddle.sum(ap * ab, axis=-1)\n    # [B, N, L] dot product\n    ap_dot_ad = paddle.sum(ap * ad, axis=-1)\n    # [B, N, L] <A, B> = |A|*|B|*cos(theta) \n    is_in_box = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (ap_dot_ad >= 0) & (\n        ap_dot_ad <= norm_ad)\n    return is_in_box\n\n\ndef rotated_iou_similarity(box1, box2, eps=1e-9, func=''):\n    \"\"\"Calculate iou of box1 and box2\n\n    Args:\n        box1 (Tensor): box with the shape [N, M1, 5]\n        box2 (Tensor): box with the shape [N, M2, 5]\n\n    Return:\n        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]\n    \"\"\"\n    from ext_op import rbox_iou\n    rotated_ious = []\n    for b1, b2 in zip(box1, box2):\n        rotated_ious.append(rbox_iou(b1, b2))\n\n    return paddle.stack(rotated_ious, axis=0)\n"
  },
  {
    "path": "ppdet/modeling/reid/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom . import jde_embedding_head\nfrom . import fairmot_embedding_head\nfrom . import resnet\nfrom . import pyramidal_embedding\nfrom . import pplcnet_embedding\nfrom . import resnet_embedding\n\nfrom .fairmot_embedding_head import *\nfrom .jde_embedding_head import *\nfrom .resnet import *\nfrom .pyramidal_embedding import *\nfrom .pplcnet_embedding import *\nfrom .resnet_embedding import *\n"
  },
  {
    "path": "ppdet/modeling/reid/fairmot_embedding_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport numpy as np\nimport math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import KaimingUniform, Uniform\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.heads.centernet_head import ConvLayer\n\n__all__ = ['FairMOTEmbeddingHead']\n\n\n@register\nclass FairMOTEmbeddingHead(nn.Layer):\n    __shared__ = ['num_classes']\n    \"\"\"\n    Args:\n        in_channels (int): the channel number of input to FairMOTEmbeddingHead.\n        ch_head (int): the channel of features before fed into embedding, 256 by default.\n        ch_emb (int): the channel of the embedding feature, 128 by default.\n        num_identities_dict (dict): the number of identities of each category,\n            support single class and multi-calss, {0: 14455} as default. \n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 ch_head=256,\n                 ch_emb=128,\n                 num_classes=1,\n                 num_identities_dict={0: 14455}):\n        super(FairMOTEmbeddingHead, self).__init__()\n        assert num_classes >= 1\n        self.num_classes = num_classes\n        self.ch_emb = ch_emb\n        self.num_identities_dict = num_identities_dict\n        self.reid = nn.Sequential(\n            ConvLayer(\n                in_channels, ch_head, kernel_size=3, padding=1, bias=True),\n            nn.ReLU(),\n            ConvLayer(\n                ch_head, ch_emb, kernel_size=1, stride=1, padding=0, bias=True))\n        param_attr = paddle.ParamAttr(initializer=KaimingUniform())\n        bound = 1 / math.sqrt(ch_emb)\n        bias_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound))\n        self.reid_loss = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum')\n\n        if num_classes == 1:\n            nID = self.num_identities_dict[0]  # single class\n            self.classifier = nn.Linear(\n                ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr)\n            # When num_identities(nID) is 1, emb_scale is set as 1\n            self.emb_scale = math.sqrt(2) * math.log(nID - 1) if nID > 1 else 1\n        else:\n            self.classifiers = dict()\n            self.emb_scale_dict = dict()\n            for cls_id, nID in self.num_identities_dict.items():\n                self.classifiers[str(cls_id)] = nn.Linear(\n                    ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr)\n                # When num_identities(nID) is 1, emb_scale is set as 1\n                self.emb_scale_dict[str(cls_id)] = math.sqrt(2) * math.log(\n                    nID - 1) if nID > 1 else 1\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        if isinstance(input_shape, (list, tuple)):\n            input_shape = input_shape[0]\n        return {'in_channels': input_shape.channels}\n\n    def process_by_class(self, bboxes, embedding, bbox_inds, topk_clses):\n        pred_dets, pred_embs = [], []\n        for cls_id in range(self.num_classes):\n            inds_masks = topk_clses == cls_id\n            inds_masks = paddle.cast(inds_masks, 'float32')\n\n            pos_num = inds_masks.sum().numpy()\n            if pos_num == 0:\n                continue\n\n            cls_inds_mask = inds_masks > 0\n\n            bbox_mask = paddle.nonzero(cls_inds_mask)\n            cls_bboxes = paddle.gather_nd(bboxes, bbox_mask)\n            pred_dets.append(cls_bboxes)\n\n            cls_inds = paddle.masked_select(bbox_inds, cls_inds_mask)\n            cls_inds = cls_inds.unsqueeze(-1)\n            cls_embedding = paddle.gather_nd(embedding, cls_inds)\n            pred_embs.append(cls_embedding)\n\n        return paddle.concat(pred_dets), paddle.concat(pred_embs)\n\n    def forward(self,\n                neck_feat,\n                inputs,\n                bboxes=None,\n                bbox_inds=None,\n                topk_clses=None):\n        reid_feat = self.reid(neck_feat)\n        if self.training:\n            if self.num_classes == 1:\n                loss = self.get_loss(reid_feat, inputs)\n            else:\n                loss = self.get_mc_loss(reid_feat, inputs)\n            return loss\n        else:\n            assert bboxes is not None and bbox_inds is not None\n            reid_feat = F.normalize(reid_feat)\n            embedding = paddle.transpose(reid_feat, [0, 2, 3, 1])\n            embedding = paddle.reshape(embedding, [-1, self.ch_emb])\n            # embedding shape: [bs * h * w, ch_emb]\n\n            if self.num_classes == 1:\n                pred_dets = bboxes\n                pred_embs = paddle.gather(embedding, bbox_inds)\n            else:\n                pred_dets, pred_embs = self.process_by_class(\n                    bboxes, embedding, bbox_inds, topk_clses)\n            return pred_dets, pred_embs\n\n    def get_loss(self, feat, inputs):\n        index = inputs['index']\n        mask = inputs['index_mask']\n        target = inputs['reid']\n        target = paddle.masked_select(target, mask > 0)\n        target = paddle.unsqueeze(target, 1)\n\n        feat = paddle.transpose(feat, perm=[0, 2, 3, 1])\n        feat_n, feat_h, feat_w, feat_c = feat.shape\n        feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c])\n        index = paddle.unsqueeze(index, 2)\n        batch_inds = list()\n        for i in range(feat_n):\n            batch_ind = paddle.full(\n                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')\n            batch_inds.append(batch_ind)\n        batch_inds = paddle.concat(batch_inds, axis=0)\n        index = paddle.concat(x=[batch_inds, index], axis=2)\n        feat = paddle.gather_nd(feat, index=index)\n\n        mask = paddle.unsqueeze(mask, axis=2)\n        mask = paddle.expand_as(mask, feat)\n        mask.stop_gradient = True\n        feat = paddle.masked_select(feat, mask > 0)\n        feat = paddle.reshape(feat, shape=[-1, feat_c])\n        feat = F.normalize(feat)\n        feat = self.emb_scale * feat\n        logit = self.classifier(feat)\n        target.stop_gradient = True\n        loss = self.reid_loss(logit, target)\n        valid = (target != self.reid_loss.ignore_index)\n        valid.stop_gradient = True\n        count = paddle.sum((paddle.cast(valid, dtype=np.int32)))\n        count.stop_gradient = True\n        if count > 0:\n            loss = loss / count\n\n        return loss\n\n    def get_mc_loss(self, feat, inputs):\n        # feat.shape = [bs, ch_emb, h, w]\n        assert 'cls_id_map' in inputs and 'cls_tr_ids' in inputs\n        index = inputs['index']\n        mask = inputs['index_mask']\n        cls_id_map = inputs['cls_id_map']  # [bs, h, w]\n        cls_tr_ids = inputs['cls_tr_ids']  # [bs, num_classes, h, w]\n\n        feat = paddle.transpose(feat, perm=[0, 2, 3, 1])\n        feat_n, feat_h, feat_w, feat_c = feat.shape\n        feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c])\n\n        index = paddle.unsqueeze(index, 2)\n        batch_inds = list()\n        for i in range(feat_n):\n            batch_ind = paddle.full(\n                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')\n            batch_inds.append(batch_ind)\n        batch_inds = paddle.concat(batch_inds, axis=0)\n        index = paddle.concat(x=[batch_inds, index], axis=2)\n        feat = paddle.gather_nd(feat, index=index)\n\n        mask = paddle.unsqueeze(mask, axis=2)\n        mask = paddle.expand_as(mask, feat)\n        mask.stop_gradient = True\n        feat = paddle.masked_select(feat, mask > 0)\n        feat = paddle.reshape(feat, shape=[-1, feat_c])\n\n        reid_losses = 0\n        for cls_id, id_num in self.num_identities_dict.items():\n            # target\n            cur_cls_tr_ids = paddle.reshape(\n                cls_tr_ids[:, cls_id, :, :], shape=[feat_n, -1])  # [bs, h*w]\n            cls_id_target = paddle.gather_nd(cur_cls_tr_ids, index=index)\n            mask = inputs['index_mask']\n            cls_id_target = paddle.masked_select(cls_id_target, mask > 0)\n            cls_id_target.stop_gradient = True\n\n            # feat\n            cls_id_feat = self.emb_scale_dict[str(cls_id)] * F.normalize(feat)\n            cls_id_pred = self.classifiers[str(cls_id)](cls_id_feat)\n\n            loss = self.reid_loss(cls_id_pred, cls_id_target)\n            valid = (cls_id_target != self.reid_loss.ignore_index)\n            valid.stop_gradient = True\n            count = paddle.sum((paddle.cast(valid, dtype=np.int32)))\n            count.stop_gradient = True\n            if count > 0:\n                loss = loss / count\n            reid_losses += loss\n\n        return reid_losses\n"
  },
  {
    "path": "ppdet/modeling/reid/jde_embedding_head.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\nfrom ppdet.core.workspace import register\nfrom paddle.nn.initializer import Normal, Constant\n\n__all__ = ['JDEEmbeddingHead']\n\n\nclass LossParam(nn.Layer):\n    def __init__(self, init_value=0., use_uncertainy=True):\n        super(LossParam, self).__init__()\n        self.loss_param = self.create_parameter(\n            shape=[1],\n            attr=ParamAttr(initializer=Constant(value=init_value)),\n            dtype=\"float32\")\n\n    def forward(self, inputs):\n        out = paddle.exp(-self.loss_param) * inputs + self.loss_param\n        return out * 0.5\n\n\n@register\nclass JDEEmbeddingHead(nn.Layer):\n    __shared__ = ['num_classes']\n    __inject__ = ['emb_loss', 'jde_loss']\n    \"\"\"\n    JDEEmbeddingHead\n    Args:\n        num_classes(int): Number of classes. Only support one class tracking.\n        num_identities(int): Number of identities.\n        anchor_levels(int): Number of anchor levels, same as FPN levels.\n        anchor_scales(int): Number of anchor scales on each FPN level.\n        embedding_dim(int): Embedding dimension. Default: 512.\n        emb_loss(object): Instance of 'JDEEmbeddingLoss'\n        jde_loss(object): Instance of 'JDELoss'\n    \"\"\"\n\n    def __init__(\n            self,\n            num_classes=1,\n            num_identities=14455,  # dataset.num_identities_dict[0]\n            anchor_levels=3,\n            anchor_scales=4,\n            embedding_dim=512,\n            emb_loss='JDEEmbeddingLoss',\n            jde_loss='JDELoss'):\n        super(JDEEmbeddingHead, self).__init__()\n        self.num_classes = num_classes\n        self.num_identities = num_identities\n        self.anchor_levels = anchor_levels\n        self.anchor_scales = anchor_scales\n        self.embedding_dim = embedding_dim\n        self.emb_loss = emb_loss\n        self.jde_loss = jde_loss\n\n        self.emb_scale = math.sqrt(2) * math.log(\n            self.num_identities - 1) if self.num_identities > 1 else 1\n\n        self.identify_outputs = []\n        self.loss_params_cls = []\n        self.loss_params_reg = []\n        self.loss_params_ide = []\n        for i in range(self.anchor_levels):\n            name = 'identify_output.{}'.format(i)\n            identify_output = self.add_sublayer(\n                name,\n                nn.Conv2D(\n                    in_channels=64 * (2**self.anchor_levels) // (2**i),\n                    out_channels=self.embedding_dim,\n                    kernel_size=3,\n                    stride=1,\n                    padding=1,\n                    bias_attr=ParamAttr(regularizer=L2Decay(0.))))\n            self.identify_outputs.append(identify_output)\n\n            loss_p_cls = self.add_sublayer('cls.{}'.format(i), LossParam(-4.15))\n            self.loss_params_cls.append(loss_p_cls)\n            loss_p_reg = self.add_sublayer('reg.{}'.format(i), LossParam(-4.85))\n            self.loss_params_reg.append(loss_p_reg)\n            loss_p_ide = self.add_sublayer('ide.{}'.format(i), LossParam(-2.3))\n            self.loss_params_ide.append(loss_p_ide)\n\n        self.classifier = self.add_sublayer(\n            'classifier',\n            nn.Linear(\n                self.embedding_dim,\n                self.num_identities,\n                weight_attr=ParamAttr(\n                    learning_rate=1., initializer=Normal(\n                        mean=0.0, std=0.01)),\n                bias_attr=ParamAttr(\n                    learning_rate=2., regularizer=L2Decay(0.))))\n\n    def forward(self,\n                identify_feats,\n                targets,\n                loss_confs=None,\n                loss_boxes=None,\n                bboxes=None,\n                boxes_idx=None,\n                nms_keep_idx=None):\n        assert self.num_classes == 1, 'JDE only support sindle class MOT.'\n        assert len(identify_feats) == self.anchor_levels\n        ide_outs = []\n        for feat, ide_head in zip(identify_feats, self.identify_outputs):\n            ide_outs.append(ide_head(feat))\n\n        if self.training:\n            assert len(loss_confs) == len(loss_boxes) == self.anchor_levels\n            loss_ides = self.emb_loss(ide_outs, targets, self.emb_scale,\n                                      self.classifier)\n            jde_losses = self.jde_loss(\n                loss_confs, loss_boxes, loss_ides, self.loss_params_cls,\n                self.loss_params_reg, self.loss_params_ide, targets)\n            return jde_losses\n        else:\n            assert bboxes is not None\n            assert boxes_idx is not None\n            assert nms_keep_idx is not None\n\n            emb_outs = self.get_emb_outs(ide_outs)\n            emb_valid = paddle.gather_nd(emb_outs, boxes_idx)\n            pred_embs = paddle.gather_nd(emb_valid, nms_keep_idx)\n\n            input_shape = targets['image'].shape[2:]\n            # input_shape: [h, w], before data transforms, set in model config\n            im_shape = targets['im_shape'][0].numpy()\n            # im_shape: [new_h, new_w], after data transforms\n            scale_factor = targets['scale_factor'][0].numpy()\n            bboxes[:, 2:] = self.scale_coords(bboxes[:, 2:], input_shape,\n                                              im_shape, scale_factor)\n            # cls_ids, scores, tlwhs \n            pred_dets = bboxes\n            return pred_dets, pred_embs\n\n    def scale_coords(self, coords, input_shape, im_shape, scale_factor):\n        ratio = scale_factor[0]\n        pad_w = (input_shape[1] - int(im_shape[1])) / 2\n        pad_h = (input_shape[0] - int(im_shape[0])) / 2\n        coords = paddle.cast(coords, 'float32')\n        coords[:, 0::2] -= pad_w\n        coords[:, 1::2] -= pad_h\n        coords[:, 0:4] /= ratio\n        coords[:, :4] = paddle.clip(\n            coords[:, :4], min=0, max=coords[:, :4].max())\n        return coords.round()\n\n    def get_emb_and_gt_outs(self, ide_outs, targets):\n        emb_and_gts = []\n        for i, p_ide in enumerate(ide_outs):\n            t_conf = targets['tconf{}'.format(i)]\n            t_ide = targets['tide{}'.format(i)]\n\n            p_ide = p_ide.transpose((0, 2, 3, 1))\n            p_ide_flatten = paddle.reshape(p_ide, [-1, self.embedding_dim])\n\n            mask = t_conf > 0\n            mask = paddle.cast(mask, dtype=\"int64\")\n            emb_mask = mask.max(1).flatten()\n            emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()\n            if len(emb_mask_inds) > 0:\n                t_ide_flatten = paddle.reshape(t_ide.max(1), [-1, 1])\n                tids = paddle.gather(t_ide_flatten, emb_mask_inds)\n\n                embedding = paddle.gather(p_ide_flatten, emb_mask_inds)\n                embedding = self.emb_scale * F.normalize(embedding)\n                emb_and_gt = paddle.concat([embedding, tids], axis=1)\n                emb_and_gts.append(emb_and_gt)\n\n        if len(emb_and_gts) > 0:\n            return paddle.concat(emb_and_gts, axis=0)\n        else:\n            return paddle.zeros((1, self.embedding_dim + 1))\n\n    def get_emb_outs(self, ide_outs):\n        emb_outs = []\n        for i, p_ide in enumerate(ide_outs):\n            p_ide = p_ide.transpose((0, 2, 3, 1))\n\n            p_ide_repeat = paddle.tile(p_ide, [self.anchor_scales, 1, 1, 1])\n            embedding = F.normalize(p_ide_repeat, axis=-1)\n            emb = paddle.reshape(embedding, [-1, self.embedding_dim])\n            emb_outs.append(emb)\n\n        if len(emb_outs) > 0:\n            return paddle.concat(emb_outs, axis=0)\n        else:\n            return paddle.zeros((1, self.embedding_dim))\n"
  },
  {
    "path": "ppdet/modeling/reid/pplcnet_embedding.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal, Constant\nfrom paddle import ParamAttr\nfrom paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Linear\nfrom paddle.regularizer import L2Decay\nfrom paddle.nn.initializer import KaimingNormal, XavierNormal\nfrom ppdet.core.workspace import register\n\n__all__ = ['PPLCNetEmbedding']\n\n\n# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se.\n# k: kernel_size\n# in_c: input channel number in depthwise block\n# out_c: output channel number in depthwise block\n# s: stride in depthwise block\n# use_se: whether to use SE block\n\nNET_CONFIG = {\n    \"blocks2\":\n    #k, in_c, out_c, s, use_se\n    [[3, 16, 32, 1, False]],\n    \"blocks3\": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],\n    \"blocks4\": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],\n    \"blocks5\": [[3, 128, 256, 2, False], [5, 256, 256, 1, False],\n                [5, 256, 256, 1, False], [5, 256, 256, 1, False],\n                [5, 256, 256, 1, False], [5, 256, 256, 1, False]],\n    \"blocks6\": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]\n}\n\n\ndef make_divisible(v, divisor=8, min_value=None):\n    if min_value is None:\n        min_value = divisor\n    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)\n    if new_v < 0.9 * v:\n        new_v += divisor\n    return new_v\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 filter_size,\n                 num_filters,\n                 stride,\n                 num_groups=1):\n        super().__init__()\n\n        self.conv = Conv2D(\n            in_channels=num_channels,\n            out_channels=num_filters,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=(filter_size - 1) // 2,\n            groups=num_groups,\n            weight_attr=ParamAttr(initializer=KaimingNormal()),\n            bias_attr=False)\n\n        self.bn = BatchNorm2D(\n            num_filters,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.hardswish = nn.Hardswish()\n\n    def forward(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        x = self.hardswish(x)\n        return x\n\n\nclass DepthwiseSeparable(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 stride,\n                 dw_size=3,\n                 use_se=False):\n        super().__init__()\n        self.use_se = use_se\n        self.dw_conv = ConvBNLayer(\n            num_channels=num_channels,\n            num_filters=num_channels,\n            filter_size=dw_size,\n            stride=stride,\n            num_groups=num_channels)\n        if use_se:\n            self.se = SEModule(num_channels)\n        self.pw_conv = ConvBNLayer(\n            num_channels=num_channels,\n            filter_size=1,\n            num_filters=num_filters,\n            stride=1)\n\n    def forward(self, x):\n        x = self.dw_conv(x)\n        if self.use_se:\n            x = self.se(x)\n        x = self.pw_conv(x)\n        return x\n\n\nclass SEModule(nn.Layer):\n    def __init__(self, channel, reduction=4):\n        super().__init__()\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        self.conv1 = Conv2D(\n            in_channels=channel,\n            out_channels=channel // reduction,\n            kernel_size=1,\n            stride=1,\n            padding=0)\n        self.relu = nn.ReLU()\n        self.conv2 = Conv2D(\n            in_channels=channel // reduction,\n            out_channels=channel,\n            kernel_size=1,\n            stride=1,\n            padding=0)\n        self.hardsigmoid = nn.Hardsigmoid()\n\n    def forward(self, x):\n        identity = x\n        x = self.avg_pool(x)\n        x = self.conv1(x)\n        x = self.relu(x)\n        x = self.conv2(x)\n        x = self.hardsigmoid(x)\n        x = paddle.multiply(x=identity, y=x)\n        return x\n\n\nclass PPLCNet(nn.Layer):\n    \"\"\"\n    PP-LCNet, see https://arxiv.org/abs/2109.15099.\n    This code is different from PPLCNet in ppdet/modeling/backbones/lcnet.py\n    or in PaddleClas, because the output is the flatten feature of last_conv.\n\n    Args:\n        scale (float): Scale ratio of channels.\n        class_expand (int): Number of channels of conv feature.\n    \"\"\"\n\n    def __init__(self, scale=1.0, class_expand=1280):\n        super(PPLCNet, self).__init__()\n        self.scale = scale\n        self.class_expand = class_expand\n\n        self.conv1 = ConvBNLayer(\n            num_channels=3,\n            filter_size=3,\n            num_filters=make_divisible(16 * scale),\n            stride=2)\n\n        self.blocks2 = nn.Sequential(*[\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks2\"])\n        ])\n\n        self.blocks3 = nn.Sequential(*[\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks3\"])\n        ])\n\n        self.blocks4 = nn.Sequential(*[\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks4\"])\n        ])\n\n        self.blocks5 = nn.Sequential(*[\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks5\"])\n        ])\n\n        self.blocks6 = nn.Sequential(*[\n            DepthwiseSeparable(\n                num_channels=make_divisible(in_c * scale),\n                num_filters=make_divisible(out_c * scale),\n                dw_size=k,\n                stride=s,\n                use_se=se)\n            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[\"blocks6\"])\n        ])\n\n        self.avg_pool = AdaptiveAvgPool2D(1)\n        self.last_conv = Conv2D(\n            in_channels=make_divisible(NET_CONFIG[\"blocks6\"][-1][2] * scale),\n            out_channels=self.class_expand,\n            kernel_size=1,\n            stride=1,\n            padding=0,\n            bias_attr=False)\n        self.hardswish = nn.Hardswish()\n        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)\n\n    def forward(self, x):\n        x = self.conv1(x)\n\n        x = self.blocks2(x)\n        x = self.blocks3(x)\n        x = self.blocks4(x)\n        x = self.blocks5(x)\n        x = self.blocks6(x)\n\n        x = self.avg_pool(x)\n        x = self.last_conv(x)\n        x = self.hardswish(x)\n        x = self.flatten(x)\n        return x\n\n\nclass FC(nn.Layer):\n    def __init__(self, input_ch, output_ch):\n        super(FC, self).__init__()\n        weight_attr = ParamAttr(initializer=XavierNormal())\n        self.fc = paddle.nn.Linear(input_ch, output_ch, weight_attr=weight_attr)\n\n    def forward(self, x):\n        out = self.fc(x)\n        return out\n\n\n@register\nclass PPLCNetEmbedding(nn.Layer):\n    \"\"\"\n    PPLCNet Embedding\n\n    Args:\n        input_ch (int): Number of channels of input conv feature.\n        output_ch (int): Number of channels of output conv feature.\n    \"\"\"\n    def __init__(self, scale=2.5, input_ch=1280, output_ch=512):\n        super(PPLCNetEmbedding, self).__init__()\n        self.backbone = PPLCNet(scale=scale)\n        self.neck = FC(input_ch, output_ch)\n\n    def forward(self, x):\n        feat = self.backbone(x)\n        feat_out = self.neck(feat)\n        return feat_out\n"
  },
  {
    "path": "ppdet/modeling/reid/pyramidal_embedding.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal, Constant\nfrom paddle import ParamAttr\nfrom .resnet import ResNet50, ResNet101\nfrom ppdet.core.workspace import register\n\n__all__ = ['PCBPyramid']\n\n\n@register\nclass PCBPyramid(nn.Layer):\n    \"\"\"\n    PCB (Part-based Convolutional Baseline), see https://arxiv.org/abs/1711.09349,\n    Pyramidal Person Re-IDentification, see https://arxiv.org/abs/1810.12193\n\n    Args:\n        input_ch (int): Number of channels of the input feature.\n        num_stripes (int): Number of sub-parts.\n        used_levels (tuple): Whether the level is used, 1 means used.\n        num_classes (int): Number of classes for identities, default 751 in\n            Market-1501 dataset.\n        last_conv_stride (int): Stride of the last conv.\n        last_conv_dilation (int): Dilation of the last conv.\n        num_conv_out_channels (int): Number of channels of conv feature.\n    \"\"\"\n\n    def __init__(self,\n                 input_ch=2048,\n                 model_name='ResNet101',\n                 num_stripes=6,\n                 used_levels=(1, 1, 1, 1, 1, 1),\n                 num_classes=751,\n                 last_conv_stride=1,\n                 last_conv_dilation=1,\n                 num_conv_out_channels=128):\n        super(PCBPyramid, self).__init__()\n        self.num_stripes = num_stripes\n        self.used_levels = used_levels\n        self.num_classes = num_classes\n\n        self.num_in_each_level = [i for i in range(self.num_stripes, 0, -1)]\n        self.num_branches = sum(self.num_in_each_level)\n\n        assert model_name in ['ResNet50', 'ResNet101'], \"Unsupported ReID arch: {}\".format(model_name)\n        self.base = eval(model_name)(\n            lr_mult=0.1,\n            last_conv_stride=last_conv_stride,\n            last_conv_dilation=last_conv_dilation)\n        self.dropout_layer = nn.Dropout(p=0.2)\n        self.pyramid_conv_list0, self.pyramid_fc_list0 = self.basic_branch(\n            num_conv_out_channels, input_ch)\n\n    def basic_branch(self, num_conv_out_channels, input_ch):\n        # the level indexes are defined from fine to coarse,\n        # the branch will contain one more part than that of its previous level\n        # the sliding step is set to 1\n        pyramid_conv_list = nn.LayerList()\n        pyramid_fc_list = nn.LayerList()\n\n        idx_levels = 0\n        for idx_branches in range(self.num_branches):\n            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):\n                idx_levels += 1\n\n            pyramid_conv_list.append(\n                nn.Sequential(\n                    nn.Conv2D(input_ch, num_conv_out_channels, 1),\n                    nn.BatchNorm2D(num_conv_out_channels), nn.ReLU()))\n\n        idx_levels = 0\n        for idx_branches in range(self.num_branches):\n            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):\n                idx_levels += 1\n\n            fc = nn.Linear(\n                in_features=num_conv_out_channels,\n                out_features=self.num_classes,\n                weight_attr=ParamAttr(initializer=Normal(\n                    mean=0., std=0.001)),\n                bias_attr=ParamAttr(initializer=Constant(value=0.)))\n            pyramid_fc_list.append(fc)\n        return pyramid_conv_list, pyramid_fc_list\n\n    def pyramid_forward(self, feat):\n        each_stripe_size = int(feat.shape[2] / self.num_stripes)\n\n        feat_list, logits_list = [], []\n        idx_levels = 0\n        used_branches = 0\n        for idx_branches in range(self.num_branches):\n            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):\n                idx_levels += 1\n            idx_in_each_level = idx_branches - sum(self.num_in_each_level[\n                0:idx_levels])\n            stripe_size_in_each_level = each_stripe_size * (idx_levels + 1)\n            start = idx_in_each_level * each_stripe_size\n            end = start + stripe_size_in_each_level\n\n            k = feat.shape[-1]\n            local_feat_avgpool = F.avg_pool2d(\n                feat[:, :, start:end, :],\n                kernel_size=(stripe_size_in_each_level, k))\n            local_feat_maxpool = F.max_pool2d(\n                feat[:, :, start:end, :],\n                kernel_size=(stripe_size_in_each_level, k))\n            local_feat = local_feat_avgpool + local_feat_maxpool\n\n            local_feat = self.pyramid_conv_list0[used_branches](local_feat)\n            local_feat = paddle.reshape(\n                local_feat, shape=[local_feat.shape[0], -1])\n            feat_list.append(local_feat)\n\n            local_logits = self.pyramid_fc_list0[used_branches](\n                self.dropout_layer(local_feat))\n            logits_list.append(local_logits)\n\n            used_branches += 1\n\n        return feat_list, logits_list\n\n    def forward(self, x):\n        feat = self.base(x)\n        assert feat.shape[2] % self.num_stripes == 0\n        feat_list, logits_list = self.pyramid_forward(feat)\n        feat_out = paddle.concat(feat_list, axis=-1)\n        return feat_out\n"
  },
  {
    "path": "ppdet/modeling/reid/resnet.py",
    "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport math\nimport paddle\nfrom paddle import ParamAttr\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Normal\n\n__all__ = [\"ResNet18\", \"ResNet34\", \"ResNet50\", \"ResNet101\", \"ResNet152\"]\n\n\nclass ConvBNLayer(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 filter_size,\n                 stride=1,\n                 dilation=1,\n                 groups=1,\n                 act=None,\n                 lr_mult=1.0,\n                 name=None,\n                 data_format=\"NCHW\"):\n        super(ConvBNLayer, self).__init__()\n        conv_stdv = filter_size * filter_size * num_filters\n        self._conv = nn.Conv2D(\n            in_channels=num_channels,\n            out_channels=num_filters,\n            kernel_size=filter_size,\n            stride=stride,\n            padding=(filter_size - 1) // 2,\n            dilation=dilation,\n            groups=groups,\n            weight_attr=ParamAttr(\n                learning_rate=lr_mult,\n                initializer=Normal(0, math.sqrt(2. / conv_stdv))),\n            bias_attr=False,\n            data_format=data_format)\n\n        self._batch_norm = nn.BatchNorm2D(num_filters)\n        self.act = act\n\n    def forward(self, inputs):\n        y = self._conv(inputs)\n        y = self._batch_norm(y)\n        if self.act:\n            y = getattr(F, self.act)(y)\n        return y\n\n\nclass BottleneckBlock(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 stride,\n                 shortcut=True,\n                 name=None,\n                 lr_mult=1.0,\n                 dilation=1,\n                 data_format=\"NCHW\"):\n        super(BottleneckBlock, self).__init__()\n        self.conv0 = ConvBNLayer(\n            num_channels=num_channels,\n            num_filters=num_filters,\n            filter_size=1,\n            dilation=dilation,\n            act=\"relu\",\n            lr_mult=lr_mult,\n            name=name + \"_branch2a\",\n            data_format=data_format)\n        self.conv1 = ConvBNLayer(\n            num_channels=num_filters,\n            num_filters=num_filters,\n            filter_size=3,\n            dilation=dilation,\n            stride=stride,\n            act=\"relu\",\n            lr_mult=lr_mult,\n            name=name + \"_branch2b\",\n            data_format=data_format)\n        self.conv2 = ConvBNLayer(\n            num_channels=num_filters,\n            num_filters=num_filters * 4,\n            filter_size=1,\n            dilation=dilation,\n            act=None,\n            lr_mult=lr_mult,\n            name=name + \"_branch2c\",\n            data_format=data_format)\n        if not shortcut:\n            self.short = ConvBNLayer(\n                num_channels=num_channels,\n                num_filters=num_filters * 4,\n                filter_size=1,\n                dilation=dilation,\n                stride=stride,\n                lr_mult=lr_mult,\n                name=name + \"_branch1\",\n                data_format=data_format)\n        self.shortcut = shortcut\n        self._num_channels_out = num_filters * 4\n\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        conv2 = self.conv2(conv1)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv2)\n        y = F.relu(y)\n        return y\n\n\nclass BasicBlock(nn.Layer):\n    def __init__(self,\n                 num_channels,\n                 num_filters,\n                 stride,\n                 shortcut=True,\n                 name=None,\n                 data_format=\"NCHW\"):\n        super(BasicBlock, self).__init__()\n        self.stride = stride\n        self.conv0 = ConvBNLayer(\n            num_channels=num_channels,\n            num_filters=num_filters,\n            filter_size=3,\n            stride=stride,\n            act=\"relu\",\n            name=name + \"_branch2a\",\n            data_format=data_format)\n        self.conv1 = ConvBNLayer(\n            num_channels=num_filters,\n            num_filters=num_filters,\n            filter_size=3,\n            act=None,\n            name=name + \"_branch2b\",\n            data_format=data_format)\n        if not shortcut:\n            self.short = ConvBNLayer(\n                num_channels=num_channels,\n                num_filters=num_filters,\n                filter_size=1,\n                stride=stride,\n                name=name + \"_branch1\",\n                data_format=data_format)\n        self.shortcut = shortcut\n\n    def forward(self, inputs):\n        y = self.conv0(inputs)\n        conv1 = self.conv1(y)\n        if self.shortcut:\n            short = inputs\n        else:\n            short = self.short(inputs)\n        y = paddle.add(x=short, y=conv1)\n        y = F.relu(y)\n        return y\n\n\nclass ResNet(nn.Layer):\n    def __init__(self,\n                 layers=50,\n                 lr_mult=1.0,\n                 last_conv_stride=2,\n                 last_conv_dilation=1):\n        super(ResNet, self).__init__()\n        self.layers = layers\n        self.data_format = \"NCHW\"\n        self.input_image_channel = 3\n        supported_layers = [18, 34, 50, 101, 152]\n        assert layers in supported_layers, \\\n            \"supported layers are {} but input layer is {}\".format(\n                supported_layers, layers)\n        if layers == 18:\n            depth = [2, 2, 2, 2]\n        elif layers == 34 or layers == 50:\n            depth = [3, 4, 6, 3]\n        elif layers == 101:\n            depth = [3, 4, 23, 3]\n        elif layers == 152:\n            depth = [3, 8, 36, 3]\n        num_channels = [64, 256, 512,\n                        1024] if layers >= 50 else [64, 64, 128, 256]\n        num_filters = [64, 128, 256, 512]\n        self.conv = ConvBNLayer(\n            num_channels=self.input_image_channel,\n            num_filters=64,\n            filter_size=7,\n            stride=2,\n            act=\"relu\",\n            lr_mult=lr_mult,\n            name=\"conv1\",\n            data_format=self.data_format)\n        self.pool2d_max = nn.MaxPool2D(\n            kernel_size=3, stride=2, padding=1, data_format=self.data_format)\n        self.block_list = []\n        if layers >= 50:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    if layers in [101, 152] and block == 2:\n                        if i == 0:\n                            conv_name = \"res\" + str(block + 2) + \"a\"\n                        else:\n                            conv_name = \"res\" + str(block + 2) + \"b\" + str(i)\n                    else:\n                        conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    if i != 0 or block == 0:\n                        stride = 1\n                    elif block == len(depth) - 1:\n                        stride = last_conv_stride\n                    else:\n                        stride = 2\n                    bottleneck_block = self.add_sublayer(\n                        conv_name,\n                        BottleneckBlock(\n                            num_channels=num_channels[block]\n                            if i == 0 else num_filters[block] * 4,\n                            num_filters=num_filters[block],\n                            stride=stride,\n                            shortcut=shortcut,\n                            name=conv_name,\n                            lr_mult=lr_mult,\n                            dilation=last_conv_dilation\n                            if block == len(depth) - 1 else 1,\n                            data_format=self.data_format))\n                    self.block_list.append(bottleneck_block)\n                    shortcut = True\n        else:\n            for block in range(len(depth)):\n                shortcut = False\n                for i in range(depth[block]):\n                    conv_name = \"res\" + str(block + 2) + chr(97 + i)\n                    basic_block = self.add_sublayer(\n                        conv_name,\n                        BasicBlock(\n                            num_channels=num_channels[block]\n                            if i == 0 else num_filters[block],\n                            num_filters=num_filters[block],\n                            stride=2 if i == 0 and block != 0 else 1,\n                            shortcut=shortcut,\n                            name=conv_name,\n                            data_format=self.data_format))\n                    self.block_list.append(basic_block)\n                    shortcut = True\n\n    def forward(self, inputs):\n        y = self.conv(inputs)\n        y = self.pool2d_max(y)\n        for block in self.block_list:\n            y = block(y)\n        return y\n\n\ndef ResNet18(**args):\n    model = ResNet(layers=18, **args)\n    return model\n\n\ndef ResNet34(**args):\n    model = ResNet(layers=34, **args)\n    return model\n\n\ndef ResNet50(pretrained=None, **args):\n    model = ResNet(layers=50, **args)\n    if pretrained is not None:\n        if not (os.path.isdir(pretrained) or\n                os.path.exists(pretrained + '.pdparams')):\n            raise ValueError(\"Model pretrain path {} does not \"\n                             \"exists.\".format(pretrained))\n        param_state_dict = paddle.load(pretrained + '.pdparams')\n        model.set_dict(param_state_dict)\n    return model\n\n\ndef ResNet101(pretrained=None, **args):\n    model = ResNet(layers=101, **args)\n    if pretrained is not None:\n        if not (os.path.isdir(pretrained) or\n                os.path.exists(pretrained + '.pdparams')):\n            raise ValueError(\"Model pretrain path {} does not \"\n                             \"exists.\".format(pretrained))\n        param_state_dict = paddle.load(pretrained + '.pdparams')\n        model.set_dict(param_state_dict)\n    return model\n\n\ndef ResNet152(**args):\n    model = ResNet(layers=152, **args)\n    return model\n"
  },
  {
    "path": "ppdet/modeling/reid/resnet_embedding.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nimport os\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle import nn\nfrom .resnet import ResNet50, ResNet101\nfrom ppdet.core.workspace import register\n\n__all__ = ['ResNetEmbedding']\n\n\n@register\nclass ResNetEmbedding(nn.Layer):\n    in_planes = 2048\n    def __init__(self, model_name='ResNet50', last_stride=1):\n        super(ResNetEmbedding, self).__init__()\n        assert model_name in ['ResNet50', 'ResNet101'], \"Unsupported ReID arch: {}\".format(model_name)\n        self.base = eval(model_name)(last_conv_stride=last_stride)\n        self.gap = nn.AdaptiveAvgPool2D(output_size=1)\n        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)\n        self.bn = nn.BatchNorm1D(self.in_planes, bias_attr=False)\n\n    def forward(self, x):\n        base_out = self.base(x)\n        global_feat = self.gap(base_out)\n        global_feat = self.flatten(global_feat)\n        global_feat = self.bn(global_feat)\n        return global_feat\n"
  },
  {
    "path": "ppdet/modeling/shape_spec.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\n# The code is based on:\n# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py\n\nfrom collections import namedtuple\n\n\nclass ShapeSpec(\n        namedtuple(\"_ShapeSpec\", [\"channels\", \"height\", \"width\", \"stride\"])):\n    def __new__(cls, channels=None, height=None, width=None, stride=None):\n        return super(ShapeSpec, cls).__new__(cls, channels, height, width,\n                                             stride)\n"
  },
  {
    "path": "ppdet/modeling/ssod/__init__.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. \n#   \n# Licensed under the Apache License, Version 2.0 (the \"License\");   \n# you may not use this file except in compliance with the License.  \n# You may obtain a copy of the License at   \n#   \n#     http://www.apache.org/licenses/LICENSE-2.0    \n#   \n# Unless required by applicable law or agreed to in writing, software   \n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  \n# See the License for the specific language governing permissions and   \n# limitations under the License.\n\nfrom . import utils\nfrom . import losses\n\nfrom .utils import *\nfrom .losses import *\n"
  },
  {
    "path": "ppdet/modeling/ssod/losses.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling.losses.iou_loss import GIoULoss\nfrom .utils import QFLv2\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'SSODFCOSLoss',\n    'SSODPPYOLOELoss',\n]\n\n\n@register\nclass SSODFCOSLoss(nn.Layer):\n    def __init__(self, loss_weight=1.0):\n        super(SSODFCOSLoss, self).__init__()\n        self.loss_weight = loss_weight\n\n    def forward(self, student_head_outs, teacher_head_outs, train_cfg):\n        # for semi-det distill\n        student_logits, student_deltas, student_quality = student_head_outs\n        teacher_logits, teacher_deltas, teacher_quality = teacher_head_outs\n        nc = student_logits[0].shape[1]\n\n        student_logits = paddle.concat(\n            [\n                _.transpose([0, 2, 3, 1]).reshape([-1, nc])\n                for _ in student_logits\n            ],\n            axis=0)\n        teacher_logits = paddle.concat(\n            [\n                _.transpose([0, 2, 3, 1]).reshape([-1, nc])\n                for _ in teacher_logits\n            ],\n            axis=0)\n\n        student_deltas = paddle.concat(\n            [\n                _.transpose([0, 2, 3, 1]).reshape([-1, 4])\n                for _ in student_deltas\n            ],\n            axis=0)\n        teacher_deltas = paddle.concat(\n            [\n                _.transpose([0, 2, 3, 1]).reshape([-1, 4])\n                for _ in teacher_deltas\n            ],\n            axis=0)\n\n        student_quality = paddle.concat(\n            [\n                _.transpose([0, 2, 3, 1]).reshape([-1, 1])\n                for _ in student_quality\n            ],\n            axis=0)\n        teacher_quality = paddle.concat(\n            [\n                _.transpose([0, 2, 3, 1]).reshape([-1, 1])\n                for _ in teacher_quality\n            ],\n            axis=0)\n\n        ratio = train_cfg.get('ratio', 0.01)\n        with paddle.no_grad():\n            # Region Selection\n            count_num = int(teacher_logits.shape[0] * ratio)\n            teacher_probs = F.sigmoid(teacher_logits)\n            max_vals = paddle.max(teacher_probs, 1)\n            sorted_vals, sorted_inds = paddle.topk(max_vals,\n                                                   teacher_logits.shape[0])\n            mask = paddle.zeros_like(max_vals)\n            mask[sorted_inds[:count_num]] = 1.\n            fg_num = sorted_vals[:count_num].sum()\n            b_mask = mask > 0\n\n        # distill_loss_cls\n        loss_logits = QFLv2(\n            F.sigmoid(student_logits),\n            teacher_probs,\n            weight=mask,\n            reduction=\"sum\") / fg_num\n\n        # distill_loss_box\n        inputs = paddle.concat(\n            (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]),\n            axis=-1)\n        targets = paddle.concat(\n            (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]),\n            axis=-1)\n        iou_loss = GIoULoss(reduction='mean')\n        loss_deltas = iou_loss(inputs, targets)\n\n        # distill_loss_quality\n        loss_quality = F.binary_cross_entropy(\n            F.sigmoid(student_quality[b_mask]),\n            F.sigmoid(teacher_quality[b_mask]),\n            reduction='mean')\n\n        return {\n            \"distill_loss_cls\": loss_logits,\n            \"distill_loss_box\": loss_deltas,\n            \"distill_loss_quality\": loss_quality,\n            \"fg_sum\": fg_num,\n        }\n\n\n@register\nclass SSODPPYOLOELoss(nn.Layer):\n    def __init__(self, loss_weight=1.0):\n        super(SSODPPYOLOELoss, self).__init__()\n        self.loss_weight = loss_weight\n\n    def forward(self, student_head_outs, teacher_head_outs, train_cfg):\n        # for semi-det distill\n        # student_probs: already sigmoid\n        student_probs, student_deltas, student_dfl = student_head_outs\n        teacher_probs, teacher_deltas, teacher_dfl = teacher_head_outs\n        bs, l, nc = student_probs.shape[:]  # bs, l, num_classes\n        bs, l, _, reg_ch = student_dfl.shape[:]  # bs, l, 4, reg_ch\n        student_probs = student_probs.reshape([-1, nc])\n        teacher_probs = teacher_probs.reshape([-1, nc])\n        student_deltas = student_deltas.reshape([-1, 4])\n        teacher_deltas = teacher_deltas.reshape([-1, 4])\n        student_dfl = student_dfl.reshape([-1, 4, reg_ch])\n        teacher_dfl = teacher_dfl.reshape([-1, 4, reg_ch])\n\n        ratio = train_cfg.get('ratio', 0.01)\n\n        # for contrast loss\n        curr_iter = train_cfg['curr_iter']\n        st_iter = train_cfg['st_iter']\n        if curr_iter == st_iter + 1:\n            # start semi-det training\n            self.queue_ptr = 0\n            self.queue_size = int(bs * l * ratio)\n            self.queue_feats = paddle.zeros([self.queue_size, nc])\n            self.queue_probs = paddle.zeros([self.queue_size, nc])\n        contrast_loss_cfg = train_cfg['contrast_loss']\n        temperature = contrast_loss_cfg.get('temperature', 0.2)\n        alpha = contrast_loss_cfg.get('alpha', 0.9)\n        smooth_iter = contrast_loss_cfg.get('smooth_iter', 100) + st_iter\n\n        with paddle.no_grad():\n            # Region Selection\n            count_num = int(teacher_probs.shape[0] * ratio)\n            max_vals = paddle.max(teacher_probs, 1)\n            sorted_vals, sorted_inds = paddle.topk(max_vals,\n                                                   teacher_probs.shape[0])\n            mask = paddle.zeros_like(max_vals)\n            mask[sorted_inds[:count_num]] = 1.\n            fg_num = sorted_vals[:count_num].sum()\n            b_mask = mask > 0.\n\n            # for contrast loss\n            probs = teacher_probs[b_mask].detach()\n            if curr_iter > smooth_iter:  # memory-smoothing\n                A = paddle.exp(\n                    paddle.mm(teacher_probs[b_mask], self.queue_probs.t()) /\n                    temperature)\n                A = A / A.sum(1, keepdim=True)\n                probs = alpha * probs + (1 - alpha) * paddle.mm(\n                    A, self.queue_probs)\n            n = student_probs[b_mask].shape[0]\n            # update memory bank\n            self.queue_feats[self.queue_ptr:self.queue_ptr +\n                             n, :] = teacher_probs[b_mask].detach()\n            self.queue_probs[self.queue_ptr:self.queue_ptr +\n                             n, :] = teacher_probs[b_mask].detach()\n            self.queue_ptr = (self.queue_ptr + n) % self.queue_size\n\n        # embedding similarity\n        sim = paddle.exp(\n            paddle.mm(student_probs[b_mask], teacher_probs[b_mask].t()) / 0.2)\n        sim_probs = sim / sim.sum(1, keepdim=True)\n        # pseudo-label graph with self-loop\n        Q = paddle.mm(probs, probs.t())\n        Q.fill_diagonal_(1)\n        pos_mask = (Q >= 0.5).astype('float32')\n        Q = Q * pos_mask\n        Q = Q / Q.sum(1, keepdim=True)\n        # contrastive loss\n        loss_contrast = -(paddle.log(sim_probs + 1e-7) * Q).sum(1)\n        loss_contrast = loss_contrast.mean()\n\n        # distill_loss_cls\n        loss_cls = QFLv2(\n            student_probs, teacher_probs, weight=mask, reduction=\"sum\") / fg_num\n\n        # distill_loss_iou\n        inputs = paddle.concat(\n            (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]),\n            -1)\n        targets = paddle.concat(\n            (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]),\n            -1)\n        iou_loss = GIoULoss(reduction='mean')\n        loss_iou = iou_loss(inputs, targets)\n\n        # distill_loss_dfl\n        loss_dfl = F.cross_entropy(\n            student_dfl[b_mask].reshape([-1, reg_ch]),\n            teacher_dfl[b_mask].reshape([-1, reg_ch]),\n            soft_label=True,\n            reduction='mean')\n\n        return {\n            \"distill_loss_cls\": loss_cls,\n            \"distill_loss_iou\": loss_iou,\n            \"distill_loss_dfl\": loss_dfl,\n            \"distill_loss_contrast\": loss_contrast,\n            \"fg_sum\": fg_num,\n        }\n"
  },
  {
    "path": "ppdet/modeling/ssod/utils.py",
    "content": "#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn.functional as F\n\n\ndef align_weak_strong_shape(data_weak, data_strong):\n    max_shape_x = max(data_strong['image'].shape[2],\n                      data_weak['image'].shape[2])\n    max_shape_y = max(data_strong['image'].shape[3],\n                      data_weak['image'].shape[3])\n\n    scale_x_s = max_shape_x / data_strong['image'].shape[2]\n    scale_y_s = max_shape_y / data_strong['image'].shape[3]\n    scale_x_w = max_shape_x / data_weak['image'].shape[2]\n    scale_y_w = max_shape_y / data_weak['image'].shape[3]\n    target_size = [max_shape_x, max_shape_y]\n\n    if scale_x_s != 1 or scale_y_s != 1:\n        data_strong['image'] = F.interpolate(\n            data_strong['image'],\n            size=target_size,\n            mode='bilinear',\n            align_corners=False)\n        if 'gt_bbox' in data_strong:\n            gt_bboxes = data_strong['gt_bbox'].numpy()\n            for i in range(len(gt_bboxes)):\n                if len(gt_bboxes[i]) > 0:\n                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_s\n                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_s\n            data_strong['gt_bbox'] = paddle.to_tensor(gt_bboxes)\n\n    if scale_x_w != 1 or scale_y_w != 1:\n        data_weak['image'] = F.interpolate(\n            data_weak['image'],\n            size=target_size,\n            mode='bilinear',\n            align_corners=False)\n        if 'gt_bbox' in data_weak:\n            gt_bboxes = data_weak['gt_bbox'].numpy()\n            for i in range(len(gt_bboxes)):\n                if len(gt_bboxes[i]) > 0:\n                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_w\n                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_w\n            data_weak['gt_bbox'] = paddle.to_tensor(gt_bboxes)\n    return data_weak, data_strong\n\n\ndef QFLv2(pred_sigmoid,\n          teacher_sigmoid,\n          weight=None,\n          beta=2.0,\n          reduction='mean'):\n    pt = pred_sigmoid\n    zerolabel = paddle.zeros_like(pt)\n    loss = F.binary_cross_entropy(\n        pred_sigmoid, zerolabel, reduction='none') * pt.pow(beta)\n    pos = weight > 0\n\n    pt = teacher_sigmoid[pos] - pred_sigmoid[pos]\n    loss[pos] = F.binary_cross_entropy(\n        pred_sigmoid[pos], teacher_sigmoid[pos],\n        reduction='none') * pt.pow(beta)\n\n    valid = weight >= 0\n    if reduction == \"mean\":\n        loss = loss[valid].mean()\n    elif reduction == \"sum\":\n        loss = loss[valid].sum()\n    return loss\n\n\ndef filter_invalid(bbox, label=None, score=None, thr=0.0, min_size=0):\n    if score.numel() > 0:\n        soft_score = score.max(-1)\n        valid = soft_score >= thr\n        bbox = bbox[valid]\n\n        if label is not None:\n            label = label[valid]\n        score = score[valid]\n    if min_size is not None and bbox.shape[0] > 0:\n        bw = bbox[:, 2]\n        bh = bbox[:, 3]\n        valid = (bw > min_size) & (bh > min_size)\n        bbox = bbox[valid]\n\n        if label is not None:\n            label = label[valid]\n            score = score[valid]\n\n    return bbox, label, score\n"
  },
  {
    "path": "ppdet/modeling/tests/__init__.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppdet/modeling/tests/test_architectures.py",
    "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport unittest\nimport ppdet\n\n\nclass TestFasterRCNN(unittest.TestCase):\n    def setUp(self):\n        self.set_config()\n\n    def set_config(self):\n        self.cfg_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml'\n\n    def test_trainer(self):\n        # Trainer __init__ will build model and DataLoader\n        # 'train' and 'eval' mode include dataset loading\n        # use 'test' mode to simplify tests\n        cfg = ppdet.core.workspace.load_config(self.cfg_file)\n        trainer = ppdet.engine.Trainer(cfg, mode='test')\n\n\nclass TestMaskRCNN(TestFasterRCNN):\n    def set_config(self):\n        self.cfg_file = 'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml'\n\n\nclass TestCascadeRCNN(TestFasterRCNN):\n    def set_config(self):\n        self.cfg_file = 'configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml'\n\n\nclass TestYolov3(TestFasterRCNN):\n    def set_config(self):\n        self.cfg_file = 'configs/yolov3/yolov3_darknet53_270e_coco.yml'\n\n\nclass TestSSD(TestFasterRCNN):\n    def set_config(self):\n        self.cfg_file = 'configs/ssd/ssd_vgg16_300_240e_voc.yml'\n\n\nclass TestGFL(TestFasterRCNN):\n    def set_config(self):\n        self.cfg_file = 'configs/gfl/gfl_r50_fpn_1x_coco.yml'\n\n\nclass TestPicoDet(TestFasterRCNN):\n    def set_config(self):\n        self.cfg_file = 'configs/picodet/picodet_s_320_coco_lcnet.yml'\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "ppdet/modeling/tests/test_base.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import print_function\nimport unittest\n\nimport contextlib\n\nimport paddle\nfrom paddle.static import Program\n\n\nclass LayerTest(unittest.TestCase):\n    @classmethod\n    def setUpClass(cls):\n        cls.seed = 111\n\n    @classmethod\n    def tearDownClass(cls):\n        pass\n\n    def _get_place(self, force_to_use_cpu=False):\n        # this option for ops that only have cpu kernel\n        if force_to_use_cpu:\n            return 'cpu'\n        else:\n            return paddle.device.get_device()\n\n    @contextlib.contextmanager\n    def static_graph(self):\n        paddle.enable_static()\n        scope = paddle.static.Scope()\n        program = Program()\n        with paddle.static.scope_guard(scope):\n            with paddle.static.program_guard(program):\n                paddle.seed(self.seed)\n                paddle.framework.random._manual_program_seed(self.seed)\n                yield\n\n    def get_static_graph_result(self,\n                                feed,\n                                fetch_list,\n                                with_lod=False,\n                                force_to_use_cpu=False):\n        exe = paddle.static.Executor(self._get_place(force_to_use_cpu))\n        exe.run(paddle.static.default_startup_program())\n        return exe.run(paddle.static.default_main_program(),\n                       feed=feed,\n                       fetch_list=fetch_list,\n                       return_numpy=(not with_lod))\n\n    @contextlib.contextmanager\n    def dynamic_graph(self, force_to_use_cpu=False):\n        paddle.disable_static()\n        place = self._get_place(force_to_use_cpu=force_to_use_cpu)\n        paddle.device.set_device(place)\n        paddle.seed(self.seed)\n        paddle.framework.random._manual_program_seed(self.seed)\n        yield\n"
  },
  {
    "path": "ppdet/modeling/tests/test_mstest.py",
    "content": "#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport unittest\nfrom ppdet.core.workspace import load_config\nfrom ppdet.engine import Trainer\n\n\nclass TestMultiScaleInference(unittest.TestCase):\n    def setUp(self):\n        self.set_config()\n\n    def set_config(self):\n        self.mstest_cfg_file = 'configs/faster_rcnn/faster_rcnn_r34_fpn_multiscaletest_1x_coco.yml'\n\n    # test evaluation with multi scale test\n    def test_eval_mstest(self):\n        cfg = load_config(self.mstest_cfg_file)\n        trainer = Trainer(cfg, mode='eval')\n\n        cfg.weights = 'https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams'\n        trainer.load_weights(cfg.weights)\n\n        trainer.evaluate()\n\n    # test inference with multi scale test\n    def test_infer_mstest(self):\n        cfg = load_config(self.mstest_cfg_file)\n        trainer = Trainer(cfg, mode='test')\n\n        cfg.weights = 'https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams'\n        trainer.load_weights(cfg.weights)\n        tests_img_root = os.path.join(os.path.dirname(__file__), 'imgs')\n\n        # input images to predict\n        imgs = [\n            'coco2017_val2017_000000000139.jpg',\n            'coco2017_val2017_000000000724.jpg'\n        ]\n        imgs = [os.path.join(tests_img_root, img) for img in imgs]\n        trainer.predict(\n            imgs, draw_threshold=0.5, output_dir='output', save_results=False)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "ppdet/modeling/tests/test_ops.py",
    "content": "#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import print_function\nimport os, sys\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4)))\nif parent_path not in sys.path:\n    sys.path.append(parent_path)\n\nimport unittest\nimport numpy as np\n\nimport paddle\n\nimport ppdet.modeling.ops as ops\nfrom ppdet.modeling.tests.test_base import LayerTest\n\n\ndef make_rois(h, w, rois_num, output_size):\n    rois = np.zeros((0, 4)).astype('float32')\n    for roi_num in rois_num:\n        roi = np.zeros((roi_num, 4)).astype('float32')\n        roi[:, 0] = np.random.randint(0, h - output_size[0], size=roi_num)\n        roi[:, 1] = np.random.randint(0, w - output_size[1], size=roi_num)\n        roi[:, 2] = np.random.randint(roi[:, 0] + output_size[0], h)\n        roi[:, 3] = np.random.randint(roi[:, 1] + output_size[1], w)\n        rois = np.vstack((rois, roi))\n    return rois\n\n\ndef softmax(x):\n    # clip to shiftx, otherwise, when calc loss with\n    # log(exp(shiftx)), may get log(0)=INF\n    shiftx = (x - np.max(x)).clip(-64.)\n    exps = np.exp(shiftx)\n    return exps / np.sum(exps)\n\n\nclass TestROIAlign(LayerTest):\n    def test_roi_align(self):\n        b, c, h, w = 2, 12, 20, 20\n        inputs_np = np.random.rand(b, c, h, w).astype('float32')\n        rois_num = [4, 6]\n        output_size = (7, 7)\n        rois_np = make_rois(h, w, rois_num, output_size)\n        rois_num_np = np.array(rois_num).astype('int32')\n        with self.static_graph():\n            inputs = paddle.static.data(\n                name='inputs', shape=[b, c, h, w], dtype='float32')\n            rois = paddle.static.data(\n                name='rois', shape=[10, 4], dtype='float32')\n            rois_num = paddle.static.data(\n                name='rois_num', shape=[None], dtype='int32')\n\n            output = paddle.vision.ops.roi_align(\n                x=inputs,\n                boxes=rois,\n                boxes_num=rois_num,\n                output_size=output_size)\n            output_np, = self.get_static_graph_result(\n                feed={\n                    'inputs': inputs_np,\n                    'rois': rois_np,\n                    'rois_num': rois_num_np\n                },\n                fetch_list=output,\n                with_lod=False)\n\n        with self.dynamic_graph():\n            inputs_dy = paddle.to_tensor(inputs_np)\n            rois_dy = paddle.to_tensor(rois_np)\n            rois_num_dy = paddle.to_tensor(rois_num_np)\n\n            output_dy = paddle.vision.ops.roi_align(\n                x=inputs_dy,\n                boxes=rois_dy,\n                boxes_num=rois_num_dy,\n                output_size=output_size)\n            output_dy_np = output_dy.numpy()\n\n        self.assertTrue(np.array_equal(output_np, output_dy_np))\n\n    def test_roi_align_error(self):\n        with self.static_graph():\n            inputs = paddle.static.data(\n                name='inputs', shape=[2, 12, 20, 20], dtype='float32')\n            rois = paddle.static.data(\n                name='data_error', shape=[10, 4], dtype='int32', lod_level=1)\n            self.assertRaises(\n                TypeError,\n                paddle.vision.ops.roi_align,\n                input=inputs,\n                rois=rois,\n                output_size=(7, 7))\n\n        paddle.disable_static()\n\n\nclass TestROIPool(LayerTest):\n    def test_roi_pool(self):\n        b, c, h, w = 2, 12, 20, 20\n        inputs_np = np.random.rand(b, c, h, w).astype('float32')\n        rois_num = [4, 6]\n        output_size = (7, 7)\n        rois_np = make_rois(h, w, rois_num, output_size)\n        rois_num_np = np.array(rois_num).astype('int32')\n        with self.static_graph():\n            inputs = paddle.static.data(\n                name='inputs', shape=[b, c, h, w], dtype='float32')\n            rois = paddle.static.data(\n                name='rois', shape=[10, 4], dtype='float32')\n            rois_num = paddle.static.data(\n                name='rois_num', shape=[None], dtype='int32')\n\n            output = paddle.vision.ops.roi_pool(\n                x=inputs,\n                boxes=rois,\n                boxes_num=rois_num,\n                output_size=output_size)\n            output_np, = self.get_static_graph_result(\n                feed={\n                    'inputs': inputs_np,\n                    'rois': rois_np,\n                    'rois_num': rois_num_np\n                },\n                fetch_list=[output],\n                with_lod=False)\n\n        with self.dynamic_graph():\n            inputs_dy = paddle.to_tensor(inputs_np)\n            rois_dy = paddle.to_tensor(rois_np)\n            rois_num_dy = paddle.to_tensor(rois_num_np)\n\n            output_dy = paddle.vision.ops.roi_pool(\n                x=inputs_dy,\n                boxes=rois_dy,\n                boxes_num=rois_num_dy,\n                output_size=output_size)\n            output_dy_np = output_dy.numpy()\n\n        self.assertTrue(np.array_equal(output_np, output_dy_np))\n\n    def test_roi_pool_error(self):\n        with self.static_graph():\n            inputs = paddle.static.data(\n                name='inputs', shape=[2, 12, 20, 20], dtype='float32')\n            rois = paddle.static.data(\n                name='data_error', shape=[10, 4], dtype='int32', lod_level=1)\n            self.assertRaises(\n                TypeError,\n                paddle.vision.ops.roi_pool,\n                input=inputs,\n                rois=rois,\n                output_size=(7, 7))\n\n        paddle.disable_static()\n\n\nclass TestPriorBox(LayerTest):\n    def test_prior_box(self):\n        input_np = np.random.rand(2, 10, 32, 32).astype('float32')\n        image_np = np.random.rand(2, 10, 40, 40).astype('float32')\n        min_sizes = [2, 4]\n        with self.static_graph():\n            input = paddle.static.data(\n                name='input', shape=[2, 10, 32, 32], dtype='float32')\n            image = paddle.static.data(\n                name='image', shape=[2, 10, 40, 40], dtype='float32')\n\n            box, var = ops.prior_box(\n                input=input,\n                image=image,\n                min_sizes=min_sizes,\n                clip=True,\n                flip=True)\n            box_np, var_np = self.get_static_graph_result(\n                feed={\n                    'input': input_np,\n                    'image': image_np,\n                },\n                fetch_list=[box, var],\n                with_lod=False)\n\n        with self.dynamic_graph():\n            inputs_dy = paddle.to_tensor(input_np)\n            image_dy = paddle.to_tensor(image_np)\n\n            box_dy, var_dy = ops.prior_box(\n                input=inputs_dy,\n                image=image_dy,\n                min_sizes=min_sizes,\n                clip=True,\n                flip=True)\n            box_dy_np = box_dy.numpy()\n            var_dy_np = var_dy.numpy()\n\n        self.assertTrue(np.array_equal(box_np, box_dy_np))\n        self.assertTrue(np.array_equal(var_np, var_dy_np))\n\n    def test_prior_box_error(self):\n        with self.static_graph():\n            input = paddle.static.data(\n                name='input', shape=[2, 10, 32, 32], dtype='int32')\n            image = paddle.static.data(\n                name='image', shape=[2, 10, 40, 40], dtype='int32')\n            self.assertRaises(\n                TypeError,\n                ops.prior_box,\n                input=input,\n                image=image,\n                min_sizes=[2, 4],\n                clip=True,\n                flip=True)\n\n        paddle.disable_static()\n\n\nclass TestMulticlassNms(LayerTest):\n    def test_multiclass_nms(self):\n        boxes_np = np.random.rand(10, 81, 4).astype('float32')\n        scores_np = np.random.rand(10, 81).astype('float32')\n        rois_num_np = np.array([2, 8]).astype('int32')\n        with self.static_graph():\n            boxes = paddle.static.data(\n                name='bboxes',\n                shape=[None, 81, 4],\n                dtype='float32',\n                lod_level=1)\n            scores = paddle.static.data(\n                name='scores', shape=[None, 81], dtype='float32', lod_level=1)\n            rois_num = paddle.static.data(\n                name='rois_num', shape=[None], dtype='int32')\n\n            output = ops.multiclass_nms(\n                bboxes=boxes,\n                scores=scores,\n                background_label=0,\n                score_threshold=0.5,\n                nms_top_k=400,\n                nms_threshold=0.3,\n                keep_top_k=200,\n                normalized=False,\n                return_index=True,\n                rois_num=rois_num)\n            out_np, index_np, nms_rois_num_np = self.get_static_graph_result(\n                feed={\n                    'bboxes': boxes_np,\n                    'scores': scores_np,\n                    'rois_num': rois_num_np\n                },\n                fetch_list=output,\n                with_lod=True)\n            out_np = np.array(out_np)\n            index_np = np.array(index_np)\n            nms_rois_num_np = np.array(nms_rois_num_np)\n\n        with self.dynamic_graph():\n            boxes_dy = paddle.to_tensor(boxes_np)\n            scores_dy = paddle.to_tensor(scores_np)\n            rois_num_dy = paddle.to_tensor(rois_num_np)\n\n            out_dy, index_dy, nms_rois_num_dy = ops.multiclass_nms(\n                bboxes=boxes_dy,\n                scores=scores_dy,\n                background_label=0,\n                score_threshold=0.5,\n                nms_top_k=400,\n                nms_threshold=0.3,\n                keep_top_k=200,\n                normalized=False,\n                return_index=True,\n                rois_num=rois_num_dy)\n            out_dy_np = out_dy.numpy()\n            index_dy_np = index_dy.numpy()\n            nms_rois_num_dy_np = nms_rois_num_dy.numpy()\n\n        self.assertTrue(np.array_equal(out_np, out_dy_np))\n        self.assertTrue(np.array_equal(index_np, index_dy_np))\n        self.assertTrue(np.array_equal(nms_rois_num_np, nms_rois_num_dy_np))\n\n    def test_multiclass_nms_error(self):\n        with self.static_graph():\n            boxes = paddle.static.data(\n                name='bboxes', shape=[81, 4], dtype='float32', lod_level=1)\n            scores = paddle.static.data(\n                name='scores', shape=[81], dtype='float32', lod_level=1)\n            rois_num = paddle.static.data(\n                name='rois_num', shape=[40, 41], dtype='int32')\n            self.assertRaises(\n                TypeError,\n                ops.multiclass_nms,\n                boxes=boxes,\n                scores=scores,\n                background_label=0,\n                score_threshold=0.5,\n                nms_top_k=400,\n                nms_threshold=0.3,\n                keep_top_k=200,\n                normalized=False,\n                return_index=True,\n                rois_num=rois_num)\n\n\nclass TestMatrixNMS(LayerTest):\n    def test_matrix_nms(self):\n        N, M, C = 7, 1200, 21\n        BOX_SIZE = 4\n        nms_top_k = 400\n        keep_top_k = 200\n        score_threshold = 0.01\n        post_threshold = 0.\n\n        scores_np = np.random.random((N * M, C)).astype('float32')\n        scores_np = np.apply_along_axis(softmax, 1, scores_np)\n        scores_np = np.reshape(scores_np, (N, M, C))\n        scores_np = np.transpose(scores_np, (0, 2, 1))\n\n        boxes_np = np.random.random((N, M, BOX_SIZE)).astype('float32')\n        boxes_np[:, :, 0:2] = boxes_np[:, :, 0:2] * 0.5\n        boxes_np[:, :, 2:4] = boxes_np[:, :, 2:4] * 0.5 + 0.5\n\n        with self.static_graph():\n            boxes = paddle.static.data(\n                name='boxes', shape=[N, M, BOX_SIZE], dtype='float32')\n            scores = paddle.static.data(\n                name='scores', shape=[N, C, M], dtype='float32')\n            out, index, _ = ops.matrix_nms(\n                bboxes=boxes,\n                scores=scores,\n                score_threshold=score_threshold,\n                post_threshold=post_threshold,\n                nms_top_k=nms_top_k,\n                keep_top_k=keep_top_k,\n                return_index=True)\n            out_np, index_np = self.get_static_graph_result(\n                feed={'boxes': boxes_np,\n                      'scores': scores_np},\n                fetch_list=[out, index],\n                with_lod=True)\n\n        with self.dynamic_graph():\n            boxes_dy = paddle.to_tensor(boxes_np)\n            scores_dy = paddle.to_tensor(scores_np)\n\n            out_dy, index_dy, _ = ops.matrix_nms(\n                bboxes=boxes_dy,\n                scores=scores_dy,\n                score_threshold=score_threshold,\n                post_threshold=post_threshold,\n                nms_top_k=nms_top_k,\n                keep_top_k=keep_top_k,\n                return_index=True)\n            out_dy_np = out_dy.numpy()\n            index_dy_np = index_dy.numpy()\n\n        self.assertTrue(np.array_equal(out_np, out_dy_np))\n        self.assertTrue(np.array_equal(index_np, index_dy_np))\n\n    def test_matrix_nms_error(self):\n        with self.static_graph():\n            bboxes = paddle.static.data(\n                name='bboxes', shape=[7, 1200, 4], dtype='float32')\n            scores = paddle.static.data(\n                name='data_error', shape=[7, 21, 1200], dtype='int32')\n            self.assertRaises(\n                TypeError,\n                ops.matrix_nms,\n                bboxes=bboxes,\n                scores=scores,\n                score_threshold=0.01,\n                post_threshold=0.,\n                nms_top_k=400,\n                keep_top_k=200,\n                return_index=True)\n\n        paddle.disable_static()\n\n\nclass TestBoxCoder(LayerTest):\n    def test_box_coder(self):\n\n        prior_box_np = np.random.random((81, 4)).astype('float32')\n        prior_box_var_np = np.random.random((81, 4)).astype('float32')\n        target_box_np = np.random.random((20, 81, 4)).astype('float32')\n\n        # static\n        with self.static_graph():\n            prior_box = paddle.static.data(\n                name='prior_box', shape=[81, 4], dtype='float32')\n            prior_box_var = paddle.static.data(\n                name='prior_box_var', shape=[81, 4], dtype='float32')\n            target_box = paddle.static.data(\n                name='target_box', shape=[20, 81, 4], dtype='float32')\n\n            boxes = ops.box_coder(\n                prior_box=prior_box,\n                prior_box_var=prior_box_var,\n                target_box=target_box,\n                code_type=\"decode_center_size\",\n                box_normalized=False)\n\n            boxes_np, = self.get_static_graph_result(\n                feed={\n                    'prior_box': prior_box_np,\n                    'prior_box_var': prior_box_var_np,\n                    'target_box': target_box_np,\n                },\n                fetch_list=[boxes],\n                with_lod=False)\n\n        # dygraph\n        with self.dynamic_graph():\n            prior_box_dy = paddle.to_tensor(prior_box_np)\n            prior_box_var_dy = paddle.to_tensor(prior_box_var_np)\n            target_box_dy = paddle.to_tensor(target_box_np)\n\n            boxes_dy = ops.box_coder(\n                prior_box=prior_box_dy,\n                prior_box_var=prior_box_var_dy,\n                target_box=target_box_dy,\n                code_type=\"decode_center_size\",\n                box_normalized=False)\n\n            boxes_dy_np = boxes_dy.numpy()\n\n            self.assertTrue(np.array_equal(boxes_np, boxes_dy_np))\n\n    def test_box_coder_error(self):\n        with self.static_graph():\n            prior_box = paddle.static.data(\n                name='prior_box', shape=[81, 4], dtype='int32')\n            prior_box_var = paddle.static.data(\n                name='prior_box_var', shape=[81, 4], dtype='float32')\n            target_box = paddle.static.data(\n                name='target_box', shape=[20, 81, 4], dtype='float32')\n\n            self.assertRaises(TypeError, ops.box_coder, prior_box,\n                              prior_box_var, target_box)\n\n        paddle.disable_static()\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "ppdet/modeling/tests/test_yolov3_loss.py",
    "content": "#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import division\n\nimport unittest\n\nimport paddle\nimport paddle.nn.functional as F\n# add python path of PaddleDetection to sys.path\nimport os\nimport sys\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4)))\nif parent_path not in sys.path:\n    sys.path.append(parent_path)\n\nfrom ppdet.modeling.losses import YOLOv3Loss\nfrom ppdet.data.transform.op_helper import jaccard_overlap\nfrom ppdet.modeling.bbox_utils import iou_similarity\nimport numpy as np\nnp.random.seed(0)\n\n\ndef _split_output(output, an_num, num_classes):\n    \"\"\"\n    Split output feature map to x, y, w, h, objectness, classification\n    along channel dimension\n    \"\"\"\n    x = paddle.strided_slice(\n        output,\n        axes=[1],\n        starts=[0],\n        ends=[output.shape[1]],\n        strides=[5 + num_classes])\n    y = paddle.strided_slice(\n        output,\n        axes=[1],\n        starts=[1],\n        ends=[output.shape[1]],\n        strides=[5 + num_classes])\n    w = paddle.strided_slice(\n        output,\n        axes=[1],\n        starts=[2],\n        ends=[output.shape[1]],\n        strides=[5 + num_classes])\n    h = paddle.strided_slice(\n        output,\n        axes=[1],\n        starts=[3],\n        ends=[output.shape[1]],\n        strides=[5 + num_classes])\n    obj = paddle.strided_slice(\n        output,\n        axes=[1],\n        starts=[4],\n        ends=[output.shape[1]],\n        strides=[5 + num_classes])\n    clss = []\n    stride = output.shape[1] // an_num\n    for m in range(an_num):\n        clss.append(\n            paddle.slice(\n                output,\n                axes=[1],\n                starts=[stride * m + 5],\n                ends=[stride * m + 5 + num_classes]))\n    cls = paddle.transpose(paddle.stack(clss, axis=1), perm=[0, 1, 3, 4, 2])\n    return (x, y, w, h, obj, cls)\n\n\ndef _split_target(target):\n    \"\"\"\n    split target to x, y, w, h, objectness, classification\n    along dimension 2\n    target is in shape [N, an_num, 6 + class_num, H, W]\n    \"\"\"\n    tx = target[:, :, 0, :, :]\n    ty = target[:, :, 1, :, :]\n    tw = target[:, :, 2, :, :]\n    th = target[:, :, 3, :, :]\n    tscale = target[:, :, 4, :, :]\n    tobj = target[:, :, 5, :, :]\n    tcls = paddle.transpose(target[:, :, 6:, :, :], perm=[0, 1, 3, 4, 2])\n    tcls.stop_gradient = True\n    return (tx, ty, tw, th, tscale, tobj, tcls)\n\n\ndef _calc_obj_loss(output, obj, tobj, gt_box, batch_size, anchors, num_classes,\n                   downsample, ignore_thresh, scale_x_y):\n    # A prediction bbox overlap any gt_bbox over ignore_thresh, \n    # objectness loss will be ignored, process as follows:\n    # 1. get pred bbox, which is same with YOLOv3 infer mode, use yolo_box here\n    # NOTE: img_size is set as 1.0 to get noramlized pred bbox\n    bbox, prob = paddle.vision.ops.yolo_box(\n        x=output,\n        img_size=paddle.ones(\n            shape=[batch_size, 2], dtype=\"int32\"),\n        anchors=anchors,\n        class_num=num_classes,\n        conf_thresh=0.,\n        downsample_ratio=downsample,\n        clip_bbox=False,\n        scale_x_y=scale_x_y)\n    # 2. split pred bbox and gt bbox by sample, calculate IoU between pred bbox\n    #    and gt bbox in each sample\n    if batch_size > 1:\n        preds = paddle.split(bbox, batch_size, axis=0)\n        gts = paddle.split(gt_box, batch_size, axis=0)\n    else:\n        preds = [bbox]\n        gts = [gt_box]\n        probs = [prob]\n    ious = []\n    for pred, gt in zip(preds, gts):\n\n        def box_xywh2xyxy(box):\n            x = box[:, 0]\n            y = box[:, 1]\n            w = box[:, 2]\n            h = box[:, 3]\n            return paddle.stack(\n                [\n                    x - w / 2.,\n                    y - h / 2.,\n                    x + w / 2.,\n                    y + h / 2.,\n                ], axis=1)\n\n        pred = paddle.squeeze(pred, axis=[0])\n        gt = box_xywh2xyxy(paddle.squeeze(gt, axis=[0]))\n        ious.append(iou_similarity(pred, gt))\n    iou = paddle.stack(ious, axis=0)\n    # 3. Get iou_mask by IoU between gt bbox and prediction bbox,\n    #    Get obj_mask by tobj(holds gt_score), calculate objectness loss\n    max_iou = paddle.max(iou, axis=-1)\n    iou_mask = paddle.cast(max_iou <= ignore_thresh, dtype=\"float32\")\n    output_shape = output.shape\n    an_num = len(anchors) // 2\n    iou_mask = paddle.reshape(iou_mask, (-1, an_num, output_shape[2],\n                                         output_shape[3]))\n    iou_mask.stop_gradient = True\n    # NOTE: tobj holds gt_score, obj_mask holds object existence mask\n    obj_mask = paddle.cast(tobj > 0., dtype=\"float32\")\n    obj_mask.stop_gradient = True\n    # For positive objectness grids, objectness loss should be calculated\n    # For negative objectness grids, objectness loss is calculated only iou_mask == 1.0\n    obj_sigmoid = F.sigmoid(obj)\n    loss_obj = F.binary_cross_entropy(obj_sigmoid, obj_mask, reduction='none')\n    loss_obj_pos = paddle.sum(loss_obj * tobj, axis=[1, 2, 3])\n    loss_obj_neg = paddle.sum(loss_obj * (1.0 - obj_mask) * iou_mask,\n                              axis=[1, 2, 3])\n    return loss_obj_pos, loss_obj_neg\n\n\ndef fine_grained_loss(output,\n                      target,\n                      gt_box,\n                      batch_size,\n                      num_classes,\n                      anchors,\n                      ignore_thresh,\n                      downsample,\n                      scale_x_y=1.,\n                      eps=1e-10):\n    an_num = len(anchors) // 2\n    x, y, w, h, obj, cls = _split_output(output, an_num, num_classes)\n    tx, ty, tw, th, tscale, tobj, tcls = _split_target(target)\n\n    tscale_tobj = tscale * tobj\n\n    scale_x_y = scale_x_y\n\n    if (abs(scale_x_y - 1.0) < eps):\n        x = F.sigmoid(x)\n        y = F.sigmoid(y)\n        loss_x = F.binary_cross_entropy(x, tx, reduction='none') * tscale_tobj\n        loss_x = paddle.sum(loss_x, axis=[1, 2, 3])\n        loss_y = F.binary_cross_entropy(y, ty, reduction='none') * tscale_tobj\n        loss_y = paddle.sum(loss_y, axis=[1, 2, 3])\n    else:\n        dx = scale_x_y * F.sigmoid(x) - 0.5 * (scale_x_y - 1.0)\n        dy = scale_x_y * F.sigmoid(y) - 0.5 * (scale_x_y - 1.0)\n        loss_x = paddle.abs(dx - tx) * tscale_tobj\n        loss_x = paddle.sum(loss_x, axis=[1, 2, 3])\n        loss_y = paddle.abs(dy - ty) * tscale_tobj\n        loss_y = paddle.sum(loss_y, axis=[1, 2, 3])\n\n    # NOTE: we refined loss function of (w, h) as L1Loss\n    loss_w = paddle.abs(w - tw) * tscale_tobj\n    loss_w = paddle.sum(loss_w, axis=[1, 2, 3])\n    loss_h = paddle.abs(h - th) * tscale_tobj\n    loss_h = paddle.sum(loss_h, axis=[1, 2, 3])\n\n    loss_obj_pos, loss_obj_neg = _calc_obj_loss(\n        output, obj, tobj, gt_box, batch_size, anchors, num_classes, downsample,\n        ignore_thresh, scale_x_y)\n\n    cls = F.sigmoid(cls)\n    loss_cls = F.binary_cross_entropy(cls, tcls, reduction='none')\n    tobj = paddle.unsqueeze(tobj, axis=-1)\n\n    loss_cls = paddle.multiply(loss_cls, tobj)\n    loss_cls = paddle.sum(loss_cls, axis=[1, 2, 3, 4])\n\n    loss_xys = paddle.mean(loss_x + loss_y)\n    loss_whs = paddle.mean(loss_w + loss_h)\n    loss_objs = paddle.mean(loss_obj_pos + loss_obj_neg)\n    loss_clss = paddle.mean(loss_cls)\n\n    losses_all = {\n        \"loss_xy\": paddle.sum(loss_xys),\n        \"loss_wh\": paddle.sum(loss_whs),\n        \"loss_loc\": paddle.sum(loss_xys) + paddle.sum(loss_whs),\n        \"loss_obj\": paddle.sum(loss_objs),\n        \"loss_cls\": paddle.sum(loss_clss),\n    }\n    return losses_all, x, y, tx, ty\n\n\ndef gt2yolotarget(gt_bbox, gt_class, gt_score, anchors, mask, num_classes, size,\n                  stride):\n    grid_h, grid_w = size\n    h, w = grid_h * stride, grid_w * stride\n    an_hw = np.array(anchors) / np.array([[w, h]])\n    target = np.zeros(\n        (len(mask), 6 + num_classes, grid_h, grid_w), dtype=np.float32)\n    for b in range(gt_bbox.shape[0]):\n        gx, gy, gw, gh = gt_bbox[b, :]\n        cls = gt_class[b]\n        score = gt_score[b]\n        if gw <= 0. or gh <= 0. or score <= 0.:\n            continue\n\n        # find best match anchor index\n        best_iou = 0.\n        best_idx = -1\n        for an_idx in range(an_hw.shape[0]):\n            iou = jaccard_overlap([0., 0., gw, gh],\n                                  [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])\n            if iou > best_iou:\n                best_iou = iou\n                best_idx = an_idx\n\n        gi = int(gx * grid_w)\n        gj = int(gy * grid_h)\n\n        # gtbox should be regresed in this layes if best match \n        # anchor index in anchor mask of this layer\n        if best_idx in mask:\n            best_n = mask.index(best_idx)\n\n            # x, y, w, h, scale\n            target[best_n, 0, gj, gi] = gx * grid_w - gi\n            target[best_n, 1, gj, gi] = gy * grid_h - gj\n            target[best_n, 2, gj, gi] = np.log(gw * w / anchors[best_idx][0])\n            target[best_n, 3, gj, gi] = np.log(gh * h / anchors[best_idx][1])\n            target[best_n, 4, gj, gi] = 2.0 - gw * gh\n\n            # objectness record gt_score\n            # if target[best_n, 5, gj, gi] > 0:\n            #     print('find 1 duplicate')\n            target[best_n, 5, gj, gi] = score\n\n            # classification\n            target[best_n, 6 + cls, gj, gi] = 1.\n\n    return target\n\n\nclass TestYolov3LossOp(unittest.TestCase):\n    def setUp(self):\n        self.initTestCase()\n        x = np.random.uniform(0, 1, self.x_shape).astype('float64')\n        gtbox = np.random.random(size=self.gtbox_shape).astype('float64')\n        gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])\n        gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])\n        gtbox = gtbox * gtmask[:, :, np.newaxis]\n        gtlabel = gtlabel * gtmask\n\n        gtscore = np.ones(self.gtbox_shape[:2]).astype('float64')\n        if self.gtscore:\n            gtscore = np.random.random(self.gtbox_shape[:2]).astype('float64')\n\n        target = []\n        for box, label, score in zip(gtbox, gtlabel, gtscore):\n            target.append(\n                gt2yolotarget(box, label, score, self.anchors, self.anchor_mask,\n                              self.class_num, (self.h, self.w\n                                               ), self.downsample_ratio))\n\n        self.target = np.array(target).astype('float64')\n\n        self.mask_anchors = []\n        for i in self.anchor_mask:\n            self.mask_anchors.extend(self.anchors[i])\n        self.x = x\n        self.gtbox = gtbox\n        self.gtlabel = gtlabel\n        self.gtscore = gtscore\n\n    def initTestCase(self):\n        self.b = 8\n        self.h = 19\n        self.w = 19\n        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],\n                        [59, 119], [116, 90], [156, 198], [373, 326]]\n        self.anchor_mask = [6, 7, 8]\n        self.na = len(self.anchor_mask)\n        self.class_num = 80\n        self.ignore_thresh = 0.7\n        self.downsample_ratio = 32\n        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),\n                        self.h, self.w)\n        self.gtbox_shape = (self.b, 40, 4)\n        self.gtscore = True\n        self.use_label_smooth = False\n        self.scale_x_y = 1.\n\n    def test_loss(self):\n        x, gtbox, gtlabel, gtscore, target = self.x, self.gtbox, self.gtlabel, self.gtscore, self.target\n        yolo_loss = YOLOv3Loss(\n            ignore_thresh=self.ignore_thresh,\n            label_smooth=self.use_label_smooth,\n            num_classes=self.class_num,\n            downsample=self.downsample_ratio,\n            scale_x_y=self.scale_x_y)\n        x = paddle.to_tensor(x.astype(np.float32))\n        gtbox = paddle.to_tensor(gtbox.astype(np.float32))\n        gtlabel = paddle.to_tensor(gtlabel.astype(np.float32))\n        gtscore = paddle.to_tensor(gtscore.astype(np.float32))\n        t = paddle.to_tensor(target.astype(np.float32))\n        anchor = [self.anchors[i] for i in self.anchor_mask]\n        (yolo_loss1, px, py, tx, ty) = fine_grained_loss(\n            output=x,\n            target=t,\n            gt_box=gtbox,\n            batch_size=self.b,\n            num_classes=self.class_num,\n            anchors=self.mask_anchors,\n            ignore_thresh=self.ignore_thresh,\n            downsample=self.downsample_ratio,\n            scale_x_y=self.scale_x_y)\n        yolo_loss2 = yolo_loss.yolov3_loss(\n            x, t, gtbox, anchor, self.downsample_ratio, self.scale_x_y)\n        for k in yolo_loss2:\n            self.assertAlmostEqual(\n                float(yolo_loss1[k]), float(yolo_loss2[k]), delta=1e-2, msg=k)\n\n\nclass TestYolov3LossNoGTScore(TestYolov3LossOp):\n    def initTestCase(self):\n        self.b = 1\n        self.h = 76\n        self.w = 76\n        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],\n                        [59, 119], [116, 90], [156, 198], [373, 326]]\n        self.anchor_mask = [0, 1, 2]\n        self.na = len(self.anchor_mask)\n        self.class_num = 80\n        self.ignore_thresh = 0.7\n        self.downsample_ratio = 8\n        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),\n                        self.h, self.w)\n        self.gtbox_shape = (self.b, 40, 4)\n        self.gtscore = False\n        self.use_label_smooth = False\n        self.scale_x_y = 1.\n\n\nclass TestYolov3LossWithScaleXY(TestYolov3LossOp):\n    def initTestCase(self):\n        self.b = 5\n        self.h = 38\n        self.w = 38\n        self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],\n                        [59, 119], [116, 90], [156, 198], [373, 326]]\n        self.anchor_mask = [3, 4, 5]\n        self.na = len(self.anchor_mask)\n        self.class_num = 80\n        self.ignore_thresh = 0.7\n        self.downsample_ratio = 16\n        self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num),\n                        self.h, self.w)\n        self.gtbox_shape = (self.b, 40, 4)\n        self.gtscore = True\n        self.use_label_smooth = False\n        self.scale_x_y = 1.2\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "ppdet/modeling/transformers/__init__.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import detr_transformer\nfrom . import utils\nfrom . import matchers\nfrom . import position_encoding\nfrom . import deformable_transformer\nfrom . import dino_transformer\nfrom . import group_detr_transformer\nfrom . import mask_dino_transformer\nfrom . import rtdetr_transformer\nfrom . import hybrid_encoder\nfrom . import mask_rtdetr_transformer\nfrom . import rtdetr_transformerv2\nfrom . import rtdetr_transformerv3\n\nfrom .detr_transformer import *\nfrom .utils import *\nfrom .matchers import *\nfrom .position_encoding import *\nfrom .deformable_transformer import *\nfrom .dino_transformer import *\nfrom .petr_transformer import *\nfrom .group_detr_transformer import *\nfrom .mask_dino_transformer import *\nfrom .rtdetr_transformer import *\nfrom .hybrid_encoder import *\nfrom .mask_rtdetr_transformer import *\nfrom .rtdetr_transformerv2 import *\nfrom .rtdetr_transformerv3 import *"
  },
  {
    "path": "ppdet/modeling/transformers/deformable_transformer.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\n\nfrom ppdet.core.workspace import register\nfrom ..layers import MultiHeadAttention\nfrom .position_encoding import PositionEmbedding\nfrom .utils import _get_clones, get_valid_ratio\nfrom ..initializer import linear_init_, constant_, xavier_uniform_, normal_\n\n__all__ = ['DeformableTransformer']\n\n\nclass MSDeformableAttention(nn.Layer):\n    def __init__(self,\n                 embed_dim=256,\n                 num_heads=8,\n                 num_levels=4,\n                 num_points=4,\n                 lr_mult=0.1):\n        \"\"\"\n        Multi-Scale Deformable Attention Module\n        \"\"\"\n        super(MSDeformableAttention, self).__init__()\n        self.embed_dim = embed_dim\n        self.num_heads = num_heads\n        self.num_levels = num_levels\n        self.num_points = num_points\n        self.total_points = num_heads * num_levels * num_points\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        self.sampling_offsets = nn.Linear(\n            embed_dim,\n            self.total_points * 2,\n            weight_attr=ParamAttr(learning_rate=lr_mult),\n            bias_attr=ParamAttr(learning_rate=lr_mult))\n\n        self.attention_weights = nn.Linear(embed_dim, self.total_points)\n        self.value_proj = nn.Linear(embed_dim, embed_dim)\n        self.output_proj = nn.Linear(embed_dim, embed_dim)\n        try:\n            # use cuda op\n            from deformable_detr_ops import ms_deformable_attn\n        except:\n            # use paddle func\n            from .utils import deformable_attention_core_func as ms_deformable_attn\n        self.ms_deformable_attn_core = ms_deformable_attn\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # sampling_offsets\n        constant_(self.sampling_offsets.weight)\n        thetas = paddle.arange(\n            self.num_heads,\n            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)\n        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)\n        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)\n        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(\n            [1, self.num_levels, self.num_points, 1])\n        scaling = paddle.arange(\n            1, self.num_points + 1,\n            dtype=paddle.float32).reshape([1, 1, -1, 1])\n        grid_init *= scaling\n        self.sampling_offsets.bias.set_value(grid_init.flatten())\n        # attention_weights\n        constant_(self.attention_weights.weight)\n        constant_(self.attention_weights.bias)\n        # proj\n        xavier_uniform_(self.value_proj.weight)\n        constant_(self.value_proj.bias)\n        xavier_uniform_(self.output_proj.weight)\n        constant_(self.output_proj.bias)\n\n    def forward(self,\n                query,\n                reference_points,\n                value,\n                value_spatial_shapes,\n                value_level_start_index,\n                value_mask=None):\n        \"\"\"\n        Args:\n            query (Tensor): [bs, query_length, C]\n            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),\n                bottom-right (1, 1), including padding area\n            value (Tensor): [bs, value_length, C]\n            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]\n            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]\n            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements\n\n        Returns:\n            output (Tensor): [bs, Length_{query}, C]\n        \"\"\"\n        bs, Len_q = query.shape[:2]\n        Len_v = value.shape[1]\n        assert int(value_spatial_shapes.prod(1).sum()) == Len_v\n\n        value = self.value_proj(value)\n        if value_mask is not None:\n            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)\n            value *= value_mask\n        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])\n\n        sampling_offsets = self.sampling_offsets(query).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])\n        attention_weights = self.attention_weights(query).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])\n        attention_weights = F.softmax(attention_weights).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])\n\n        if reference_points.shape[-1] == 2:\n            offset_normalizer = value_spatial_shapes.flip([1]).reshape(\n                [1, 1, 1, self.num_levels, 1, 2])\n            sampling_locations = reference_points.reshape([\n                bs, Len_q, 1, self.num_levels, 1, 2\n            ]) + sampling_offsets / offset_normalizer.astype(sampling_offsets.dtype)\n        elif reference_points.shape[-1] == 4:\n            sampling_locations = (\n                reference_points[:, :, None, :, None, :2] + sampling_offsets /\n                self.num_points * reference_points[:, :, None, :, None, 2:] *\n                0.5)\n        else:\n            raise ValueError(\n                \"Last dim of reference_points must be 2 or 4, but get {} instead.\".\n                format(reference_points.shape[-1]))\n\n        output = self.ms_deformable_attn_core(\n            value, value_spatial_shapes, value_level_start_index,\n            sampling_locations, attention_weights)\n        output = self.output_proj(output)\n\n        return output\n\n\nclass DeformableTransformerEncoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model=256,\n                 n_head=8,\n                 dim_feedforward=1024,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 n_levels=4,\n                 n_points=4,\n                 lr_mult=0.1,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(DeformableTransformerEncoderLayer, self).__init__()\n        # self attention\n        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,\n                                               n_points, lr_mult)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(\n            d_model, weight_attr=weight_attr, bias_attr=bias_attr)\n        # ffn\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.activation = getattr(F, activation)\n        self.dropout2 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n        self.dropout3 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(\n            d_model, weight_attr=weight_attr, bias_attr=bias_attr)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n        xavier_uniform_(self.linear1.weight)\n        xavier_uniform_(self.linear2.weight)\n\n    def with_pos_embed(self, tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, src):\n        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))\n        src = src + self.dropout3(src2)\n        src = self.norm2(src)\n        return src\n\n    def forward(self,\n                src,\n                reference_points,\n                spatial_shapes,\n                level_start_index,\n                src_mask=None,\n                query_pos_embed=None):\n        # self attention\n        src2 = self.self_attn(\n            self.with_pos_embed(src, query_pos_embed), reference_points, src,\n            spatial_shapes, level_start_index, src_mask)\n        src = src + self.dropout1(src2)\n        src = self.norm1(src)\n        # ffn\n        src = self.forward_ffn(src)\n\n        return src\n\n\nclass DeformableTransformerEncoder(nn.Layer):\n    def __init__(self, encoder_layer, num_layers):\n        super(DeformableTransformerEncoder, self).__init__()\n        self.layers = _get_clones(encoder_layer, num_layers)\n        self.num_layers = num_layers\n\n    @staticmethod\n    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):\n        valid_ratios = valid_ratios.unsqueeze(1)\n        reference_points = []\n        for i, (H, W) in enumerate(spatial_shapes):\n            ref_y, ref_x = paddle.meshgrid(\n                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)\n            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *\n                                                    H)\n            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *\n                                                    W)\n            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))\n        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)\n        reference_points = reference_points * valid_ratios\n        return reference_points\n\n    def forward(self,\n                feat,\n                spatial_shapes,\n                level_start_index,\n                feat_mask=None,\n                query_pos_embed=None,\n                valid_ratios=None):\n        if valid_ratios is None:\n            valid_ratios = paddle.ones(\n                [feat.shape[0], spatial_shapes.shape[0], 2])\n        reference_points = self.get_reference_points(spatial_shapes,\n                                                     valid_ratios)\n        for layer in self.layers:\n            feat = layer(feat, reference_points, spatial_shapes,\n                         level_start_index, feat_mask, query_pos_embed)\n\n        return feat\n\n\nclass DeformableTransformerDecoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model=256,\n                 n_head=8,\n                 dim_feedforward=1024,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 n_levels=4,\n                 n_points=4,\n                 lr_mult=0.1,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(DeformableTransformerDecoderLayer, self).__init__()\n\n        # self attention\n        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(\n            d_model, weight_attr=weight_attr, bias_attr=bias_attr)\n\n        # cross attention\n        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,\n                                                n_points, lr_mult)\n        self.dropout2 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(\n            d_model, weight_attr=weight_attr, bias_attr=bias_attr)\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.activation = getattr(F, activation)\n        self.dropout3 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n        self.dropout4 = nn.Dropout(dropout)\n        self.norm3 = nn.LayerNorm(\n            d_model, weight_attr=weight_attr, bias_attr=bias_attr)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n        xavier_uniform_(self.linear1.weight)\n        xavier_uniform_(self.linear2.weight)\n\n    def with_pos_embed(self, tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt):\n        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n        return tgt\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                memory_mask=None,\n                query_pos_embed=None):\n        # self attention\n        q = k = self.with_pos_embed(tgt, query_pos_embed)\n        tgt2 = self.self_attn(q, k, value=tgt)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm1(tgt)\n\n        # cross attention\n        tgt2 = self.cross_attn(\n            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,\n            memory_spatial_shapes, memory_level_start_index, memory_mask)\n        tgt = tgt + self.dropout2(tgt2)\n        tgt = self.norm2(tgt)\n\n        # ffn\n        tgt = self.forward_ffn(tgt)\n\n        return tgt\n\n\nclass DeformableTransformerDecoder(nn.Layer):\n    def __init__(self, decoder_layer, num_layers, return_intermediate=False):\n        super(DeformableTransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.return_intermediate = return_intermediate\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                memory_mask=None,\n                query_pos_embed=None):\n        output = tgt\n        intermediate = []\n        for lid, layer in enumerate(self.layers):\n            output = layer(output, reference_points, memory,\n                           memory_spatial_shapes, memory_level_start_index,\n                           memory_mask, query_pos_embed)\n\n            if self.return_intermediate:\n                intermediate.append(output)\n\n        if self.return_intermediate:\n            return paddle.stack(intermediate)\n\n        return output.unsqueeze(0)\n\n\n@register\nclass DeformableTransformer(nn.Layer):\n    __shared__ = ['hidden_dim']\n\n    def __init__(self,\n                 num_queries=300,\n                 position_embed_type='sine',\n                 return_intermediate_dec=True,\n                 in_feats_channel=[512, 1024, 2048],\n                 num_feature_levels=4,\n                 num_encoder_points=4,\n                 num_decoder_points=4,\n                 hidden_dim=256,\n                 nhead=8,\n                 num_encoder_layers=6,\n                 num_decoder_layers=6,\n                 dim_feedforward=1024,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 lr_mult=0.1,\n                 pe_temperature=10000,\n                 pe_offset=-0.5):\n        super(DeformableTransformer, self).__init__()\n        assert position_embed_type in ['sine', 'learned'], \\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        assert len(in_feats_channel) <= num_feature_levels\n\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.num_feature_levels = num_feature_levels\n\n        encoder_layer = DeformableTransformerEncoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation,\n            num_feature_levels, num_encoder_points, lr_mult)\n        self.encoder = DeformableTransformerEncoder(encoder_layer,\n                                                    num_encoder_layers)\n\n        decoder_layer = DeformableTransformerDecoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation,\n            num_feature_levels, num_decoder_points)\n        self.decoder = DeformableTransformerDecoder(\n            decoder_layer, num_decoder_layers, return_intermediate_dec)\n\n        self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)\n        self.tgt_embed = nn.Embedding(num_queries, hidden_dim)\n        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)\n\n        self.reference_points = nn.Linear(\n            hidden_dim,\n            2,\n            weight_attr=ParamAttr(learning_rate=lr_mult),\n            bias_attr=ParamAttr(learning_rate=lr_mult))\n\n        self.input_proj = nn.LayerList()\n        for in_channels in in_feats_channel:\n            self.input_proj.append(\n                nn.Sequential(\n                    nn.Conv2D(\n                        in_channels, hidden_dim, kernel_size=1),\n                    nn.GroupNorm(32, hidden_dim)))\n        in_channels = in_feats_channel[-1]\n        for _ in range(num_feature_levels - len(in_feats_channel)):\n            self.input_proj.append(\n                nn.Sequential(\n                    nn.Conv2D(\n                        in_channels,\n                        hidden_dim,\n                        kernel_size=3,\n                        stride=2,\n                        padding=1),\n                    nn.GroupNorm(32, hidden_dim)))\n            in_channels = hidden_dim\n\n        self.position_embedding = PositionEmbedding(\n            hidden_dim // 2,\n            temperature=pe_temperature,\n            normalize=True if position_embed_type == 'sine' else False,\n            embed_type=position_embed_type,\n            offset=pe_offset,\n            eps=1e-4)\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        normal_(self.level_embed.weight)\n        normal_(self.tgt_embed.weight)\n        normal_(self.query_pos_embed.weight)\n        xavier_uniform_(self.reference_points.weight)\n        constant_(self.reference_points.bias)\n        for l in self.input_proj:\n            xavier_uniform_(l[0].weight)\n            constant_(l[0].bias)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_feats_channel': [i.channels for i in input_shape], }\n\n    def forward(self, src_feats, src_mask=None, *args, **kwargs):\n        srcs = []\n        for i in range(len(src_feats)):\n            srcs.append(self.input_proj[i](src_feats[i]))\n        if self.num_feature_levels > len(srcs):\n            len_srcs = len(srcs)\n            for i in range(len_srcs, self.num_feature_levels):\n                if i == len_srcs:\n                    srcs.append(self.input_proj[i](src_feats[-1]))\n                else:\n                    srcs.append(self.input_proj[i](srcs[-1]))\n        src_flatten = []\n        mask_flatten = []\n        lvl_pos_embed_flatten = []\n        spatial_shapes = []\n        valid_ratios = []\n        for level, src in enumerate(srcs):\n            src_shape = paddle.shape(src)\n            bs = src_shape[0:1]\n            h = src_shape[2:3]\n            w = src_shape[3:4]\n            spatial_shapes.append(paddle.concat([h, w]))\n            src = src.flatten(2).transpose([0, 2, 1])\n            src_flatten.append(src)\n            if src_mask is not None:\n                mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]\n            else:\n                mask = paddle.ones([bs, h, w])\n            valid_ratios.append(get_valid_ratio(mask))\n            pos_embed = self.position_embedding(mask).flatten(1, 2)\n            lvl_pos_embed = pos_embed + self.level_embed.weight[level]\n            lvl_pos_embed_flatten.append(lvl_pos_embed)\n            mask = mask.flatten(1)\n            mask_flatten.append(mask)\n        src_flatten = paddle.concat(src_flatten, 1)\n        mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,\n                                                                   1)\n        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)\n        # [l, 2]\n        spatial_shapes = paddle.to_tensor(\n            paddle.stack(spatial_shapes).astype('int64'))\n        # [l], 每一个level的起始index\n        level_start_index = paddle.concat([\n            paddle.zeros(\n                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]\n        ])\n        # [b, l, 2]\n        valid_ratios = paddle.stack(valid_ratios, 1)\n\n        # encoder\n        memory = self.encoder(src_flatten, spatial_shapes, level_start_index,\n                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)\n\n        # prepare input for decoder\n        bs, _, c = memory.shape\n        query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n        tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n        reference_points = F.sigmoid(self.reference_points(query_embed))\n        reference_points_input = reference_points.unsqueeze(\n            2) * valid_ratios.unsqueeze(1)\n\n        # decoder\n        hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,\n                          level_start_index, mask_flatten, query_embed)\n\n        return (hs, memory, reference_points)\n\n\nclass QRDeformableTransformerDecoder(DeformableTransformerDecoder):\n    def __init__(self, decoder_layer, num_layers,\n                 start_q=None, end_q=None, return_intermediate=False):\n        super(QRDeformableTransformerDecoder, self).__init__(\n            decoder_layer, num_layers, return_intermediate=return_intermediate)\n        self.start_q = start_q\n        self.end_q = end_q\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                memory_mask=None,\n                query_pos_embed=None):\n\n        if not self.training:\n            return super(QRDeformableTransformerDecoder, self).forward(\n                tgt, reference_points,\n                memory, memory_spatial_shapes,\n                memory_level_start_index,\n                memory_mask=memory_mask,\n                query_pos_embed=query_pos_embed)\n\n        batchsize = tgt.shape[0]\n        query_list_reserve = [tgt]\n        intermediate = []\n        for lid, layer in enumerate(self.layers):\n\n            start_q = self.start_q[lid]\n            end_q = self.end_q[lid]\n            query_list = query_list_reserve.copy()[start_q:end_q]\n\n            # prepare for parallel process\n            output = paddle.concat(query_list, axis=0)\n            fakesetsize = int(output.shape[0] / batchsize)\n            reference_points_tiled = reference_points.tile([fakesetsize, 1, 1, 1])\n\n            memory_tiled = memory.tile([fakesetsize, 1, 1])\n            query_pos_embed_tiled = query_pos_embed.tile([fakesetsize, 1, 1])\n            memory_mask_tiled = memory_mask.tile([fakesetsize, 1])\n\n            output = layer(output, reference_points_tiled, memory_tiled,\n                           memory_spatial_shapes, memory_level_start_index,\n                           memory_mask_tiled, query_pos_embed_tiled)\n\n            for i in range(fakesetsize):\n                query_list_reserve.append(output[batchsize*i:batchsize*(i+1)])\n\n            if self.return_intermediate:\n                for i in range(fakesetsize):\n                    intermediate.append(output[batchsize*i:batchsize*(i+1)])\n\n        if self.return_intermediate:\n            return paddle.stack(intermediate)\n\n        return output.unsqueeze(0)\n\n\n@register\nclass QRDeformableTransformer(DeformableTransformer):\n\n    def __init__(self,\n                 num_queries=300,\n                 position_embed_type='sine',\n                 return_intermediate_dec=True,\n                 in_feats_channel=[512, 1024, 2048],\n                 num_feature_levels=4,\n                 num_encoder_points=4,\n                 num_decoder_points=4,\n                 hidden_dim=256,\n                 nhead=8,\n                 num_encoder_layers=6,\n                 num_decoder_layers=6,\n                 dim_feedforward=1024,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 lr_mult=0.1,\n                 pe_temperature=10000,\n                 pe_offset=-0.5,\n                 start_q=None,\n                 end_q=None):\n        super(QRDeformableTransformer, self).__init__(\n                 num_queries=num_queries,\n                 position_embed_type=position_embed_type,\n                 return_intermediate_dec=return_intermediate_dec,\n                 in_feats_channel=in_feats_channel,\n                 num_feature_levels=num_feature_levels,\n                 num_encoder_points=num_encoder_points,\n                 num_decoder_points=num_decoder_points,\n                 hidden_dim=hidden_dim,\n                 nhead=nhead,\n                 num_encoder_layers=num_encoder_layers,\n                 num_decoder_layers=num_decoder_layers,\n                 dim_feedforward=dim_feedforward,\n                 dropout=dropout,\n                 activation=activation,\n                 lr_mult=lr_mult,\n                 pe_temperature=pe_temperature,\n                 pe_offset=pe_offset)\n\n        decoder_layer = DeformableTransformerDecoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation,\n            num_feature_levels, num_decoder_points)\n        self.decoder = QRDeformableTransformerDecoder(\n            decoder_layer, num_decoder_layers, start_q, end_q, return_intermediate_dec)\n"
  },
  {
    "path": "ppdet/modeling/transformers/detr_transformer.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import register\nfrom ..layers import MultiHeadAttention, _convert_attention_mask\nfrom .position_encoding import PositionEmbedding\nfrom .utils import _get_clones\nfrom ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_\n\n__all__ = ['DETRTransformer']\n\n\nclass TransformerEncoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False):\n        super(TransformerEncoderLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n\n        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos_embed):\n        return tensor if pos_embed is None else tensor + pos_embed\n\n    def forward(self, src, src_mask=None, pos_embed=None):\n        residual = src\n        if self.normalize_before:\n            src = self.norm1(src)\n        q = k = self.with_pos_embed(src, pos_embed)\n        src = self.self_attn(q, k, value=src, attn_mask=src_mask)\n\n        src = residual + self.dropout1(src)\n        if not self.normalize_before:\n            src = self.norm1(src)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm2(src)\n        src = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = residual + self.dropout2(src)\n        if not self.normalize_before:\n            src = self.norm2(src)\n        return src\n\n\nclass TransformerEncoder(nn.Layer):\n    def __init__(self, encoder_layer, num_layers, norm=None):\n        super(TransformerEncoder, self).__init__()\n        self.layers = _get_clones(encoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.norm = norm\n\n    def forward(self, src, src_mask=None, pos_embed=None):\n        output = src\n        for layer in self.layers:\n            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n        return output\n\n\nclass TransformerDecoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False):\n        super(TransformerDecoderLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n\n        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)\n        self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.norm3 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout3 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos_embed):\n        return tensor if pos_embed is None else tensor + pos_embed\n\n    def forward(self,\n                tgt,\n                memory,\n                tgt_mask=None,\n                memory_mask=None,\n                pos_embed=None,\n                query_pos_embed=None):\n        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)\n\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm1(tgt)\n        q = k = self.with_pos_embed(tgt, query_pos_embed)\n        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)\n        tgt = residual + self.dropout1(tgt)\n        if not self.normalize_before:\n            tgt = self.norm1(tgt)\n\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm2(tgt)\n        q = self.with_pos_embed(tgt, query_pos_embed)\n        k = self.with_pos_embed(memory, pos_embed)\n        tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)\n        tgt = residual + self.dropout2(tgt)\n        if not self.normalize_before:\n            tgt = self.norm2(tgt)\n\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm3(tgt)\n        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))\n        tgt = residual + self.dropout3(tgt)\n        if not self.normalize_before:\n            tgt = self.norm3(tgt)\n        return tgt\n\n\nclass TransformerDecoder(nn.Layer):\n    def __init__(self,\n                 decoder_layer,\n                 num_layers,\n                 norm=None,\n                 return_intermediate=False):\n        super(TransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.norm = norm\n        self.return_intermediate = return_intermediate\n\n    def forward(self,\n                tgt,\n                memory,\n                tgt_mask=None,\n                memory_mask=None,\n                pos_embed=None,\n                query_pos_embed=None):\n        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)\n\n        output = tgt\n        intermediate = []\n        for layer in self.layers:\n            output = layer(\n                output,\n                memory,\n                tgt_mask=tgt_mask,\n                memory_mask=memory_mask,\n                pos_embed=pos_embed,\n                query_pos_embed=query_pos_embed)\n            if self.return_intermediate:\n                intermediate.append(self.norm(output))\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n        if self.return_intermediate:\n            return paddle.stack(intermediate)\n\n        return output.unsqueeze(0)\n\n\n@register\nclass DETRTransformer(nn.Layer):\n    __shared__ = ['hidden_dim']\n\n    def __init__(self,\n                 num_queries=100,\n                 position_embed_type='sine',\n                 return_intermediate_dec=True,\n                 backbone_num_channels=2048,\n                 hidden_dim=256,\n                 nhead=8,\n                 num_encoder_layers=6,\n                 num_decoder_layers=6,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 pe_temperature=10000,\n                 pe_offset=0.,\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False):\n        super(DETRTransformer, self).__init__()\n        assert position_embed_type in ['sine', 'learned'],\\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n\n        encoder_layer = TransformerEncoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation,\n            attn_dropout, act_dropout, normalize_before)\n        encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None\n        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,\n                                          encoder_norm)\n\n        decoder_layer = TransformerDecoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation,\n            attn_dropout, act_dropout, normalize_before)\n        decoder_norm = nn.LayerNorm(hidden_dim)\n        self.decoder = TransformerDecoder(\n            decoder_layer,\n            num_decoder_layers,\n            decoder_norm,\n            return_intermediate=return_intermediate_dec)\n\n        self.input_proj = nn.Conv2D(\n            backbone_num_channels, hidden_dim, kernel_size=1)\n        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)\n        self.position_embedding = PositionEmbedding(\n            hidden_dim // 2,\n            temperature=pe_temperature,\n            normalize=True if position_embed_type == 'sine' else False,\n            embed_type=position_embed_type,\n            offset=pe_offset)\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        for p in self.parameters():\n            if p.dim() > 1:\n                xavier_uniform_(p)\n        conv_init_(self.input_proj)\n        normal_(self.query_pos_embed.weight)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {\n            'backbone_num_channels': [i.channels for i in input_shape][-1],\n        }\n\n    def _convert_attention_mask(self, mask):\n        return (mask - 1.0) * 1e9\n\n    def forward(self, src, src_mask=None, *args, **kwargs):\n        r\"\"\"\n        Applies a Transformer model on the inputs.\n\n        Parameters:\n            src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].\n            src_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                [bs, H, W]`. When the data type is bool, the unwanted positions\n                have `False` values and the others have `True` values. When the\n                data type is int, the unwanted positions have 0 values and the\n                others have 1 values. When the data type is float, the unwanted\n                positions have `-INF` values and the others have 0 values. It\n                can be None when nothing wanted or needed to be prevented\n                attention to. Default None.\n\n        Returns:\n            output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]\n            memory (Tensor): [batch_size, hidden_dim, h, w]\n        \"\"\"\n        # use last level feature map\n        src_proj = self.input_proj(src[-1])\n        bs, c, h, w = src_proj.shape\n        # flatten [B, C, H, W] to [B, HxW, C]\n        src_flatten = src_proj.flatten(2).transpose([0, 2, 1])\n        if src_mask is not None:\n            src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]\n        else:\n            src_mask = paddle.ones([bs, h, w])\n        pos_embed = self.position_embedding(src_mask).flatten(1, 2)\n\n        if self.training:\n            src_mask = self._convert_attention_mask(src_mask)\n            src_mask = src_mask.reshape([bs, 1, 1, h * w])\n        else:\n            src_mask = None\n\n        memory = self.encoder(\n            src_flatten, src_mask=src_mask, pos_embed=pos_embed)\n\n        query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(\n            [bs, 1, 1])\n        tgt = paddle.zeros_like(query_pos_embed)\n        output = self.decoder(\n            tgt,\n            memory,\n            memory_mask=src_mask,\n            pos_embed=pos_embed,\n            query_pos_embed=query_pos_embed)\n\n        if self.training:\n            src_mask = src_mask.reshape([bs, 1, 1, h, w])\n        else:\n            src_mask = None\n\n        return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),\n                src_proj, src_mask)\n"
  },
  {
    "path": "ppdet/modeling/transformers/dino_transformer.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Modified from detrex (https://github.com/IDEA-Research/detrex)\n# Copyright 2022 The IDEA Authors. All rights reserved.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register\nfrom ..layers import MultiHeadAttention\nfrom .position_encoding import PositionEmbedding\nfrom ..heads.detr_head import MLP\nfrom .deformable_transformer import (MSDeformableAttention,\n                                     DeformableTransformerEncoderLayer,\n                                     DeformableTransformerEncoder)\nfrom ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,\n                           bias_init_with_prob)\nfrom .utils import (_get_clones, get_valid_ratio,\n                    get_contrastive_denoising_training_group,\n                    get_sine_pos_embed, inverse_sigmoid)\n\n__all__ = ['DINOTransformer']\n\n\nclass DINOTransformerDecoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model=256,\n                 n_head=8,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 n_levels=4,\n                 n_points=4,\n                 lr_mult=1.0,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(DINOTransformerDecoderLayer, self).__init__()\n\n        # self attention\n        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(\n            d_model, weight_attr=weight_attr, bias_attr=bias_attr)\n\n        # cross attention\n        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,\n                                                n_points, lr_mult)\n        self.dropout2 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(\n            d_model, weight_attr=weight_attr, bias_attr=bias_attr)\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.activation = getattr(F, activation)\n        self.dropout3 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n        self.dropout4 = nn.Dropout(dropout)\n        self.norm3 = nn.LayerNorm(\n            d_model, weight_attr=weight_attr, bias_attr=bias_attr)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n        xavier_uniform_(self.linear1.weight)\n        xavier_uniform_(self.linear2.weight)\n\n    def with_pos_embed(self, tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt):\n        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                attn_mask=None,\n                memory_mask=None,\n                query_pos_embed=None):\n        # self attention\n        q = k = self.with_pos_embed(tgt, query_pos_embed)\n        if attn_mask is not None:\n            attn_mask = paddle.where(\n                attn_mask.astype('bool'),\n                paddle.zeros(attn_mask.shape, tgt.dtype),\n                paddle.full(attn_mask.shape, float(\"-inf\"), tgt.dtype))\n        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm1(tgt)\n\n        # cross attention\n        tgt2 = self.cross_attn(\n            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,\n            memory_spatial_shapes, memory_level_start_index, memory_mask)\n        tgt = tgt + self.dropout2(tgt2)\n        tgt = self.norm2(tgt)\n\n        # ffn\n        tgt2 = self.forward_ffn(tgt)\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n\n        return tgt\n\n\nclass DINOTransformerDecoder(nn.Layer):\n    def __init__(self,\n                 hidden_dim,\n                 decoder_layer,\n                 num_layers,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(DINOTransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.hidden_dim = hidden_dim\n        self.num_layers = num_layers\n        self.norm = nn.LayerNorm(\n            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)\n\n    def forward(self,\n                tgt,\n                ref_points_unact,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                bbox_head,\n                query_pos_head,\n                valid_ratios=None,\n                attn_mask=None,\n                memory_mask=None):\n        if valid_ratios is None:\n            valid_ratios = paddle.ones(\n                [memory.shape[0], memory_spatial_shapes.shape[0], 2])\n\n        output = tgt\n        intermediate = []\n        inter_bboxes = []\n        ref_points = F.sigmoid(ref_points_unact)\n        for i, layer in enumerate(self.layers):\n            reference_points_input = ref_points.detach().unsqueeze(\n                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)\n            query_pos_embed = get_sine_pos_embed(\n                reference_points_input[..., 0, :], self.hidden_dim // 2)\n            query_pos_embed = query_pos_head(query_pos_embed)\n\n            output = layer(output, reference_points_input, memory,\n                           memory_spatial_shapes, memory_level_start_index,\n                           attn_mask, memory_mask, query_pos_embed)\n\n            ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(\n                ref_points.detach()))\n\n            intermediate.append(self.norm(output))\n            inter_bboxes.append(ref_points)\n\n        return paddle.stack(intermediate), paddle.stack(inter_bboxes)\n\n\n@register\nclass DINOTransformer(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=256,\n                 num_queries=900,\n                 position_embed_type='sine',\n                 in_feats_channel=[512, 1024, 2048],\n                 num_levels=4,\n                 num_encoder_points=4,\n                 num_decoder_points=4,\n                 nhead=8,\n                 num_encoder_layers=6,\n                 num_decoder_layers=6,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 lr_mult=1.0,\n                 pe_temperature=10000,\n                 pe_offset=-0.5,\n                 num_denoising=100,\n                 label_noise_ratio=0.5,\n                 box_noise_scale=1.0,\n                 learnt_init_query=True,\n                 eps=1e-2):\n        super(DINOTransformer, self).__init__()\n        assert position_embed_type in ['sine', 'learned'], \\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        assert len(in_feats_channel) <= num_levels\n\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.num_levels = num_levels\n        self.num_classes = num_classes\n        self.num_queries = num_queries\n        self.eps = eps\n        self.num_decoder_layers = num_decoder_layers\n\n        weight_attr = ParamAttr(regularizer=L2Decay(0.0))\n        bias_attr = ParamAttr(regularizer=L2Decay(0.0))\n        # backbone feature projection\n        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)\n\n        # Transformer module\n        encoder_layer = DeformableTransformerEncoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,\n            num_encoder_points, lr_mult, weight_attr, bias_attr)\n        self.encoder = DeformableTransformerEncoder(encoder_layer,\n                                                    num_encoder_layers)\n        decoder_layer = DINOTransformerDecoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,\n            num_decoder_points, lr_mult, weight_attr, bias_attr)\n        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,\n                                              num_decoder_layers, weight_attr,\n                                              bias_attr)\n\n        # denoising part\n        self.denoising_class_embed = nn.Embedding(\n            num_classes,\n            hidden_dim,\n            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n        self.num_denoising = num_denoising\n        self.label_noise_ratio = label_noise_ratio\n        self.box_noise_scale = box_noise_scale\n\n        # position embedding\n        self.position_embedding = PositionEmbedding(\n            hidden_dim // 2,\n            temperature=pe_temperature,\n            normalize=True if position_embed_type == 'sine' else False,\n            embed_type=position_embed_type,\n            offset=pe_offset)\n        self.level_embed = nn.Embedding(num_levels, hidden_dim)\n        # decoder embedding\n        self.learnt_init_query = learnt_init_query\n        if learnt_init_query:\n            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)\n        self.query_pos_head = MLP(2 * hidden_dim,\n                                  hidden_dim,\n                                  hidden_dim,\n                                  num_layers=2)\n\n        # encoder head\n        self.enc_output = nn.Sequential(\n            nn.Linear(hidden_dim, hidden_dim),\n            nn.LayerNorm(\n                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))\n        self.enc_score_head = nn.Linear(hidden_dim, num_classes)\n        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n        # decoder head\n        self.dec_score_head = nn.LayerList([\n            nn.Linear(hidden_dim, num_classes)\n            for _ in range(num_decoder_layers)\n        ])\n        self.dec_bbox_head = nn.LayerList([\n            MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n            for _ in range(num_decoder_layers)\n        ])\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # class and bbox head init\n        bias_cls = bias_init_with_prob(0.01)\n        linear_init_(self.enc_score_head)\n        constant_(self.enc_score_head.bias, bias_cls)\n        constant_(self.enc_bbox_head.layers[-1].weight)\n        constant_(self.enc_bbox_head.layers[-1].bias)\n        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):\n            linear_init_(cls_)\n            constant_(cls_.bias, bias_cls)\n            constant_(reg_.layers[-1].weight)\n            constant_(reg_.layers[-1].bias)\n\n        linear_init_(self.enc_output[0])\n        xavier_uniform_(self.enc_output[0].weight)\n        normal_(self.level_embed.weight)\n        if self.learnt_init_query:\n            xavier_uniform_(self.tgt_embed.weight)\n        xavier_uniform_(self.query_pos_head.layers[0].weight)\n        xavier_uniform_(self.query_pos_head.layers[1].weight)\n        for l in self.input_proj:\n            xavier_uniform_(l[0].weight)\n            constant_(l[0].bias)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_feats_channel': [i.channels for i in input_shape], }\n\n    def _build_input_proj_layer(self,\n                                in_feats_channel,\n                                weight_attr=None,\n                                bias_attr=None):\n        self.input_proj = nn.LayerList()\n        for in_channels in in_feats_channel:\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels, self.hidden_dim, kernel_size=1)), (\n                            'norm', nn.GroupNorm(\n                                32,\n                                self.hidden_dim,\n                                weight_attr=weight_attr,\n                                bias_attr=bias_attr))))\n        in_channels = in_feats_channel[-1]\n        for _ in range(self.num_levels - len(in_feats_channel)):\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=3,\n                        stride=2,\n                        padding=1)), ('norm', nn.GroupNorm(\n                            32,\n                            self.hidden_dim,\n                            weight_attr=weight_attr,\n                            bias_attr=bias_attr))))\n            in_channels = self.hidden_dim\n\n    def _get_encoder_input(self, feats, pad_mask=None):\n        # get projection features\n        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]\n        if self.num_levels > len(proj_feats):\n            len_srcs = len(proj_feats)\n            for i in range(len_srcs, self.num_levels):\n                if i == len_srcs:\n                    proj_feats.append(self.input_proj[i](feats[-1]))\n                else:\n                    proj_feats.append(self.input_proj[i](proj_feats[-1]))\n\n        # get encoder inputs\n        feat_flatten = []\n        mask_flatten = []\n        lvl_pos_embed_flatten = []\n        spatial_shapes = []\n        valid_ratios = []\n        for i, feat in enumerate(proj_feats):\n            bs, _, h, w = paddle.shape(feat)\n            spatial_shapes.append(paddle.stack([h, w]))\n            # [b,c,h,w] -> [b,h*w,c]\n            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))\n            if pad_mask is not None:\n                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]\n            else:\n                mask = paddle.ones([bs, h, w])\n            valid_ratios.append(get_valid_ratio(mask))\n            # [b, h*w, c]\n            pos_embed = self.position_embedding(mask).flatten(1, 2)\n            lvl_pos_embed = pos_embed + self.level_embed.weight[i]\n            lvl_pos_embed_flatten.append(lvl_pos_embed)\n            if pad_mask is not None:\n                # [b, h*w]\n                mask_flatten.append(mask.flatten(1))\n\n        # [b, l, c]\n        feat_flatten = paddle.concat(feat_flatten, 1)\n        # [b, l]\n        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,\n                                                                   1)\n        # [b, l, c]\n        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)\n        # [num_levels, 2]\n        spatial_shapes = paddle.to_tensor(\n            paddle.stack(spatial_shapes).astype('int64'))\n        # [l] start index of each level\n        level_start_index = paddle.concat([\n            paddle.zeros(\n                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]\n        ])\n        # [b, num_levels, 2]\n        valid_ratios = paddle.stack(valid_ratios, 1)\n        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,\n                lvl_pos_embed_flatten, valid_ratios)\n\n    def forward(self, feats, pad_mask=None, gt_meta=None):\n        # input projection and embedding\n        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,\n         lvl_pos_embed_flatten,\n         valid_ratios) = self._get_encoder_input(feats, pad_mask)\n\n        # encoder\n        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,\n                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)\n\n        # prepare denoising training\n        if self.training:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \\\n                get_contrastive_denoising_training_group(gt_meta,\n                                            self.num_classes,\n                                            self.num_queries,\n                                            self.denoising_class_embed.weight,\n                                            self.num_denoising,\n                                            self.label_noise_ratio,\n                                            self.box_noise_scale)\n        else:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None\n\n        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \\\n            self._get_decoder_input(\n            memory, spatial_shapes, mask_flatten, denoising_class,\n            denoising_bbox_unact)\n\n        # decoder\n        inter_feats, inter_bboxes = self.decoder(\n            target, init_ref_points_unact, memory, spatial_shapes,\n            level_start_index, self.dec_bbox_head, self.query_pos_head,\n            valid_ratios, attn_mask, mask_flatten)\n        out_bboxes = []\n        out_logits = []\n        for i in range(self.num_decoder_layers):\n            out_logits.append(self.dec_score_head[i](inter_feats[i]))\n            if i == 0:\n                out_bboxes.append(\n                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +\n                              init_ref_points_unact))\n            else:\n                out_bboxes.append(\n                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +\n                              inverse_sigmoid(inter_bboxes[i - 1])))\n        out_bboxes = paddle.stack(out_bboxes)\n        out_logits = paddle.stack(out_logits)\n\n        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,\n                dn_meta)\n\n    def _get_encoder_output_anchors(self,\n                                    memory,\n                                    spatial_shapes,\n                                    memory_mask=None,\n                                    grid_size=0.05):\n        output_anchors = []\n        idx = 0\n        for lvl, (h, w) in enumerate(spatial_shapes):\n            if memory_mask is not None:\n                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])\n                valid_H = paddle.sum(mask_[:, :, 0], 1)\n                valid_W = paddle.sum(mask_[:, 0, :], 1)\n            else:\n                valid_H, valid_W = h, w\n\n            grid_y, grid_x = paddle.meshgrid(\n                paddle.arange(end=h), paddle.arange(end=w))\n            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)\n\n            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(\n                [-1, 1, 1, 2]).astype(grid_xy.dtype)\n            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH\n            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)\n            output_anchors.append(\n                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))\n            idx += h * w\n\n        output_anchors = paddle.concat(output_anchors, 1)\n        valid_mask = ((output_anchors > self.eps) *\n                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)\n        output_anchors = paddle.log(output_anchors / (1 - output_anchors))\n        if memory_mask is not None:\n            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0\n        output_anchors = paddle.where(valid_mask, output_anchors,\n                                      paddle.to_tensor(float(\"inf\")))\n\n        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))\n        output_memory = self.enc_output(memory)\n        return output_memory, output_anchors\n\n    def _get_decoder_input(self,\n                           memory,\n                           spatial_shapes,\n                           memory_mask=None,\n                           denoising_class=None,\n                           denoising_bbox_unact=None):\n        bs, _, _ = memory.shape\n        # prepare input for decoder\n        output_memory, output_anchors = self._get_encoder_output_anchors(\n            memory, spatial_shapes, memory_mask)\n        enc_outputs_class = self.enc_score_head(output_memory)\n        enc_outputs_coord_unact = self.enc_bbox_head(\n            output_memory) + output_anchors\n\n        _, topk_ind = paddle.topk(\n            enc_outputs_class.max(-1), self.num_queries, axis=1)\n        # extract region proposal boxes\n        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)\n        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])\n        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)\n        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,\n                                                  topk_ind)  # unsigmoided.\n        enc_topk_bboxes = F.sigmoid(reference_points_unact)\n        if denoising_bbox_unact is not None:\n            reference_points_unact = paddle.concat(\n                [denoising_bbox_unact, reference_points_unact], 1)\n        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)\n\n        # extract region features\n        if self.learnt_init_query:\n            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n        else:\n            target = paddle.gather_nd(output_memory, topk_ind).detach()\n        if denoising_class is not None:\n            target = paddle.concat([denoising_class, target], 1)\n\n        return target, reference_points_unact.detach(\n        ), enc_topk_bboxes, enc_topk_logits\n"
  },
  {
    "path": "ppdet/modeling/transformers/ext_op/README.md",
    "content": "# Multi-scale deformable attention自定义OP编译\n该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。\n\n## 1. 环境依赖\n- Paddle >= 2.3.2\n- gcc 8.2\n\n## 2. 安装\n请在当前路径下进行编译安装\n```\ncd PaddleDetection/ppdet/modeling/transformers/ext_op/\npython setup_ms_deformable_attn_op.py install\n```\n\n编译完成后即可使用，以下为`ms_deformable_attn`的使用示例\n```\n# 引入自定义op\nfrom deformable_detr_ops import ms_deformable_attn\n\n# 构造fake input tensor\nbs, n_heads, c = 2, 8, 8\nquery_length, n_levels, n_points = 2, 2, 2\nspatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)\nlevel_start_index = paddle.concat((paddle.to_tensor(\n    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))\nvalue_length = sum([(H * W).item() for H, W in spatial_shapes])\n\ndef get_test_tensors(channels):\n    value = paddle.rand(\n        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01\n    sampling_locations = paddle.rand(\n        [bs, query_length, n_heads, n_levels, n_points, 2],\n        dtype=paddle.float32)\n    attention_weights = paddle.rand(\n        [bs, query_length, n_heads, n_levels, n_points],\n        dtype=paddle.float32) + 1e-5\n    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(\n        -2, keepdim=True)\n    return [value, sampling_locations, attention_weights]\n\nvalue, sampling_locations, attention_weights = get_test_tensors(c)\n\noutput = ms_deformable_attn(value,\n                            spatial_shapes,\n                            level_start_index,\n                            sampling_locations,\n                            attention_weights)\n```\n\n## 3. 单元测试\n可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：\n```\npython test_ms_deformable_attn_op.py\n```\n运行成功后，打印如下：\n```\n*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07\n*tensor1 True check_gradient_numerical(D=30)\n*tensor2 True check_gradient_numerical(D=30)\n*tensor3 True check_gradient_numerical(D=30)\n*tensor1 True check_gradient_numerical(D=32)\n*tensor2 True check_gradient_numerical(D=32)\n*tensor3 True check_gradient_numerical(D=32)\n*tensor1 True check_gradient_numerical(D=64)\n*tensor2 True check_gradient_numerical(D=64)\n*tensor3 True check_gradient_numerical(D=64)\n*tensor1 True check_gradient_numerical(D=71)\n*tensor2 True check_gradient_numerical(D=71)\n*tensor3 True check_gradient_numerical(D=71)\n*tensor1 True check_gradient_numerical(D=128)\n*tensor2 True check_gradient_numerical(D=128)\n*tensor3 True check_gradient_numerical(D=128)\n*tensor1 True check_gradient_numerical(D=1024)\n*tensor2 True check_gradient_numerical(D=1024)\n*tensor3 True check_gradient_numerical(D=1024)\n*tensor1 True check_gradient_numerical(D=1025)\n*tensor2 True check_gradient_numerical(D=1025)\n*tensor3 True check_gradient_numerical(D=1025)\n*tensor1 True check_gradient_numerical(D=2048)\n*tensor2 True check_gradient_numerical(D=2048)\n*tensor3 True check_gradient_numerical(D=2048)\n*tensor1 True check_gradient_numerical(D=3096)\n*tensor2 True check_gradient_numerical(D=3096)\n*tensor3 True check_gradient_numerical(D=3096)\n```\n"
  },
  {
    "path": "ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc",
    "content": "/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n    http://www.apache.org/licenses/LICENSE-2.0\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License. */\n\n#include \"paddle/extension.h\"\n\n#include <vector>\n\n// declare GPU implementation\nstd::vector<paddle::Tensor>\nMSDeformableAttnCUDAForward(const paddle::Tensor &value,\n                            const paddle::Tensor &value_spatial_shapes,\n                            const paddle::Tensor &value_level_start_index,\n                            const paddle::Tensor &sampling_locations,\n                            const paddle::Tensor &attention_weights);\n\nstd::vector<paddle::Tensor> MSDeformableAttnCUDABackward(\n    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,\n    const paddle::Tensor &value_level_start_index,\n    const paddle::Tensor &sampling_locations,\n    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);\n\n//// CPU not implemented\n\nstd::vector<std::vector<int64_t>>\nMSDeformableAttnInferShape(std::vector<int64_t> value_shape,\n                           std::vector<int64_t> value_spatial_shapes_shape,\n                           std::vector<int64_t> value_level_start_index_shape,\n                           std::vector<int64_t> sampling_locations_shape,\n                           std::vector<int64_t> attention_weights_shape) {\n  return {{value_shape[0], sampling_locations_shape[1],\n           value_shape[2] * value_shape[3]}};\n}\n\nstd::vector<paddle::DataType>\nMSDeformableAttnInferDtype(paddle::DataType value_dtype,\n                           paddle::DataType value_spatial_shapes_dtype,\n                           paddle::DataType value_level_start_index_dtype,\n                           paddle::DataType sampling_locations_dtype,\n                           paddle::DataType attention_weights_dtype) {\n  return {value_dtype};\n}\n\nPD_BUILD_OP(ms_deformable_attn)\n    .Inputs({\"Value\", \"SpatialShapes\", \"LevelIndex\", \"SamplingLocations\",\n             \"AttentionWeights\"})\n    .Outputs({\"Out\"})\n    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))\n    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))\n    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));\n\nPD_BUILD_GRAD_OP(ms_deformable_attn)\n    .Inputs({\"Value\", \"SpatialShapes\", \"LevelIndex\", \"SamplingLocations\",\n             \"AttentionWeights\", paddle::Grad(\"Out\")})\n    .Outputs({paddle::Grad(\"Value\"), paddle::Grad(\"SpatialShapes\"),\n              paddle::Grad(\"LevelIndex\"), paddle::Grad(\"SamplingLocations\"),\n              paddle::Grad(\"AttentionWeights\")})\n    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));\n"
  },
  {
    "path": "ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu",
    "content": "/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n    http://www.apache.org/licenses/LICENSE-2.0\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License. */\n\n#include \"paddle/extension.h\"\n\n#define CUDA_KERNEL_LOOP(i, n)                                                 \\\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \\\n       i += blockDim.x * gridDim.x)\n\nconst int CUDA_NUM_THREADS = 1024;\ninline int GET_BLOCKS(const int N, const int num_threads) {\n  return (N + num_threads - 1) / num_threads;\n}\n\n// forward bilinear\ntemplate <typename data_t>\n__device__ data_t deformable_attn_bilinear_forward(\n    const data_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const data_t &h, const data_t &w,\n    const int &m, const int &c) {\n  const int h_low = floor(h);\n  const int w_low = floor(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const data_t lh = h - h_low;\n  const data_t lw = w - w_low;\n  const data_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  data_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n  }\n  data_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n  }\n  data_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n  }\n  data_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n  }\n\n  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\n// forward kernel\ntemplate <typename data_t>\n__global__ void deformable_attn_cuda_kernel_forward(\n    const int n, const data_t *data_value, const int64_t *data_spatial_shapes,\n    const int64_t *data_level_start_index, const data_t *data_sampling_loc,\n    const data_t *data_attn_weight, const int batch_size,\n    const int value_length, const int num_heads, const int channels,\n    const int num_levels, const int query_length, const int num_points,\n    data_t *output_data_ptr) {\n  CUDA_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % query_length;\n    _temp /= query_length;\n    const int b_col = _temp;\n\n    data_t *data_ptr = output_data_ptr + index;\n    int data_weight_ptr = sampling_index * num_levels * num_points;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;\n    data_t col = 0;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset +\n                                                   level_start_id * qid_stride);\n      for (int p_col = 0; p_col < num_points; ++p_col) {\n        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const data_t weight = data_attn_weight[data_weight_ptr];\n\n        const data_t h_im = loc_h * spatial_h - 0.5;\n        const data_t w_im = loc_w * spatial_w - 0.5;\n\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          col += deformable_attn_bilinear_forward(\n                     data_value_ptr, spatial_h, spatial_w, num_heads, channels,\n                     h_im, w_im, m_col, c_col) *\n                 weight;\n        }\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n      }\n    }\n    *data_ptr = col;\n  }\n}\n\n#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x \" must be a GPU Tensor.\")\n// forward\nstd::vector<paddle::Tensor>\nMSDeformableAttnCUDAForward(const paddle::Tensor &value,\n                            const paddle::Tensor &value_spatial_shapes,\n                            const paddle::Tensor &value_level_start_index,\n                            const paddle::Tensor &sampling_locations,\n                            const paddle::Tensor &attention_weights) {\n\n  CHECK_INPUT_GPU(value);\n  CHECK_INPUT_GPU(value_spatial_shapes);\n  CHECK_INPUT_GPU(value_level_start_index);\n  CHECK_INPUT_GPU(sampling_locations);\n  CHECK_INPUT_GPU(attention_weights);\n\n  const int batch_size = value.shape()[0];\n  const int value_length = value.shape()[1];\n  const int num_heads = value.shape()[2];\n  const int channels = value.shape()[3];\n\n  const int num_levels = value_spatial_shapes.shape()[0];\n  const int query_length = sampling_locations.shape()[1];\n  const int num_points = sampling_locations.shape()[4];\n\n  auto output = paddle::full({batch_size, query_length, num_heads * channels},\n                             0, value.dtype(), paddle::GPUPlace());\n\n  const int num_kernels = batch_size * query_length * num_heads * channels;\n  deformable_attn_cuda_kernel_forward<float>\n      <<<GET_BLOCKS(num_kernels, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,\n         value.stream()>>>(num_kernels, value.data<float>(),\n                           value_spatial_shapes.data<int64_t>(),\n                           value_level_start_index.data<int64_t>(),\n                           sampling_locations.data<float>(),\n                           attention_weights.data<float>(), batch_size,\n                           value_length, num_heads, channels, num_levels,\n                           query_length, num_points, output.data<float>());\n  return {output};\n}\n\n// backward bilinear\ntemplate <typename data_t>\n__device__ void deformable_attn_bilinear_backward(\n    const data_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const data_t &h, const data_t &w,\n    const int &m, const int &c, const data_t &top_grad,\n    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  const int h_low = floor(h);\n  const int w_low = floor(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const data_t lh = h - h_low;\n  const data_t lw = w - w_low;\n  const data_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n  const data_t top_grad_value = top_grad * attn_weight;\n  data_t grad_h_weight = 0, grad_w_weight = 0;\n\n  data_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n    grad_h_weight -= hw * v1;\n    grad_w_weight -= hh * v1;\n    atomicAdd(grad_value + ptr1, w1 * top_grad_value);\n  }\n  data_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n    grad_h_weight -= lw * v2;\n    grad_w_weight += hh * v2;\n    atomicAdd(grad_value + ptr2, w2 * top_grad_value);\n  }\n  data_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n    grad_h_weight += hw * v3;\n    grad_w_weight -= lh * v3;\n    atomicAdd(grad_value + ptr3, w3 * top_grad_value);\n  }\n  data_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n    grad_h_weight += lw * v4;\n    grad_w_weight += lh * v4;\n    atomicAdd(grad_value + ptr4, w4 * top_grad_value);\n  }\n\n  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  *grad_attn_weight = top_grad * val;\n  *grad_sampling_loc = width * grad_w_weight * top_grad_value;\n  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;\n}\n\ntemplate <typename data_t>\n__device__ void deformable_attn_bilinear_backward_gm(\n    const data_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const data_t &h, const data_t &w,\n    const int &m, const int &c, const data_t &top_grad,\n    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  const int h_low = floor(h);\n  const int w_low = floor(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const data_t lh = h - h_low;\n  const data_t lw = w - w_low;\n  const data_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n  const data_t top_grad_value = top_grad * attn_weight;\n  data_t grad_h_weight = 0, grad_w_weight = 0;\n\n  data_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n    grad_h_weight -= hw * v1;\n    grad_w_weight -= hh * v1;\n    atomicAdd(grad_value + ptr1, w1 * top_grad_value);\n  }\n  data_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n    grad_h_weight -= lw * v2;\n    grad_w_weight += hh * v2;\n    atomicAdd(grad_value + ptr2, w2 * top_grad_value);\n  }\n  data_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n    grad_h_weight += hw * v3;\n    grad_w_weight -= lh * v3;\n    atomicAdd(grad_value + ptr3, w3 * top_grad_value);\n  }\n  data_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n    grad_h_weight += lw * v4;\n    grad_w_weight += lh * v4;\n    atomicAdd(grad_value + ptr4, w4 * top_grad_value);\n  }\n\n  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  atomicAdd(grad_attn_weight, top_grad * val);\n  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);\n  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);\n}\n\n// backward kernels\n// channels > 1024\ntemplate <typename data_t>\n__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks(\n    const int n, const data_t *grad_col, const data_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const data_t *data_sampling_loc, const data_t *data_attn_weight,\n    const int batch_size, const int value_length, const int num_heads,\n    const int channels, const int num_levels, const int query_length,\n    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  CUDA_KERNEL_LOOP(index, n) {\n    extern __shared__ int _s[];\n    data_t *cache_grad_sampling_loc = (data_t *)_s;\n    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % query_length;\n    _temp /= query_length;\n    const int b_col = _temp;\n\n    const data_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_points;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const data_t *data_value_ptr = data_value + value_ptr_offset;\n      data_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_points; ++p_col) {\n        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const data_t weight = data_attn_weight[data_weight_ptr];\n\n        const data_t h_im = loc_h * spatial_h - 0.5;\n        const data_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          deformable_attn_bilinear_backward(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;\n             s >>= 1, spre >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n            if (tid + (s << 1) < spre) {\n              cache_grad_attn_weight[tid] +=\n                  cache_grad_attn_weight[tid + (s << 1)];\n              cache_grad_sampling_loc[xid1] +=\n                  cache_grad_sampling_loc[xid2 + (s << 1)];\n              cache_grad_sampling_loc[xid1 + 1] +=\n                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];\n            }\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);\n          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);\n          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename data_t>\n__global__ void deformable_attn_cuda_kernel_backward_gm(\n    const int n, const data_t *grad_col, const data_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const data_t *data_sampling_loc, const data_t *data_attn_weight,\n    const int batch_size, const int value_length, const int num_heads,\n    const int channels, const int num_levels, const int query_length,\n    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  CUDA_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % query_length;\n    _temp /= query_length;\n    const int b_col = _temp;\n\n    const data_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_points;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const data_t *data_value_ptr = data_value + value_ptr_offset;\n      data_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_points; ++p_col) {\n        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const data_t weight = data_attn_weight[data_weight_ptr];\n\n        const data_t h_im = loc_h * spatial_h - 0.5;\n        const data_t w_im = loc_w * spatial_w - 0.5;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          deformable_attn_bilinear_backward_gm(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              grad_sampling_loc, grad_attn_weight);\n        }\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\n// channels <= 1024\ntemplate <typename data_t, unsigned int blockSize>\n__global__ void\ndeformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1(\n    const int n, const data_t *grad_col, const data_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const data_t *data_sampling_loc, const data_t *data_attn_weight,\n    const int batch_size, const int value_length, const int num_heads,\n    const int channels, const int num_levels, const int query_length,\n    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  CUDA_KERNEL_LOOP(index, n) {\n    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];\n    __shared__ data_t cache_grad_attn_weight[blockSize];\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % query_length;\n    _temp /= query_length;\n    const int b_col = _temp;\n\n    const data_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_points;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const data_t *data_value_ptr = data_value + value_ptr_offset;\n      data_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_points; ++p_col) {\n        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const data_t weight = data_attn_weight[data_weight_ptr];\n\n        const data_t h_im = loc_h * spatial_h - 0.5;\n        const data_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          deformable_attn_bilinear_backward(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n        if (tid == 0) {\n          data_t _grad_w = cache_grad_sampling_loc[0],\n                 _grad_h = cache_grad_sampling_loc[1],\n                 _grad_a = cache_grad_attn_weight[0];\n          int sid = 2;\n          for (unsigned int tid = 1; tid < blockSize; ++tid) {\n            _grad_w += cache_grad_sampling_loc[sid];\n            _grad_h += cache_grad_sampling_loc[sid + 1];\n            _grad_a += cache_grad_attn_weight[tid];\n            sid += 2;\n          }\n\n          *grad_sampling_loc = _grad_w;\n          *(grad_sampling_loc + 1) = _grad_h;\n          *grad_attn_weight = _grad_a;\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename data_t, unsigned int blockSize>\n__global__ void\ndeformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2(\n    const int n, const data_t *grad_col, const data_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const data_t *data_sampling_loc, const data_t *data_attn_weight,\n    const int batch_size, const int value_length, const int num_heads,\n    const int channels, const int num_levels, const int query_length,\n    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  CUDA_KERNEL_LOOP(index, n) {\n    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];\n    __shared__ data_t cache_grad_attn_weight[blockSize];\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % query_length;\n    _temp /= query_length;\n    const int b_col = _temp;\n\n    const data_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_points;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const data_t *data_value_ptr = data_value + value_ptr_offset;\n      data_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_points; ++p_col) {\n        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const data_t weight = data_attn_weight[data_weight_ptr];\n\n        const data_t h_im = loc_h * spatial_h - 0.5;\n        const data_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          deformable_attn_bilinear_backward(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          *grad_sampling_loc = cache_grad_sampling_loc[0];\n          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];\n          *grad_attn_weight = cache_grad_attn_weight[0];\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename data_t>\n__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1(\n    const int n, const data_t *grad_col, const data_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const data_t *data_sampling_loc, const data_t *data_attn_weight,\n    const int batch_size, const int value_length, const int num_heads,\n    const int channels, const int num_levels, const int query_length,\n    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  CUDA_KERNEL_LOOP(index, n) {\n    extern __shared__ int _s[];\n    data_t *cache_grad_sampling_loc = (data_t *)_s;\n    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % query_length;\n    _temp /= query_length;\n    const int b_col = _temp;\n\n    const data_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_points;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const data_t *data_value_ptr = data_value + value_ptr_offset;\n      data_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_points; ++p_col) {\n        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const data_t weight = data_attn_weight[data_weight_ptr];\n\n        const data_t h_im = loc_h * spatial_h - 0.5;\n        const data_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          deformable_attn_bilinear_backward(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n        if (tid == 0) {\n          data_t _grad_w = cache_grad_sampling_loc[0],\n                 _grad_h = cache_grad_sampling_loc[1],\n                 _grad_a = cache_grad_attn_weight[0];\n          int sid = 2;\n          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {\n            _grad_w += cache_grad_sampling_loc[sid];\n            _grad_h += cache_grad_sampling_loc[sid + 1];\n            _grad_a += cache_grad_attn_weight[tid];\n            sid += 2;\n          }\n\n          *grad_sampling_loc = _grad_w;\n          *(grad_sampling_loc + 1) = _grad_h;\n          *grad_attn_weight = _grad_a;\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename data_t>\n__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2(\n    const int n, const data_t *grad_col, const data_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const data_t *data_sampling_loc, const data_t *data_attn_weight,\n    const int batch_size, const int value_length, const int num_heads,\n    const int channels, const int num_levels, const int query_length,\n    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  CUDA_KERNEL_LOOP(index, n) {\n    extern __shared__ int _s[];\n    data_t *cache_grad_sampling_loc = (data_t *)_s;\n    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n    unsigned int tid = threadIdx.x;\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    const int q_col = _temp % query_length;\n    _temp /= query_length;\n    const int b_col = _temp;\n\n    const data_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_points;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    grad_sampling_loc += grad_sampling_ptr << 1;\n    grad_attn_weight += grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const data_t *data_value_ptr = data_value + value_ptr_offset;\n      data_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_points; ++p_col) {\n        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const data_t weight = data_attn_weight[data_weight_ptr];\n\n        const data_t h_im = loc_h * spatial_h - 0.5;\n        const data_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          deformable_attn_bilinear_backward(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;\n             s >>= 1, spre >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n            if (tid + (s << 1) < spre) {\n              cache_grad_attn_weight[tid] +=\n                  cache_grad_attn_weight[tid + (s << 1)];\n              cache_grad_sampling_loc[xid1] +=\n                  cache_grad_sampling_loc[xid2 + (s << 1)];\n              cache_grad_sampling_loc[xid1 + 1] +=\n                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];\n            }\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          *grad_sampling_loc = cache_grad_sampling_loc[0];\n          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];\n          *grad_attn_weight = cache_grad_attn_weight[0];\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight += grad_weight_stride;\n        grad_sampling_loc += grad_loc_stride;\n      }\n    }\n  }\n}\n\n// backward branch\ntemplate <typename data_t>\nvoid deformable_attn_cuda_backward(\n    cudaStream_t stream, const data_t *grad_out, const data_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const data_t *data_sampling_loc, const data_t *data_attn_weight,\n    const int batch_size, const int value_length, const int num_heads,\n    const int channels, const int num_levels, const int query_length,\n    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,\n    data_t *grad_attn_weight) {\n  const int num_threads =\n      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;\n  const int num_kernels = batch_size * query_length * num_heads * channels;\n  const int num_actual_kernels =\n      batch_size * query_length * num_heads * channels;\n  if (channels > 1024) {\n    if ((channels & 1023) == 0) {\n      deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks<data_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n             num_threads * 3 * sizeof(data_t), stream>>>(\n              num_kernels, grad_out, data_value, data_spatial_shapes,\n              data_level_start_index, data_sampling_loc, data_attn_weight,\n              batch_size, value_length, num_heads, channels, num_levels,\n              query_length, num_points, grad_value, grad_sampling_loc,\n              grad_attn_weight);\n    } else {\n      deformable_attn_cuda_kernel_backward_gm<data_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n    }\n  } else {\n    switch (channels) {\n    case 1:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,\n                                                                         1>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 2:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,\n                                                                         2>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 4:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,\n                                                                         4>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 8:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,\n                                                                         8>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 16:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,\n                                                                         16>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 32:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,\n                                                                         32>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 64:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,\n                                                                         64>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 128:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,\n                                                                         128>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 256:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,\n                                                                         256>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 512:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,\n                                                                         512>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    case 1024:\n      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,\n                                                                         1024>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, value_length, num_heads,\n                       channels, num_levels, query_length, num_points,\n                       grad_value, grad_sampling_loc, grad_attn_weight);\n      break;\n    default:\n      if (channels < 64) {\n        deformable_attn_cuda_kernel_backward_shm_reduce_v1<data_t>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n               num_threads * 3 * sizeof(data_t), stream>>>(\n                num_kernels, grad_out, data_value, data_spatial_shapes,\n                data_level_start_index, data_sampling_loc, data_attn_weight,\n                batch_size, value_length, num_heads, channels, num_levels,\n                query_length, num_points, grad_value, grad_sampling_loc,\n                grad_attn_weight);\n      } else {\n        deformable_attn_cuda_kernel_backward_shm_reduce_v2<data_t>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n               num_threads * 3 * sizeof(data_t), stream>>>(\n                num_kernels, grad_out, data_value, data_spatial_shapes,\n                data_level_start_index, data_sampling_loc, data_attn_weight,\n                batch_size, value_length, num_heads, channels, num_levels,\n                query_length, num_points, grad_value, grad_sampling_loc,\n                grad_attn_weight);\n      }\n    }\n  }\n}\n\n// backward\nstd::vector<paddle::Tensor> MSDeformableAttnCUDABackward(\n    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,\n    const paddle::Tensor &value_level_start_index,\n    const paddle::Tensor &sampling_locations,\n    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) {\n\n  CHECK_INPUT_GPU(value);\n  CHECK_INPUT_GPU(value_spatial_shapes);\n  CHECK_INPUT_GPU(value_level_start_index);\n  CHECK_INPUT_GPU(sampling_locations);\n  CHECK_INPUT_GPU(attention_weights);\n  CHECK_INPUT_GPU(grad_out);\n\n  const int batch_size = value.shape()[0];\n  const int value_length = value.shape()[1];\n  const int num_heads = value.shape()[2];\n  const int channels = value.shape()[3];\n\n  const int num_levels = value_spatial_shapes.shape()[0];\n  const int query_length = sampling_locations.shape()[1];\n  const int num_points = sampling_locations.shape()[4];\n\n  auto grad_value =\n      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());\n  auto grad_spatial_shapes =\n      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());\n  auto grad_level_start_index =\n      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());\n  auto grad_sampling_locations =\n      paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(),\n                   paddle::GPUPlace());\n  auto grad_attention_weights =\n      paddle::full(attention_weights.shape(), 0, attention_weights.dtype(),\n                   paddle::GPUPlace());\n\n  deformable_attn_cuda_backward<float>(\n      value.stream(), grad_out.data<float>(), value.data<float>(),\n      value_spatial_shapes.data<int64_t>(),\n      value_level_start_index.data<int64_t>(), sampling_locations.data<float>(),\n      attention_weights.data<float>(), batch_size, value_length, num_heads,\n      channels, num_levels, query_length, num_points, grad_value.data<float>(),\n      grad_sampling_locations.data<float>(),\n      grad_attention_weights.data<float>());\n\n  return {grad_value, grad_spatial_shapes, grad_level_start_index,\n          grad_sampling_locations, grad_attention_weights};\n}\n"
  },
  {
    "path": "ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py",
    "content": "from paddle.utils.cpp_extension import CUDAExtension, setup\n\nif __name__ == \"__main__\":\n    setup(\n        name='deformable_detr_ops',\n        ext_modules=CUDAExtension(\n            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))\n"
  },
  {
    "path": "ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport os\nimport sys\nimport random\nimport numpy as np\nimport paddle\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))\nif parent_path not in sys.path:\n    sys.path.append(parent_path)\n\nfrom ppdet.modeling.transformers.utils import deformable_attention_core_func\nms_deform_attn_core_paddle = deformable_attention_core_func\n\ntry:\n    gpu_index = int(sys.argv[1])\nexcept:\n    gpu_index = 0\nprint(f'Use gpu {gpu_index} to test...')\npaddle.set_device(f'gpu:{gpu_index}')\n\ntry:\n    from deformable_detr_ops import ms_deformable_attn\nexcept Exception as e:\n    print('import deformable_detr_ops error', e)\n    sys.exit(-1)\n\npaddle.seed(1)\nrandom.seed(1)\nnp.random.seed(1)\n\nbs, n_heads, c = 2, 8, 8\nquery_length, n_levels, n_points = 2, 2, 2\nspatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)\nlevel_start_index = paddle.concat((paddle.to_tensor(\n    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))\nvalue_length = sum([(H * W).item() for H, W in spatial_shapes])\n\n\ndef get_test_tensors(channels):\n    value = paddle.rand(\n        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01\n    sampling_locations = paddle.rand(\n        [bs, query_length, n_heads, n_levels, n_points, 2],\n        dtype=paddle.float32)\n    attention_weights = paddle.rand(\n        [bs, query_length, n_heads, n_levels, n_points],\n        dtype=paddle.float32) + 1e-5\n    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(\n        -2, keepdim=True)\n\n    return [value, sampling_locations, attention_weights]\n\n\n@paddle.no_grad()\ndef check_forward_equal_with_paddle_float():\n    value, sampling_locations, attention_weights = get_test_tensors(c)\n\n    output_paddle = ms_deform_attn_core_paddle(\n        value, spatial_shapes, level_start_index, sampling_locations,\n        attention_weights).detach().cpu()\n    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,\n                                     sampling_locations,\n                                     attention_weights).detach().cpu()\n    fwdok = paddle.allclose(\n        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()\n    max_abs_err = (output_cuda - output_paddle).abs().max().item()\n    max_rel_err = (\n        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()\n\n    print(\n        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'\n    )\n\n\ndef check_gradient_numerical(channels=4):\n    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(\n        channels)\n    value_paddle.stop_gradient = False\n    sampling_locations_paddle.stop_gradient = False\n    attention_weights_paddle.stop_gradient = False\n\n    value_cuda = value_paddle.detach().clone()\n    sampling_locations_cuda = sampling_locations_paddle.detach().clone()\n    attention_weights_cuda = attention_weights_paddle.detach().clone()\n    value_cuda.stop_gradient = False\n    sampling_locations_cuda.stop_gradient = False\n    attention_weights_cuda.stop_gradient = False\n\n    output_paddle = ms_deform_attn_core_paddle(\n        value_paddle, spatial_shapes, level_start_index,\n        sampling_locations_paddle, attention_weights_paddle)\n    output_paddle.sum().backward()\n\n    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,\n                                     level_start_index, sampling_locations_cuda,\n                                     attention_weights_cuda)\n    output_cuda.sum().backward()\n\n    res = paddle.allclose(\n        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()\n    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')\n\n    res = paddle.allclose(\n        sampling_locations_paddle.grad,\n        sampling_locations_cuda.grad,\n        rtol=1e-2,\n        atol=1e-3).item()\n    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')\n\n    res = paddle.allclose(\n        attention_weights_paddle.grad,\n        attention_weights_cuda.grad,\n        rtol=1e-2,\n        atol=1e-3).item()\n    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')\n\n\nif __name__ == '__main__':\n    check_forward_equal_with_paddle_float()\n\n    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:\n        check_gradient_numerical(channels)\n"
  },
  {
    "path": "ppdet/modeling/transformers/group_detr_transformer.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Modified from detrex (https://github.com/IDEA-Research/detrex)\n# Copyright 2022 The IDEA Authors. All rights reserved.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register\nfrom ..layers import MultiHeadAttention\nfrom .position_encoding import PositionEmbedding\nfrom ..heads.detr_head import MLP\nfrom .deformable_transformer import MSDeformableAttention\nfrom ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,\n                           bias_init_with_prob)\nfrom .utils import (_get_clones, get_valid_ratio,\n                    get_contrastive_denoising_training_group,\n                    get_sine_pos_embed, inverse_sigmoid)\n\n__all__ = ['GroupDINOTransformer']\n\n\nclass DINOTransformerEncoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model=256,\n                 n_head=8,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 n_levels=4,\n                 n_points=4,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(DINOTransformerEncoderLayer, self).__init__()\n        # self attention\n        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,\n                                               n_points, 1.0)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        # ffn\n        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,\n                                 bias_attr)\n        self.activation = getattr(F, activation)\n        self.dropout2 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,\n                                 bias_attr)\n        self.dropout3 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n        xavier_uniform_(self.linear1.weight)\n        xavier_uniform_(self.linear2.weight)\n\n    def with_pos_embed(self, tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, src):\n        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))\n        src = src + self.dropout3(src2)\n        src = self.norm2(src)\n        return src\n\n    def forward(self,\n                src,\n                reference_points,\n                spatial_shapes,\n                level_start_index,\n                src_mask=None,\n                query_pos_embed=None):\n        # self attention\n        src2 = self.self_attn(\n            self.with_pos_embed(src, query_pos_embed), reference_points, src,\n            spatial_shapes, level_start_index, src_mask)\n        src = src + self.dropout1(src2)\n        src = self.norm1(src)\n        # ffn\n        src = self.forward_ffn(src)\n\n        return src\n\n\nclass DINOTransformerEncoder(nn.Layer):\n    def __init__(self, encoder_layer, num_layers):\n        super(DINOTransformerEncoder, self).__init__()\n        self.layers = _get_clones(encoder_layer, num_layers)\n        self.num_layers = num_layers\n\n    @staticmethod\n    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):\n        valid_ratios = valid_ratios.unsqueeze(1)\n        reference_points = []\n        for i, (H, W) in enumerate(spatial_shapes):\n            ref_y, ref_x = paddle.meshgrid(\n                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)\n            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *\n                                                    H)\n            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *\n                                                    W)\n            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))\n        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)\n        reference_points = reference_points * valid_ratios\n        return reference_points\n\n    def forward(self,\n                feat,\n                spatial_shapes,\n                level_start_index,\n                feat_mask=None,\n                query_pos_embed=None,\n                valid_ratios=None):\n        if valid_ratios is None:\n            valid_ratios = paddle.ones(\n                [feat.shape[0], spatial_shapes.shape[0], 2])\n        reference_points = self.get_reference_points(spatial_shapes,\n                                                     valid_ratios)\n        for layer in self.layers:\n            feat = layer(feat, reference_points, spatial_shapes,\n                         level_start_index, feat_mask, query_pos_embed)\n\n        return feat\n\n\nclass DINOTransformerDecoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model=256,\n                 n_head=8,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 n_levels=4,\n                 n_points=4,\n                 dual_queries=False,\n                 dual_groups=0,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(DINOTransformerDecoderLayer, self).__init__()\n\n        # self attention\n        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # cross attention\n        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,\n                                                n_points, 1.0)\n        self.dropout2 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,\n                                 bias_attr)\n        self.activation = getattr(F, activation)\n        self.dropout3 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,\n                                 bias_attr)\n        self.dropout4 = nn.Dropout(dropout)\n        self.norm3 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # for dual groups \n        self.dual_queries = dual_queries\n        self.dual_groups = dual_groups\n        self.n_head = n_head\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n        xavier_uniform_(self.linear1.weight)\n        xavier_uniform_(self.linear2.weight)\n\n    def with_pos_embed(self, tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt):\n        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                attn_mask=None,\n                memory_mask=None,\n                query_pos_embed=None):\n        # self attention\n        q = k = self.with_pos_embed(tgt, query_pos_embed)\n        if self.dual_queries:\n            dual_groups = self.dual_groups\n            bs, num_queries, n_model = q.shape\n            q = paddle.concat(q.split(dual_groups + 1, axis=1), axis=0)\n            k = paddle.concat(k.split(dual_groups + 1, axis=1), axis=0)\n            tgt = paddle.concat(tgt.split(dual_groups + 1, axis=1), axis=0)\n\n            g_num_queries = num_queries // (dual_groups + 1)\n            if attn_mask is None or attn_mask[0] is None:\n                attn_mask = None\n            else:\n                # [(dual_groups + 1), g_num_queries, g_num_queries]\n                attn_mask = paddle.concat(\n                    [sa_mask.unsqueeze(0) for sa_mask in attn_mask], axis=0)\n                # [1, (dual_groups + 1), 1, g_num_queries, g_num_queries]\n                # --> [bs, (dual_groups + 1), nhead, g_num_queries, g_num_queries]\n                # --> [bs * (dual_groups + 1), nhead, g_num_queries, g_num_queries]\n                attn_mask = attn_mask.unsqueeze(0).unsqueeze(2).tile(\n                    [bs, 1, self.n_head, 1, 1])\n                attn_mask = attn_mask.reshape([\n                    bs * (dual_groups + 1), self.n_head, g_num_queries,\n                    g_num_queries\n                ])\n\n        if attn_mask is not None:\n            attn_mask = attn_mask.astype('bool')\n\n        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm2(tgt)\n\n        # trace back\n        if self.dual_queries:\n            tgt = paddle.concat(tgt.split(dual_groups + 1, axis=0), axis=1)\n\n        # cross attention\n        tgt2 = self.cross_attn(\n            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,\n            memory_spatial_shapes, memory_level_start_index, memory_mask)\n        tgt = tgt + self.dropout2(tgt2)\n        tgt = self.norm1(tgt)\n\n        # ffn\n        tgt2 = self.forward_ffn(tgt)\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n\n        return tgt\n\n\nclass DINOTransformerDecoder(nn.Layer):\n    def __init__(self,\n                 hidden_dim,\n                 decoder_layer,\n                 num_layers,\n                 return_intermediate=True):\n        super(DINOTransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.hidden_dim = hidden_dim\n        self.num_layers = num_layers\n        self.return_intermediate = return_intermediate\n\n        self.norm = nn.LayerNorm(\n            hidden_dim,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                bbox_head,\n                query_pos_head,\n                valid_ratios=None,\n                attn_mask=None,\n                memory_mask=None):\n        if valid_ratios is None:\n            valid_ratios = paddle.ones(\n                [memory.shape[0], memory_spatial_shapes.shape[0], 2])\n\n        output = tgt\n        intermediate = []\n        inter_ref_bboxes = []\n        for i, layer in enumerate(self.layers):\n            reference_points_input = reference_points.unsqueeze(\n                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)\n            query_pos_embed = get_sine_pos_embed(\n                reference_points_input[..., 0, :], self.hidden_dim // 2)\n            query_pos_embed = query_pos_head(query_pos_embed)\n\n            output = layer(output, reference_points_input, memory,\n                           memory_spatial_shapes, memory_level_start_index,\n                           attn_mask, memory_mask, query_pos_embed)\n            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(\n                reference_points))\n\n            if self.return_intermediate:\n                intermediate.append(self.norm(output))\n                inter_ref_bboxes.append(inter_ref_bbox)\n\n            reference_points = inter_ref_bbox.detach()\n\n        if self.return_intermediate:\n            return paddle.stack(intermediate), paddle.stack(inter_ref_bboxes)\n\n        return output, reference_points\n\n\n@register\nclass GroupDINOTransformer(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=256,\n                 num_queries=900,\n                 position_embed_type='sine',\n                 return_intermediate_dec=True,\n                 backbone_feat_channels=[512, 1024, 2048],\n                 num_levels=4,\n                 num_encoder_points=4,\n                 num_decoder_points=4,\n                 nhead=8,\n                 num_encoder_layers=6,\n                 num_decoder_layers=6,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 pe_temperature=10000,\n                 pe_offset=-0.5,\n                 num_denoising=100,\n                 label_noise_ratio=0.5,\n                 box_noise_scale=1.0,\n                 learnt_init_query=True,\n                 use_input_proj=True,\n                 dual_queries=False,\n                 dual_groups=0,\n                 eps=1e-2):\n        super(GroupDINOTransformer, self).__init__()\n        assert position_embed_type in ['sine', 'learned'], \\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        assert len(backbone_feat_channels) <= num_levels\n\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.num_levels = num_levels\n        self.num_classes = num_classes\n        self.num_queries = num_queries\n        self.eps = eps\n        self.num_decoder_layers = num_decoder_layers\n        self.use_input_proj = use_input_proj\n\n        if use_input_proj:\n            # backbone feature projection\n            self._build_input_proj_layer(backbone_feat_channels)\n\n        # Transformer module\n        encoder_layer = DINOTransformerEncoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,\n            num_encoder_points)\n        self.encoder = DINOTransformerEncoder(encoder_layer, num_encoder_layers)\n        decoder_layer = DINOTransformerDecoderLayer(\n            hidden_dim,\n            nhead,\n            dim_feedforward,\n            dropout,\n            activation,\n            num_levels,\n            num_decoder_points,\n            dual_queries=dual_queries,\n            dual_groups=dual_groups)\n        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,\n                                              num_decoder_layers,\n                                              return_intermediate_dec)\n\n        # denoising part\n        self.denoising_class_embed = nn.Embedding(\n            num_classes,\n            hidden_dim,\n            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n        self.num_denoising = num_denoising\n        self.label_noise_ratio = label_noise_ratio\n        self.box_noise_scale = box_noise_scale\n\n        # for dual group\n        self.dual_queries = dual_queries\n        self.dual_groups = dual_groups\n        if self.dual_queries:\n            self.denoising_class_embed_groups = nn.LayerList([\n                nn.Embedding(\n                    num_classes,\n                    hidden_dim,\n                    weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n                for _ in range(self.dual_groups)\n            ])\n\n        # position embedding\n        self.position_embedding = PositionEmbedding(\n            hidden_dim // 2,\n            temperature=pe_temperature,\n            normalize=True if position_embed_type == 'sine' else False,\n            embed_type=position_embed_type,\n            offset=pe_offset)\n        self.level_embed = nn.Embedding(num_levels, hidden_dim)\n        # decoder embedding\n        self.learnt_init_query = learnt_init_query\n        if learnt_init_query:\n            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)\n            normal_(self.tgt_embed.weight)\n            if self.dual_queries:\n                self.tgt_embed_dual = nn.LayerList([\n                    nn.Embedding(num_queries, hidden_dim)\n                    for _ in range(self.dual_groups)\n                ])\n                for dual_tgt_module in self.tgt_embed_dual:\n                    normal_(dual_tgt_module.weight)\n        self.query_pos_head = MLP(2 * hidden_dim,\n                                  hidden_dim,\n                                  hidden_dim,\n                                  num_layers=2)\n\n        # encoder head\n        self.enc_output = nn.Sequential(\n            nn.Linear(hidden_dim, hidden_dim),\n            nn.LayerNorm(\n                hidden_dim,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))\n        if self.dual_queries:\n            self.enc_output = _get_clones(self.enc_output, self.dual_groups + 1)\n        else:\n            self.enc_output = _get_clones(self.enc_output, 1)\n\n        self.enc_score_head = nn.Linear(hidden_dim, num_classes)\n        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n\n        if self.dual_queries:\n            self.enc_bbox_head_dq = nn.LayerList([\n                MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n                for i in range(self.dual_groups)\n            ])\n            self.enc_score_head_dq = nn.LayerList([\n                nn.Linear(hidden_dim, num_classes)\n                for i in range(self.dual_groups)\n            ])\n\n        # decoder head\n        self.dec_score_head = nn.LayerList([\n            nn.Linear(hidden_dim, num_classes)\n            for _ in range(num_decoder_layers)\n        ])\n        self.dec_bbox_head = nn.LayerList([\n            MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n            for _ in range(num_decoder_layers)\n        ])\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # class and bbox head init\n        bias_cls = bias_init_with_prob(0.01)\n        linear_init_(self.enc_score_head)\n        constant_(self.enc_score_head.bias, bias_cls)\n        constant_(self.enc_bbox_head.layers[-1].weight)\n        constant_(self.enc_bbox_head.layers[-1].bias)\n        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):\n            linear_init_(cls_)\n            constant_(cls_.bias, bias_cls)\n            constant_(reg_.layers[-1].weight)\n            constant_(reg_.layers[-1].bias)\n\n        for enc_output in self.enc_output:\n            linear_init_(enc_output[0])\n            xavier_uniform_(enc_output[0].weight)\n        normal_(self.level_embed.weight)\n        if self.learnt_init_query:\n            xavier_uniform_(self.tgt_embed.weight)\n        xavier_uniform_(self.query_pos_head.layers[0].weight)\n        xavier_uniform_(self.query_pos_head.layers[1].weight)\n        normal_(self.denoising_class_embed.weight)\n        if self.use_input_proj:\n            for l in self.input_proj:\n                xavier_uniform_(l[0].weight)\n                constant_(l[0].bias)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'backbone_feat_channels': [i.channels for i in input_shape], }\n\n    def _build_input_proj_layer(self, backbone_feat_channels):\n        self.input_proj = nn.LayerList()\n        for in_channels in backbone_feat_channels:\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels, self.hidden_dim, kernel_size=1)),\n                    ('norm', nn.GroupNorm(\n                        32,\n                        self.hidden_dim,\n                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n        in_channels = backbone_feat_channels[-1]\n        for _ in range(self.num_levels - len(backbone_feat_channels)):\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=3,\n                        stride=2,\n                        padding=1)), ('norm', nn.GroupNorm(\n                            32,\n                            self.hidden_dim,\n                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n            in_channels = self.hidden_dim\n\n    def _get_encoder_input(self, feats, pad_mask=None):\n        if self.use_input_proj:\n            # get projection features\n            proj_feats = [\n                self.input_proj[i](feat) for i, feat in enumerate(feats)\n            ]\n            if self.num_levels > len(proj_feats):\n                len_srcs = len(proj_feats)\n                for i in range(len_srcs, self.num_levels):\n                    if i == len_srcs:\n                        proj_feats.append(self.input_proj[i](feats[-1]))\n                    else:\n                        proj_feats.append(self.input_proj[i](proj_feats[-1]))\n        else:\n            proj_feats = feats\n        # get encoder inputs\n        feat_flatten = []\n        mask_flatten = []\n        lvl_pos_embed_flatten = []\n        spatial_shapes = []\n        valid_ratios = []\n        for i, feat in enumerate(proj_feats):\n            bs, _, h, w = feat.shape\n            spatial_shapes.append(paddle.concat([h, w]))\n            # [b,c,h,w] -> [b,h*w,c]\n            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))\n            if pad_mask is not None:\n                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]\n            else:\n                mask = paddle.ones([bs, h, w])\n            valid_ratios.append(get_valid_ratio(mask))\n            # [b, h*w, c]\n            pos_embed = self.position_embedding(mask).flatten(1, 2)\n            lvl_pos_embed = pos_embed + self.level_embed.weight[i].reshape(\n                [1, 1, -1])\n            lvl_pos_embed_flatten.append(lvl_pos_embed)\n            if pad_mask is not None:\n                # [b, h*w]\n                mask_flatten.append(mask.flatten(1))\n\n        # [b, l, c]\n        feat_flatten = paddle.concat(feat_flatten, 1)\n        # [b, l]\n        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,\n                                                                   1)\n        # [b, l, c]\n        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)\n        # [num_levels, 2]\n        spatial_shapes = paddle.to_tensor(\n            paddle.stack(spatial_shapes).astype('int64'))\n        # [l] start index of each level\n        level_start_index = paddle.concat([\n            paddle.zeros(\n                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]\n        ])\n        # [b, num_levels, 2]\n        valid_ratios = paddle.stack(valid_ratios, 1)\n        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,\n                lvl_pos_embed_flatten, valid_ratios)\n\n    def forward(self, feats, pad_mask=None, gt_meta=None):\n        # input projection and embedding\n        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,\n         lvl_pos_embed_flatten,\n         valid_ratios) = self._get_encoder_input(feats, pad_mask)\n\n        # encoder\n        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,\n                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)\n\n        # prepare denoising training\n        if self.training:\n            denoising_class, denoising_bbox, attn_mask, dn_meta = \\\n                get_contrastive_denoising_training_group(gt_meta,\n                                            self.num_classes,\n                                            self.num_queries,\n                                            self.denoising_class_embed.weight,\n                                            self.num_denoising,\n                                            self.label_noise_ratio,\n                                            self.box_noise_scale)\n            if self.dual_queries:\n                denoising_class_groups = []\n                denoising_bbox_groups = []\n                attn_mask_groups = []\n                dn_meta_groups = []\n                for g_id in range(self.dual_groups):\n                    denoising_class_gid, denoising_bbox_gid, attn_mask_gid, dn_meta_gid = \\\n                        get_contrastive_denoising_training_group(gt_meta,\n                                                    self.num_classes,\n                                                    self.num_queries,\n                                                    self.denoising_class_embed_groups[g_id].weight,\n                                                    self.num_denoising,\n                                                    self.label_noise_ratio,\n                                                    self.box_noise_scale)\n                    denoising_class_groups.append(denoising_class_gid)\n                    denoising_bbox_groups.append(denoising_bbox_gid)\n                    attn_mask_groups.append(attn_mask_gid)\n                    dn_meta_groups.append(dn_meta_gid)\n\n                # combine\n                denoising_class = [denoising_class] + denoising_class_groups\n                denoising_bbox = [denoising_bbox] + denoising_bbox_groups\n                attn_mask = [attn_mask] + attn_mask_groups\n                dn_meta = [dn_meta] + dn_meta_groups\n        else:\n            denoising_class, denoising_bbox, attn_mask, dn_meta = None, None, None, None\n\n        target, init_ref_points, enc_topk_bboxes, enc_topk_logits = \\\n            self._get_decoder_input(\n            memory, spatial_shapes, mask_flatten, denoising_class,\n            denoising_bbox)\n\n        # decoder\n        inter_feats, inter_ref_bboxes = self.decoder(\n            target, init_ref_points, memory, spatial_shapes, level_start_index,\n            self.dec_bbox_head, self.query_pos_head, valid_ratios, attn_mask,\n            mask_flatten)\n        # solve hang during distributed training\n        inter_feats[0] += self.denoising_class_embed.weight[0, 0] * 0.\n        if self.dual_queries:\n            for g_id in range(self.dual_groups):\n                inter_feats[0] += self.denoising_class_embed_groups[\n                    g_id].weight[0, 0] * 0.0\n\n        out_bboxes = []\n        out_logits = []\n        for i in range(self.num_decoder_layers):\n            out_logits.append(self.dec_score_head[i](inter_feats[i]))\n            if i == 0:\n                out_bboxes.append(\n                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +\n                              inverse_sigmoid(init_ref_points)))\n            else:\n                out_bboxes.append(\n                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +\n                              inverse_sigmoid(inter_ref_bboxes[i - 1])))\n\n        out_bboxes = paddle.stack(out_bboxes)\n        out_logits = paddle.stack(out_logits)\n        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,\n                dn_meta)\n\n    def _get_encoder_output_anchors(self,\n                                    memory,\n                                    spatial_shapes,\n                                    memory_mask=None,\n                                    grid_size=0.05):\n        output_anchors = []\n        idx = 0\n        for lvl, (h, w) in enumerate(spatial_shapes):\n            if memory_mask is not None:\n                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])\n                valid_H = paddle.sum(mask_[:, :, 0], 1)\n                valid_W = paddle.sum(mask_[:, 0, :], 1)\n            else:\n                valid_H, valid_W = h, w\n\n            grid_y, grid_x = paddle.meshgrid(\n                paddle.arange(\n                    end=h, dtype=memory.dtype),\n                paddle.arange(\n                    end=w, dtype=memory.dtype))\n            grid_xy = paddle.stack([grid_x, grid_y], -1)\n\n            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(\n                [-1, 1, 1, 2]).astype(grid_xy.dtype)\n            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH\n            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)\n            output_anchors.append(\n                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))\n            idx += h * w\n\n        output_anchors = paddle.concat(output_anchors, 1)\n        valid_mask = ((output_anchors > self.eps) *\n                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)\n        output_anchors = paddle.log(output_anchors / (1 - output_anchors))\n        if memory_mask is not None:\n            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0\n        output_anchors = paddle.where(valid_mask, output_anchors,\n                                      paddle.to_tensor(float(\"inf\")))\n\n        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))\n        if self.dual_queries:\n            output_memory = [\n                self.enc_output[g_id](memory)\n                for g_id in range(self.dual_groups + 1)\n            ]\n        else:\n            output_memory = self.enc_output[0](memory)\n        return output_memory, output_anchors\n\n    def _get_decoder_input(self,\n                           memory,\n                           spatial_shapes,\n                           memory_mask=None,\n                           denoising_class=None,\n                           denoising_bbox=None):\n        bs, _, _ = memory.shape\n        # prepare input for decoder\n        output_memory, output_anchors = self._get_encoder_output_anchors(\n            memory, spatial_shapes, memory_mask)\n        if self.dual_queries:\n            enc_outputs_class = self.enc_score_head(output_memory[0])\n            enc_outputs_coord_unact = self.enc_bbox_head(output_memory[\n                0]) + output_anchors\n        else:\n            enc_outputs_class = self.enc_score_head(output_memory)\n            enc_outputs_coord_unact = self.enc_bbox_head(\n                output_memory) + output_anchors\n\n        _, topk_ind = paddle.topk(\n            enc_outputs_class.max(-1), self.num_queries, axis=1)\n        # extract region proposal boxes\n        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)\n        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])\n        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)\n        topk_coords_unact = paddle.gather_nd(enc_outputs_coord_unact,\n                                             topk_ind)  # unsigmoided.\n        enc_topk_bboxes = F.sigmoid(topk_coords_unact)\n        reference_points = enc_topk_bboxes.detach()\n        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)\n\n        if self.dual_queries:\n            enc_topk_logits_groups = []\n            enc_topk_bboxes_groups = []\n            reference_points_groups = []\n            topk_ind_groups = []\n            for g_id in range(self.dual_groups):\n                enc_outputs_class_gid = self.enc_score_head_dq[g_id](\n                    output_memory[g_id + 1])\n                enc_outputs_coord_unact_gid = self.enc_bbox_head_dq[g_id](\n                    output_memory[g_id + 1]) + output_anchors\n                _, topk_ind_gid = paddle.topk(\n                    enc_outputs_class_gid.max(-1), self.num_queries, axis=1)\n                # extract region proposal boxes\n                batch_ind = paddle.arange(end=bs, dtype=topk_ind_gid.dtype)\n                batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])\n                topk_ind_gid = paddle.stack([batch_ind, topk_ind_gid], axis=-1)\n                topk_coords_unact_gid = paddle.gather_nd(\n                    enc_outputs_coord_unact_gid, topk_ind_gid)  # unsigmoided.\n                enc_topk_bboxes_gid = F.sigmoid(topk_coords_unact_gid)\n                reference_points_gid = enc_topk_bboxes_gid.detach()\n                enc_topk_logits_gid = paddle.gather_nd(enc_outputs_class_gid,\n                                                       topk_ind_gid)\n\n                # append and combine\n                topk_ind_groups.append(topk_ind_gid)\n                enc_topk_logits_groups.append(enc_topk_logits_gid)\n                enc_topk_bboxes_groups.append(enc_topk_bboxes_gid)\n                reference_points_groups.append(reference_points_gid)\n\n            enc_topk_bboxes = paddle.concat(\n                [enc_topk_bboxes] + enc_topk_bboxes_groups, 1)\n            enc_topk_logits = paddle.concat(\n                [enc_topk_logits] + enc_topk_logits_groups, 1)\n            reference_points = paddle.concat(\n                [reference_points] + reference_points_groups, 1)\n            topk_ind = paddle.concat([topk_ind] + topk_ind_groups, 1)\n\n        # extract region features\n        if self.learnt_init_query:\n            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n            if self.dual_queries:\n                target = paddle.concat([target] + [\n                    self.tgt_embed_dual[g_id].weight.unsqueeze(0).tile(\n                        [bs, 1, 1]) for g_id in range(self.dual_groups)\n                ], 1)\n        else:\n            if self.dual_queries:\n                target = paddle.gather_nd(output_memory[0], topk_ind)\n                target_groups = []\n                for g_id in range(self.dual_groups):\n                    target_gid = paddle.gather_nd(output_memory[g_id + 1],\n                                                  topk_ind_groups[g_id])\n                    target_groups.append(target_gid)\n                target = paddle.concat([target] + target_groups, 1).detach()\n            else:\n                target = paddle.gather_nd(output_memory, topk_ind).detach()\n\n        if denoising_bbox is not None:\n            if isinstance(denoising_bbox, list) and isinstance(\n                    denoising_class, list) and self.dual_queries:\n                if denoising_bbox[0] is not None:\n                    reference_points_list = paddle.split(\n                        reference_points, self.dual_groups + 1, axis=1)\n                    reference_points = paddle.concat(\n                        [\n                            paddle.concat(\n                                [ref, ref_], axis=1)\n                            for ref, ref_ in zip(denoising_bbox,\n                                                 reference_points_list)\n                        ],\n                        axis=1)\n\n                    target_list = paddle.split(\n                        target, self.dual_groups + 1, axis=1)\n                    target = paddle.concat(\n                        [\n                            paddle.concat(\n                                [tgt, tgt_], axis=1)\n                            for tgt, tgt_ in zip(denoising_class, target_list)\n                        ],\n                        axis=1)\n                else:\n                    reference_points, target = reference_points, target\n            else:\n                reference_points = paddle.concat(\n                    [denoising_bbox, reference_points], 1)\n                target = paddle.concat([denoising_class, target], 1)\n\n        return target, reference_points, enc_topk_bboxes, enc_topk_logits\n"
  },
  {
    "path": "ppdet/modeling/transformers/hybrid_encoder.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.modeling.ops import get_act_fn\nfrom ..shape_spec import ShapeSpec\nfrom ..backbones.csp_darknet import BaseConv\nfrom ..backbones.cspresnet import RepVggBlock\nfrom ppdet.modeling.transformers.detr_transformer import TransformerEncoder\nfrom ..initializer import xavier_uniform_, linear_init_\nfrom ..layers import MultiHeadAttention\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\n__all__ = ['HybridEncoder', 'MaskHybridEncoder']\n\n\nclass CSPRepLayer(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 num_blocks=3,\n                 expansion=1.0,\n                 bias=False,\n                 act=\"silu\"):\n        super(CSPRepLayer, self).__init__()\n        hidden_channels = int(out_channels * expansion)\n        self.conv1 = BaseConv(\n            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)\n        self.conv2 = BaseConv(\n            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)\n        self.bottlenecks = nn.Sequential(* [\n            RepVggBlock(\n                hidden_channels, hidden_channels, act=act)\n            for _ in range(num_blocks)\n        ])\n        if hidden_channels != out_channels:\n            self.conv3 = BaseConv(\n                hidden_channels,\n                out_channels,\n                ksize=1,\n                stride=1,\n                bias=bias,\n                act=act)\n        else:\n            self.conv3 = nn.Identity()\n\n    def forward(self, x):\n        x_1 = self.conv1(x)\n        x_1 = self.bottlenecks(x_1)\n        x_2 = self.conv2(x)\n        return self.conv3(x_1 + x_2)\n\n\n@register\nclass TransformerLayer(nn.Layer):\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False):\n        super(TransformerLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n\n        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos_embed):\n        return tensor if pos_embed is None else tensor + pos_embed\n\n    def forward(self, src, src_mask=None, pos_embed=None):\n        residual = src\n        if self.normalize_before:\n            src = self.norm1(src)\n        q = k = self.with_pos_embed(src, pos_embed)\n        src = self.self_attn(q, k, value=src, attn_mask=src_mask)\n\n        src = residual + self.dropout1(src)\n        if not self.normalize_before:\n            src = self.norm1(src)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm2(src)\n        src = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = residual + self.dropout2(src)\n        if not self.normalize_before:\n            src = self.norm2(src)\n        return src\n\n\n@register\n@serializable\nclass HybridEncoder(nn.Layer):\n    __shared__ = ['depth_mult', 'act', 'trt', 'eval_size']\n    __inject__ = ['encoder_layer']\n\n    def __init__(self,\n                 in_channels=[512, 1024, 2048],\n                 feat_strides=[8, 16, 32],\n                 hidden_dim=256,\n                 use_encoder_idx=[2],\n                 num_encoder_layers=1,\n                 encoder_layer='TransformerLayer',\n                 pe_temperature=10000,\n                 expansion=1.0,\n                 depth_mult=1.0,\n                 act='silu',\n                 trt=False,\n                 eval_size=None):\n        super(HybridEncoder, self).__init__()\n        self.in_channels = in_channels\n        self.feat_strides = feat_strides\n        self.hidden_dim = hidden_dim\n        self.use_encoder_idx = use_encoder_idx\n        self.num_encoder_layers = num_encoder_layers\n        self.pe_temperature = pe_temperature\n        self.eval_size = eval_size\n\n        # channel projection\n        self.input_proj = nn.LayerList()\n        for in_channel in in_channels:\n            self.input_proj.append(\n                nn.Sequential(\n                    nn.Conv2D(\n                        in_channel, hidden_dim, kernel_size=1, bias_attr=False),\n                    nn.BatchNorm2D(\n                        hidden_dim,\n                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))\n        # encoder transformer\n        self.encoder = nn.LayerList([\n            TransformerEncoder(encoder_layer, num_encoder_layers)\n            for _ in range(len(use_encoder_idx))\n        ])\n\n        act = get_act_fn(\n            act, trt=trt) if act is None or isinstance(act,\n                                                       (str, dict)) else act\n        # top-down fpn\n        self.lateral_convs = nn.LayerList()\n        self.fpn_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1, 0, -1):\n            self.lateral_convs.append(\n                BaseConv(\n                    hidden_dim, hidden_dim, 1, 1, act=act))\n            self.fpn_blocks.append(\n                CSPRepLayer(\n                    hidden_dim * 2,\n                    hidden_dim,\n                    round(3 * depth_mult),\n                    act=act,\n                    expansion=expansion))\n\n        # bottom-up pan\n        self.downsample_convs = nn.LayerList()\n        self.pan_blocks = nn.LayerList()\n        for idx in range(len(in_channels) - 1):\n            self.downsample_convs.append(\n                BaseConv(\n                    hidden_dim, hidden_dim, 3, stride=2, act=act))\n            self.pan_blocks.append(\n                CSPRepLayer(\n                    hidden_dim * 2,\n                    hidden_dim,\n                    round(3 * depth_mult),\n                    act=act,\n                    expansion=expansion))\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        if self.eval_size:\n            for idx in self.use_encoder_idx:\n                stride = self.feat_strides[idx]\n                pos_embed = self.build_2d_sincos_position_embedding(\n                    self.eval_size[1] // stride, self.eval_size[0] // stride,\n                    self.hidden_dim, self.pe_temperature)\n                setattr(self, f'pos_embed{idx}', pos_embed)\n\n    @staticmethod\n    def build_2d_sincos_position_embedding(w,\n                                           h,\n                                           embed_dim=256,\n                                           temperature=10000.):\n        grid_w = paddle.arange(int(w), dtype=paddle.float32)\n        grid_h = paddle.arange(int(h), dtype=paddle.float32)\n        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)\n        assert embed_dim % 4 == 0, \\\n            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'\n        pos_dim = embed_dim // 4\n        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim\n        omega = 1. / (temperature**omega)\n\n        out_w = grid_w.flatten()[..., None] @omega[None]\n        out_h = grid_h.flatten()[..., None] @omega[None]\n\n        return paddle.concat(\n            [\n                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),\n                paddle.cos(out_h)\n            ],\n            axis=1)[None, :, :]\n\n    def forward(self, feats, for_mot=False, is_teacher=False):\n        assert len(feats) == len(self.in_channels)\n        # get projection features\n        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]\n        # encoder\n        if self.num_encoder_layers > 0:\n            for i, enc_ind in enumerate(self.use_encoder_idx):\n                h, w = proj_feats[enc_ind].shape[2:]\n                # flatten [B, C, H, W] to [B, HxW, C]\n                src_flatten = proj_feats[enc_ind].flatten(2).transpose(\n                    [0, 2, 1])\n                if self.training or self.eval_size is None or is_teacher:\n                    pos_embed = self.build_2d_sincos_position_embedding(\n                        w, h, self.hidden_dim, self.pe_temperature)\n                else:\n                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None)\n                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)\n                proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(\n                    [-1, self.hidden_dim, h, w])\n\n        # top-down fpn\n        inner_outs = [proj_feats[-1]]\n        for idx in range(len(self.in_channels) - 1, 0, -1):\n            feat_heigh = inner_outs[0]\n            feat_low = proj_feats[idx - 1]\n            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](\n                feat_heigh)\n            inner_outs[0] = feat_heigh\n            upsample_feat = F.interpolate(\n                feat_heigh, scale_factor=2., mode=\"nearest\")\n            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](\n                paddle.concat(\n                    [upsample_feat, feat_low], axis=1))\n            inner_outs.insert(0, inner_out)\n\n        # bottom-up pan\n        outs = [inner_outs[0]]\n        for idx in range(len(self.in_channels) - 1):\n            feat_low = outs[-1]\n            feat_height = inner_outs[idx + 1]\n            downsample_feat = self.downsample_convs[idx](feat_low)\n            out = self.pan_blocks[idx](paddle.concat(\n                [downsample_feat, feat_height], axis=1))\n            outs.append(out)\n\n        return outs\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {\n            'in_channels': [i.channels for i in input_shape],\n            'feat_strides': [i.stride for i in input_shape]\n        }\n\n    @property\n    def out_shape(self):\n        return [\n            ShapeSpec(\n                channels=self.hidden_dim, stride=self.feat_strides[idx])\n            for idx in range(len(self.in_channels))\n        ]\n\n\nclass MaskFeatFPN(nn.Layer):\n    def __init__(self,\n                 in_channels=[256, 256, 256],\n                 fpn_strides=[32, 16, 8],\n                 feat_channels=256,\n                 dropout_ratio=0.0,\n                 out_channels=256,\n                 align_corners=False,\n                 act='swish'):\n        super(MaskFeatFPN, self).__init__()\n        assert len(in_channels) == len(fpn_strides)\n        reorder_index = np.argsort(fpn_strides, axis=0)\n        in_channels = [in_channels[i] for i in reorder_index]\n        fpn_strides = [fpn_strides[i] for i in reorder_index]\n        assert min(fpn_strides) == fpn_strides[0]\n        self.reorder_index = reorder_index\n        self.fpn_strides = fpn_strides\n        self.dropout_ratio = dropout_ratio\n        self.align_corners = align_corners\n        if self.dropout_ratio > 0:\n            self.dropout = nn.Dropout2D(dropout_ratio)\n\n        self.scale_heads = nn.LayerList()\n        for i in range(len(fpn_strides)):\n            head_length = max(\n                1, int(np.log2(fpn_strides[i]) - np.log2(fpn_strides[0])))\n            scale_head = []\n            for k in range(head_length):\n                in_c = in_channels[i] if k == 0 else feat_channels\n                scale_head.append(\n                    nn.Sequential(\n                        BaseConv(in_c, feat_channels, 3, 1, act=act))\n                )\n                if fpn_strides[i] != fpn_strides[0]:\n                    scale_head.append(\n                        nn.Upsample(\n                            scale_factor=2,\n                            mode='bilinear',\n                            align_corners=align_corners))\n\n            self.scale_heads.append(nn.Sequential(*scale_head))\n\n        self.output_conv = BaseConv(\n            feat_channels, out_channels, 3, 1, act=act)\n\n    def forward(self, inputs):\n        x = [inputs[i] for i in self.reorder_index]\n\n        output = self.scale_heads[0](x[0])\n        for i in range(1, len(self.fpn_strides)):\n            output = output + F.interpolate(\n                self.scale_heads[i](x[i]),\n                size=output.shape[2:],\n                mode='bilinear',\n                align_corners=self.align_corners)\n\n        if self.dropout_ratio > 0:\n            output = self.dropout(output)\n        output = self.output_conv(output)\n        return output\n\n\n@register\n@serializable\nclass MaskHybridEncoder(HybridEncoder):\n    __shared__ = ['depth_mult', 'act', 'trt', 'eval_size', 'num_prototypes']\n    __inject__ = ['encoder_layer']\n\n    def __init__(self,\n                 in_channels=[256, 512, 1024, 2048],\n                 feat_strides=[4, 8, 16, 32],\n                 hidden_dim=256,\n                 use_encoder_idx=[3],\n                 num_encoder_layers=1,\n                 encoder_layer='TransformerLayer',\n                 num_prototypes=32,\n                 pe_temperature=10000,\n                 expansion=1.0,\n                 depth_mult=1.0,\n                 mask_feat_channels=[64, 64],\n                 act='silu',\n                 trt=False,\n                 eval_size=None):\n        assert len(in_channels) == len(feat_strides)\n        x4_feat_dim = in_channels.pop(0)\n        x4_feat_stride = feat_strides.pop(0)\n        use_encoder_idx = [i - 1 for i in use_encoder_idx]\n        assert x4_feat_stride == 4\n\n        super(MaskHybridEncoder, self).__init__(\n            in_channels=in_channels,\n            feat_strides=feat_strides,\n            hidden_dim=hidden_dim,\n            use_encoder_idx=use_encoder_idx,\n            num_encoder_layers=num_encoder_layers,\n            encoder_layer=encoder_layer,\n            pe_temperature=pe_temperature,\n            expansion=expansion,\n            depth_mult=depth_mult,\n            act=act,\n            trt=trt,\n            eval_size=eval_size)\n\n        self.mask_feat_head = MaskFeatFPN(\n            [hidden_dim] * len(feat_strides),\n            feat_strides,\n            feat_channels=mask_feat_channels[0],\n            out_channels=mask_feat_channels[1],\n            act=act)\n        self.enc_mask_lateral = BaseConv(\n            x4_feat_dim, mask_feat_channels[1], 3, 1, act=act)\n        self.enc_mask_output = nn.Sequential(\n            BaseConv(\n                mask_feat_channels[1],\n                mask_feat_channels[1], 3, 1, act=act),\n            nn.Conv2D(mask_feat_channels[1], num_prototypes, 1))\n\n    def forward(self, feats, for_mot=False, is_teacher=False):\n        x4_feat = feats.pop(0)\n\n        enc_feats = super(MaskHybridEncoder, self).forward(\n            feats, for_mot=for_mot, is_teacher=is_teacher)\n\n        mask_feat = self.mask_feat_head(enc_feats)\n        mask_feat = F.interpolate(\n            mask_feat,\n            scale_factor=2,\n            mode='bilinear',\n            align_corners=False)\n        mask_feat += self.enc_mask_lateral(x4_feat)\n        mask_feat = self.enc_mask_output(mask_feat)\n\n        return enc_feats, mask_feat\n"
  },
  {
    "path": "ppdet/modeling/transformers/mask_dino_transformer.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Modified from detrex (https://github.com/IDEA-Research/detrex)\n# Copyright 2022 The IDEA Authors. All rights reserved.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register\nfrom .position_encoding import PositionEmbedding\nfrom ..heads.detr_head import MLP\nfrom .deformable_transformer import (DeformableTransformerEncoderLayer,\n                                     DeformableTransformerEncoder)\nfrom .dino_transformer import (DINOTransformerDecoderLayer)\nfrom ..initializer import (linear_init_, constant_, xavier_uniform_,\n                           bias_init_with_prob)\nfrom .utils import (_get_clones, get_valid_ratio, get_denoising_training_group,\n                    get_sine_pos_embed, inverse_sigmoid, mask_to_box_coordinate)\n\n__all__ = ['MaskDINO']\n\n\nclass ConvGNBlock(nn.Layer):\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 groups=1,\n                 num_groups=32,\n                 bias=False,\n                 act=None):\n        super(ConvGNBlock, self).__init__()\n        self.conv = nn.Conv2D(\n            in_channels,\n            out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=(kernel_size - 1) // 2,\n            groups=groups,\n            bias_attr=bias)\n        self.norm = nn.GroupNorm(\n            num_groups,\n            out_channels,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self.act = getattr(F, act) if act is not None else None\n\n        self._init_weights()\n\n    def _init_weights(self):\n        xavier_uniform_(self.conv.weight)\n\n    def forward(self, x):\n        x = self.norm(self.conv(x))\n        if self.act is not None:\n            x = self.act(x)\n        return x\n\n\nclass MaskDINOTransformerDecoder(nn.Layer):\n    def __init__(self, hidden_dim, decoder_layer, num_layers):\n        super(MaskDINOTransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.hidden_dim = hidden_dim\n        self.num_layers = num_layers\n\n    def forward(self,\n                tgt,\n                ref_points_unact,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                bbox_head,\n                query_pos_head,\n                dec_norm,\n                valid_ratios=None,\n                attn_mask=None,\n                memory_mask=None):\n        if valid_ratios is None:\n            valid_ratios = paddle.ones(\n                [memory.shape[0], memory_spatial_shapes.shape[0], 2])\n\n        output = tgt\n        intermediate = []\n        inter_bboxes = []\n        ref_points = F.sigmoid(ref_points_unact)\n        for i, layer in enumerate(self.layers):\n            reference_points_input = ref_points.detach().unsqueeze(\n                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)\n            query_pos_embed = get_sine_pos_embed(\n                reference_points_input[..., 0, :], self.hidden_dim // 2)\n            query_pos_embed = query_pos_head(query_pos_embed)\n\n            output = layer(output, reference_points_input, memory,\n                           memory_spatial_shapes, memory_level_start_index,\n                           attn_mask, memory_mask, query_pos_embed)\n\n            ref_points = F.sigmoid(\n                bbox_head(output) + inverse_sigmoid(ref_points.detach()))\n\n            intermediate.append(dec_norm(output))\n            inter_bboxes.append(ref_points)\n\n        return paddle.stack(intermediate), paddle.stack(inter_bboxes)\n\n\n@register\nclass MaskDINO(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=256,\n                 num_queries=300,\n                 position_embed_type='sine',\n                 in_feats_channel=[256, 512, 1024, 2048],\n                 num_levels=3,\n                 num_encoder_points=4,\n                 num_decoder_points=4,\n                 nhead=8,\n                 num_encoder_layers=6,\n                 num_decoder_layers=9,\n                 enc_dim_feedforward=1024,\n                 dec_dim_feedforward=2048,\n                 dropout=0.,\n                 activation=\"relu\",\n                 lr_mult=1.0,\n                 pe_temperature=10000,\n                 pe_offset=-0.5,\n                 num_denoising=100,\n                 label_noise_ratio=0.4,\n                 box_noise_scale=0.4,\n                 learnt_init_query=False,\n                 mask_enhanced=True,\n                 eps=1e-2):\n        super(MaskDINO, self).__init__()\n        assert position_embed_type in ['sine', 'learned'], \\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        feat0_dim = in_feats_channel.pop(0)\n        assert len(in_feats_channel) <= num_levels\n\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.num_levels = num_levels\n        self.num_classes = num_classes\n        self.num_queries = num_queries\n        self.eps = eps\n        self.num_decoder_layers = num_decoder_layers\n        self.mask_enhanced = mask_enhanced\n\n        weight_attr = ParamAttr(regularizer=L2Decay(0.0))\n        bias_attr = ParamAttr(regularizer=L2Decay(0.0))\n        # backbone feature projection\n        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)\n\n        # Transformer module\n        encoder_layer = DeformableTransformerEncoderLayer(\n            hidden_dim, nhead, enc_dim_feedforward, dropout, activation,\n            num_levels, num_encoder_points, lr_mult, weight_attr, bias_attr)\n        self.encoder = DeformableTransformerEncoder(encoder_layer,\n                                                    num_encoder_layers)\n        decoder_layer = DINOTransformerDecoderLayer(\n            hidden_dim, nhead, dec_dim_feedforward, dropout, activation,\n            num_levels, num_decoder_points, lr_mult, weight_attr, bias_attr)\n        self.decoder = MaskDINOTransformerDecoder(hidden_dim, decoder_layer,\n                                                  num_decoder_layers)\n\n        # denoising part\n        self.denoising_class_embed = nn.Embedding(\n            num_classes,\n            hidden_dim,\n            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n        self.num_denoising = num_denoising\n        self.label_noise_ratio = label_noise_ratio\n        self.box_noise_scale = box_noise_scale\n\n        # position embedding\n        self.position_embedding = PositionEmbedding(\n            hidden_dim // 2,\n            temperature=pe_temperature,\n            normalize=True if position_embed_type == 'sine' else False,\n            embed_type=position_embed_type,\n            offset=pe_offset)\n        self.level_embed = nn.Embedding(\n            num_levels,\n            hidden_dim,\n            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n        # decoder embedding\n        self.learnt_init_query = learnt_init_query\n        if learnt_init_query:\n            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)\n        self.query_pos_head = MLP(2 * hidden_dim,\n                                  hidden_dim,\n                                  hidden_dim,\n                                  num_layers=2)\n        # mask embedding\n        self.mask_query_head = MLP(hidden_dim,\n                                   hidden_dim,\n                                   hidden_dim,\n                                   num_layers=3)\n\n        # encoder mask head\n        self.enc_mask_lateral = ConvGNBlock(feat0_dim, hidden_dim, 1)\n        self.enc_mask_output = nn.Sequential(\n            ConvGNBlock(\n                hidden_dim, hidden_dim, 3, act=activation),\n            nn.Conv2D(hidden_dim, hidden_dim, 1))\n        # encoder head\n        self.enc_output = nn.Sequential(\n            nn.Linear(hidden_dim, hidden_dim),\n            nn.LayerNorm(\n                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))\n        # decoder norm layer\n        self.dec_norm = nn.LayerNorm(\n            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)\n        # shared prediction head\n        self.class_head = nn.Linear(hidden_dim, num_classes)\n        self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # class and bbox head init\n        bias_cls = bias_init_with_prob(0.01)\n        linear_init_(self.class_head)\n        constant_(self.class_head.bias, bias_cls)\n        constant_(self.bbox_head.layers[-1].weight)\n        constant_(self.bbox_head.layers[-1].bias)\n\n        xavier_uniform_(self.enc_mask_output[1].weight)\n        linear_init_(self.enc_output[0])\n        xavier_uniform_(self.enc_output[0].weight)\n        if self.learnt_init_query:\n            xavier_uniform_(self.tgt_embed.weight)\n        xavier_uniform_(self.query_pos_head.layers[0].weight)\n        xavier_uniform_(self.query_pos_head.layers[1].weight)\n        for l in self.input_proj:\n            xavier_uniform_(l[0].weight)\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'in_feats_channel': [i.channels for i in input_shape], }\n\n    def _build_input_proj_layer(self,\n                                in_feats_channel,\n                                weight_attr=None,\n                                bias_attr=None):\n        self.input_proj = nn.LayerList()\n        for in_channels in in_feats_channel:\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels, self.hidden_dim, kernel_size=1)), (\n                            'norm', nn.GroupNorm(\n                                32,\n                                self.hidden_dim,\n                                weight_attr=weight_attr,\n                                bias_attr=bias_attr))))\n        in_channels = in_feats_channel[-1]\n        for _ in range(self.num_levels - len(in_feats_channel)):\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=3,\n                        stride=2,\n                        padding=1)), ('norm', nn.GroupNorm(\n                            32,\n                            self.hidden_dim,\n                            weight_attr=weight_attr,\n                            bias_attr=bias_attr))))\n            in_channels = self.hidden_dim\n\n    def _get_encoder_input(self, feats, pad_mask=None):\n        # get projection features\n        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]\n        if self.num_levels > len(proj_feats):\n            len_srcs = len(proj_feats)\n            for i in range(len_srcs, self.num_levels):\n                if i == len_srcs:\n                    proj_feats.append(self.input_proj[i](feats[-1]))\n                else:\n                    proj_feats.append(self.input_proj[i](proj_feats[-1]))\n\n        # get encoder inputs\n        feat_flatten = []\n        mask_flatten = []\n        lvl_pos_embed_flatten = []\n        spatial_shapes = []\n        valid_ratios = []\n        for i, feat in enumerate(proj_feats):\n            bs, _, h, w = feat.shape\n            spatial_shapes.append(paddle.concat([h, w]))\n            # [b,c,h,w] -> [b,h*w,c]\n            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))\n            if pad_mask is not None:\n                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]\n            else:\n                mask = paddle.ones([bs, h, w])\n            valid_ratios.append(get_valid_ratio(mask))\n            # [b, h*w, c]\n            pos_embed = self.position_embedding(mask).flatten(1, 2)\n            lvl_pos_embed = pos_embed + self.level_embed.weight[i]\n            lvl_pos_embed_flatten.append(lvl_pos_embed)\n            if pad_mask is not None:\n                # [b, h*w]\n                mask_flatten.append(mask.flatten(1))\n\n        # [b, l, c]\n        feat_flatten = paddle.concat(feat_flatten, 1)\n        # [b, l]\n        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,\n                                                                   1)\n        # [b, l, c]\n        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)\n        # [num_levels, 2]\n        spatial_shapes = paddle.to_tensor(\n            paddle.stack(spatial_shapes).astype('int64'))\n        # [l], 每一个level的起始index\n        level_start_index = paddle.concat([\n            paddle.zeros(\n                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]\n        ])\n        # [b, num_levels, 2]\n        valid_ratios = paddle.stack(valid_ratios, 1)\n        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,\n                lvl_pos_embed_flatten, valid_ratios)\n\n    def forward(self, feats, pad_mask=None, gt_meta=None):\n        feat0 = feats.pop(0)\n        # input projection and embedding\n        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,\n         lvl_pos_embed_flatten,\n         valid_ratios) = self._get_encoder_input(feats, pad_mask)\n\n        # encoder\n        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,\n                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)\n\n        mask_feat = self._get_encoder_mask_feature(feat0, memory,\n                                                   spatial_shapes)\n\n        # prepare denoising training\n        if self.training:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \\\n                get_denoising_training_group(gt_meta,\n                                            self.num_classes,\n                                            self.num_queries,\n                                            self.denoising_class_embed.weight,\n                                            self.num_denoising,\n                                            self.label_noise_ratio,\n                                            self.box_noise_scale)\n        else:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None\n\n        target, init_ref_points_unact, enc_out, init_out = \\\n            self._get_decoder_input(\n            memory, mask_feat, spatial_shapes, mask_flatten, denoising_class,\n            denoising_bbox_unact)\n\n        # decoder\n        inter_feats, inter_bboxes = self.decoder(\n            target, init_ref_points_unact, memory, spatial_shapes,\n            level_start_index, self.bbox_head, self.query_pos_head,\n            self.dec_norm, valid_ratios, attn_mask, mask_flatten)\n\n        out_logits = []\n        out_bboxes = []\n        out_masks = []\n        for i in range(self.num_decoder_layers):\n            if self.training or i == self.num_decoder_layers - 1:\n                logits_, masks_ = self._get_pred_class_and_mask(inter_feats[i],\n                                                                mask_feat)\n            else:\n                continue\n            out_logits.append(logits_)\n            out_masks.append(masks_)\n            if i == 0:\n                out_bboxes.append(\n                    F.sigmoid(\n                        self.bbox_head(inter_feats[i]) + init_ref_points_unact))\n            else:\n                out_bboxes.append(\n                    F.sigmoid(\n                        self.bbox_head(inter_feats[i]) + inverse_sigmoid(\n                            inter_bboxes[i - 1])))\n        out_bboxes = paddle.stack(out_bboxes)\n        out_logits = paddle.stack(out_logits)\n        out_masks = paddle.stack(out_masks)\n\n        return (out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta)\n\n    def _get_encoder_mask_feature(self, in_feat, memory, spatial_shapes):\n        memory_feat0 = memory.split(\n            spatial_shapes.prod(1).split(self.num_levels), axis=1)[0]\n        h, w = spatial_shapes[0]\n        memory_feat0 = memory_feat0.reshape(\n            [0, h, w, self.hidden_dim]).transpose([0, 3, 1, 2])\n        out = self.enc_mask_lateral(in_feat) + F.interpolate(\n            memory_feat0,\n            scale_factor=2.0,\n            mode='bilinear',\n            align_corners=False)\n        return self.enc_mask_output(out)\n\n    def _get_encoder_output_anchors(self,\n                                    memory,\n                                    spatial_shapes,\n                                    memory_mask=None,\n                                    grid_size=0.05):\n        output_anchors = []\n        idx = 0\n        for lvl, (h, w) in enumerate(spatial_shapes):\n            if memory_mask is not None:\n                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])\n                valid_H = paddle.sum(mask_[:, :, 0], 1)\n                valid_W = paddle.sum(mask_[:, 0, :], 1)\n            else:\n                valid_H, valid_W = h, w\n\n            grid_y, grid_x = paddle.meshgrid(\n                paddle.arange(end=h), paddle.arange(end=w))\n            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)\n\n            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(\n                [-1, 1, 1, 2]).astype(grid_xy.dtype)\n            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH\n            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)\n            output_anchors.append(\n                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))\n            idx += h * w\n\n        output_anchors = paddle.concat(output_anchors, 1)\n        valid_mask = ((output_anchors > self.eps) *\n                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)\n        output_anchors = paddle.log(output_anchors / (1 - output_anchors))\n        if memory_mask is not None:\n            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0\n        output_anchors = paddle.where(valid_mask, output_anchors,\n                                      paddle.to_tensor(float(\"inf\")))\n\n        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))\n        output_memory = self.enc_output(memory)\n        return output_memory, output_anchors\n\n    def _get_decoder_input(self,\n                           memory,\n                           mask_feat,\n                           spatial_shapes,\n                           memory_mask=None,\n                           denoising_class=None,\n                           denoising_bbox_unact=None):\n        # prepare input for decoder\n        bs, _, _ = memory.shape\n        output_memory, output_anchors = self._get_encoder_output_anchors(\n            memory, spatial_shapes, memory_mask)\n        enc_logits_unact = self.class_head(output_memory)\n        enc_bboxes_unact = self.bbox_head(output_memory) + output_anchors\n\n        # get topk index\n        _, topk_ind = paddle.topk(\n            enc_logits_unact.max(-1), self.num_queries, axis=1)\n        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)\n        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])\n        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)\n\n        # extract content and position query embedding\n        target = paddle.gather_nd(output_memory, topk_ind)\n        reference_points_unact = paddle.gather_nd(enc_bboxes_unact,\n                                                  topk_ind)  # unsigmoided.\n        # get encoder output: {logits, bboxes, masks}\n        enc_out_logits, enc_out_masks = self._get_pred_class_and_mask(target,\n                                                                      mask_feat)\n        enc_out_bboxes = F.sigmoid(reference_points_unact)\n        enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks)\n\n        # concat denoising query\n        if self.learnt_init_query:\n            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n        else:\n            target = target.detach()\n        if denoising_class is not None:\n            target = paddle.concat([denoising_class, target], 1)\n        if self.mask_enhanced:\n            # use mask-enhanced anchor box initialization\n            reference_points = mask_to_box_coordinate(\n                enc_out_masks > 0, normalize=True, format=\"xywh\")\n            reference_points_unact = inverse_sigmoid(reference_points)\n        if denoising_bbox_unact is not None:\n            reference_points_unact = paddle.concat(\n                [denoising_bbox_unact, reference_points_unact], 1)\n\n        # direct prediction from the matching and denoising part in the begining\n        if self.training and denoising_class is not None:\n            init_out_logits, init_out_masks = self._get_pred_class_and_mask(\n                target, mask_feat)\n            init_out_bboxes = F.sigmoid(reference_points_unact)\n            init_out = (init_out_logits, init_out_bboxes, init_out_masks)\n        else:\n            init_out = None\n\n        return target, reference_points_unact.detach(), enc_out, init_out\n\n    def _get_pred_class_and_mask(self, query_embed, mask_feat):\n        out_query = self.dec_norm(query_embed)\n        out_logits = self.class_head(out_query)\n        mask_query_embed = self.mask_query_head(out_query)\n        _, _, h, w = mask_feat.shape\n        # [b, q, c] x [b, c, h, w] -> [b, q, h, w]\n        out_mask = paddle.bmm(mask_query_embed, mask_feat.flatten(2)).reshape(\n            [0, 0, h, w])\n        return out_logits, out_mask\n"
  },
  {
    "path": "ppdet/modeling/transformers/mask_rtdetr_transformer.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register\nfrom .rtdetr_transformer import TransformerDecoderLayer\nfrom .utils import (_get_clones, inverse_sigmoid, get_denoising_training_group,\n                    mask_to_box_coordinate)\nfrom ..heads.detr_head import MLP\nfrom ..initializer import (linear_init_, constant_, xavier_uniform_, bias_init_with_prob)\n\n__all__ = ['MaskRTDETR']\n\n\ndef _get_pred_class_and_mask(query_embed,\n                             mask_feat,\n                             dec_norm,\n                             score_head,\n                             mask_query_head):\n    out_query = dec_norm(query_embed)\n    out_logits = score_head(out_query)\n    mask_query_embed = mask_query_head(out_query)\n    batch_size, mask_dim, _ = mask_query_embed.shape\n    _, _, mask_h, mask_w = mask_feat.shape\n    out_mask = paddle.bmm(\n        mask_query_embed, mask_feat.flatten(2)).reshape(\n        [batch_size, mask_dim, mask_h, mask_w])\n    return out_logits, out_mask\n\n\nclass MaskTransformerDecoder(nn.Layer):\n    def __init__(self,\n                 hidden_dim,\n                 decoder_layer,\n                 num_layers,\n                 eval_idx=-1,\n                 eval_topk=100):\n        super(MaskTransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.hidden_dim = hidden_dim\n        self.num_layers = num_layers\n        self.eval_idx = eval_idx if eval_idx >= 0 \\\n            else num_layers + eval_idx\n        self.eval_topk = eval_topk\n\n    def forward(self,\n                tgt,\n                ref_points_unact,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                mask_feat,\n                bbox_head,\n                score_head,\n                query_pos_head,\n                mask_query_head,\n                dec_norm,\n                attn_mask=None,\n                memory_mask=None,\n                query_pos_head_inv_sig=False):\n        output = tgt\n        dec_out_bboxes = []\n        dec_out_logits = []\n        dec_out_masks = []\n        ref_points_detach = F.sigmoid(ref_points_unact)\n        for i, layer in enumerate(self.layers):\n            ref_points_input = ref_points_detach.unsqueeze(2)\n            if not query_pos_head_inv_sig:\n                query_pos_embed = query_pos_head(ref_points_detach)\n            else:\n                query_pos_embed = query_pos_head(\n                    inverse_sigmoid(ref_points_detach))\n\n            output = layer(output, ref_points_input, memory,\n                           memory_spatial_shapes, memory_level_start_index,\n                           attn_mask, memory_mask, query_pos_embed)\n\n            inter_ref_bbox = F.sigmoid(bbox_head(output) +\n                                       inverse_sigmoid(ref_points_detach))\n\n            if self.training:\n                logits_, masks_ = _get_pred_class_and_mask(\n                    output, mask_feat, dec_norm,\n                    score_head, mask_query_head)\n                dec_out_logits.append(logits_)\n                dec_out_masks.append(masks_)\n                if i == 0:\n                    dec_out_bboxes.append(inter_ref_bbox)\n                else:\n                    dec_out_bboxes.append(\n                        F.sigmoid(bbox_head(output) +\n                                  inverse_sigmoid(ref_points)))\n            elif i == self.eval_idx:\n                logits_, masks_ = _get_pred_class_and_mask(\n                    output, mask_feat, dec_norm,\n                    score_head, mask_query_head)\n                dec_out_logits.append(logits_)\n                dec_out_masks.append(masks_)\n                dec_out_bboxes.append(inter_ref_bbox)\n                return (paddle.stack(dec_out_bboxes),\n                        paddle.stack(dec_out_logits),\n                        paddle.stack(dec_out_masks))\n\n            ref_points = inter_ref_bbox\n            ref_points_detach = inter_ref_bbox.detach(\n            ) if self.training else inter_ref_bbox\n\n        return (paddle.stack(dec_out_bboxes),\n                paddle.stack(dec_out_logits),\n                paddle.stack(dec_out_masks))\n\n\n@register\nclass MaskRTDETR(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim', 'eval_size', 'num_prototypes']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=256,\n                 num_queries=300,\n                 position_embed_type='sine',\n                 backbone_feat_channels=[512, 1024, 2048],\n                 feat_strides=[8, 16, 32],\n                 num_prototypes=32,\n                 num_levels=3,\n                 num_decoder_points=4,\n                 nhead=8,\n                 num_decoder_layers=6,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 num_denoising=100,\n                 label_noise_ratio=0.4,\n                 box_noise_scale=0.4,\n                 learnt_init_query=False,\n                 query_pos_head_inv_sig=False,\n                 mask_enhanced=True,\n                 eval_size=None,\n                 eval_idx=-1,\n                 eps=1e-2):\n        super(MaskRTDETR, self).__init__()\n        assert position_embed_type in ['sine', 'learned'], \\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        assert len(backbone_feat_channels) <= num_levels\n        assert len(feat_strides) == len(backbone_feat_channels)\n        for _ in range(num_levels - len(feat_strides)):\n            feat_strides.append(feat_strides[-1] * 2)\n\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.feat_strides = feat_strides\n        self.num_levels = num_levels\n        self.num_classes = num_classes\n        self.num_queries = num_queries\n        self.eps = eps\n        self.num_decoder_layers = num_decoder_layers\n        self.mask_enhanced = mask_enhanced\n        self.eval_size = eval_size\n\n        # backbone feature projection\n        self._build_input_proj_layer(backbone_feat_channels)\n\n        # Transformer module\n        decoder_layer = TransformerDecoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,\n            num_decoder_points)\n        self.decoder = MaskTransformerDecoder(hidden_dim, decoder_layer,\n                                              num_decoder_layers, eval_idx)\n\n        # denoising part\n        self.denoising_class_embed = nn.Embedding(\n            num_classes,\n            hidden_dim,\n            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n        self.num_denoising = num_denoising\n        self.label_noise_ratio = label_noise_ratio\n        self.box_noise_scale = box_noise_scale\n\n        # decoder embedding\n        self.learnt_init_query = learnt_init_query\n        if learnt_init_query:\n            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)\n        self.query_pos_head = MLP(4, 2 * hidden_dim,\n                                  hidden_dim, num_layers=2)\n        self.query_pos_head_inv_sig = query_pos_head_inv_sig\n\n        # mask embedding\n        self.mask_query_head = MLP(hidden_dim, hidden_dim,\n                                   num_prototypes, num_layers=3)\n\n        # encoder head\n        self.enc_output = nn.Sequential(\n            nn.Linear(hidden_dim, hidden_dim),\n            nn.LayerNorm(\n                hidden_dim,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))\n\n        # decoder norm layer\n        self.dec_norm = nn.LayerNorm(\n            hidden_dim,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # shared prediction head\n        self.score_head = nn.Linear(hidden_dim, num_classes)\n        self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # class and bbox head init\n        bias_cls = bias_init_with_prob(0.01)\n        linear_init_(self.score_head)\n        constant_(self.score_head.bias, bias_cls)\n        constant_(self.bbox_head.layers[-1].weight)\n        constant_(self.bbox_head.layers[-1].bias)\n\n        linear_init_(self.enc_output[0])\n        xavier_uniform_(self.enc_output[0].weight)\n        if self.learnt_init_query:\n            xavier_uniform_(self.tgt_embed.weight)\n        xavier_uniform_(self.query_pos_head.layers[0].weight)\n        xavier_uniform_(self.query_pos_head.layers[1].weight)\n        for l in self.input_proj:\n            xavier_uniform_(l[0].weight)\n\n        # init encoder output anchors and valid_mask\n        if self.eval_size:\n            self.anchors, self.valid_mask = self._generate_anchors()\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'backbone_feat_channels': [i.channels for i in input_shape],\n                'feat_strides': [i.stride for i in input_shape]}\n\n    def _build_input_proj_layer(self, backbone_feat_channels):\n        self.input_proj = nn.LayerList()\n        for in_channels in backbone_feat_channels:\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=1,\n                        bias_attr=False)),\n                    ('norm', nn.BatchNorm2D(\n                        self.hidden_dim,\n                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n        in_channels = backbone_feat_channels[-1]\n        for _ in range(self.num_levels - len(backbone_feat_channels)):\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=3,\n                        stride=2,\n                        padding=1,\n                        bias_attr=False)),\n                    ('norm', nn.BatchNorm2D(\n                        self.hidden_dim,\n                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n            in_channels = self.hidden_dim\n\n    def _get_encoder_input(self, feats):\n        # get projection features\n        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]\n        if self.num_levels > len(proj_feats):\n            len_srcs = len(proj_feats)\n            for i in range(len_srcs, self.num_levels):\n                if i == len_srcs:\n                    proj_feats.append(self.input_proj[i](feats[-1]))\n                else:\n                    proj_feats.append(self.input_proj[i](proj_feats[-1]))\n\n        # get encoder inputs\n        feat_flatten = []\n        spatial_shapes = []\n        level_start_index = [0, ]\n        for i, feat in enumerate(proj_feats):\n            _, _, h, w = feat.shape\n            # [b, c, h, w] -> [b, h*w, c]\n            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))\n            # [num_levels, 2]\n            spatial_shapes.append([h, w])\n            # [l], start index of each level\n            level_start_index.append(h * w + level_start_index[-1])\n\n        # [b, l, c]\n        feat_flatten = paddle.concat(feat_flatten, 1)\n        level_start_index.pop()\n        return feat_flatten, spatial_shapes, level_start_index\n\n    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):\n        enc_feats, mask_feat = feats\n        # input projection and embedding\n        (memory, spatial_shapes,\n         level_start_index) = self._get_encoder_input(enc_feats)\n\n        # prepare denoising training\n        if self.training:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \\\n                get_denoising_training_group(gt_meta,\n                                             self.num_classes,\n                                             self.num_queries,\n                                             self.denoising_class_embed.weight,\n                                             self.num_denoising,\n                                             self.label_noise_ratio,\n                                             self.box_noise_scale)\n        else:\n            denoising_class, denoising_bbox_unact,\\\n                attn_mask, dn_meta = None, None, None, None\n\n        target, init_ref_points_unact, enc_out, init_out = \\\n            self._get_decoder_input(\n                memory, mask_feat, spatial_shapes,\n                denoising_class, denoising_bbox_unact, is_teacher)\n\n        # decoder\n        out_bboxes, out_logits, out_masks = self.decoder(\n            target,\n            init_ref_points_unact,\n            memory,\n            spatial_shapes,\n            level_start_index,\n            mask_feat,\n            self.bbox_head,\n            self.score_head,\n            self.query_pos_head,\n            self.mask_query_head,\n            self.dec_norm,\n            attn_mask=attn_mask,\n            memory_mask=None,\n            query_pos_head_inv_sig=self.query_pos_head_inv_sig)\n\n        return out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta\n\n    def _generate_anchors(self,\n                          spatial_shapes=None,\n                          grid_size=0.05,\n                          dtype=paddle.float32):\n        if spatial_shapes is None:\n            spatial_shapes = [\n                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]\n                for s in self.feat_strides\n            ]\n        anchors = []\n        for lvl, (h, w) in enumerate(spatial_shapes):\n            grid_y, grid_x = paddle.meshgrid(\n                paddle.arange(\n                    end=h, dtype=dtype),\n                paddle.arange(\n                    end=w, dtype=dtype))\n            grid_xy = paddle.stack([grid_x, grid_y], -1)\n\n            valid_WH = paddle.to_tensor([h, w]).astype(dtype)\n            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH\n            wh = paddle.ones_like(grid_xy) * grid_size * (2.0 ** lvl)\n            anchors.append(\n                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))\n\n        anchors = paddle.concat(anchors, 1)\n        valid_mask = ((anchors > self.eps) *\n                      (anchors < 1 - self.eps)).all(-1, keepdim=True)\n        anchors = paddle.log(anchors / (1 - anchors))\n        anchors = paddle.where(valid_mask, anchors,\n                               paddle.to_tensor(float(\"inf\")))\n        return anchors, valid_mask\n\n    def _get_decoder_input(self,\n                           memory,\n                           mask_feat,\n                           spatial_shapes,\n                           denoising_class=None,\n                           denoising_bbox_unact=None,\n                           is_teacher=False):\n        bs, _, _ = memory.shape\n        # prepare input for decoder\n        if self.training or self.eval_size is None or is_teacher:\n            anchors, valid_mask = self._generate_anchors(spatial_shapes)\n        else:\n            anchors, valid_mask = self.anchors, self.valid_mask\n        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))\n        output_memory = self.enc_output(memory)\n\n        enc_logits_unact = self.score_head(output_memory)\n        enc_bboxes_unact = self.bbox_head(output_memory) + anchors\n\n        # get topk index\n        _, topk_ind = paddle.topk(\n            enc_logits_unact.max(-1), self.num_queries, axis=1)\n        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)\n        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])\n        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)\n\n        # extract content and position query embedding\n        target = paddle.gather_nd(output_memory, topk_ind)\n        reference_points_unact = paddle.gather_nd(enc_bboxes_unact,\n                                                  topk_ind)  # unsigmoided.\n        # get encoder output: {logits, bboxes, masks}\n        enc_out_logits, enc_out_masks = _get_pred_class_and_mask(\n            target, mask_feat, self.dec_norm,\n            self.score_head, self.mask_query_head)\n        enc_out_bboxes = F.sigmoid(reference_points_unact)\n        enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks)\n\n        # concat denoising query\n        if self.learnt_init_query:\n            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n        else:\n            target = target.detach()\n        if denoising_class is not None:\n            target = paddle.concat([denoising_class, target], 1)\n        if self.mask_enhanced:\n            # use mask-enhanced anchor box initialization\n            reference_points = mask_to_box_coordinate(\n                enc_out_masks > 0, normalize=True, format=\"xywh\")\n            reference_points_unact = inverse_sigmoid(reference_points)\n        if denoising_bbox_unact is not None:\n            reference_points_unact = paddle.concat(\n                [denoising_bbox_unact, reference_points_unact], 1)\n\n        # direct prediction from the matching and denoising part in the beginning\n        if self.training and denoising_class is not None:\n            init_out_logits, init_out_masks = _get_pred_class_and_mask(\n                target, mask_feat, self.dec_norm,\n                self.score_head, self.mask_query_head)\n            init_out_bboxes = F.sigmoid(reference_points_unact)\n            init_out = (init_out_logits, init_out_bboxes, init_out_masks)\n        else:\n            init_out = None\n\n        return target, reference_points_unact.detach(), enc_out, init_out\n"
  },
  {
    "path": "ppdet/modeling/transformers/matchers.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom scipy.optimize import linear_sum_assignment\n\nfrom ppdet.core.workspace import register, serializable\nfrom ..losses.iou_loss import GIoULoss\nfrom .utils import bbox_cxcywh_to_xyxy\n\n__all__ = ['HungarianMatcher']\n\n\n@register\n@serializable\nclass HungarianMatcher(nn.Layer):\n    __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']\n\n    def __init__(self,\n                 matcher_coeff={\n                     'class': 1,\n                     'bbox': 5,\n                     'giou': 2,\n                     'mask': 1,\n                     'dice': 1\n                 },\n                 use_focal_loss=False,\n                 with_mask=False,\n                 num_sample_points=12544,\n                 alpha=0.25,\n                 gamma=2.0):\n        r\"\"\"\n        Args:\n            matcher_coeff (dict): The coefficient of hungarian matcher cost.\n        \"\"\"\n        super(HungarianMatcher, self).__init__()\n        self.matcher_coeff = matcher_coeff\n        self.use_focal_loss = use_focal_loss\n        self.with_mask = with_mask\n        self.num_sample_points = num_sample_points\n        self.alpha = alpha\n        self.gamma = gamma\n\n        self.giou_loss = GIoULoss()\n\n    def forward(self,\n                boxes,\n                logits,\n                gt_bbox,\n                gt_class,\n                masks=None,\n                gt_mask=None):\n        r\"\"\"\n        Args:\n            boxes (Tensor): [b, query, 4]\n            logits (Tensor): [b, query, num_classes]\n            gt_bbox (List(Tensor)): list[[n, 4]]\n            gt_class (List(Tensor)): list[[n, 1]]\n            masks (Tensor|None): [b, query, h, w]\n            gt_mask (List(Tensor)): list[[n, H, W]]\n\n        Returns:\n            A list of size batch_size, containing tuples of (index_i, index_j) where:\n                - index_i is the indices of the selected predictions (in order)\n                - index_j is the indices of the corresponding selected targets (in order)\n            For each batch element, it holds:\n                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)\n        \"\"\"\n        bs, num_queries = boxes.shape[:2]\n\n        num_gts = [len(a) for a in gt_class]\n        if sum(num_gts) == 0:\n            return [(paddle.to_tensor(\n                [], dtype=paddle.int64), paddle.to_tensor(\n                    [], dtype=paddle.int64)) for _ in range(bs)]\n\n        # We flatten to compute the cost matrices in a batch\n        # [batch_size * num_queries, num_classes]\n        logits = logits.detach()\n        out_prob = F.sigmoid(logits.flatten(\n            0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))\n        # [batch_size * num_queries, 4]\n        out_bbox = boxes.detach().flatten(0, 1)\n\n        # Also concat the target labels and boxes\n        tgt_ids = paddle.concat(gt_class).flatten()\n        tgt_bbox = paddle.concat(gt_bbox)\n\n        # Compute the classification cost\n        out_prob = paddle.gather(out_prob, tgt_ids, axis=1)\n        if self.use_focal_loss:\n            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(\n                1 - out_prob + 1e-8).log())\n            pos_cost_class = self.alpha * (\n                (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())\n            cost_class = pos_cost_class - neg_cost_class\n        else:\n            cost_class = -out_prob\n\n        # Compute the L1 cost between boxes\n        cost_bbox = (\n            out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)\n\n        # Compute the giou cost betwen boxes\n        giou_loss = self.giou_loss(\n            bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),\n            bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)\n        cost_giou = giou_loss - 1\n\n        # Final cost matrix\n        C = self.matcher_coeff['class'] * cost_class + \\\n            self.matcher_coeff['bbox'] * cost_bbox + \\\n            self.matcher_coeff['giou'] * cost_giou\n        # Compute the mask cost and dice cost\n        if self.with_mask:\n            assert (masks is not None and gt_mask is not None,\n                    'Make sure the input has `mask` and `gt_mask`')\n            # all masks share the same set of points for efficient matching\n            sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])\n            sample_points = 2.0 * sample_points - 1.0\n\n            out_mask = F.grid_sample(\n                masks.detach(), sample_points, align_corners=False).squeeze(-2)\n            out_mask = out_mask.flatten(0, 1)\n\n            tgt_mask = paddle.concat(gt_mask).unsqueeze(1)\n            sample_points = paddle.concat([\n                a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)\n                if b > 0\n            ])\n            tgt_mask = F.grid_sample(\n                tgt_mask, sample_points, align_corners=False).squeeze([1, 2])\n\n            with paddle.amp.auto_cast(enable=False):\n                # binary cross entropy cost\n                pos_cost_mask = F.binary_cross_entropy_with_logits(\n                    out_mask, paddle.ones_like(out_mask), reduction='none')\n                neg_cost_mask = F.binary_cross_entropy_with_logits(\n                    out_mask, paddle.zeros_like(out_mask), reduction='none')\n                cost_mask = paddle.matmul(\n                    pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(\n                        neg_cost_mask, 1 - tgt_mask, transpose_y=True)\n                cost_mask /= self.num_sample_points\n\n                # dice cost\n                out_mask = F.sigmoid(out_mask)\n                numerator = 2 * paddle.matmul(\n                    out_mask, tgt_mask, transpose_y=True)\n                denominator = out_mask.sum(\n                    -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)\n                cost_dice = 1 - (numerator + 1) / (denominator + 1)\n\n                C = C + self.matcher_coeff['mask'] * cost_mask + \\\n                    self.matcher_coeff['dice'] * cost_dice\n\n        C = C.reshape([bs, num_queries, -1])\n        C = [a.squeeze(0) for a in C.chunk(bs)]\n        sizes = [a.shape[0] for a in gt_bbox]\n        if hasattr(paddle.Tensor, \"contiguous\"):\n            indices = [\n                linear_sum_assignment(c.split(sizes, -1)[i].contiguous().numpy())\n                for i, c in enumerate(C)\n            ]\n        else:\n            indices = [\n                linear_sum_assignment(c.split(sizes, -1)[i].numpy())\n                for i, c in enumerate(C)\n            ]\n        return [(paddle.to_tensor(\n            i, dtype=paddle.int64), paddle.to_tensor(\n                j, dtype=paddle.int64)) for i, j in indices]\n"
  },
  {
    "path": "ppdet/modeling/transformers/petr_transformer.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nthis code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/utils/transformer.py\n\"\"\"\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\n\nfrom ppdet.core.workspace import register\nfrom ..layers import MultiHeadAttention, _convert_attention_mask\nfrom .utils import _get_clones\nfrom ..initializer import linear_init_, normal_, constant_, xavier_uniform_\n\n__all__ = [\n    'PETRTransformer', 'MultiScaleDeformablePoseAttention',\n    'PETR_TransformerDecoderLayer', 'PETR_TransformerDecoder',\n    'PETR_DeformableDetrTransformerDecoder',\n    'PETR_DeformableTransformerDecoder', 'TransformerEncoderLayer',\n    'TransformerEncoder', 'MSDeformableAttention'\n]\n\n\ndef masked_fill(x, mask, value):\n    y = paddle.full(x.shape, value, x.dtype)\n    return paddle.where(mask, y, x)\n\n\ndef inverse_sigmoid(x, eps=1e-5):\n    \"\"\"Inverse function of sigmoid.\n\n    Args:\n        x (Tensor): The tensor to do the\n            inverse.\n        eps (float): EPS avoid numerical\n            overflow. Defaults 1e-5.\n    Returns:\n        Tensor: The x has passed the inverse\n            function of sigmoid, has same\n            shape with input.\n    \"\"\"\n    x = x.clip(min=0, max=1)\n    x1 = x.clip(min=eps)\n    x2 = (1 - x).clip(min=eps)\n    return paddle.log(x1 / x2)\n\n\n@register\nclass TransformerEncoderLayer(nn.Layer):\n    __inject__ = ['attn']\n\n    def __init__(self,\n                 d_model,\n                 attn=None,\n                 nhead=8,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False):\n        super(TransformerEncoderLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n        self.embed_dims = d_model\n\n        if attn is None:\n            self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)\n        else:\n            self.self_attn = attn\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos_embed):\n        return tensor if pos_embed is None else tensor + pos_embed\n\n    def forward(self, src, src_mask=None, pos_embed=None, **kwargs):\n        residual = src\n        if self.normalize_before:\n            src = self.norm1(src)\n        q = k = self.with_pos_embed(src, pos_embed)\n        src = self.self_attn(q, k, value=src, attn_mask=src_mask, **kwargs)\n\n        src = residual + self.dropout1(src)\n        if not self.normalize_before:\n            src = self.norm1(src)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm2(src)\n        src = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = residual + self.dropout2(src)\n        if not self.normalize_before:\n            src = self.norm2(src)\n        return src\n\n\n@register\nclass TransformerEncoder(nn.Layer):\n    __inject__ = ['encoder_layer']\n\n    def __init__(self, encoder_layer, num_layers, norm=None):\n        super(TransformerEncoder, self).__init__()\n        self.layers = _get_clones(encoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.norm = norm\n        self.embed_dims = encoder_layer.embed_dims\n\n    def forward(self, src, src_mask=None, pos_embed=None, **kwargs):\n        output = src\n        for layer in self.layers:\n            output = layer(\n                output, src_mask=src_mask, pos_embed=pos_embed, **kwargs)\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n        return output\n\n\n@register\nclass MSDeformableAttention(nn.Layer):\n    def __init__(self,\n                 embed_dim=256,\n                 num_heads=8,\n                 num_levels=4,\n                 num_points=4,\n                 lr_mult=0.1):\n        \"\"\"\n        Multi-Scale Deformable Attention Module\n        \"\"\"\n        super(MSDeformableAttention, self).__init__()\n        self.embed_dim = embed_dim\n        self.num_heads = num_heads\n        self.num_levels = num_levels\n        self.num_points = num_points\n        self.total_points = num_heads * num_levels * num_points\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        self.sampling_offsets = nn.Linear(\n            embed_dim,\n            self.total_points * 2,\n            weight_attr=ParamAttr(learning_rate=lr_mult),\n            bias_attr=ParamAttr(learning_rate=lr_mult))\n\n        self.attention_weights = nn.Linear(embed_dim, self.total_points)\n        self.value_proj = nn.Linear(embed_dim, embed_dim)\n        self.output_proj = nn.Linear(embed_dim, embed_dim)\n        try:\n            # use cuda op\n            print(\"use deformable_detr_ops in ms_deformable_attn\")\n            from deformable_detr_ops import ms_deformable_attn\n        except:\n            # use paddle func\n            from .utils import deformable_attention_core_func as ms_deformable_attn\n        self.ms_deformable_attn_core = ms_deformable_attn\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # sampling_offsets\n        constant_(self.sampling_offsets.weight)\n        thetas = paddle.arange(\n            self.num_heads,\n            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)\n        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)\n        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)\n        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(\n            [1, self.num_levels, self.num_points, 1])\n        scaling = paddle.arange(\n            1, self.num_points + 1,\n            dtype=paddle.float32).reshape([1, 1, -1, 1])\n        grid_init *= scaling\n        self.sampling_offsets.bias.set_value(grid_init.flatten())\n        # attention_weights\n        constant_(self.attention_weights.weight)\n        constant_(self.attention_weights.bias)\n        # proj\n        xavier_uniform_(self.value_proj.weight)\n        constant_(self.value_proj.bias)\n        xavier_uniform_(self.output_proj.weight)\n        constant_(self.output_proj.bias)\n\n    def forward(self,\n                query,\n                key,\n                value,\n                reference_points,\n                value_spatial_shapes,\n                value_level_start_index,\n                attn_mask=None,\n                **kwargs):\n        \"\"\"\n        Args:\n            query (Tensor): [bs, query_length, C]\n            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),\n                bottom-right (1, 1), including padding area\n            value (Tensor): [bs, value_length, C]\n            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]\n            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]\n            attn_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements\n\n        Returns:\n            output (Tensor): [bs, Length_{query}, C]\n        \"\"\"\n        bs, Len_q = query.shape[:2]\n        Len_v = value.shape[1]\n        assert int(value_spatial_shapes.prod(1).sum()) == Len_v\n\n        value = self.value_proj(value)\n        if attn_mask is not None:\n            attn_mask = attn_mask.astype(value.dtype).unsqueeze(-1)\n            value *= attn_mask\n        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])\n\n        sampling_offsets = self.sampling_offsets(query).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])\n        attention_weights = self.attention_weights(query).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])\n        attention_weights = F.softmax(attention_weights).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])\n\n        if reference_points.shape[-1] == 2:\n            offset_normalizer = value_spatial_shapes.flip([1]).reshape(\n                [1, 1, 1, self.num_levels, 1, 2])\n            sampling_locations = reference_points.reshape([\n                bs, Len_q, 1, self.num_levels, 1, 2\n            ]) + sampling_offsets / offset_normalizer\n        elif reference_points.shape[-1] == 4:\n            sampling_locations = (\n                reference_points[:, :, None, :, None, :2] + sampling_offsets /\n                self.num_points * reference_points[:, :, None, :, None, 2:] *\n                0.5)\n        else:\n            raise ValueError(\n                \"Last dim of reference_points must be 2 or 4, but get {} instead.\".\n                format(reference_points.shape[-1]))\n\n        output = self.ms_deformable_attn_core(\n            value, value_spatial_shapes, value_level_start_index,\n            sampling_locations, attention_weights)\n        output = self.output_proj(output)\n\n        return output\n\n\n@register\nclass MultiScaleDeformablePoseAttention(nn.Layer):\n    \"\"\"An attention module used in PETR. `End-to-End Multi-Person\n    Pose Estimation with Transformers`.\n\n    Args:\n        embed_dims (int): The embedding dimension of Attention.\n            Default: 256.\n        num_heads (int): Parallel attention heads. Default: 8.\n        num_levels (int): The number of feature map used in\n            Attention. Default: 4.\n        num_points (int): The number of sampling points for\n            each query in each head. Default: 17.\n        im2col_step (int): The step used in image_to_column.\n            Default: 64.\n        dropout (float): A Dropout layer on `inp_residual`.\n            Default: 0.1.\n        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.\n            Default: None.\n    \"\"\"\n\n    def __init__(self,\n                 embed_dims=256,\n                 num_heads=8,\n                 num_levels=4,\n                 num_points=17,\n                 im2col_step=64,\n                 dropout=0.1,\n                 norm_cfg=None,\n                 init_cfg=None,\n                 batch_first=False,\n                 lr_mult=0.1):\n        super().__init__()\n        if embed_dims % num_heads != 0:\n            raise ValueError(f'embed_dims must be divisible by num_heads, '\n                             f'but got {embed_dims} and {num_heads}')\n        dim_per_head = embed_dims // num_heads\n        self.norm_cfg = norm_cfg\n        self.init_cfg = init_cfg\n        self.dropout = nn.Dropout(dropout)\n        self.batch_first = batch_first\n\n        # you'd better set dim_per_head to a power of 2\n        # which is more efficient in the CUDA implementation\n        def _is_power_of_2(n):\n            if (not isinstance(n, int)) or (n < 0):\n                raise ValueError(\n                    'invalid input for _is_power_of_2: {} (type: {})'.format(\n                        n, type(n)))\n            return (n & (n - 1) == 0) and n != 0\n\n        if not _is_power_of_2(dim_per_head):\n            warnings.warn(\"You'd better set embed_dims in \"\n                          'MultiScaleDeformAttention to make '\n                          'the dimension of each attention head a power of 2 '\n                          'which is more efficient in our CUDA implementation.')\n\n        self.im2col_step = im2col_step\n        self.embed_dims = embed_dims\n        self.num_levels = num_levels\n        self.num_heads = num_heads\n        self.num_points = num_points\n        self.sampling_offsets = nn.Linear(\n            embed_dims,\n            num_heads * num_levels * num_points * 2,\n            weight_attr=ParamAttr(learning_rate=lr_mult),\n            bias_attr=ParamAttr(learning_rate=lr_mult))\n        self.attention_weights = nn.Linear(embed_dims,\n                                           num_heads * num_levels * num_points)\n        self.value_proj = nn.Linear(embed_dims, embed_dims)\n        self.output_proj = nn.Linear(embed_dims, embed_dims)\n\n        try:\n            # use cuda op\n            from deformable_detr_ops import ms_deformable_attn\n        except:\n            # use paddle func\n            from .utils import deformable_attention_core_func as ms_deformable_attn\n        self.ms_deformable_attn_core = ms_deformable_attn\n\n        self.init_weights()\n\n    def init_weights(self):\n        \"\"\"Default initialization for Parameters of Module.\"\"\"\n        constant_(self.sampling_offsets.weight)\n        constant_(self.sampling_offsets.bias)\n        constant_(self.attention_weights.weight)\n        constant_(self.attention_weights.bias)\n        xavier_uniform_(self.value_proj.weight)\n        constant_(self.value_proj.bias)\n        xavier_uniform_(self.output_proj.weight)\n        constant_(self.output_proj.bias)\n\n    def forward(self,\n                query,\n                key,\n                value,\n                residual=None,\n                attn_mask=None,\n                reference_points=None,\n                value_spatial_shapes=None,\n                value_level_start_index=None,\n                **kwargs):\n        \"\"\"Forward Function of MultiScaleDeformAttention.\n\n        Args:\n            query (Tensor): Query of Transformer with shape\n                (num_query, bs, embed_dims).\n            key (Tensor): The key tensor with shape (num_key, bs, embed_dims).\n            value (Tensor): The value tensor with shape\n                (num_key, bs, embed_dims).\n            residual (Tensor): The tensor used for addition, with the\n                same shape as `x`. Default None. If None, `x` will be used.\n            reference_points (Tensor):  The normalized reference points with\n                shape (bs, num_query, num_levels, K*2), all elements is range\n                in [0, 1], top-left (0,0), bottom-right (1, 1), including\n                padding area.\n            attn_mask (Tensor): ByteTensor for `query`, with\n                shape [bs, num_key].\n            value_spatial_shapes (Tensor): Spatial shape of features in\n                different level. With shape  (num_levels, 2),\n                last dimension represent (h, w).\n            value_level_start_index (Tensor): The start index of each level.\n                A tensor has shape (num_levels) and can be represented\n                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].\n\n        Returns:\n            Tensor: forwarded results with shape [num_query, bs, embed_dims].\n        \"\"\"\n\n        if key is None:\n            key = query\n        if value is None:\n            value = key\n\n        bs, num_query, _ = query.shape\n        bs, num_key, _ = value.shape\n        assert (value_spatial_shapes[:, 0].numpy() *\n                value_spatial_shapes[:, 1].numpy()).sum() == num_key\n\n        value = self.value_proj(value)\n        if attn_mask is not None:\n            # value = value.masked_fill(attn_mask[..., None], 0.0)\n            value *= attn_mask.unsqueeze(-1)\n        value = value.reshape([bs, num_key, self.num_heads, -1])\n        sampling_offsets = self.sampling_offsets(query).reshape([\n            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2\n        ])\n        attention_weights = self.attention_weights(query).reshape(\n            [bs, num_query, self.num_heads, self.num_levels * self.num_points])\n        attention_weights = F.softmax(attention_weights, axis=-1)\n\n        attention_weights = attention_weights.reshape(\n            [bs, num_query, self.num_heads, self.num_levels, self.num_points])\n        if reference_points.shape[-1] == self.num_points * 2:\n            reference_points_reshape = reference_points.reshape(\n                (bs, num_query, self.num_levels, -1, 2)).unsqueeze(2)\n            x1 = reference_points[:, :, :, 0::2].min(axis=-1, keepdim=True)\n            y1 = reference_points[:, :, :, 1::2].min(axis=-1, keepdim=True)\n            x2 = reference_points[:, :, :, 0::2].max(axis=-1, keepdim=True)\n            y2 = reference_points[:, :, :, 1::2].max(axis=-1, keepdim=True)\n            w = paddle.clip(x2 - x1, min=1e-4)\n            h = paddle.clip(y2 - y1, min=1e-4)\n            wh = paddle.concat([w, h], axis=-1)[:, :, None, :, None, :]\n\n            sampling_locations = reference_points_reshape \\\n                                 + sampling_offsets * wh * 0.5\n        else:\n            raise ValueError(\n                f'Last dim of reference_points must be'\n                f' 2K, but get {reference_points.shape[-1]} instead.')\n\n        output = self.ms_deformable_attn_core(\n            value, value_spatial_shapes, value_level_start_index,\n            sampling_locations, attention_weights)\n\n        output = self.output_proj(output)\n        return output\n\n\n@register\nclass PETR_TransformerDecoderLayer(nn.Layer):\n    __inject__ = ['self_attn', 'cross_attn']\n\n    def __init__(self,\n                 d_model,\n                 nhead=8,\n                 self_attn=None,\n                 cross_attn=None,\n                 dim_feedforward=2048,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False):\n        super(PETR_TransformerDecoderLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n\n        if self_attn is None:\n            self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)\n        else:\n            self.self_attn = self_attn\n        if cross_attn is None:\n            self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)\n        else:\n            self.cross_attn = cross_attn\n        # Implementation of Feedforward model\n        self.linear1 = nn.Linear(d_model, dim_feedforward)\n        self.dropout = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.linear2 = nn.Linear(dim_feedforward, d_model)\n\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.norm3 = nn.LayerNorm(d_model)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout3 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos_embed):\n        return tensor if pos_embed is None else tensor + pos_embed\n\n    def forward(self,\n                tgt,\n                memory,\n                tgt_mask=None,\n                memory_mask=None,\n                pos_embed=None,\n                query_pos_embed=None,\n                **kwargs):\n        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)\n\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm1(tgt)\n        q = k = self.with_pos_embed(tgt, query_pos_embed)\n        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)\n        tgt = residual + self.dropout1(tgt)\n        if not self.normalize_before:\n            tgt = self.norm1(tgt)\n\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm2(tgt)\n        q = self.with_pos_embed(tgt, query_pos_embed)\n        key_tmp = tgt\n        # k = self.with_pos_embed(memory, pos_embed)\n        tgt = self.cross_attn(\n            q, key=key_tmp, value=memory, attn_mask=memory_mask, **kwargs)\n        tgt = residual + self.dropout2(tgt)\n        if not self.normalize_before:\n            tgt = self.norm2(tgt)\n\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm3(tgt)\n        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))\n        tgt = residual + self.dropout3(tgt)\n        if not self.normalize_before:\n            tgt = self.norm3(tgt)\n        return tgt\n\n\n@register\nclass PETR_TransformerDecoder(nn.Layer):\n    \"\"\"Implements the decoder in PETR transformer.\n\n    Args:\n        return_intermediate (bool): Whether to return intermediate outputs.\n        coder_norm_cfg (dict): Config of last normalization layer. Default：\n            `LN`.\n    \"\"\"\n    __inject__ = ['decoder_layer']\n\n    def __init__(self,\n                 decoder_layer,\n                 num_layers,\n                 norm=None,\n                 return_intermediate=False,\n                 num_keypoints=17,\n                 **kwargs):\n        super(PETR_TransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.norm = norm\n        self.return_intermediate = return_intermediate\n        self.num_keypoints = num_keypoints\n\n    def forward(self,\n                query,\n                *args,\n                reference_points=None,\n                valid_ratios=None,\n                kpt_branches=None,\n                **kwargs):\n        \"\"\"Forward function for `TransformerDecoder`.\n\n        Args:\n            query (Tensor): Input query with shape (num_query, bs, embed_dims).\n            reference_points (Tensor): The reference points of offset,\n                has shape (bs, num_query, K*2).\n            valid_ratios (Tensor): The radios of valid points on the feature\n                map, has shape (bs, num_levels, 2).\n            kpt_branches: (obj:`nn.LayerList`): Used for refining the\n                regression results. Only would be passed when `with_box_refine`\n                is True, otherwise would be passed a `None`.\n\n        Returns:\n            tuple (Tensor): Results with shape [1, num_query, bs, embed_dims] when\n                return_intermediate is `False`, otherwise it has shape\n                [num_layers, num_query, bs, embed_dims] and\n                [num_layers, bs, num_query, K*2].\n        \"\"\"\n        output = query\n        intermediate = []\n        intermediate_reference_points = []\n        for lid, layer in enumerate(self.layers):\n            if reference_points.shape[-1] == self.num_keypoints * 2:\n                reference_points_input = \\\n                    reference_points[:, :, None] * \\\n                    valid_ratios.tile((1, 1, self.num_keypoints))[:, None]\n            else:\n                assert reference_points.shape[-1] == 2\n                reference_points_input = reference_points[:, :, None] * \\\n                                         valid_ratios[:, None]\n            output = layer(\n                output,\n                *args,\n                reference_points=reference_points_input,\n                **kwargs)\n\n            if kpt_branches is not None:\n                tmp = kpt_branches[lid](output)\n                if reference_points.shape[-1] == self.num_keypoints * 2:\n                    new_reference_points = tmp + inverse_sigmoid(\n                        reference_points)\n                    new_reference_points = F.sigmoid(new_reference_points)\n                else:\n                    raise NotImplementedError\n                reference_points = new_reference_points.detach()\n\n            if self.return_intermediate:\n                intermediate.append(output)\n                intermediate_reference_points.append(reference_points)\n\n        if self.return_intermediate:\n            return paddle.stack(intermediate), paddle.stack(\n                intermediate_reference_points)\n\n        return output, reference_points\n\n\n@register\nclass PETR_DeformableTransformerDecoder(nn.Layer):\n    __inject__ = ['decoder_layer']\n\n    def __init__(self, decoder_layer, num_layers, return_intermediate=False):\n        super(PETR_DeformableTransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.return_intermediate = return_intermediate\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_mask=None,\n                query_pos_embed=None):\n        output = tgt\n        intermediate = []\n        for lid, layer in enumerate(self.layers):\n            output = layer(output, reference_points, memory,\n                           memory_spatial_shapes, memory_mask, query_pos_embed)\n\n            if self.return_intermediate:\n                intermediate.append(output)\n\n        if self.return_intermediate:\n            return paddle.stack(intermediate)\n\n        return output.unsqueeze(0)\n\n\n@register\nclass PETR_DeformableDetrTransformerDecoder(PETR_DeformableTransformerDecoder):\n    \"\"\"Implements the decoder in DETR transformer.\n\n    Args:\n        return_intermediate (bool): Whether to return intermediate outputs.\n        coder_norm_cfg (dict): Config of last normalization layer. Default：\n            `LN`.\n    \"\"\"\n\n    def __init__(self, *args, return_intermediate=False, **kwargs):\n\n        super(PETR_DeformableDetrTransformerDecoder, self).__init__(*args,\n                                                                    **kwargs)\n        self.return_intermediate = return_intermediate\n\n    def forward(self,\n                query,\n                *args,\n                reference_points=None,\n                valid_ratios=None,\n                reg_branches=None,\n                **kwargs):\n        \"\"\"Forward function for `TransformerDecoder`.\n\n        Args:\n            query (Tensor): Input query with shape\n                `(num_query, bs, embed_dims)`.\n            reference_points (Tensor): The reference\n                points of offset. has shape\n                (bs, num_query, 4) when as_two_stage,\n                otherwise has shape ((bs, num_query, 2).\n            valid_ratios (Tensor): The radios of valid\n                points on the feature map, has shape\n                (bs, num_levels, 2)\n            reg_branch: (obj:`nn.LayerList`): Used for\n                refining the regression results. Only would\n                be passed when with_box_refine is True,\n                otherwise would be passed a `None`.\n\n        Returns:\n            Tensor: Results with shape [1, num_query, bs, embed_dims] when\n                return_intermediate is `False`, otherwise it has shape\n                [num_layers, num_query, bs, embed_dims].\n        \"\"\"\n        output = query\n        intermediate = []\n        intermediate_reference_points = []\n        for lid, layer in enumerate(self.layers):\n            if reference_points.shape[-1] == 4:\n                reference_points_input = reference_points[:, :, None] * \\\n                    paddle.concat([valid_ratios, valid_ratios], -1)[:, None]\n            else:\n                assert reference_points.shape[-1] == 2\n                reference_points_input = reference_points[:, :, None] * \\\n                    valid_ratios[:, None]\n            output = layer(\n                output,\n                *args,\n                reference_points=reference_points_input,\n                **kwargs)\n\n            if reg_branches is not None:\n                tmp = reg_branches[lid](output)\n                if reference_points.shape[-1] == 4:\n                    new_reference_points = tmp + inverse_sigmoid(\n                        reference_points)\n                    new_reference_points = F.sigmoid(new_reference_points)\n                else:\n                    assert reference_points.shape[-1] == 2\n                    new_reference_points = tmp\n                    new_reference_points[..., :2] = tmp[\n                        ..., :2] + inverse_sigmoid(reference_points)\n                    new_reference_points = F.sigmoid(new_reference_points)\n                reference_points = new_reference_points.detach()\n\n            if self.return_intermediate:\n                intermediate.append(output)\n                intermediate_reference_points.append(reference_points)\n\n        if self.return_intermediate:\n            return paddle.stack(intermediate), paddle.stack(\n                intermediate_reference_points)\n\n        return output, reference_points\n\n\n@register\nclass PETRTransformer(nn.Layer):\n    \"\"\"Implements the PETR transformer.\n\n    Args:\n        as_two_stage (bool): Generate query from encoder features.\n            Default: False.\n        num_feature_levels (int): Number of feature maps from FPN:\n            Default: 4.\n        two_stage_num_proposals (int): Number of proposals when set\n            `as_two_stage` as True. Default: 300.\n    \"\"\"\n    __inject__ = [\"encoder\", \"decoder\", \"hm_encoder\", \"refine_decoder\"]\n\n    def __init__(self,\n                 encoder=\"\",\n                 decoder=\"\",\n                 hm_encoder=\"\",\n                 refine_decoder=\"\",\n                 as_two_stage=True,\n                 num_feature_levels=4,\n                 two_stage_num_proposals=300,\n                 num_keypoints=17,\n                 **kwargs):\n        super(PETRTransformer, self).__init__(**kwargs)\n        self.as_two_stage = as_two_stage\n        self.num_feature_levels = num_feature_levels\n        self.two_stage_num_proposals = two_stage_num_proposals\n        self.num_keypoints = num_keypoints\n        self.encoder = encoder\n        self.decoder = decoder\n        self.embed_dims = self.encoder.embed_dims\n        self.hm_encoder = hm_encoder\n        self.refine_decoder = refine_decoder\n        self.init_layers()\n        self.init_weights()\n\n    def init_layers(self):\n        \"\"\"Initialize layers of the DeformableDetrTransformer.\"\"\"\n        #paddle.create_parameter\n        self.level_embeds = paddle.create_parameter(\n            (self.num_feature_levels, self.embed_dims), dtype=\"float32\")\n\n        if self.as_two_stage:\n            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)\n            self.enc_output_norm = nn.LayerNorm(self.embed_dims)\n            self.refine_query_embedding = nn.Embedding(self.num_keypoints,\n                                                       self.embed_dims * 2)\n        else:\n            self.reference_points = nn.Linear(self.embed_dims,\n                                              2 * self.num_keypoints)\n\n    def init_weights(self):\n        \"\"\"Initialize the transformer weights.\"\"\"\n        for p in self.parameters():\n            if p.rank() > 1:\n                xavier_uniform_(p)\n                if hasattr(p, 'bias') and p.bias is not None:\n                    constant_(p.bais)\n        for m in self.sublayers():\n            if isinstance(m, MSDeformableAttention):\n                m._reset_parameters()\n        for m in self.sublayers():\n            if isinstance(m, MultiScaleDeformablePoseAttention):\n                m.init_weights()\n        if not self.as_two_stage:\n            xavier_uniform_(self.reference_points.weight)\n            constant_(self.reference_points.bias)\n        normal_(self.level_embeds)\n        normal_(self.refine_query_embedding.weight)\n\n    def gen_encoder_output_proposals(self, memory, memory_padding_mask,\n                                     spatial_shapes):\n        \"\"\"Generate proposals from encoded memory.\n\n        Args:\n            memory (Tensor): The output of encoder, has shape\n                (bs, num_key, embed_dim). num_key is equal the number of points\n                on feature map from all level.\n            memory_padding_mask (Tensor): Padding mask for memory.\n                has shape (bs, num_key).\n            spatial_shapes (Tensor): The shape of all feature maps.\n                has shape (num_level, 2).\n\n        Returns:\n            tuple: A tuple of feature map and bbox prediction.\n\n                - output_memory (Tensor): The input of decoder, has shape\n                    (bs, num_key, embed_dim). num_key is equal the number of\n                    points on feature map from all levels.\n                - output_proposals (Tensor): The normalized proposal\n                    after a inverse sigmoid, has shape (bs, num_keys, 4).\n        \"\"\"\n\n        N, S, C = memory.shape\n        proposals = []\n        _cur = 0\n        for lvl, (H, W) in enumerate(spatial_shapes):\n            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].reshape(\n                [N, H, W, 1])\n            valid_H = paddle.sum(mask_flatten_[:, :, 0, 0], 1)\n            valid_W = paddle.sum(mask_flatten_[:, 0, :, 0], 1)\n\n            grid_y, grid_x = paddle.meshgrid(\n                paddle.linspace(\n                    0, H - 1, H, dtype=\"float32\"),\n                paddle.linspace(\n                    0, W - 1, W, dtype=\"float32\"))\n            grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)],\n                                 -1)\n\n            scale = paddle.concat(\n                [valid_W.unsqueeze(-1),\n                 valid_H.unsqueeze(-1)], 1).reshape([N, 1, 1, 2])\n            grid = (grid.unsqueeze(0).expand((N, -1, -1, -1)) + 0.5) / scale\n            proposal = grid.reshape([N, -1, 2])\n            proposals.append(proposal)\n            _cur += (H * W)\n        output_proposals = paddle.concat(proposals, 1)\n        output_proposals_valid = ((output_proposals > 0.01) &\n                                  (output_proposals < 0.99)).all(\n                                      -1, keepdim=True).astype(\"bool\")\n        output_proposals = paddle.log(output_proposals / (1 - output_proposals))\n        output_proposals = masked_fill(\n            output_proposals, ~memory_padding_mask.astype(\"bool\").unsqueeze(-1),\n            float('inf'))\n        output_proposals = masked_fill(output_proposals,\n                                       ~output_proposals_valid, float('inf'))\n\n        output_memory = memory\n        output_memory = masked_fill(\n            output_memory, ~memory_padding_mask.astype(\"bool\").unsqueeze(-1),\n            float(0))\n        output_memory = masked_fill(output_memory, ~output_proposals_valid,\n                                    float(0))\n        output_memory = self.enc_output_norm(self.enc_output(output_memory))\n        return output_memory, output_proposals\n\n    @staticmethod\n    def get_reference_points(spatial_shapes, valid_ratios):\n        \"\"\"Get the reference points used in decoder.\n\n        Args:\n            spatial_shapes (Tensor): The shape of all feature maps,\n                has shape (num_level, 2).\n            valid_ratios (Tensor): The radios of valid points on the\n                feature map, has shape (bs, num_levels, 2).\n\n        Returns:\n            Tensor: reference points used in decoder, has \\\n                shape (bs, num_keys, num_levels, 2).\n        \"\"\"\n        reference_points_list = []\n        for lvl, (H, W) in enumerate(spatial_shapes):\n            ref_y, ref_x = paddle.meshgrid(\n                paddle.linspace(\n                    0.5, H - 0.5, H, dtype=\"float32\"),\n                paddle.linspace(\n                    0.5, W - 0.5, W, dtype=\"float32\"))\n            ref_y = ref_y.reshape(\n                (-1, ))[None] / (valid_ratios[:, None, lvl, 1] * H)\n            ref_x = ref_x.reshape(\n                (-1, ))[None] / (valid_ratios[:, None, lvl, 0] * W)\n            ref = paddle.stack((ref_x, ref_y), -1)\n            reference_points_list.append(ref)\n        reference_points = paddle.concat(reference_points_list, 1)\n        reference_points = reference_points[:, :, None] * valid_ratios[:, None]\n        return reference_points\n\n    def get_valid_ratio(self, mask):\n        \"\"\"Get the valid radios of feature maps of all level.\"\"\"\n        _, H, W = mask.shape\n        valid_H = paddle.sum(mask[:, :, 0].astype('float'), 1)\n        valid_W = paddle.sum(mask[:, 0, :].astype('float'), 1)\n        valid_ratio_h = valid_H.astype('float') / H\n        valid_ratio_w = valid_W.astype('float') / W\n        valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1)\n        return valid_ratio\n\n    def get_proposal_pos_embed(self,\n                               proposals,\n                               num_pos_feats=128,\n                               temperature=10000):\n        \"\"\"Get the position embedding of proposal.\"\"\"\n        scale = 2 * math.pi\n        dim_t = paddle.arange(num_pos_feats, dtype=\"float32\")\n        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)\n        # N, L, 4\n        proposals = F.sigmoid(proposals) * scale\n        # N, L, 4, 128\n        pos = proposals[:, :, :, None] / dim_t\n        # N, L, 4, 64, 2\n        pos = paddle.stack(\n            (pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),\n            axis=4).flatten(2)\n        return pos\n\n    def forward(self,\n                mlvl_feats,\n                mlvl_masks,\n                query_embed,\n                mlvl_pos_embeds,\n                kpt_branches=None,\n                cls_branches=None):\n        \"\"\"Forward function for `Transformer`.\n\n        Args:\n            mlvl_feats (list(Tensor)): Input queries from different level.\n                Each element has shape [bs, embed_dims, h, w].\n            mlvl_masks (list(Tensor)): The key_padding_mask from different\n                level used for encoder and decoder, each element has shape\n                [bs, h, w].\n            query_embed (Tensor): The query embedding for decoder,\n                with shape [num_query, c].\n            mlvl_pos_embeds (list(Tensor)): The positional encoding\n                of feats from different level, has the shape\n                 [bs, embed_dims, h, w].\n            kpt_branches (obj:`nn.LayerList`): Keypoint Regression heads for\n                feature maps from each decoder layer. Only would be passed when\n                `with_box_refine` is Ture. Default to None.\n            cls_branches (obj:`nn.LayerList`): Classification heads for\n                feature maps from each decoder layer. Only would be passed when\n                `as_two_stage` is Ture. Default to None.\n\n        Returns:\n            tuple[Tensor]: results of decoder containing the following tensor.\n\n                - inter_states: Outputs from decoder. If\n                    `return_intermediate_dec` is True output has shape \\\n                    (num_dec_layers, bs, num_query, embed_dims), else has \\\n                    shape (1, bs, num_query, embed_dims).\n                - init_reference_out: The initial value of reference \\\n                    points, has shape (bs, num_queries, 4).\n                - inter_references_out: The internal value of reference \\\n                    points in decoder, has shape \\\n                    (num_dec_layers, bs,num_query, embed_dims)\n                - enc_outputs_class: The classification score of proposals \\\n                    generated from encoder's feature maps, has shape \\\n                    (batch, h*w, num_classes). \\\n                    Only would be returned when `as_two_stage` is True, \\\n                    otherwise None.\n                - enc_outputs_kpt_unact: The regression results generated from \\\n                    encoder's feature maps., has shape (batch, h*w, K*2).\n                    Only would be returned when `as_two_stage` is True, \\\n                    otherwise None.\n        \"\"\"\n        assert self.as_two_stage or query_embed is not None\n\n        feat_flatten = []\n        mask_flatten = []\n        lvl_pos_embed_flatten = []\n        spatial_shapes = []\n        for lvl, (feat, mask, pos_embed\n                  ) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):\n            bs, c, h, w = feat.shape\n            spatial_shape = (h, w)\n            spatial_shapes.append(spatial_shape)\n            feat = feat.flatten(2).transpose((0, 2, 1))\n            mask = mask.flatten(1)\n            pos_embed = pos_embed.flatten(2).transpose((0, 2, 1))\n            lvl_pos_embed = pos_embed + self.level_embeds[lvl].reshape(\n                [1, 1, -1])\n            lvl_pos_embed_flatten.append(lvl_pos_embed)\n            feat_flatten.append(feat)\n            mask_flatten.append(mask)\n        feat_flatten = paddle.concat(feat_flatten, 1)\n        mask_flatten = paddle.concat(mask_flatten, 1)\n        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)\n        spatial_shapes_cumsum = paddle.to_tensor(\n            np.array(spatial_shapes).prod(1).cumsum(0))\n        spatial_shapes = paddle.to_tensor(spatial_shapes, dtype=\"int64\")\n        level_start_index = paddle.concat((paddle.zeros(\n            (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1]))\n        valid_ratios = paddle.stack(\n            [self.get_valid_ratio(m) for m in mlvl_masks], 1)\n\n        reference_points = \\\n            self.get_reference_points(spatial_shapes,\n                                      valid_ratios)\n\n        memory = self.encoder(\n            src=feat_flatten,\n            pos_embed=lvl_pos_embed_flatten,\n            src_mask=mask_flatten,\n            value_spatial_shapes=spatial_shapes,\n            reference_points=reference_points,\n            value_level_start_index=level_start_index,\n            valid_ratios=valid_ratios)\n\n        bs, _, c = memory.shape\n\n        hm_proto = None\n        if self.training:\n            hm_memory = paddle.slice(\n                memory,\n                starts=level_start_index[0],\n                ends=level_start_index[1],\n                axes=[1])\n            hm_pos_embed = paddle.slice(\n                lvl_pos_embed_flatten,\n                starts=level_start_index[0],\n                ends=level_start_index[1],\n                axes=[1])\n            hm_mask = paddle.slice(\n                mask_flatten,\n                starts=level_start_index[0],\n                ends=level_start_index[1],\n                axes=[1])\n            hm_reference_points = paddle.slice(\n                reference_points,\n                starts=level_start_index[0],\n                ends=level_start_index[1],\n                axes=[1])[:, :, :1, :]\n\n            # official code make a mistake of pos_embed to pose_embed, which disable pos_embed\n            hm_memory = self.hm_encoder(\n                src=hm_memory,\n                pose_embed=hm_pos_embed,\n                src_mask=hm_mask,\n                value_spatial_shapes=spatial_shapes[[0]],\n                reference_points=hm_reference_points,\n                value_level_start_index=level_start_index[0],\n                valid_ratios=valid_ratios[:, :1, :])\n            hm_memory = hm_memory.reshape((bs, spatial_shapes[0, 0],\n                                           spatial_shapes[0, 1], -1))\n            hm_proto = (hm_memory, mlvl_masks[0])\n\n        if self.as_two_stage:\n            output_memory, output_proposals = \\\n                self.gen_encoder_output_proposals(\n                    memory, mask_flatten, spatial_shapes)\n            enc_outputs_class = cls_branches[self.decoder.num_layers](\n                output_memory)\n            enc_outputs_kpt_unact = \\\n                kpt_branches[self.decoder.num_layers](output_memory)\n            enc_outputs_kpt_unact[..., 0::2] += output_proposals[..., 0:1]\n            enc_outputs_kpt_unact[..., 1::2] += output_proposals[..., 1:2]\n\n            topk = self.two_stage_num_proposals\n            topk_proposals = paddle.topk(\n                enc_outputs_class[..., 0], topk, axis=1)[1].unsqueeze(-1)\n\n            #paddle.take_along_axis 对应torch.gather\n            topk_kpts_unact = paddle.take_along_axis(enc_outputs_kpt_unact,\n                                                     topk_proposals, 1)\n            topk_kpts_unact = topk_kpts_unact.detach()\n\n            reference_points = F.sigmoid(topk_kpts_unact)\n            init_reference_out = reference_points\n            # learnable query and query_pos\n            query_pos, query = paddle.split(\n                query_embed, query_embed.shape[1] // c, axis=1)\n            query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1))\n            query = query.unsqueeze(0).expand((bs, -1, -1))\n        else:\n            query_pos, query = paddle.split(\n                query_embed, query_embed.shape[1] // c, axis=1)\n            query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1))\n            query = query.unsqueeze(0).expand((bs, -1, -1))\n            reference_points = F.sigmoid(self.reference_points(query_pos))\n            init_reference_out = reference_points\n\n        # decoder\n        inter_states, inter_references = self.decoder(\n            query=query,\n            memory=memory,\n            query_pos_embed=query_pos,\n            memory_mask=mask_flatten,\n            reference_points=reference_points,\n            value_spatial_shapes=spatial_shapes,\n            value_level_start_index=level_start_index,\n            valid_ratios=valid_ratios,\n            kpt_branches=kpt_branches)\n\n        inter_references_out = inter_references\n        if self.as_two_stage:\n            return inter_states, init_reference_out, \\\n                   inter_references_out, enc_outputs_class, \\\n                   enc_outputs_kpt_unact, hm_proto, memory\n        return inter_states, init_reference_out, \\\n               inter_references_out, None, None, None, None, None, hm_proto\n\n    def forward_refine(self,\n                       mlvl_masks,\n                       memory,\n                       reference_points_pose,\n                       img_inds,\n                       kpt_branches=None,\n                       **kwargs):\n        mask_flatten = []\n        spatial_shapes = []\n        for lvl, mask in enumerate(mlvl_masks):\n            bs, h, w = mask.shape\n            spatial_shape = (h, w)\n            spatial_shapes.append(spatial_shape)\n            mask = mask.flatten(1)\n            mask_flatten.append(mask)\n        mask_flatten = paddle.concat(mask_flatten, 1)\n        spatial_shapes_cumsum = paddle.to_tensor(\n            np.array(\n                spatial_shapes, dtype='int64').prod(1).cumsum(0))\n        spatial_shapes = paddle.to_tensor(spatial_shapes, dtype=\"int64\")\n        level_start_index = paddle.concat((paddle.zeros(\n            (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1]))\n        valid_ratios = paddle.stack(\n            [self.get_valid_ratio(m) for m in mlvl_masks], 1)\n\n        # pose refinement (17 queries corresponding to 17 keypoints)\n        # learnable query and query_pos\n        refine_query_embedding = self.refine_query_embedding.weight\n        query_pos, query = paddle.split(refine_query_embedding, 2, axis=1)\n        pos_num = reference_points_pose.shape[0]\n        query_pos = query_pos.unsqueeze(0).expand((pos_num, -1, -1))\n        query = query.unsqueeze(0).expand((pos_num, -1, -1))\n        reference_points = reference_points_pose.reshape(\n            (pos_num, reference_points_pose.shape[1] // 2, 2))\n        pos_memory = memory[img_inds]\n        mask_flatten = mask_flatten[img_inds]\n        valid_ratios = valid_ratios[img_inds]\n        if img_inds.size == 1:\n            pos_memory = pos_memory.unsqueeze(0)\n            mask_flatten = mask_flatten.unsqueeze(0)\n            valid_ratios = valid_ratios.unsqueeze(0)\n        inter_states, inter_references = self.refine_decoder(\n            query=query,\n            memory=pos_memory,\n            query_pos_embed=query_pos,\n            memory_mask=mask_flatten,\n            reference_points=reference_points,\n            value_spatial_shapes=spatial_shapes,\n            value_level_start_index=level_start_index,\n            valid_ratios=valid_ratios,\n            reg_branches=kpt_branches,\n            **kwargs)\n        # [num_decoder, num_query, bs, embed_dim]\n\n        init_reference_out = reference_points\n        return inter_states, init_reference_out, inter_references\n"
  },
  {
    "path": "ppdet/modeling/transformers/position_encoding.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport paddle\nimport paddle.nn as nn\n\nfrom ppdet.core.workspace import register, serializable\n\n\n@register\n@serializable\nclass PositionEmbedding(nn.Layer):\n    def __init__(self,\n                 num_pos_feats=128,\n                 temperature=10000,\n                 normalize=True,\n                 scale=2 * math.pi,\n                 embed_type='sine',\n                 num_embeddings=50,\n                 offset=0.,\n                 eps=1e-6):\n        super(PositionEmbedding, self).__init__()\n        assert embed_type in ['sine', 'learned']\n\n        self.embed_type = embed_type\n        self.offset = offset\n        self.eps = eps\n        if self.embed_type == 'sine':\n            self.num_pos_feats = num_pos_feats\n            self.temperature = temperature\n            self.normalize = normalize\n            self.scale = scale\n        elif self.embed_type == 'learned':\n            self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)\n            self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)\n        else:\n            raise ValueError(f\"{self.embed_type} is not supported.\")\n\n    def forward(self, mask):\n        \"\"\"\n        Args:\n            mask (Tensor): [B, H, W]\n        Returns:\n            pos (Tensor): [B, H, W, C]\n        \"\"\"\n        if self.embed_type == 'sine':\n            y_embed = mask.cumsum(1)\n            x_embed = mask.cumsum(2)\n            if self.normalize:\n                y_embed = (y_embed + self.offset) / (\n                    y_embed[:, -1:, :] + self.eps) * self.scale\n                x_embed = (x_embed + self.offset) / (\n                    x_embed[:, :, -1:] + self.eps) * self.scale\n\n            dim_t = 2 * (paddle.arange(self.num_pos_feats) //\n                         2).astype('float32')\n            dim_t = self.temperature**(dim_t / self.num_pos_feats)\n\n            pos_x = x_embed.unsqueeze(-1) / dim_t\n            pos_y = y_embed.unsqueeze(-1) / dim_t\n            pos_x = paddle.stack(\n                (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),\n                axis=4).flatten(3)\n            pos_y = paddle.stack(\n                (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),\n                axis=4).flatten(3)\n            return paddle.concat((pos_y, pos_x), axis=3)\n        elif self.embed_type == 'learned':\n            h, w = mask.shape[-2:]\n            i = paddle.arange(w)\n            j = paddle.arange(h)\n            x_emb = self.col_embed(i)\n            y_emb = self.row_embed(j)\n            return paddle.concat(\n                [\n                    x_emb.unsqueeze(0).tile([h, 1, 1]),\n                    y_emb.unsqueeze(1).tile([1, w, 1]),\n                ],\n                axis=-1).unsqueeze(0)\n        else:\n            raise ValueError(f\"not supported {self.embed_type}\")\n"
  },
  {
    "path": "ppdet/modeling/transformers/rtdetr_transformer.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Modified from detrex (https://github.com/IDEA-Research/detrex)\n# Copyright 2022 The IDEA Authors. All rights reserved.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register\nfrom ..layers import MultiHeadAttention\nfrom ..heads.detr_head import MLP\nfrom .deformable_transformer import MSDeformableAttention\nfrom ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,\n                           bias_init_with_prob)\nfrom .utils import (_get_clones, get_sine_pos_embed,\n                    get_contrastive_denoising_training_group, inverse_sigmoid)\n\n__all__ = ['RTDETRTransformer']\n\n\nclass PPMSDeformableAttention(MSDeformableAttention):\n    def forward(self,\n                query,\n                reference_points,\n                value,\n                value_spatial_shapes,\n                value_level_start_index,\n                value_mask=None):\n        \"\"\"\n        Args:\n            query (Tensor): [bs, query_length, C]\n            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),\n                bottom-right (1, 1), including padding area\n            value (Tensor): [bs, value_length, C]\n            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]\n            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]\n            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements\n\n        Returns:\n            output (Tensor): [bs, Length_{query}, C]\n        \"\"\"\n        bs, Len_q = query.shape[:2]\n        Len_v = value.shape[1]\n\n        value = self.value_proj(value)\n        if value_mask is not None:\n            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)\n            value *= value_mask\n        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])\n\n        sampling_offsets = self.sampling_offsets(query).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])\n        attention_weights = self.attention_weights(query).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])\n        attention_weights = F.softmax(attention_weights).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])\n\n        if reference_points.shape[-1] == 2:\n            offset_normalizer = paddle.to_tensor(value_spatial_shapes)\n            offset_normalizer = offset_normalizer.flip([1]).reshape(\n                [1, 1, 1, self.num_levels, 1, 2])\n            sampling_locations = reference_points.reshape([\n                bs, Len_q, 1, self.num_levels, 1, 2\n            ]) + sampling_offsets / offset_normalizer\n        elif reference_points.shape[-1] == 4:\n            sampling_locations = (\n                reference_points[:, :, None, :, None, :2] + sampling_offsets /\n                self.num_points * reference_points[:, :, None, :, None, 2:] *\n                0.5)\n        else:\n            raise ValueError(\n                \"Last dim of reference_points must be 2 or 4, but get {} instead.\".\n                format(reference_points.shape[-1]))\n\n        if not isinstance(query, paddle.Tensor):\n            from ppdet.modeling.transformers.utils import deformable_attention_core_func\n            output = deformable_attention_core_func(\n                value, value_spatial_shapes, value_level_start_index,\n                sampling_locations, attention_weights)\n        else:\n            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)\n            value_level_start_index = paddle.to_tensor(value_level_start_index)\n            output = self.ms_deformable_attn_core(\n                value, value_spatial_shapes, value_level_start_index,\n                sampling_locations, attention_weights)\n        output = self.output_proj(output)\n\n        return output\n\n\nclass TransformerDecoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model=256,\n                 n_head=8,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 n_levels=4,\n                 n_points=4,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(TransformerDecoderLayer, self).__init__()\n\n        # self attention\n        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # cross attention\n        self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,\n                                                  n_points, 1.0)\n        self.dropout2 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,\n                                 bias_attr)\n        self.activation = getattr(F, activation)\n        self.dropout3 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,\n                                 bias_attr)\n        self.dropout4 = nn.Dropout(dropout)\n        self.norm3 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n        xavier_uniform_(self.linear1.weight)\n        xavier_uniform_(self.linear2.weight)\n\n    def with_pos_embed(self, tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt):\n        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                attn_mask=None,\n                memory_mask=None,\n                query_pos_embed=None):\n        # self attention\n        q = k = self.with_pos_embed(tgt, query_pos_embed)\n        if attn_mask is not None:\n            attn_mask = paddle.where(\n                attn_mask.astype('bool'),\n                paddle.zeros(attn_mask.shape, tgt.dtype),\n                paddle.full(attn_mask.shape, float(\"-inf\"), tgt.dtype))\n        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm1(tgt)\n\n        # cross attention\n        tgt2 = self.cross_attn(\n            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,\n            memory_spatial_shapes, memory_level_start_index, memory_mask)\n        tgt = tgt + self.dropout2(tgt2)\n        tgt = self.norm2(tgt)\n\n        # ffn\n        tgt2 = self.forward_ffn(tgt)\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n\n        return tgt\n\n\nclass TransformerDecoder(nn.Layer):\n    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):\n        super(TransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.hidden_dim = hidden_dim\n        self.num_layers = num_layers\n        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx\n\n    def forward(self,\n                tgt,\n                ref_points_unact,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                bbox_head,\n                score_head,\n                query_pos_head,\n                attn_mask=None,\n                memory_mask=None,\n                query_pos_head_inv_sig=False):\n        output = tgt\n        dec_out_bboxes = []\n        dec_out_logits = []\n        ref_points_detach = F.sigmoid(ref_points_unact)\n        for i, layer in enumerate(self.layers):\n            ref_points_input = ref_points_detach.unsqueeze(2)\n            if not query_pos_head_inv_sig:\n                query_pos_embed = query_pos_head(ref_points_detach)\n            else:\n                query_pos_embed = query_pos_head(\n                    inverse_sigmoid(ref_points_detach))\n\n            output = layer(output, ref_points_input, memory,\n                           memory_spatial_shapes, memory_level_start_index,\n                           attn_mask, memory_mask, query_pos_embed)\n\n            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(\n                ref_points_detach))\n\n            if self.training:\n                dec_out_logits.append(score_head[i](output))\n                if i == 0:\n                    dec_out_bboxes.append(inter_ref_bbox)\n                else:\n                    dec_out_bboxes.append(\n                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(\n                            ref_points)))\n            elif i == self.eval_idx:\n                dec_out_logits.append(score_head[i](output))\n                dec_out_bboxes.append(inter_ref_bbox)\n                break\n\n            ref_points = inter_ref_bbox\n            ref_points_detach = inter_ref_bbox.detach(\n            ) if self.training else inter_ref_bbox\n\n        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)\n\n\n@register\nclass RTDETRTransformer(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim', 'eval_size']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=256,\n                 num_queries=300,\n                 position_embed_type='sine',\n                 backbone_feat_channels=[512, 1024, 2048],\n                 feat_strides=[8, 16, 32],\n                 num_levels=3,\n                 num_decoder_points=4,\n                 nhead=8,\n                 num_decoder_layers=6,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 num_denoising=100,\n                 label_noise_ratio=0.5,\n                 box_noise_scale=1.0,\n                 learnt_init_query=True,\n                 query_pos_head_inv_sig=False,\n                 eval_size=None,\n                 eval_idx=-1,\n                 eps=1e-2):\n        super(RTDETRTransformer, self).__init__()\n        assert position_embed_type in ['sine', 'learned'], \\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        assert len(backbone_feat_channels) <= num_levels\n        assert len(feat_strides) == len(backbone_feat_channels)\n        for _ in range(num_levels - len(feat_strides)):\n            feat_strides.append(feat_strides[-1] * 2)\n\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.feat_strides = feat_strides\n        self.num_levels = num_levels\n        self.num_classes = num_classes\n        self.num_queries = num_queries\n        self.eps = eps\n        self.num_decoder_layers = num_decoder_layers\n        self.eval_size = eval_size\n\n        # backbone feature projection\n        self._build_input_proj_layer(backbone_feat_channels)\n\n        # Transformer module\n        decoder_layer = TransformerDecoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,\n            num_decoder_points)\n        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,\n                                          num_decoder_layers, eval_idx)\n\n        # denoising part\n        self.denoising_class_embed = nn.Embedding(\n            num_classes,\n            hidden_dim,\n            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n        self.num_denoising = num_denoising\n        self.label_noise_ratio = label_noise_ratio\n        self.box_noise_scale = box_noise_scale\n\n        # decoder embedding\n        self.learnt_init_query = learnt_init_query\n        if learnt_init_query:\n            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)\n        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)\n        self.query_pos_head_inv_sig = query_pos_head_inv_sig\n\n        # encoder head\n        self.enc_output = nn.Sequential(\n            nn.Linear(hidden_dim, hidden_dim),\n            nn.LayerNorm(\n                hidden_dim,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))\n        self.enc_score_head = nn.Linear(hidden_dim, num_classes)\n        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n\n        # decoder head\n        self.dec_score_head = nn.LayerList([\n            nn.Linear(hidden_dim, num_classes)\n            for _ in range(num_decoder_layers)\n        ])\n        self.dec_bbox_head = nn.LayerList([\n            MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n            for _ in range(num_decoder_layers)\n        ])\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # class and bbox head init\n        bias_cls = bias_init_with_prob(0.01)\n        linear_init_(self.enc_score_head)\n        constant_(self.enc_score_head.bias, bias_cls)\n        constant_(self.enc_bbox_head.layers[-1].weight)\n        constant_(self.enc_bbox_head.layers[-1].bias)\n        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):\n            linear_init_(cls_)\n            constant_(cls_.bias, bias_cls)\n            constant_(reg_.layers[-1].weight)\n            constant_(reg_.layers[-1].bias)\n\n        linear_init_(self.enc_output[0])\n        xavier_uniform_(self.enc_output[0].weight)\n        if self.learnt_init_query:\n            xavier_uniform_(self.tgt_embed.weight)\n        xavier_uniform_(self.query_pos_head.layers[0].weight)\n        xavier_uniform_(self.query_pos_head.layers[1].weight)\n        for l in self.input_proj:\n            xavier_uniform_(l[0].weight)\n\n        # init encoder output anchors and valid_mask\n        if self.eval_size:\n            self.anchors, self.valid_mask = self._generate_anchors()\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'backbone_feat_channels': [i.channels for i in input_shape]}\n\n    def _build_input_proj_layer(self, backbone_feat_channels):\n        self.input_proj = nn.LayerList()\n        for in_channels in backbone_feat_channels:\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=1,\n                        bias_attr=False)), ('norm', nn.BatchNorm2D(\n                            self.hidden_dim,\n                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n        in_channels = backbone_feat_channels[-1]\n        for _ in range(self.num_levels - len(backbone_feat_channels)):\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=3,\n                        stride=2,\n                        padding=1,\n                        bias_attr=False)), ('norm', nn.BatchNorm2D(\n                            self.hidden_dim,\n                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n            in_channels = self.hidden_dim\n\n    def _get_encoder_input(self, feats):\n        # get projection features\n        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]\n        if self.num_levels > len(proj_feats):\n            len_srcs = len(proj_feats)\n            for i in range(len_srcs, self.num_levels):\n                if i == len_srcs:\n                    proj_feats.append(self.input_proj[i](feats[-1]))\n                else:\n                    proj_feats.append(self.input_proj[i](proj_feats[-1]))\n\n        # get encoder inputs\n        feat_flatten = []\n        spatial_shapes = []\n        level_start_index = [0, ]\n        for i, feat in enumerate(proj_feats):\n            _, _, h, w = feat.shape\n            # [b, c, h, w] -> [b, h*w, c]\n            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))\n            # [num_levels, 2]\n            spatial_shapes.append([h, w])\n            # [l], start index of each level\n            level_start_index.append(h * w + level_start_index[-1])\n\n        # [b, l, c]\n        feat_flatten = paddle.concat(feat_flatten, 1)\n        level_start_index.pop()\n        return (feat_flatten, spatial_shapes, level_start_index)\n\n    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):\n        # input projection and embedding\n        (memory, spatial_shapes,\n         level_start_index) = self._get_encoder_input(feats)\n\n        # prepare denoising training\n        if self.training:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \\\n                get_contrastive_denoising_training_group(gt_meta,\n                                            self.num_classes,\n                                            self.num_queries,\n                                            self.denoising_class_embed.weight,\n                                            self.num_denoising,\n                                            self.label_noise_ratio,\n                                            self.box_noise_scale)\n        else:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None\n\n        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \\\n            self._get_decoder_input(\n            memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher)\n\n        # decoder\n        out_bboxes, out_logits = self.decoder(\n            target,\n            init_ref_points_unact,\n            memory,\n            spatial_shapes,\n            level_start_index,\n            self.dec_bbox_head,\n            self.dec_score_head,\n            self.query_pos_head,\n            attn_mask=attn_mask,\n            memory_mask=None,\n            query_pos_head_inv_sig=self.query_pos_head_inv_sig)\n        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,\n                dn_meta)\n\n    def _generate_anchors(self,\n                          spatial_shapes=None,\n                          grid_size=0.05,\n                          dtype=\"float32\"):\n        if spatial_shapes is None:\n            spatial_shapes = [\n                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]\n                for s in self.feat_strides\n            ]\n        anchors = []\n        for lvl, (h, w) in enumerate(spatial_shapes):\n            grid_y, grid_x = paddle.meshgrid(\n                paddle.arange(\n                    end=h, dtype=dtype),\n                paddle.arange(\n                    end=w, dtype=dtype))\n            grid_xy = paddle.stack([grid_x, grid_y], -1)\n\n            valid_WH = paddle.to_tensor([h, w]).astype(dtype)\n            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH\n            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)\n            anchors.append(\n                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))\n\n        anchors = paddle.concat(anchors, 1)\n        valid_mask = ((anchors > self.eps) *\n                      (anchors < 1 - self.eps)).all(-1, keepdim=True)\n        anchors = paddle.log(anchors / (1 - anchors))\n        anchors = paddle.where(valid_mask, anchors,\n                               paddle.to_tensor(float(\"inf\")))\n        return anchors, valid_mask\n\n    def _get_decoder_input(self,\n                           memory,\n                           spatial_shapes,\n                           denoising_class=None,\n                           denoising_bbox_unact=None,\n                           is_teacher=False):\n        bs, _, _ = memory.shape\n        # prepare input for decoder\n        if self.training or self.eval_size is None or is_teacher:\n            anchors, valid_mask = self._generate_anchors(spatial_shapes)\n        else:\n            anchors, valid_mask = self.anchors, self.valid_mask\n        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))\n        output_memory = self.enc_output(memory)\n\n        enc_outputs_class = self.enc_score_head(output_memory)\n        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors\n\n        _, topk_ind = paddle.topk(\n            enc_outputs_class.max(-1), self.num_queries, axis=1)\n        # extract region proposal boxes\n        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)\n        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])\n        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)\n\n        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,\n                                                  topk_ind)  # unsigmoided.\n        enc_topk_bboxes = F.sigmoid(reference_points_unact)\n        if denoising_bbox_unact is not None:\n            reference_points_unact = paddle.concat(\n                [denoising_bbox_unact, reference_points_unact], 1)\n        if self.training:\n            reference_points_unact = reference_points_unact.detach()\n        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)\n\n        # extract region features\n        if self.learnt_init_query:\n            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n        else:\n            target = paddle.gather_nd(output_memory, topk_ind)\n            if self.training:\n                target = target.detach()\n        if denoising_class is not None:\n            target = paddle.concat([denoising_class, target], 1)\n\n        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits\n"
  },
  {
    "path": "ppdet/modeling/transformers/rtdetr_transformerv2.py",
    "content": "# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Modified from detrex (https://github.com/IDEA-Research/detrex)\n# Copyright 2022 The IDEA Authors. All rights reserved.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport functools\nimport math\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register\nfrom .rtdetr_transformer import TransformerDecoder\nfrom .utils import deformable_attention_core_func_v2, get_contrastive_denoising_training_group\nfrom ..heads.detr_head import MLP\nfrom ..initializer import (linear_init_, constant_, xavier_uniform_, bias_init_with_prob)\nfrom ..layers import MultiHeadAttention\n\n__all__ = ['RTDETRTransformerv2']\n\n\nclass MSDeformableAttention(nn.Layer):\n    def __init__(self,\n                 embed_dim=256,\n                 num_heads=8,\n                 num_levels=4,\n                 num_points=4,\n                 sampling_method='default',\n                 offset_scale=0.5,\n                 lr_mult=0.1):\n        \"\"\"\n        Multi-Scale Deformable Attention Module\n        \"\"\"\n        super(MSDeformableAttention, self).__init__()\n        self.embed_dim = embed_dim\n        self.num_heads = num_heads\n        self.num_levels = num_levels\n\n        if isinstance(num_points, list):\n            assert len(num_points) == num_levels, ValueError\n            num_points_list = num_points\n        else:\n            num_points_list = [num_points for _ in range(num_levels)]\n\n        self.num_points_list = num_points_list\n        self.total_points = num_heads * sum(num_points_list)\n\n        num_points_scale = [1 / n for n in num_points_list for _ in range(n)]\n        self.register_buffer('num_points_scale',\n                             paddle.to_tensor(num_points_scale, dtype=paddle.float32))\n\n        self.sampling_method = sampling_method\n        self.offset_scale = offset_scale\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        self.sampling_offsets = nn.Linear(\n            embed_dim,\n            self.total_points * 2,\n            weight_attr=ParamAttr(learning_rate=lr_mult),\n            bias_attr=ParamAttr(learning_rate=lr_mult))\n\n        self.attention_weights = nn.Linear(embed_dim, self.total_points)\n        self.value_proj = nn.Linear(embed_dim, embed_dim)\n        self.output_proj = nn.Linear(embed_dim, embed_dim)\n\n        self.ms_deformable_attn_core = functools.partial(\n            deformable_attention_core_func_v2,\n            num_points_list=self.num_points_list,\n            sampling_method=self.sampling_method)\n\n        self._reset_parameters()\n\n        if self.sampling_method == 'discrete':\n            for p in self.sampling_offsets.parameters():\n                p.stop_gradient = True\n\n    def _reset_parameters(self):\n        # sampling_offsets\n        constant_(self.sampling_offsets.weight)\n        thetas = paddle.arange(\n            self.num_heads,\n            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)\n        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)\n        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)\n        grid_init = grid_init.reshape([self.num_heads, 1, 2]).tile(\n            [1, sum(self.num_points_list), 1])\n        scaling = paddle.concat(\n            [paddle.arange(1, n + 1, dtype=paddle.float32)\n             for n in self.num_points_list]).reshape([1, -1, 1])\n        grid_init *= scaling\n        self.sampling_offsets.bias.set_value(grid_init.flatten())\n        # attention_weights\n        constant_(self.attention_weights.weight)\n        constant_(self.attention_weights.bias)\n        # proj\n        xavier_uniform_(self.value_proj.weight)\n        constant_(self.value_proj.bias)\n        xavier_uniform_(self.output_proj.weight)\n        constant_(self.output_proj.bias)\n\n    def forward(self,\n                query,\n                reference_points,\n                value,\n                value_spatial_shapes,\n                value_mask=None):\n        \"\"\"\n        Args:\n            query (Tensor): [batch_num, query_len, num_heads * head_dim]\n            reference_points (Tensor): [batch_num, query_length, n_levels, 2], range in [0, 1], top-left (0,0),\n                bottom-right (1, 1), including padding area\n            value (Tensor): [batch_num, value_len, num_heads * head_dim]\n            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]\n            value_mask (Tensor): [batch_num, value_len], True for non-padding elements, False for padding elements\n\n        Returns:\n            output (Tensor): [bs, Length_{query}, C]\n        \"\"\"\n        batch_num, query_len = query.shape[:2]\n        value_len = value.shape[1]\n\n        value = self.value_proj(value)\n        if value_mask is not None:\n            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)\n            value *= value_mask\n        value = value.reshape([batch_num, value_len, self.num_heads, self.head_dim])\n\n        sampling_offsets = self.sampling_offsets(query).reshape(\n            [batch_num, query_len, self.num_heads, sum(self.num_points_list), 2])\n        attention_weights = self.attention_weights(query).reshape(\n            [batch_num, query_len, self.num_heads, sum(self.num_points_list)])\n        attention_weights = F.softmax(attention_weights, axis=-1)\n\n        if reference_points.shape[-1] == 2:\n            offset_normalizer = value_spatial_shapes.flip([1]).reshape(\n                [1, 1, 1, self.num_levels, 1, 2])\n            sampling_locations = reference_points.reshape([\n                batch_num, query_len, 1, self.num_levels, 1, 2\n            ]) + sampling_offsets / offset_normalizer.astype(sampling_offsets.dtype)\n        elif reference_points.shape[-1] == 4:\n            offset = sampling_offsets * reference_points[:, :, None, :, 2:]\n            num_points_scale = self.num_points_scale.astype(query.dtype).unsqueeze(-1)\n            offset = offset * num_points_scale * self.offset_scale\n            sampling_locations = reference_points[:, :, None, :, :2] + offset\n        else:\n            raise ValueError(\n                \"Last dim of reference_points must be 2 or 4, but get {} instead.\".\n                format(reference_points.shape[-1]))\n\n        output = self.ms_deformable_attn_core(value, value_spatial_shapes,\n                                              sampling_locations, attention_weights)\n        output = self.output_proj(output)\n\n        return output\n\n\nclass TransformerDecoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model=256,\n                 n_head=8,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 n_levels=4,\n                 n_points=4,\n                 sampling_method='default',\n                 weight_attr=None,\n                 bias_attr=None):\n        super(TransformerDecoderLayer, self).__init__()\n\n        # self attention\n        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # cross attention\n        self.cross_attn = MSDeformableAttention(\n            d_model, n_head, n_levels, n_points,\n            sampling_method=sampling_method, lr_mult=1.0)\n        self.dropout2 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,\n                                 bias_attr)\n        self.activation = getattr(F, activation)\n        self.dropout3 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,\n                                 bias_attr)\n        self.dropout4 = nn.Dropout(dropout)\n        self.norm3 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n        xavier_uniform_(self.linear1.weight)\n        xavier_uniform_(self.linear2.weight)\n\n    def with_pos_embed(self, tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt):\n        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                attn_mask=None,\n                memory_mask=None,\n                query_pos_embed=None):\n        # self attention\n        q = k = self.with_pos_embed(tgt, query_pos_embed)\n        if attn_mask is not None:\n            attn_mask = paddle.where(\n                attn_mask.astype('bool'),\n                paddle.zeros(attn_mask.shape, tgt.dtype),\n                paddle.full(attn_mask.shape, float(\"-inf\"), tgt.dtype))\n        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm1(tgt)\n\n        # cross attention\n        tgt2 = self.cross_attn(\n            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,\n            memory_spatial_shapes, memory_mask)\n        tgt = tgt + self.dropout2(tgt2)\n        tgt = self.norm2(tgt)\n\n        # ffn\n        tgt2 = self.forward_ffn(tgt)\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n\n        return tgt\n\n\n@register\nclass RTDETRTransformerv2(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim', 'eval_size']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=256,\n                 num_queries=300,\n                 position_embed_type='sine',\n                 backbone_feat_channels=[512, 1024, 2048],\n                 feat_strides=[8, 16, 32],\n                 num_levels=3,\n                 num_decoder_points=4,\n                 nhead=8,\n                 num_decoder_layers=6,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 num_denoising=100,\n                 label_noise_ratio=0.5,\n                 box_noise_scale=1.0,\n                 learnt_init_query=True,\n                 query_pos_head_inv_sig=False,\n                 eval_size=None,\n                 eval_idx=-1,\n                 eps=1e-2,\n                 cross_attn_sampling_method='default'):\n        super(RTDETRTransformerv2, self).__init__()\n        assert position_embed_type in ['sine', 'learned'], \\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        assert len(backbone_feat_channels) <= num_levels\n        assert len(feat_strides) == len(backbone_feat_channels)\n        for _ in range(num_levels - len(feat_strides)):\n            feat_strides.append(feat_strides[-1] * 2)\n\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.feat_strides = feat_strides\n        self.num_levels = num_levels\n        self.num_classes = num_classes\n        self.num_queries = num_queries\n        self.eps = eps\n        self.num_decoder_layers = num_decoder_layers\n        self.eval_size = eval_size\n\n        assert cross_attn_sampling_method in ['default', 'discrete'], NotImplementedError\n        self.cross_attn_sampling_method = cross_attn_sampling_method\n\n        # backbone feature projection\n        self._build_input_proj_layer(backbone_feat_channels)\n\n        # Transformer module\n        decoder_layer = TransformerDecoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,\n            num_decoder_points, sampling_method=cross_attn_sampling_method)\n        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,\n                                          num_decoder_layers, eval_idx)\n\n        # denoising part\n        self.denoising_class_embed = nn.Embedding(\n            num_classes,\n            hidden_dim,\n            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n        self.num_denoising = num_denoising\n        self.label_noise_ratio = label_noise_ratio\n        self.box_noise_scale = box_noise_scale\n\n        # decoder embedding\n        self.learnt_init_query = learnt_init_query\n        if learnt_init_query:\n            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)\n        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)\n        self.query_pos_head_inv_sig = query_pos_head_inv_sig\n\n        # encoder head\n        self.enc_output = nn.Sequential(\n            nn.Linear(hidden_dim, hidden_dim),\n            nn.LayerNorm(\n                hidden_dim,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))\n        self.enc_score_head = nn.Linear(hidden_dim, num_classes)\n        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n\n        # decoder head\n        self.dec_score_head = nn.LayerList([\n            nn.Linear(hidden_dim, num_classes)\n            for _ in range(num_decoder_layers)\n        ])\n        self.dec_bbox_head = nn.LayerList([\n            MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n            for _ in range(num_decoder_layers)\n        ])\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # class and bbox head init\n        bias_cls = bias_init_with_prob(0.01)\n        linear_init_(self.enc_score_head)\n        constant_(self.enc_score_head.bias, bias_cls)\n        constant_(self.enc_bbox_head.layers[-1].weight)\n        constant_(self.enc_bbox_head.layers[-1].bias)\n        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):\n            linear_init_(cls_)\n            constant_(cls_.bias, bias_cls)\n            constant_(reg_.layers[-1].weight)\n            constant_(reg_.layers[-1].bias)\n\n        linear_init_(self.enc_output[0])\n        xavier_uniform_(self.enc_output[0].weight)\n        if self.learnt_init_query:\n            xavier_uniform_(self.tgt_embed.weight)\n        xavier_uniform_(self.query_pos_head.layers[0].weight)\n        xavier_uniform_(self.query_pos_head.layers[1].weight)\n        for l in self.input_proj:\n            xavier_uniform_(l[0].weight)\n\n        # init encoder output anchors and valid_mask\n        if self.eval_size:\n            self.anchors, self.valid_mask = self._generate_anchors()\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'backbone_feat_channels': [i.channels for i in input_shape]}\n\n    def _build_input_proj_layer(self, backbone_feat_channels):\n        self.input_proj = nn.LayerList()\n        for in_channels in backbone_feat_channels:\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=1,\n                        bias_attr=False)), ('norm', nn.BatchNorm2D(\n                            self.hidden_dim,\n                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n        in_channels = backbone_feat_channels[-1]\n        for _ in range(self.num_levels - len(backbone_feat_channels)):\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=3,\n                        stride=2,\n                        padding=1,\n                        bias_attr=False)), ('norm', nn.BatchNorm2D(\n                            self.hidden_dim,\n                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n            in_channels = self.hidden_dim\n\n    def _get_encoder_input(self, feats):\n        # get projection features\n        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]\n        if self.num_levels > len(proj_feats):\n            len_srcs = len(proj_feats)\n            for i in range(len_srcs, self.num_levels):\n                if i == len_srcs:\n                    proj_feats.append(self.input_proj[i](feats[-1]))\n                else:\n                    proj_feats.append(self.input_proj[i](proj_feats[-1]))\n\n        # get encoder inputs\n        feat_flatten = []\n        spatial_shapes = []\n        level_start_index = [0, ]\n        for i, feat in enumerate(proj_feats):\n            _, _, h, w = feat.shape\n            # [b, c, h, w] -> [b, h*w, c]\n            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))\n            # [num_levels, 2]\n            spatial_shapes.append([h, w])\n            # [l], start index of each level\n            level_start_index.append(h * w + level_start_index[-1])\n\n        # [b, l, c]\n        feat_flatten = paddle.concat(feat_flatten, 1)\n        level_start_index.pop()\n        return (feat_flatten, spatial_shapes, level_start_index)\n\n    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):\n        # input projection and embedding\n        (memory, spatial_shapes,\n         level_start_index) = self._get_encoder_input(feats)\n\n        # prepare denoising training\n        if self.training:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \\\n                get_contrastive_denoising_training_group(gt_meta,\n                                            self.num_classes,\n                                            self.num_queries,\n                                            self.denoising_class_embed.weight,\n                                            self.num_denoising,\n                                            self.label_noise_ratio,\n                                            self.box_noise_scale)\n        else:\n            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None\n\n        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \\\n            self._get_decoder_input(\n            memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher)\n\n        # decoder\n        out_bboxes, out_logits = self.decoder(\n            target,\n            init_ref_points_unact,\n            memory,\n            spatial_shapes,\n            level_start_index,\n            self.dec_bbox_head,\n            self.dec_score_head,\n            self.query_pos_head,\n            attn_mask=attn_mask,\n            memory_mask=None,\n            query_pos_head_inv_sig=self.query_pos_head_inv_sig)\n        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,\n                dn_meta)\n\n    def _generate_anchors(self,\n                          spatial_shapes=None,\n                          grid_size=0.05,\n                          dtype=\"float32\"):\n        if spatial_shapes is None:\n            spatial_shapes = [\n                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]\n                for s in self.feat_strides\n            ]\n        anchors = []\n        for lvl, (h, w) in enumerate(spatial_shapes):\n            grid_y, grid_x = paddle.meshgrid(\n                paddle.arange(\n                    end=h, dtype=dtype),\n                paddle.arange(\n                    end=w, dtype=dtype))\n            grid_xy = paddle.stack([grid_x, grid_y], -1)\n\n            valid_WH = paddle.to_tensor([h, w]).astype(dtype)\n            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH\n            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)\n            anchors.append(\n                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))\n\n        anchors = paddle.concat(anchors, 1)\n        valid_mask = ((anchors > self.eps) *\n                      (anchors < 1 - self.eps)).all(-1, keepdim=True)\n        anchors = paddle.log(anchors / (1 - anchors))\n        anchors = paddle.where(valid_mask, anchors,\n                               paddle.to_tensor(float(\"inf\")))\n        return anchors, valid_mask\n\n    def _get_decoder_input(self,\n                           memory,\n                           spatial_shapes,\n                           denoising_class=None,\n                           denoising_bbox_unact=None,\n                           is_teacher=False):\n        bs, _, _ = memory.shape\n        # prepare input for decoder\n        if self.training or self.eval_size is None or is_teacher:\n            anchors, valid_mask = self._generate_anchors(spatial_shapes)\n        else:\n            anchors, valid_mask = self.anchors, self.valid_mask\n        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))\n        output_memory = self.enc_output(memory)\n\n        enc_outputs_class = self.enc_score_head(output_memory)\n        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors\n\n        _, topk_ind = paddle.topk(\n            enc_outputs_class.max(-1), self.num_queries, axis=1)\n        # extract region proposal boxes\n        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)\n        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])\n        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)\n\n        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,\n                                                  topk_ind)  # unsigmoided.\n        enc_topk_bboxes = F.sigmoid(reference_points_unact)\n        if denoising_bbox_unact is not None:\n            reference_points_unact = paddle.concat(\n                [denoising_bbox_unact, reference_points_unact], 1)\n        if self.training:\n            reference_points_unact = reference_points_unact.detach()\n        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)\n\n        # extract region features\n        if self.learnt_init_query:\n            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n        else:\n            target = paddle.gather_nd(output_memory, topk_ind)\n            if self.training:\n                target = target.detach()\n        if denoising_class is not None:\n            target = paddle.concat([denoising_class, target], 1)\n\n        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits\n"
  },
  {
    "path": "ppdet/modeling/transformers/rtdetr_transformerv3.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Modified from detrex (https://github.com/IDEA-Research/detrex)\n# Copyright 2022 The IDEA Authors. All rights reserved.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\nfrom paddle.regularizer import L2Decay\n\nfrom ppdet.core.workspace import register\nfrom ..layers import MultiHeadAttention\nfrom ..heads.detr_head import MLP\nfrom .deformable_transformer import MSDeformableAttention\nfrom ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,\n                           bias_init_with_prob)\nfrom .utils import (_get_clones, get_sine_pos_embed,\n                    get_contrastive_denoising_training_group, inverse_sigmoid)\n\n__all__ = ['RTDETRTransformerv3']\n\n\nclass PPMSDeformableAttention(MSDeformableAttention):\n    def forward(self,\n                query,\n                reference_points,\n                value,\n                value_spatial_shapes,\n                value_level_start_index,\n                value_mask=None):\n        \"\"\"\n        Args:\n            query (Tensor): [bs, query_length, C]\n            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),\n                bottom-right (1, 1), including padding area\n            value (Tensor): [bs, value_length, C]\n            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]\n            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]\n            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements\n\n        Returns:\n            output (Tensor): [bs, Length_{query}, C]\n        \"\"\"\n        bs, Len_q = query.shape[:2]\n        Len_v = value.shape[1]\n\n        value = self.value_proj(value)\n        if value_mask is not None:\n            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)\n            value *= value_mask\n        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])\n\n        sampling_offsets = self.sampling_offsets(query).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])\n        attention_weights = self.attention_weights(query).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])\n        attention_weights = F.softmax(attention_weights).reshape(\n            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])\n\n        if reference_points.shape[-1] == 2:\n            offset_normalizer = paddle.to_tensor(value_spatial_shapes)\n            offset_normalizer = offset_normalizer.flip([1]).reshape(\n                [1, 1, 1, self.num_levels, 1, 2])\n            sampling_locations = reference_points.reshape([\n                bs, Len_q, 1, self.num_levels, 1, 2\n            ]) + sampling_offsets / offset_normalizer\n        elif reference_points.shape[-1] == 4:\n            sampling_locations = (\n                reference_points[:, :, None, :, None, :2] + sampling_offsets /\n                self.num_points * reference_points[:, :, None, :, None, 2:] *\n                0.5)\n        else:\n            raise ValueError(\n                \"Last dim of reference_points must be 2 or 4, but get {} instead.\".\n                format(reference_points.shape[-1]))\n\n        if not isinstance(query, paddle.Tensor):\n            from ppdet.modeling.transformers.utils import deformable_attention_core_func\n            output = deformable_attention_core_func(\n                value, value_spatial_shapes, value_level_start_index,\n                sampling_locations, attention_weights)\n        else:\n            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)\n            value_level_start_index = paddle.to_tensor(value_level_start_index)\n            output = self.ms_deformable_attn_core(\n                value, value_spatial_shapes, value_level_start_index,\n                sampling_locations, attention_weights)\n        output = self.output_proj(output)\n\n        return output\n\n\nclass TransformerDecoderLayer(nn.Layer):\n    def __init__(self,\n                 d_model=256,\n                 n_head=8,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 n_levels=4,\n                 n_points=4,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(TransformerDecoderLayer, self).__init__()\n\n        # self attention\n        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # cross attention\n        self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,\n                                                  n_points, 1.0)\n        self.dropout2 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,\n                                 bias_attr)\n        self.activation = getattr(F, activation)\n        self.dropout3 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,\n                                 bias_attr)\n        self.dropout4 = nn.Dropout(dropout)\n        self.norm3 = nn.LayerNorm(\n            d_model,\n            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        linear_init_(self.linear1)\n        linear_init_(self.linear2)\n        xavier_uniform_(self.linear1.weight)\n        xavier_uniform_(self.linear2.weight)\n\n    def with_pos_embed(self, tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt):\n        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n\n    def forward(self,\n                tgt,\n                reference_points,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                attn_mask=None,\n                memory_mask=None,\n                query_pos_embed=None):\n        # self attention\n        q = k = self.with_pos_embed(tgt, query_pos_embed)\n        if attn_mask is not None:\n            attn_mask = paddle.where(\n                attn_mask.astype('bool'),\n                paddle.zeros(attn_mask.shape, tgt.dtype),\n                paddle.full(attn_mask.shape, float(\"-inf\"), tgt.dtype))\n        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm1(tgt)\n\n        # cross attention\n        tgt2 = self.cross_attn(\n            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,\n            memory_spatial_shapes, memory_level_start_index, memory_mask)\n        tgt = tgt + self.dropout2(tgt2)\n        tgt = self.norm2(tgt)\n\n        # ffn\n        tgt2 = self.forward_ffn(tgt)\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n\n        return tgt\n\n\nclass TransformerDecoder(nn.Layer):\n    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):\n        super(TransformerDecoder, self).__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.hidden_dim = hidden_dim\n        self.num_layers = num_layers\n        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx\n\n    def forward(self,\n                tgt,\n                ref_points_unact,\n                memory,\n                memory_spatial_shapes,\n                memory_level_start_index,\n                bbox_head,\n                score_head,\n                query_pos_head,\n                attn_mask=None,\n                memory_mask=None,\n                query_pos_head_inv_sig=False):\n        output = tgt\n        dec_out_bboxes = []\n        dec_out_logits = []\n        ref_points_detach = F.sigmoid(ref_points_unact)\n        for i, layer in enumerate(self.layers):\n            ref_points_input = ref_points_detach.unsqueeze(2)\n            if not query_pos_head_inv_sig:\n                query_pos_embed = query_pos_head(ref_points_detach)\n            else:\n                query_pos_embed = query_pos_head(\n                    inverse_sigmoid(ref_points_detach))\n\n            output = layer(output, ref_points_input, memory,\n                           memory_spatial_shapes, memory_level_start_index,\n                           attn_mask, memory_mask, query_pos_embed)\n\n            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(\n                ref_points_detach))\n\n            if self.training:\n                dec_out_logits.append(score_head[i](output))\n                if i == 0:\n                    dec_out_bboxes.append(inter_ref_bbox)\n                else:\n                    dec_out_bboxes.append(\n                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(\n                            ref_points)))\n            elif i == self.eval_idx:\n                dec_out_logits.append(score_head[i](output))\n                dec_out_bboxes.append(inter_ref_bbox)\n                break\n\n            ref_points = inter_ref_bbox\n            ref_points_detach = inter_ref_bbox.detach(\n            ) if self.training else inter_ref_bbox\n\n        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)\n\n\n@register\nclass RTDETRTransformerv3(nn.Layer):\n    __shared__ = ['num_classes', 'hidden_dim', 'eval_size',\n                  'o2m_branch', 'num_queries_o2m']\n\n    def __init__(self,\n                 num_classes=80,\n                 hidden_dim=256,\n                 num_queries=300,\n                 position_embed_type='sine',\n                 backbone_feat_channels=[512, 1024, 2048],\n                 feat_strides=[8, 16, 32],\n                 num_levels=3,\n                 num_decoder_points=4,\n                 nhead=8,\n                 num_decoder_layers=6,\n                 dim_feedforward=1024,\n                 dropout=0.,\n                 activation=\"relu\",\n                 num_denoising=100,\n                 label_noise_ratio=0.5,\n                 box_noise_scale=1.0,\n                 learnt_init_query=True,\n                 query_pos_head_inv_sig=False,\n                 eval_size=None,\n                 eval_idx=-1,\n                 num_noises=0,\n                 num_noise_queries=[],\n                 num_noise_denoising=100,\n                 o2m_branch=False,\n                 num_queries_o2m=450,\n                 eps=1e-2):\n        super(RTDETRTransformerv3, self).__init__()\n        assert position_embed_type in ['sine', 'learned'], \\\n            f'ValueError: position_embed_type not supported {position_embed_type}!'\n        assert len(backbone_feat_channels) <= num_levels\n        assert len(feat_strides) == len(backbone_feat_channels)\n        assert len(num_noise_queries) == num_noises\n        for _ in range(num_levels - len(feat_strides)):\n            feat_strides.append(feat_strides[-1] * 2)\n\n        self.hidden_dim = hidden_dim\n        self.nhead = nhead\n        self.feat_strides = feat_strides\n        self.num_levels = num_levels\n        self.num_classes = num_classes\n        self.num_queries = [num_queries]\n        self.eps = eps\n        self.num_decoder_layers = num_decoder_layers\n        self.eval_size = eval_size\n\n        self.num_noises = num_noises\n        self.num_noise_denoising = num_noise_denoising\n        self.num_groups = 1\n        if num_noises > 0:\n            self.num_queries.extend(num_noise_queries)\n            self.num_groups += num_noises\n        \n        self.o2m_branch = o2m_branch\n        self.num_queries_o2m = num_queries_o2m\n        if o2m_branch:\n            self.num_queries.append(num_queries_o2m)\n            self.num_groups += 1\n\n        # backbone feature projection\n        self._build_input_proj_layer(backbone_feat_channels)\n\n        # Transformer module\n        decoder_layer = TransformerDecoderLayer(\n            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,\n            num_decoder_points)\n        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,\n                                          num_decoder_layers, eval_idx)\n\n        # denoising part\n        self.denoising_class_embed = nn.Embedding(\n            num_classes,\n            hidden_dim,\n            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))\n        self.num_denoising = num_denoising\n        self.label_noise_ratio = label_noise_ratio\n        self.box_noise_scale = box_noise_scale\n\n        # decoder embedding\n        self.learnt_init_query = learnt_init_query\n        if learnt_init_query:\n            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)\n        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)\n        self.query_pos_head_inv_sig = query_pos_head_inv_sig\n\n        # encoder head\n        self.enc_output = nn.LayerList([\n            nn.Sequential(\n                nn.Linear(hidden_dim, hidden_dim),\n                nn.LayerNorm(\n                    hidden_dim,\n                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))\n            for _ in range(self.num_groups)\n        ])\n        self.enc_score_head = nn.LayerList([\n            nn.Linear(hidden_dim, num_classes)\n            for _ in range(self.num_groups)\n        ])\n        self.enc_bbox_head = nn.LayerList([\n            MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n            for _ in range(self.num_groups)\n        ])\n\n        self.map_memory = nn.Sequential(\n            nn.Linear(hidden_dim, hidden_dim),\n            nn.LayerNorm(\n                hidden_dim,\n                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))\n            )\n\n        # decoder head\n        self.dec_score_head = nn.LayerList([\n            nn.Linear(hidden_dim, num_classes)\n            for _ in range(num_decoder_layers)\n        ])\n        self.dec_bbox_head = nn.LayerList([\n            MLP(hidden_dim, hidden_dim, 4, num_layers=3)\n            for _ in range(num_decoder_layers)\n        ])\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        # class and bbox head init\n        bias_cls = bias_init_with_prob(0.01)\n        for enc_score_head in self.enc_score_head:\n            linear_init_(enc_score_head)\n            constant_(enc_score_head.bias, bias_cls)\n        for enc_bbox_head in self.enc_bbox_head:\n            constant_(enc_bbox_head.layers[-1].weight)\n            constant_(enc_bbox_head.layers[-1].bias)\n        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):\n            linear_init_(cls_)\n            constant_(cls_.bias, bias_cls)\n            constant_(reg_.layers[-1].weight)\n            constant_(reg_.layers[-1].bias)\n\n        for enc_output in self.enc_output:\n            linear_init_(enc_output[0])\n            xavier_uniform_(enc_output[0].weight)\n        linear_init_(self.map_memory[0])\n        xavier_uniform_(self.map_memory[0].weight)\n\n        if self.learnt_init_query:\n            xavier_uniform_(self.tgt_embed.weight)\n        xavier_uniform_(self.query_pos_head.layers[0].weight)\n        xavier_uniform_(self.query_pos_head.layers[1].weight)\n        for l in self.input_proj:\n            xavier_uniform_(l[0].weight)\n\n        # init encoder output anchors and valid_mask\n        if self.eval_size:\n            self.anchors, self.valid_mask = self._generate_anchors()\n\n    @classmethod\n    def from_config(cls, cfg, input_shape):\n        return {'backbone_feat_channels': [i.channels for i in input_shape]}\n\n    def _build_input_proj_layer(self, backbone_feat_channels):\n        self.input_proj = nn.LayerList()\n        for in_channels in backbone_feat_channels:\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=1,\n                        bias_attr=False)), ('norm', nn.BatchNorm2D(\n                            self.hidden_dim,\n                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n        in_channels = backbone_feat_channels[-1]\n        for _ in range(self.num_levels - len(backbone_feat_channels)):\n            self.input_proj.append(\n                nn.Sequential(\n                    ('conv', nn.Conv2D(\n                        in_channels,\n                        self.hidden_dim,\n                        kernel_size=3,\n                        stride=2,\n                        padding=1,\n                        bias_attr=False)), ('norm', nn.BatchNorm2D(\n                            self.hidden_dim,\n                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),\n                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))\n            in_channels = self.hidden_dim\n\n    def _get_encoder_input(self, feats):\n        # get projection features\n        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]\n        if self.num_levels > len(proj_feats):\n            len_srcs = len(proj_feats)\n            for i in range(len_srcs, self.num_levels):\n                if i == len_srcs:\n                    proj_feats.append(self.input_proj[i](feats[-1]))\n                else:\n                    proj_feats.append(self.input_proj[i](proj_feats[-1]))\n\n        # get encoder inputs\n        feat_flatten = []\n        spatial_shapes = []\n        level_start_index = [0, ]\n        for i, feat in enumerate(proj_feats):\n            _, _, h, w = feat.shape\n            # [b, c, h, w] -> [b, h*w, c]\n            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))\n            # [num_levels, 2]\n            spatial_shapes.append([h, w])\n            # [l], start index of each level\n            level_start_index.append(h * w + level_start_index[-1])\n\n        # [b, l, c]\n        feat_flatten = paddle.concat(feat_flatten, 1)\n        level_start_index.pop()\n        return (feat_flatten, spatial_shapes, level_start_index)\n\n    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):\n        # input projection and embedding\n        (memory, spatial_shapes,\n         level_start_index) = self._get_encoder_input(feats)\n\n        # prepare denoising training\n        if self.training:\n            denoising_classes, denoising_bbox_unacts, attn_masks, dn_metas = [], [], [], []\n            for g_id in range(self.num_noises + 1):\n                if g_id == 0:\n                    num_denoising = self.num_denoising\n                else:\n                    num_denoising = self.num_noise_denoising\n                denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \\\n                    get_contrastive_denoising_training_group(gt_meta,\n                                                self.num_classes,\n                                                self.num_queries[g_id],\n                                                self.denoising_class_embed.weight,\n                                                num_denoising,\n                                                self.label_noise_ratio,\n                                                self.box_noise_scale)\n                denoising_classes.append(denoising_class)\n                denoising_bbox_unacts.append(denoising_bbox_unact)\n                attn_masks.append(attn_mask)\n                dn_metas.append(dn_meta)\n        else:\n            denoising_classes, denoising_bbox_unacts, attn_masks, dn_metas = None, None, None, None\n\n        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \\\n            self._get_decoder_input(\n                memory, spatial_shapes, denoising_classes, denoising_bbox_unacts, is_teacher)\n\n        # multi group noise attention\n        if self.training:\n            new_size = target.shape[1]\n            new_attn_mask = paddle.ones([new_size, new_size]) < 0\n            begin, end = 0, 0\n            mask = None\n            for g_id in range(self.num_groups):\n                new_mask = paddle.rand([self.num_queries[g_id], self.num_queries[g_id]])\n                if self.o2m_branch and g_id == self.num_groups - 1:\n                    end = end + self.num_queries_o2m\n                    new_mask = new_mask >= 0.0\n                    new_attn_mask[begin: end, begin: end] = new_mask\n                else:\n                    end = end + attn_masks[g_id].shape[1]\n                    dn_size, q_size = dn_metas[g_id]['dn_num_split']\n                    if g_id > 0:\n                        new_mask = new_mask > 0.1\n                    else:\n                        new_mask = new_mask >= 0.0\n                    attn_masks[g_id][dn_size: dn_size + q_size, dn_size: dn_size + q_size] = new_mask\n                    new_attn_mask[begin: end, begin: end] = attn_masks[g_id]\n                begin = end\n            attn_masks = new_attn_mask\n\n        # decoder\n        out_bboxes, out_logits = self.decoder(\n            target,\n            init_ref_points_unact,\n            memory,\n            spatial_shapes,\n            level_start_index,\n            self.dec_bbox_head,\n            self.dec_score_head,\n            self.query_pos_head,\n            attn_mask=attn_masks,\n            memory_mask=None,\n            query_pos_head_inv_sig=self.query_pos_head_inv_sig)\n        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,\n                dn_metas)\n\n    def _generate_anchors(self,\n                          spatial_shapes=None,\n                          grid_size=0.05,\n                          dtype=\"float32\"):\n        if spatial_shapes is None:\n            spatial_shapes = [\n                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]\n                for s in self.feat_strides\n            ]\n        anchors = []\n        for lvl, (h, w) in enumerate(spatial_shapes):\n            grid_y, grid_x = paddle.meshgrid(\n                paddle.arange(\n                    end=h, dtype=dtype),\n                paddle.arange(\n                    end=w, dtype=dtype))\n            grid_xy = paddle.stack([grid_x, grid_y], -1)\n\n            valid_WH = paddle.to_tensor([h, w]).astype(dtype)\n            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH\n            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)\n            anchors.append(\n                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))\n\n        anchors = paddle.concat(anchors, 1)\n        valid_mask = ((anchors > self.eps) *\n                      (anchors < 1 - self.eps)).all(-1, keepdim=True)\n        anchors = paddle.log(anchors / (1 - anchors))\n        anchors = paddle.where(valid_mask, anchors,\n                               paddle.to_tensor(float(\"inf\")))\n        return anchors, valid_mask\n\n    def _get_decoder_input(self,\n                           memory,\n                           spatial_shapes,\n                           denoising_classes=None,\n                           denoising_bbox_unacts=None,\n                           is_teacher=False):\n        bs, _, _ = memory.shape\n        # prepare input for decoder\n        if self.training or self.eval_size is None or is_teacher:\n            anchors, valid_mask = self._generate_anchors(spatial_shapes)\n        else:\n            anchors, valid_mask = self.anchors, self.valid_mask\n        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))\n        map_memory = self.map_memory(memory.detach())\n        targets, reference_points_unacts, enc_topk_bboxes, enc_topk_logits = [], [], [], []\n\n        for g_id in range(self.num_groups):\n            output_memory = self.enc_output[g_id](memory)\n            enc_outputs_class = self.enc_score_head[g_id](output_memory)\n            enc_outputs_coord_unact = self.enc_bbox_head[g_id](output_memory) + anchors\n\n            _, topk_ind = paddle.topk(\n                enc_outputs_class.max(-1), self.num_queries[g_id], axis=1)\n            # extract region proposal boxes\n            batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)\n            batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries[g_id]])\n            topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)\n\n            reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, topk_ind)  # unsigmoided.\n            enc_topk_bbox = F.sigmoid(reference_points_unact)\n            enc_topk_logit = paddle.gather_nd(enc_outputs_class, topk_ind)\n\n            if denoising_bbox_unacts is not None and not (self.o2m_branch and g_id == self.num_groups - 1):\n                reference_points_unact = paddle.concat(\n                    [denoising_bbox_unacts[g_id], reference_points_unact], 1)\n            if self.training:\n                reference_points_unact = reference_points_unact.detach()\n\n            # extract region features\n            if self.learnt_init_query:\n                target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])\n            else:\n                if g_id == 0:\n                    target = paddle.gather_nd(output_memory, topk_ind)\n                    if self.training:\n                        target = target.detach()\n                else:\n                    target = paddle.gather_nd(map_memory, topk_ind)\n            if denoising_classes is not None and not (self.o2m_branch and g_id == self.num_groups - 1):\n                target = paddle.concat([denoising_classes[g_id], target], 1)\n            \n            if not self.training:\n                return target, reference_points_unact, enc_topk_bbox, enc_topk_logit\n            \n            targets.append(target)\n            reference_points_unacts.append(reference_points_unact)\n            enc_topk_bboxes.append(enc_topk_bbox)\n            enc_topk_logits.append(enc_topk_logit)\n\n        targets = paddle.concat(targets, 1)\n        reference_points_unacts = paddle.concat(reference_points_unacts, 1)\n        enc_topk_bboxes = paddle.concat(enc_topk_bboxes, 1)\n        enc_topk_logits = paddle.concat(enc_topk_logits, 1)\n        return targets, reference_points_unacts, enc_topk_bboxes, enc_topk_logits\n"
  },
  {
    "path": "ppdet/modeling/transformers/utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# Modified from detrex (https://github.com/IDEA-Research/detrex)\n# Copyright 2022 The IDEA Authors. All rights reserved.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport copy\nimport math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ..bbox_utils import bbox_overlaps\n\n__all__ = [\n    '_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy',\n    'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',\n    'deformable_attention_core_func', 'varifocal_loss_with_logits'\n]\n\n\ndef _get_clones(module, N):\n    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])\n\n\ndef bbox_cxcywh_to_xyxy(x):\n    cxcy, wh = paddle.split(x, 2, axis=-1)\n    return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)\n\n\ndef bbox_xyxy_to_cxcywh(x):\n    x1, y1, x2, y2 = x.split(4, axis=-1)\n    return paddle.concat([(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)],\n                         axis=-1)\n\n\ndef sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):\n    prob = F.sigmoid(logit)\n    ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction=\"none\")\n    p_t = prob * label + (1 - prob) * (1 - label)\n    loss = ce_loss * ((1 - p_t)**gamma)\n\n    if alpha >= 0:\n        alpha_t = alpha * label + (1 - alpha) * (1 - label)\n        loss = alpha_t * loss\n    return loss.mean(1).sum() / normalizer\n\n\ndef inverse_sigmoid(x, eps=1e-5):\n    x = x.clip(min=0., max=1.)\n    return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))\n\n\ndef deformable_attention_core_func(value, value_spatial_shapes,\n                                   value_level_start_index, sampling_locations,\n                                   attention_weights):\n    \"\"\"\n    Args:\n        value (Tensor): [bs, value_length, n_head, c]\n        value_spatial_shapes (Tensor|List): [n_levels, 2]\n        value_level_start_index (Tensor|List): [n_levels]\n        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]\n        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]\n\n    Returns:\n        output (Tensor): [bs, Length_{query}, C]\n    \"\"\"\n    bs, _, n_head, c = value.shape\n    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape\n\n    split_shape = [h * w for h, w in value_spatial_shapes]\n    value_list = value.split(split_shape, axis=1)\n    sampling_grids = 2 * sampling_locations - 1\n    sampling_value_list = []\n    for level, (h, w) in enumerate(value_spatial_shapes):\n        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_\n        value_l_ = value_list[level].flatten(2).transpose([0, 2, 1]).reshape(\n            [bs * n_head, c, h, w])\n        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2\n        sampling_grid_l_ = sampling_grids[:, :, :,\n                                          level].transpose([0, 2, 1, 3,\n                                                            4]).flatten(0, 1)\n        # N_*M_, D_, Lq_, P_\n        sampling_value_l_ = F.grid_sample(value_l_,\n                                          sampling_grid_l_,\n                                          mode='bilinear',\n                                          padding_mode='zeros',\n                                          align_corners=False)\n        sampling_value_list.append(sampling_value_l_)\n    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)\n    attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(\n        [bs * n_head, 1, Len_q, n_levels * n_points])\n    output = (paddle.stack(sampling_value_list, axis=-2).flatten(-2) *\n              attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])\n\n    return output.transpose([0, 2, 1])\n\n\ndef discrete_sample(x, grid):\n    \"\"\"\n    Args:\n        x (Tensor): [N, C, H, W]\n        grid (Tensor): [N, grid_H, grid_W, 2]\n    Returns:\n        output (Tensor): [N, C, grid_H, grid_W]\n    \"\"\"\n    N, C, H, W = x.shape\n    _, grid_H, grid_W, _ = grid.shape\n    spatial_shape = paddle.to_tensor([[W, H]], dtype=paddle.float32)\n    index = (grid * spatial_shape + 0.5).astype(paddle.int64).flatten(1, 2)\n    h_index = index[:, :, 1].clip(0, H - 1)\n    w_index = index[:, :, 0].clip(0, W - 1)\n    batch_index = paddle.arange(N).unsqueeze(-1).tile([1, grid_H * grid_W])\n    output = x[batch_index, :, h_index, w_index]\n    output = output.transpose([0, 2, 1]).reshape([N, C, grid_H, grid_W])\n    return output\n\n\ndef deformable_attention_core_func_v2(value,\n                                      value_spatial_shapes,\n                                      sampling_locations,\n                                      attention_weights,\n                                      num_points_list,\n                                      sampling_method='default'):\n    \"\"\"\n    Args:\n        value (Tensor): [batch_num, value_len, num_heads, head_dim]\n        value_spatial_shapes (Tensor|List): [n_levels, 2]\n        sampling_locations (Tensor): [batch_num, query_len, num_heads, total_num_points, 2]\n        attention_weights (Tensor): [batch_num, query_len, num_heads, total_num_points]\n        num_points_list (List): The number of sampling point corresponding to each level\n        sampling_method (str): default(grid_sample) or discrete(discrete_sample)\n\n    Returns:\n        output (Tensor): [batch_num, query_len, num_heads * head_dim]\n    \"\"\"\n    assert sampling_method in ['default', 'discrete'], NotImplementedError\n    batch_num, _, num_heads, head_dim = value.shape\n    query_len = sampling_locations.shape[1]\n    num_levels = len(num_points_list)\n\n    value = value.transpose([0, 2, 3, 1]).flatten(0, 1)\n    split_shape = [h * w for h, w in value_spatial_shapes]\n    value_list = value.split(split_shape, axis=-1)\n    value_list = [\n        value.reshape([batch_num * num_heads, head_dim, h, w])\n        for value, (h, w) in zip(value_list, value_spatial_shapes)\n    ]\n\n    if sampling_method == 'default':\n        sampling_grids = 2 * sampling_locations - 1\n    else:\n        sampling_grids = sampling_locations\n\n    sampling_grids = sampling_grids.transpose([0, 2, 1, 3, 4]).flatten(0, 1)\n    sampling_grids_list = sampling_grids.split(num_points_list, axis=-2)\n\n    sampling_value_list = []\n    for idx in range(num_levels):\n        # value_list[idx]: [batch_num * num_heads, head_dim, h, w]\n        # sampling_grids_list[idx]: [batch_num * num_heads, query_len, num_points, 2]\n        # _sampling_value: [batch_num * num_heads, head_dim, query_len, num_points]\n        if sampling_method == 'default':\n            _sampling_value = F.grid_sample(value_list[idx],\n                                            sampling_grids_list[idx],\n                                            mode='bilinear',\n                                            padding_mode='zeros',\n                                            align_corners=False)\n        else:\n            _sampling_value = discrete_sample(value_list[idx],\n                                              sampling_grids_list[idx])\n        sampling_value_list.append(_sampling_value)\n\n    attn_weights = attention_weights.transpose([0, 2, 1, 3])\n    attn_weights = attn_weights.flatten(0, 1).unsqueeze(1)\n    sampling_value = paddle.concat(sampling_value_list, axis=-1)\n    # attn_weights: [batch_num * num_heads, 1, query_len, total_num_points]\n    # sampling_value: [batch_num * num_heads, head_dim, query_len, total_num_points]\n    # output: [batch_num * num_heads, head_dim, query_len]\n    output = (sampling_value * attn_weights).sum(-1)\n    output = output.reshape([batch_num, num_heads * head_dim, query_len])\n    return output.transpose([0, 2, 1])\n\n\ndef get_valid_ratio(mask):\n    _, H, W = mask.shape\n    valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H\n    valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W\n    # [b, 2]\n    return paddle.stack([valid_ratio_w, valid_ratio_h], -1)\n\n\ndef get_denoising_training_group(targets,\n                                 num_classes,\n                                 num_queries,\n                                 class_embed,\n                                 num_denoising=100,\n                                 label_noise_ratio=0.5,\n                                 box_noise_scale=1.0):\n    if num_denoising <= 0:\n        return None, None, None, None\n    num_gts = [len(t) for t in targets[\"gt_class\"]]\n    max_gt_num = max(num_gts)\n    if max_gt_num == 0:\n        return None, None, None, None\n\n    num_group = num_denoising // max_gt_num\n    num_group = 1 if num_group == 0 else num_group\n    # pad gt to max_num of a batch\n    bs = len(targets[\"gt_class\"])\n    input_query_class = paddle.full([bs, max_gt_num],\n                                    num_classes,\n                                    dtype='int32')\n    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])\n    pad_gt_mask = paddle.zeros([bs, max_gt_num])\n    for i in range(bs):\n        num_gt = num_gts[i]\n        if num_gt > 0:\n            input_query_class[i, :num_gt] = targets[\"gt_class\"][i].squeeze(-1)\n            input_query_bbox[i, :num_gt] = targets[\"gt_bbox\"][i]\n            pad_gt_mask[i, :num_gt] = 1\n\n    input_query_class = input_query_class.tile([1, num_group])\n    input_query_bbox = input_query_bbox.tile([1, num_group, 1])\n    pad_gt_mask = pad_gt_mask.tile([1, num_group])\n\n    dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]\n    dn_positive_idx = paddle.split(dn_positive_idx,\n                                   [n * num_group for n in num_gts])\n    # total denoising queries\n    num_denoising = int(max_gt_num * num_group)\n\n    if label_noise_ratio > 0:\n        input_query_class = input_query_class.flatten()\n        pad_gt_mask = pad_gt_mask.flatten()\n        # half of bbox prob, cast mask from bool to float bacause dtype promotaion\n        # between bool and float is not supported in static mode.\n        mask = paddle.cast(\n            paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5),\n            paddle.float32)\n        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)\n        # randomly put a new one here\n        new_label = paddle.randint_like(chosen_idx,\n                                        0,\n                                        num_classes,\n                                        dtype=input_query_class.dtype)\n        input_query_class.scatter_(chosen_idx, new_label)\n        input_query_class.reshape_([bs, num_denoising])\n        pad_gt_mask.reshape_([bs, num_denoising])\n\n    if box_noise_scale > 0:\n        diff = paddle.concat(\n            [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],\n            axis=-1) * box_noise_scale\n        diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)\n        input_query_bbox += diff\n        input_query_bbox = inverse_sigmoid(input_query_bbox)\n\n    class_embed = paddle.concat(\n        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])\n    input_query_class = paddle.gather(class_embed,\n                                      input_query_class.flatten(),\n                                      axis=0).reshape([bs, num_denoising, -1])\n\n    tgt_size = num_denoising + num_queries\n    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0\n    # match query cannot see the reconstruction\n    attn_mask[num_denoising:, :num_denoising] = True\n    # reconstruct cannot see each other\n    for i in range(num_group):\n        if i == 0:\n            attn_mask[max_gt_num * i:max_gt_num * (i + 1),\n                      max_gt_num * (i + 1):num_denoising] = True\n        if i == num_group - 1:\n            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *\n                      i] = True\n        else:\n            attn_mask[max_gt_num * i:max_gt_num * (i + 1),\n                      max_gt_num * (i + 1):num_denoising] = True\n            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *\n                      i] = True\n    attn_mask = ~attn_mask\n    dn_meta = {\n        \"dn_positive_idx\": dn_positive_idx,\n        \"dn_num_group\": num_group,\n        \"dn_num_split\": [num_denoising, num_queries]\n    }\n\n    return input_query_class, input_query_bbox, attn_mask, dn_meta\n\n\ndef get_contrastive_denoising_training_group(targets,\n                                             num_classes,\n                                             num_queries,\n                                             class_embed,\n                                             num_denoising=100,\n                                             label_noise_ratio=0.5,\n                                             box_noise_scale=1.0):\n    if num_denoising <= 0:\n        return None, None, None, None\n    # listcomp is not well-supported in SOT mode for now.\n    num_gts = []\n    for t in targets[\"gt_class\"]:\n        num_gts.append(len(t))\n    max_gt_num = max(num_gts)\n    if max_gt_num == 0:\n        return None, None, None, None\n\n    num_group = num_denoising // max_gt_num\n    num_group = 1 if num_group == 0 else num_group\n    # pad gt to max_num of a batch\n    bs = len(targets[\"gt_class\"])\n    input_query_class = paddle.full([bs, max_gt_num],\n                                    num_classes,\n                                    dtype='int32')\n    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])\n    pad_gt_mask = paddle.zeros([bs, max_gt_num])\n    for i in range(bs):\n        num_gt = num_gts[i]\n        if num_gt > 0:\n            input_query_class[i, :num_gt] = targets[\"gt_class\"][i].squeeze(-1)\n            input_query_bbox[i, :num_gt] = targets[\"gt_bbox\"][i]\n            pad_gt_mask[i, :num_gt] = 1\n    # each group has positive and negative queries.\n    input_query_class = input_query_class.tile([1, 2 * num_group])\n    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])\n    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])\n    # positive and negative mask\n    negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])\n    negative_gt_mask[:, max_gt_num:] = 1\n    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])\n    positive_gt_mask = 1 - negative_gt_mask\n    # contrastive denoising training positive index\n    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask\n    dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]\n    dn_positive_idx = paddle.split(dn_positive_idx,\n                                   [n * num_group for n in num_gts])\n    # total denoising queries\n    num_denoising = int(max_gt_num * 2 * num_group)\n\n    if label_noise_ratio > 0:\n        input_query_class = input_query_class.flatten()\n        pad_gt_mask = pad_gt_mask.flatten()\n        # half of bbox prob\n        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)\n        chosen_idx = paddle.nonzero(mask.cast(pad_gt_mask.dtype) *\n                                    pad_gt_mask).squeeze(-1)\n        # randomly put a new one here\n        new_label = paddle.randint_like(chosen_idx,\n                                        0,\n                                        num_classes,\n                                        dtype=input_query_class.dtype)\n        input_query_class.scatter_(chosen_idx, new_label)\n        input_query_class.reshape_([bs, num_denoising])\n        pad_gt_mask.reshape_([bs, num_denoising])\n\n    if box_noise_scale > 0:\n        known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)\n\n        diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,\n                           [1, 1, 2]) * box_noise_scale\n\n        rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0\n        rand_part = paddle.rand(input_query_bbox.shape)\n        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (\n            1 - negative_gt_mask)\n        rand_part *= rand_sign\n        known_bbox += rand_part * diff\n        known_bbox.clip_(min=0.0, max=1.0)\n        input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)\n        input_query_bbox = inverse_sigmoid(input_query_bbox)\n\n    class_embed = paddle.concat(\n        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])\n    input_query_class = paddle.gather(class_embed,\n                                      input_query_class.flatten(),\n                                      axis=0).reshape([bs, num_denoising, -1])\n\n    tgt_size = num_denoising + num_queries\n    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0\n    # match query cannot see the reconstruction\n    attn_mask[num_denoising:, :num_denoising] = True\n    # reconstruct cannot see each other\n    for i in range(num_group):\n        if i == 0:\n            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1),\n                      max_gt_num * 2 * (i + 1):num_denoising] = True\n        if i == num_group - 1:\n            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *\n                      i * 2] = True\n        else:\n            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1),\n                      max_gt_num * 2 * (i + 1):num_denoising] = True\n            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *\n                      2 * i] = True\n    attn_mask = ~attn_mask\n    dn_meta = {\n        \"dn_positive_idx\": dn_positive_idx,\n        \"dn_num_group\": num_group,\n        \"dn_num_split\": [num_denoising, num_queries]\n    }\n\n    return input_query_class, input_query_bbox, attn_mask, dn_meta\n\n\ndef get_sine_pos_embed(pos_tensor,\n                       num_pos_feats=128,\n                       temperature=10000,\n                       exchange_xy=True):\n    \"\"\"generate sine position embedding from a position tensor\n\n    Args:\n        pos_tensor (Tensor): Shape as `(None, n)`.\n        num_pos_feats (int): projected shape for each float in the tensor. Default: 128\n        temperature (int): The temperature used for scaling\n            the position embedding. Default: 10000.\n        exchange_xy (bool, optional): exchange pos x and pos y. \\\n            For example, input tensor is `[x, y]`, the results will  # noqa\n            be `[pos(y), pos(x)]`. Defaults: True.\n\n    Returns:\n        Tensor: Returned position embedding  # noqa\n        with shape `(None, n * num_pos_feats)`.\n    \"\"\"\n    scale = 2. * math.pi\n    dim_t = 2. * paddle.floor_divide(paddle.arange(num_pos_feats),\n                                     paddle.to_tensor(2))\n    dim_t = scale / temperature**(dim_t / num_pos_feats)\n\n    def sine_func(x):\n        x *= dim_t\n        return paddle.stack((x[:, :, 0::2].sin(), x[:, :, 1::2].cos()),\n                            axis=3).flatten(2)\n\n    pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]\n    if exchange_xy:\n        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]\n    pos_res = paddle.concat(pos_res, axis=2)\n    return pos_res\n\n\ndef mask_to_box_coordinate(mask,\n                           normalize=False,\n                           format=\"xyxy\",\n                           dtype=\"float32\"):\n    \"\"\"\n    Compute the bounding boxes around the provided mask.\n    Args:\n        mask (Tensor:bool): [b, c, h, w]\n\n    Returns:\n        bbox (Tensor): [b, c, 4]\n    \"\"\"\n    assert mask.ndim == 4\n    assert format in [\"xyxy\", \"xywh\"]\n\n    h, w = mask.shape[-2:]\n    y, x = paddle.meshgrid(paddle.arange(end=h, dtype=dtype),\n                           paddle.arange(end=w, dtype=dtype))\n\n    x_mask = x * mask.astype(x.dtype)\n    x_max = x_mask.flatten(-2).max(-1) + 1\n    x_min = paddle.where(mask.astype(bool), x_mask,\n                         paddle.to_tensor(1e8)).flatten(-2).min(-1)\n\n    y_mask = y * mask.astype(y.dtype)\n    y_max = y_mask.flatten(-2).max(-1) + 1\n    y_min = paddle.where(mask.astype(bool), y_mask,\n                         paddle.to_tensor(1e8)).flatten(-2).min(-1)\n    out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)\n    mask = mask.any(axis=[2, 3]).unsqueeze(2)\n    out_bbox = out_bbox * mask.astype(out_bbox.dtype)\n    if normalize:\n        out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)\n\n    return out_bbox if format == \"xyxy\" else bbox_xyxy_to_cxcywh(out_bbox)\n\n\ndef varifocal_loss_with_logits(pred_logits,\n                               gt_score,\n                               label,\n                               normalizer=1.0,\n                               alpha=0.75,\n                               gamma=2.0):\n    pred_score = F.sigmoid(pred_logits)\n    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label\n    loss = F.binary_cross_entropy_with_logits(pred_logits,\n                                              gt_score,\n                                              weight=weight,\n                                              reduction='none')\n    return loss.mean(1).sum() / normalizer\n"
  },
  {
    "path": "ppdet/optimizer/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import optimizer\nfrom . import ema\n\nfrom .optimizer import *\nfrom .ema import *\n"
  },
  {
    "path": "ppdet/optimizer/adamw.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom paddle.optimizer import AdamW\nfrom functools import partial\nimport re\n\nIS_PADDLE_LATER_2_4 = (\n    int(paddle.version.major) >= 2 and\n    int(paddle.version.minor) >= 4) or int(paddle.version.major) == 0\n\n\ndef layerwise_lr_decay(decay_rate, name_dict, n_layers, param):\n    \"\"\"\n    Args:\n        decay_rate (float): \n            The layer-wise decay ratio.\n        name_dict (dict): \n            The keys of name_dict is dynamic name of model while the value\n            of name_dict is static name.\n            Use model.named_parameters() to get name_dict.\n        n_layers (int):\n            Total number of layers in the transformer encoder.\n    \"\"\"\n    ratio = 1.0\n    static_name = name_dict[param.name]\n    if 'blocks.' in static_name or 'layers.' in static_name:\n        idx_1 = static_name.find('blocks.')\n        idx_2 = static_name.find('layers.')\n        assert any([x >= 0 for x in [idx_1, idx_2]]), ''\n        idx = idx_1 if idx_1 >= 0 else idx_2\n        # idx = re.findall('[blocks|layers]\\.(\\d+)\\.', static_name)[0]\n\n        layer = int(static_name[idx:].split('.')[1])\n        ratio = decay_rate**(n_layers - layer)\n\n    elif 'cls_token' in static_name or 'patch_embed' in static_name or 'pos_embed' in static_name:\n        ratio = decay_rate**(n_layers + 1)\n\n    if IS_PADDLE_LATER_2_4:\n        return ratio\n    else:\n        param.optimize_attr['learning_rate'] *= ratio\n\n\nclass AdamWDL(AdamW):\n    r\"\"\"\n    The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting.\n    Generally it's used for transformer model.\n\n    We use \"layerwise_lr_decay\" as default dynamic lr setting method of AdamWDL.\n    “Layer-wise decay” means exponentially decaying the learning rates of individual \n    layers in a top-down manner. For example, suppose the 24-th layer uses a learning\n    rate l, and the Layer-wise decay rate is α, then the learning rate of layer m \n    is lα^(24-m). See more details on: https://arxiv.org/abs/1906.08237.\n\n    .. math::\n        & t = t + 1\n    \n        & moment\\_1\\_out = {\\beta}_1 * moment\\_1 + (1 - {\\beta}_1) * grad\n\n        & moment\\_2\\_out = {\\beta}_2 * moment\\_2 + (1 - {\\beta}_2) * grad * grad\n\n        & learning\\_rate = learning\\_rate * \\frac{\\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}\n\n        & param\\_out = param - learning\\_rate * (\\frac{moment\\_1}{\\sqrt{moment\\_2} + \\epsilon} + \\lambda * param)\n\n    Args:\n        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.\n            It can be a float value or a LRScheduler. The default value is 0.001.\n        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.\n            It should be a float number or a Tensor with shape [1] and data type as float32.\n            The default value is 0.9.\n        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.\n            It should be a float number or a Tensor with shape [1] and data type as float32.\n            The default value is 0.999.\n        epsilon (float, optional): A small float value for numerical stability.\n            It should be a float number or a Tensor with shape [1] and data type as float32.\n            The default value is 1e-08.\n        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \\\n            This parameter is required in dygraph mode. \\\n            The default value is None in static mode, at this time all parameters will be updated.\n        weight_decay (float, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.\n        apply_decay_param_fun (function|None, optional): If it is not None,\n            only tensors that makes apply_decay_param_fun(Tensor.name)==True\n            will be updated. It only works when we want to specify tensors.\n            Default: None.\n        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of\n            some derived class of ``GradientClipBase`` . There are three cliping strategies\n            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,\n            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.\n        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.\n            The accumulators are updated at every step. Every element of the two moving-average\n            is updated in both dense mode and sparse mode. If the size of parameter is very large,\n            then the update may be very slow. The lazy mode only update the element that has\n            gradient in current mini-batch, so it will be much more faster. But this mode has\n            different semantics with the original Adam algorithm and may lead to different result.\n            The default value is False.\n        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.  \n        layerwise_decay (float, optional): The layer-wise decay ratio. Defaults to 1.0.\n        n_layers (int, optional): The total number of encoder layers. Defaults to 12.\n        set_param_lr_fun (function|None, optional): If it's not None, set_param_lr_fun() will set the the parameter \n            learning rate before it executes Adam Operator. Defaults to :ref:`layerwise_lr_decay`.\n        name_dict (dict, optional): The keys of name_dict is dynamic name of model while the value\n            of name_dict is static name. Use model.named_parameters() to get name_dict.\n        name (str, optional): Normally there is no need for user to set this property.\n            For more information, please refer to :ref:`api_guide_Name`.\n            The default value is None.\n\n    Examples:\n        .. code-block:: python\n\n            import paddle\n            from paddlenlp.ops.optimizer import AdamWDL\n            def simple_lr_setting(decay_rate, name_dict, n_layers, param):\n                ratio = 1.0\n                static_name = name_dict[param.name]\n                if \"weight\" in static_name:\n                    ratio = decay_rate**0.5\n                param.optimize_attr[\"learning_rate\"] *= ratio\n            \n            linear = paddle.nn.Linear(10, 10)\n\n            name_dict = dict()\n            for n, p in linear.named_parameters():\n                name_dict[p.name] = n\n\n            inp = paddle.rand([10,10], dtype=\"float32\")\n            out = linear(inp)\n            loss = paddle.mean(out)\n\n            adamwdl = AdamWDL(\n                learning_rate=1e-4,\n                parameters=linear.parameters(),\n                set_param_lr_fun=simple_lr_setting,\n                layerwise_decay=0.8,\n                name_dict=name_dict)\n            \n            loss.backward()\n            adamwdl.step()\n            adamwdl.clear_grad()\n    \"\"\"\n\n    def __init__(self,\n                 learning_rate=0.001,\n                 beta1=0.9,\n                 beta2=0.999,\n                 epsilon=1e-8,\n                 parameters=None,\n                 weight_decay=0.01,\n                 apply_decay_param_fun=None,\n                 grad_clip=None,\n                 lazy_mode=False,\n                 multi_precision=False,\n                 layerwise_decay=1.0,\n                 n_layers=12,\n                 set_param_lr_func=None,\n                 name_dict=None,\n                 name=None):\n        if not isinstance(layerwise_decay, float):\n            raise TypeError(\"coeff should be float or Tensor.\")\n        self.layerwise_decay = layerwise_decay\n        self.n_layers = n_layers\n        self.set_param_lr_func = partial(\n            set_param_lr_func, layerwise_decay, name_dict,\n            n_layers) if set_param_lr_func is not None else set_param_lr_func\n\n        if IS_PADDLE_LATER_2_4:\n            super(AdamWDL, self).__init__(\n                learning_rate=learning_rate,\n                parameters=parameters,\n                beta1=beta1,\n                beta2=beta2,\n                epsilon=epsilon,\n                grad_clip=grad_clip,\n                name=name,\n                apply_decay_param_fun=apply_decay_param_fun,\n                weight_decay=weight_decay,\n                lazy_mode=lazy_mode,\n                multi_precision=multi_precision,\n                lr_ratio=self.set_param_lr_func)\n        else:\n            super(AdamWDL, self).__init__(\n                learning_rate=learning_rate,\n                parameters=parameters,\n                beta1=beta1,\n                beta2=beta2,\n                epsilon=epsilon,\n                grad_clip=grad_clip,\n                name=name,\n                apply_decay_param_fun=apply_decay_param_fun,\n                weight_decay=weight_decay,\n                lazy_mode=lazy_mode,\n                multi_precision=multi_precision)\n\n\ndef _append_optimize_op(self, block, param_and_grad):\n    if self.set_param_lr_func is None:\n        return super(AdamWDL, self)._append_optimize_op(block, param_and_grad)\n\n    self._append_decoupled_weight_decay(block, param_and_grad)\n    prev_lr = param_and_grad[0].optimize_attr[\"learning_rate\"]\n    self.set_param_lr_func(param_and_grad[0])\n    # excute Adam op\n    res = super(AdamW, self)._append_optimize_op(block, param_and_grad)\n    param_and_grad[0].optimize_attr[\"learning_rate\"] = prev_lr\n    return res\n\n\nif not IS_PADDLE_LATER_2_4:\n    AdamWDL._append_optimize_op = _append_optimize_op\n\n\ndef build_adamwdl(model,\n                  lr=1e-4,\n                  weight_decay=0.05,\n                  betas=(0.9, 0.999),\n                  layer_decay=0.65,\n                  num_layers=None,\n                  filter_bias_and_bn=True,\n                  skip_decay_names=None,\n                  set_param_lr_func='layerwise_lr_decay'):\n\n    if skip_decay_names and filter_bias_and_bn:\n        decay_dict = {\n            param.name: not (len(param.shape) == 1 or name.endswith('.bias') or\n                             any([_n in name for _n in skip_decay_names]))\n            for name, param in model.named_parameters()\n        }\n        parameters = [p for p in model.parameters()]\n\n    else:\n        parameters = model.parameters()\n\n    opt_args = dict(\n        parameters=parameters, learning_rate=lr, weight_decay=weight_decay)\n\n    if decay_dict is not None:\n        opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n]\n\n    if isinstance(set_param_lr_func, str):\n        set_param_lr_func = eval(set_param_lr_func)\n        opt_args['set_param_lr_func'] = set_param_lr_func\n\n    opt_args['beta1'] = betas[0]\n    opt_args['beta2'] = betas[1]\n\n    opt_args['layerwise_decay'] = layer_decay\n    name_dict = {p.name: n for n, p in model.named_parameters()}\n\n    opt_args['name_dict'] = name_dict\n    opt_args['n_layers'] = num_layers\n\n    optimizer = AdamWDL(**opt_args)\n\n    return optimizer\n"
  },
  {
    "path": "ppdet/optimizer/ema.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport paddle\nimport weakref\nfrom copy import deepcopy\n\nfrom .utils import get_bn_running_state_names\n\n__all__ = ['ModelEMA', 'SimpleModelEMA']\n\n\nclass ModelEMA(object):\n    \"\"\"\n    Exponential Weighted Average for Deep Neutal Networks\n    Args:\n        model (nn.Layer): Detector of model.\n        decay (int):  The decay used for updating ema parameter.\n            Ema's parameter are updated with the formula:\n           `ema_param = decay * ema_param + (1 - decay) * cur_param`.\n            Defaults is 0.9998.\n        ema_decay_type (str): type in ['threshold', 'normal', 'exponential'],\n            'threshold' as default.\n        cycle_epoch (int): The epoch of interval to reset ema_param and\n            step. Defaults is -1, which means not reset. Its function is to\n            add a regular effect to ema, which is set according to experience\n            and is effective when the total training epoch is large.\n        ema_black_list (set|list|tuple, optional): The custom EMA black_list.\n            Blacklist of weight names that will not participate in EMA\n            calculation. Default: None.\n    \"\"\"\n\n    def __init__(self,\n                 model,\n                 decay=0.9998,\n                 ema_decay_type='threshold',\n                 cycle_epoch=-1,\n                 ema_black_list=None,\n                 ema_filter_no_grad=False):\n        self.step = 0\n        self.epoch = 0\n        self.decay = decay\n        self.ema_decay_type = ema_decay_type\n        self.cycle_epoch = cycle_epoch\n        self.ema_black_list = self._match_ema_black_list(\n            model.state_dict().keys(), ema_black_list)\n        bn_states_names = get_bn_running_state_names(model)\n        if ema_filter_no_grad:\n            for n, p in model.named_parameters():\n                if p.stop_gradient and n not in bn_states_names:\n                    self.ema_black_list.add(n)\n\n        self.state_dict = dict()\n        for k, v in model.state_dict().items():\n            if k in self.ema_black_list:\n                self.state_dict[k] = v\n            else:\n                self.state_dict[k] = paddle.zeros_like(v, dtype='float32')\n\n        self._model_state = {\n            k: weakref.ref(p)\n            for k, p in model.state_dict().items()\n        }\n\n    def reset(self):\n        self.step = 0\n        self.epoch = 0\n        for k, v in self.state_dict.items():\n            if k in self.ema_black_list:\n                self.state_dict[k] = v\n            else:\n                self.state_dict[k] = paddle.zeros_like(v)\n\n    def resume(self, state_dict, step=0):\n        for k, v in state_dict.items():\n            if k in self.state_dict:\n                if self.state_dict[k].dtype == v.dtype:\n                    self.state_dict[k] = v\n                else:\n                    self.state_dict[k] = v.astype(self.state_dict[k].dtype)\n        self.step = step\n\n    def update(self, model=None):\n        if self.ema_decay_type == 'threshold':\n            decay = min(self.decay, (1 + self.step) / (10 + self.step))\n        elif self.ema_decay_type == 'exponential':\n            decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000))\n        else:\n            decay = self.decay\n        self._decay = decay\n\n        if model is not None:\n            model_dict = model.state_dict()\n        else:\n            model_dict = {k: p() for k, p in self._model_state.items()}\n            assert all(\n                [v is not None for _, v in model_dict.items()]), 'python gc.'\n\n        for k, v in self.state_dict.items():\n            if k not in self.ema_black_list:\n                v = decay * v + (1 - decay) * model_dict[k].astype('float32')\n                v.stop_gradient = True\n                self.state_dict[k] = v\n        self.step += 1\n\n    def apply(self):\n        if self.step == 0:\n            return self.state_dict\n        state_dict = dict()\n        model_dict = {k: p() for k, p in self._model_state.items()}\n        for k, v in self.state_dict.items():\n            if k in self.ema_black_list:\n                v.stop_gradient = True\n                state_dict[k] = v\n            else:\n                if self.ema_decay_type != 'exponential':\n                    v = v / (1 - self._decay**self.step)\n                    v = v.astype(model_dict[k].dtype)\n                v.stop_gradient = True\n                state_dict[k] = v\n        self.epoch += 1\n        if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch:\n            self.reset()\n\n        return state_dict\n\n    def _match_ema_black_list(self, weight_name, ema_black_list=None):\n        out_list = set()\n        if ema_black_list:\n            for name in weight_name:\n                for key in ema_black_list:\n                    if key in name:\n                        out_list.add(name)\n        return out_list\n\n\nclass SimpleModelEMA(object):\n    \"\"\"\n    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models\n    Keep a moving average of everything in the model state_dict (parameters and buffers).\n    This is intended to allow functionality like\n    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage\n    A smoothed version of the weights is necessary for some training schemes to perform well.\n    This class is sensitive where it is initialized in the sequence of model init,\n    GPU assignment and distributed training wrappers.\n    \"\"\"\n\n    def __init__(self, model=None, decay=0.9996):\n        \"\"\"\n        Args:\n            model (nn.Module): model to apply EMA.\n            decay (float): ema decay reate.\n        \"\"\"\n        self.model = deepcopy(model)\n        self.decay = decay\n\n    def update(self, model, decay=None):\n        if decay is None:\n            decay = self.decay\n\n        with paddle.no_grad():\n            state = {}\n            msd = model.state_dict()\n            for k, v in self.model.state_dict().items():\n                if paddle.is_floating_point(v):\n                    v *= decay\n                    v += (1.0 - decay) * msd[k].detach()\n                state[k] = v\n            self.model.set_state_dict(state)\n\n    def resume(self, state_dict, step=0):\n        state = {}\n        msd = state_dict\n        for k, v in self.model.state_dict().items():\n            if paddle.is_floating_point(v):\n                v = msd[k].detach()\n            state[k] = v\n        self.model.set_state_dict(state)\n        self.step = step\n"
  },
  {
    "path": "ppdet/optimizer/optimizer.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport re\nimport sys\nimport math\nimport paddle\nimport paddle.nn as nn\n\nimport paddle.optimizer as optimizer\nimport paddle.regularizer as regularizer\n\nfrom ppdet.core.workspace import register, serializable\nimport copy\n\nfrom .adamw import AdamWDL, build_adamwdl\n\n__all__ = ['LearningRate', 'OptimizerBuilder']\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\n@serializable\nclass CosineDecay(object):\n    \"\"\"\n    Cosine learning rate decay\n\n    Args:\n        max_epochs (int): max epochs for the training process.\n            if you commbine cosine decay with warmup, it is recommended that\n            the max_iters is much larger than the warmup iter\n        use_warmup (bool): whether to use warmup. Default: True.\n        min_lr_ratio (float): minimum learning rate ratio. Default: 0.\n        last_plateau_epochs (int): use minimum learning rate in\n            the last few epochs. Default: 0.\n    \"\"\"\n\n    def __init__(self,\n                 max_epochs=1000,\n                 use_warmup=True,\n                 min_lr_ratio=0.,\n                 last_plateau_epochs=0):\n        self.max_epochs = max_epochs\n        self.use_warmup = use_warmup\n        self.min_lr_ratio = min_lr_ratio\n        self.last_plateau_epochs = last_plateau_epochs\n\n    def __call__(self,\n                 base_lr=None,\n                 boundary=None,\n                 value=None,\n                 step_per_epoch=None):\n        assert base_lr is not None, \"either base LR or values should be provided\"\n\n        max_iters = self.max_epochs * int(step_per_epoch)\n        last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch)\n        min_lr = base_lr * self.min_lr_ratio\n        if boundary is not None and value is not None and self.use_warmup:\n            # use warmup\n            warmup_iters = len(boundary)\n            for i in range(int(boundary[-1]), max_iters):\n                boundary.append(i)\n                if i < max_iters - last_plateau_iters:\n                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(\n                        (i - warmup_iters) * math.pi /\n                        (max_iters - warmup_iters - last_plateau_iters)) + 1)\n                    value.append(decayed_lr)\n                else:\n                    value.append(min_lr)\n            return optimizer.lr.PiecewiseDecay(boundary, value)\n        elif last_plateau_iters > 0:\n            # not use warmup, but set `last_plateau_epochs` > 0\n            boundary = []\n            value = []\n            for i in range(max_iters):\n                if i < max_iters - last_plateau_iters:\n                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(\n                        i * math.pi / (max_iters - last_plateau_iters)) + 1)\n                    value.append(decayed_lr)\n                else:\n                    value.append(min_lr)\n                if i > 0:\n                    boundary.append(i)\n            return optimizer.lr.PiecewiseDecay(boundary, value)\n\n        return optimizer.lr.CosineAnnealingDecay(\n            base_lr, T_max=max_iters, eta_min=min_lr)\n\n\n@serializable\nclass PiecewiseDecay(object):\n    \"\"\"\n    Multi step learning rate decay\n\n    Args:\n        gamma (float | list): decay factor\n        milestones (list): steps at which to decay learning rate\n    \"\"\"\n\n    def __init__(self,\n                 gamma=[0.1, 0.01],\n                 milestones=[8, 11],\n                 values=None,\n                 use_warmup=True):\n        super(PiecewiseDecay, self).__init__()\n        if type(gamma) is not list:\n            self.gamma = []\n            for i in range(len(milestones)):\n                self.gamma.append(gamma / 10**i)\n        else:\n            self.gamma = gamma\n        self.milestones = milestones\n        self.values = values\n        self.use_warmup = use_warmup\n\n    def __call__(self,\n                 base_lr=None,\n                 boundary=None,\n                 value=None,\n                 step_per_epoch=None):\n        if boundary is not None and self.use_warmup:\n            boundary.extend([int(step_per_epoch) * i for i in self.milestones])\n        else:\n            # do not use LinearWarmup\n            boundary = [int(step_per_epoch) * i for i in self.milestones]\n            value = [base_lr]  # during step[0, boundary[0]] is base_lr\n\n        # self.values is setted directly in config\n        if self.values is not None:\n            assert len(self.milestones) + 1 == len(self.values)\n            return optimizer.lr.PiecewiseDecay(boundary, self.values)\n\n        # value is computed by self.gamma\n        value = value if value is not None else [base_lr]\n        for i in self.gamma:\n            value.append(base_lr * i)\n\n        return optimizer.lr.PiecewiseDecay(boundary, value)\n\n\n@serializable\nclass LinearWarmup(object):\n    \"\"\"\n    Warm up learning rate linearly\n\n    Args:\n        steps (int): warm up steps\n        start_factor (float): initial learning rate factor\n        epochs (int|None): use epochs as warm up steps, the priority\n            of `epochs` is higher than `steps`. Default: None.\n    \"\"\"\n\n    def __init__(self, steps=500, start_factor=1. / 3, epochs=None, epochs_first=True):\n        super(LinearWarmup, self).__init__()\n        self.steps = steps\n        self.start_factor = start_factor\n        self.epochs = epochs\n        self.epochs_first = epochs_first\n\n    def __call__(self, base_lr, step_per_epoch):\n        boundary = []\n        value = []\n        if self.epochs_first and self.epochs is not None:\n            warmup_steps = self.epochs * step_per_epoch\n        else:\n            warmup_steps = self.steps\n        warmup_steps = max(warmup_steps, 1)\n        for i in range(warmup_steps + 1):\n            if warmup_steps > 0:\n                alpha = i / warmup_steps\n                factor = self.start_factor * (1 - alpha) + alpha\n                lr = base_lr * factor\n                value.append(lr)\n            if i > 0:\n                boundary.append(i)\n        return boundary, value\n\n\n@serializable\nclass ExpWarmup(object):\n    \"\"\"\n    Warm up learning rate in exponential mode\n    Args:\n        steps (int): warm up steps.\n        epochs (int|None): use epochs as warm up steps, the priority\n            of `epochs` is higher than `steps`. Default: None.\n        power (int): Exponential coefficient. Default: 2.\n    \"\"\"\n\n    def __init__(self, steps=1000, epochs=None, power=2):\n        super(ExpWarmup, self).__init__()\n        self.steps = steps\n        self.epochs = epochs\n        self.power = power\n\n    def __call__(self, base_lr, step_per_epoch):\n        boundary = []\n        value = []\n        warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps\n        warmup_steps = max(warmup_steps, 1)\n        for i in range(warmup_steps + 1):\n            factor = (i / float(warmup_steps))**self.power\n            value.append(base_lr * factor)\n            if i > 0:\n                boundary.append(i)\n        return boundary, value\n\n\n@register\nclass LearningRate(object):\n    \"\"\"\n    Learning Rate configuration\n\n    Args:\n        base_lr (float): base learning rate\n        schedulers (list): learning rate schedulers\n    \"\"\"\n    __category__ = 'optim'\n\n    def __init__(self,\n                 base_lr=0.01,\n                 schedulers=[PiecewiseDecay(), LinearWarmup()]):\n        super(LearningRate, self).__init__()\n        self.base_lr = base_lr\n        self.schedulers = []\n\n        schedulers = copy.deepcopy(schedulers)\n        for sched in schedulers:\n            if isinstance(sched, dict):\n                # support dict sched instantiate\n                module = sys.modules[__name__]\n                type = sched.pop(\"name\")\n                scheduler = getattr(module, type)(**sched)\n                self.schedulers.append(scheduler)\n            else:\n                self.schedulers.append(sched)\n\n    def __call__(self, step_per_epoch):\n        assert len(self.schedulers) >= 1\n        if not self.schedulers[0].use_warmup:\n            return self.schedulers[0](base_lr=self.base_lr,\n                                      step_per_epoch=step_per_epoch)\n\n        # TODO: split warmup & decay\n        # warmup\n        boundary, value = self.schedulers[1](self.base_lr, step_per_epoch)\n        # decay\n        decay_lr = self.schedulers[0](self.base_lr, boundary, value,\n                                      step_per_epoch)\n        return decay_lr\n\n\n@register\nclass OptimizerBuilder():\n    \"\"\"\n    Build optimizer handles\n    Args:\n        regularizer (object): an `Regularizer` instance\n        optimizer (object): an `Optimizer` instance\n    \"\"\"\n    __category__ = 'optim'\n\n    def __init__(self,\n                 clip_grad_by_norm=None,\n                 clip_grad_by_value=None,\n                 regularizer={'type': 'L2',\n                              'factor': .0001},\n                 optimizer={'type': 'Momentum',\n                            'momentum': .9}):\n        self.clip_grad_by_norm = clip_grad_by_norm\n        self.clip_grad_by_value = clip_grad_by_value\n        self.regularizer = regularizer\n        self.optimizer = optimizer\n\n    def __call__(self, learning_rate, model=None):\n        if self.clip_grad_by_norm is not None:\n            grad_clip = nn.ClipGradByGlobalNorm(\n                clip_norm=self.clip_grad_by_norm)\n        elif self.clip_grad_by_value is not None:\n            var = abs(self.clip_grad_by_value)\n            grad_clip = nn.ClipGradByValue(min=-var, max=var)\n        else:\n            grad_clip = None\n        if self.regularizer and self.regularizer != 'None':\n            reg_type = self.regularizer['type'] + 'Decay'\n            reg_factor = self.regularizer['factor']\n            regularization = getattr(regularizer, reg_type)(reg_factor)\n        else:\n            regularization = None\n\n        optim_args = self.optimizer.copy()\n        optim_type = optim_args['type']\n        del optim_args['type']\n\n        if optim_type == 'AdamWDL':\n            return build_adamwdl(model, lr=learning_rate, **optim_args)\n\n        if optim_type != 'AdamW':\n            optim_args['weight_decay'] = regularization\n\n        op = getattr(optimizer, optim_type)\n\n        if 'param_groups' in optim_args:\n            assert isinstance(optim_args['param_groups'], list), ''\n\n            param_groups = optim_args.pop('param_groups')\n\n            params, visited = [], []\n            for group in param_groups:\n                assert isinstance(group,\n                                  dict) and 'params' in group and isinstance(\n                                      group['params'], list), ''\n                _params = {}\n                for n, p in model.named_parameters():\n                    if not p.trainable:\n                        continue\n                    for k in group['params']:\n                        if re.search(k, n):\n                            _params.update({n: p})\n                            break\n\n                _group = group.copy()\n                _group.update({'params': list(_params.values())})\n\n                params.append(_group)\n                visited.extend(list(_params.keys()))\n\n            ext_params = [\n                p for n, p in model.named_parameters()\n                if n not in visited and p.trainable is True\n            ]\n\n            if len(ext_params) < len(model.parameters()):\n                params.append({'params': ext_params})\n\n            elif len(ext_params) > len(model.parameters()):\n                raise RuntimeError\n\n        else:\n            _params = model.parameters()\n            params = [param for param in _params if param.trainable is True]\n\n        return op(learning_rate=learning_rate,\n                  parameters=params,\n                  grad_clip=grad_clip,\n                  **optim_args)\n"
  },
  {
    "path": "ppdet/optimizer/utils.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\n\nfrom typing import List\n\n\ndef get_bn_running_state_names(model: nn.Layer) -> List[str]:\n    \"\"\"Get all bn state full names including running mean and variance\n    \"\"\"\n    names = []\n    for n, m in model.named_sublayers():\n        if isinstance(m, (nn.BatchNorm2D, nn.SyncBatchNorm)):\n            assert hasattr(m, '_mean'), f'assert {m} has _mean'\n            assert hasattr(m, '_variance'), f'assert {m} has _variance'\n            running_mean = f'{n}._mean'\n            running_var = f'{n}._variance'\n            names.extend([running_mean, running_var])\n\n    return names\n"
  },
  {
    "path": "ppdet/slim/__init__.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import distill_loss\nfrom . import distill_model\nfrom . import ofa\nfrom . import prune\nfrom . import quant\nfrom . import unstructured_prune\n\nfrom .distill_loss import *\nfrom .distill_model import *\nfrom .ofa import *\nfrom .prune import *\nfrom .quant import *\nfrom .unstructured_prune import *\n\nimport yaml\nfrom ppdet.core.workspace import load_config\nfrom ppdet.utils.checkpoint import load_pretrain_weight\n\n\ndef build_slim_model(cfg, slim_cfg, mode='train'):\n    with open(slim_cfg) as f:\n        slim_load_cfg = yaml.load(f, Loader=yaml.Loader)\n\n    if mode != 'train' and slim_load_cfg['slim'] == 'Distill':\n        return cfg\n\n    if slim_load_cfg['slim'] == 'Distill':\n        if \"slim_method\" in slim_load_cfg and slim_load_cfg[\n                'slim_method'] == \"FGD\":\n            model = FGDDistillModel(cfg, slim_cfg)\n        elif \"slim_method\" in slim_load_cfg and slim_load_cfg[\n                'slim_method'] == \"LD\":\n            model = LDDistillModel(cfg, slim_cfg)\n        elif \"slim_method\" in slim_load_cfg and slim_load_cfg[\n                'slim_method'] == \"CWD\":\n            model = CWDDistillModel(cfg, slim_cfg)\n        elif \"slim_method\" in slim_load_cfg and slim_load_cfg[\n                'slim_method'] == \"PPYOLOEDistill\":\n            model = PPYOLOEDistillModel(cfg, slim_cfg)\n        else:\n            # common distillation model\n            model = DistillModel(cfg, slim_cfg)\n        cfg['model'] = model\n        cfg['slim_type'] = cfg.slim\n    elif slim_load_cfg['slim'] == 'OFA':\n        load_config(slim_cfg)\n        model = create(cfg.architecture)\n        load_pretrain_weight(model, cfg.weights)\n        slim = create(cfg.slim)\n        cfg['slim'] = slim\n        cfg['model'] = slim(model, model.state_dict())\n        cfg['slim_type'] = cfg.slim\n    elif slim_load_cfg['slim'] == 'DistillPrune':\n        if mode == 'train':\n            model = DistillModel(cfg, slim_cfg)\n            pruner = create(cfg.pruner)\n            pruner(model.student_model)\n        else:\n            model = create(cfg.architecture)\n            weights = cfg.weights\n            load_config(slim_cfg)\n            pruner = create(cfg.pruner)\n            model = pruner(model)\n            load_pretrain_weight(model, weights)\n        cfg['model'] = model\n        cfg['slim_type'] = cfg.slim\n    elif slim_load_cfg['slim'] == 'PTQ':\n        model = create(cfg.architecture)\n        load_config(slim_cfg)\n        load_pretrain_weight(model, cfg.weights)\n        slim = create(cfg.slim)\n        cfg['slim_type'] = cfg.slim\n        cfg['slim'] = slim\n        cfg['model'] = slim(model)\n    elif slim_load_cfg['slim'] == 'UnstructuredPruner':\n        load_config(slim_cfg)\n        slim = create(cfg.slim)\n        cfg['slim_type'] = cfg.slim\n        cfg['slim'] = slim\n        cfg['unstructured_prune'] = True\n    else:\n        load_config(slim_cfg)\n        model = create(cfg.architecture)\n        if mode == 'train':\n            load_pretrain_weight(model, cfg.pretrain_weights)\n        slim = create(cfg.slim)\n        cfg['slim_type'] = cfg.slim\n        # TODO: fix quant export model in framework.\n        if mode == 'test' and 'QAT' in slim_load_cfg['slim']:\n            slim.quant_config['activation_preprocess_type'] = None\n        cfg['model'] = slim(model)\n        cfg['slim'] = slim\n        if mode != 'train':\n            load_pretrain_weight(cfg['model'], cfg.weights)\n\n    return cfg\n"
  },
  {
    "path": "ppdet/slim/distill_loss.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport math\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle import ParamAttr\n\nfrom ppdet.core.workspace import register\nfrom ppdet.modeling import ops\nfrom ppdet.modeling.losses.iou_loss import GIoULoss\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'DistillYOLOv3Loss',\n    'KnowledgeDistillationKLDivLoss',\n    'DistillPPYOLOELoss',\n    'FGDFeatureLoss',\n    'CWDFeatureLoss',\n    'PKDFeatureLoss',\n    'MGDFeatureLoss',\n]\n\n\ndef parameter_init(mode=\"kaiming\", value=0.):\n    if mode == \"kaiming\":\n        weight_attr = paddle.nn.initializer.KaimingUniform()\n    elif mode == \"constant\":\n        weight_attr = paddle.nn.initializer.Constant(value=value)\n    else:\n        weight_attr = paddle.nn.initializer.KaimingUniform()\n\n    weight_init = ParamAttr(initializer=weight_attr)\n    return weight_init\n\n\ndef feature_norm(feat):\n    # Normalize the feature maps to have zero mean and unit variances.\n    assert len(feat.shape) == 4\n    N, C, H, W = feat.shape\n    feat = feat.transpose([1, 0, 2, 3]).reshape([C, -1])\n    mean = feat.mean(axis=-1, keepdim=True)\n    std = feat.std(axis=-1, keepdim=True)\n    feat = (feat - mean) / (std + 1e-6)\n    return feat.reshape([C, N, H, W]).transpose([1, 0, 2, 3])\n\n\n@register\nclass DistillYOLOv3Loss(nn.Layer):\n    def __init__(self, weight=1000):\n        super(DistillYOLOv3Loss, self).__init__()\n        self.loss_weight = weight\n\n    def obj_weighted_reg(self, sx, sy, sw, sh, tx, ty, tw, th, tobj):\n        loss_x = ops.sigmoid_cross_entropy_with_logits(sx, F.sigmoid(tx))\n        loss_y = ops.sigmoid_cross_entropy_with_logits(sy, F.sigmoid(ty))\n        loss_w = paddle.abs(sw - tw)\n        loss_h = paddle.abs(sh - th)\n        loss = paddle.add_n([loss_x, loss_y, loss_w, loss_h])\n        weighted_loss = paddle.mean(loss * F.sigmoid(tobj))\n        return weighted_loss\n\n    def obj_weighted_cls(self, scls, tcls, tobj):\n        loss = ops.sigmoid_cross_entropy_with_logits(scls, F.sigmoid(tcls))\n        weighted_loss = paddle.mean(paddle.multiply(loss, F.sigmoid(tobj)))\n        return weighted_loss\n\n    def obj_loss(self, sobj, tobj):\n        obj_mask = paddle.cast(tobj > 0., dtype=\"float32\")\n        obj_mask.stop_gradient = True\n        loss = paddle.mean(\n            ops.sigmoid_cross_entropy_with_logits(sobj, obj_mask))\n        return loss\n\n    def forward(self, teacher_model, student_model):\n        teacher_distill_pairs = teacher_model.yolo_head.loss.distill_pairs\n        student_distill_pairs = student_model.yolo_head.loss.distill_pairs\n        distill_reg_loss, distill_cls_loss, distill_obj_loss = [], [], []\n        for s_pair, t_pair in zip(student_distill_pairs, teacher_distill_pairs):\n            distill_reg_loss.append(\n                self.obj_weighted_reg(s_pair[0], s_pair[1], s_pair[2], s_pair[\n                    3], t_pair[0], t_pair[1], t_pair[2], t_pair[3], t_pair[4]))\n            distill_cls_loss.append(\n                self.obj_weighted_cls(s_pair[5], t_pair[5], t_pair[4]))\n            distill_obj_loss.append(self.obj_loss(s_pair[4], t_pair[4]))\n        distill_reg_loss = paddle.add_n(distill_reg_loss)\n        distill_cls_loss = paddle.add_n(distill_cls_loss)\n        distill_obj_loss = paddle.add_n(distill_obj_loss)\n        loss = (distill_reg_loss + distill_cls_loss + distill_obj_loss\n                ) * self.loss_weight\n        return loss\n\n\n@register\nclass KnowledgeDistillationKLDivLoss(nn.Layer):\n    \"\"\"Loss function for knowledge distilling using KL divergence.\n\n    Args:\n        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.\n        loss_weight (float): Loss weight of current loss.\n        T (int): Temperature for distillation.\n    \"\"\"\n\n    def __init__(self, reduction='mean', loss_weight=1.0, T=10):\n        super(KnowledgeDistillationKLDivLoss, self).__init__()\n        assert reduction in ('none', 'mean', 'sum')\n        assert T >= 1\n        self.reduction = reduction\n        self.loss_weight = loss_weight\n        self.T = T\n\n    def knowledge_distillation_kl_div_loss(self,\n                                           pred,\n                                           soft_label,\n                                           T,\n                                           detach_target=True):\n        r\"\"\"Loss function for knowledge distilling using KL divergence.\n\n        Args:\n            pred (Tensor): Predicted logits with shape (N, n + 1).\n            soft_label (Tensor): Target logits with shape (N, N + 1).\n            T (int): Temperature for distillation.\n            detach_target (bool): Remove soft_label from automatic differentiation\n        \"\"\"\n        assert pred.shape == soft_label.shape\n        target = F.softmax(soft_label / T, axis=1)\n        if detach_target:\n            target = target.detach()\n\n        kd_loss = F.kl_div(\n            F.log_softmax(\n                pred / T, axis=1), target, reduction='none').mean(1) * (T * T)\n\n        return kd_loss\n\n    def forward(self,\n                pred,\n                soft_label,\n                weight=None,\n                avg_factor=None,\n                reduction_override=None):\n        \"\"\"Forward function.\n\n        Args:\n            pred (Tensor): Predicted logits with shape (N, n + 1).\n            soft_label (Tensor): Target logits with shape (N, N + 1).\n            weight (Tensor, optional): The weight of loss for each\n                prediction. Defaults to None.\n            avg_factor (int, optional): Average factor that is used to average\n                the loss. Defaults to None.\n            reduction_override (str, optional): The reduction method used to\n                override the original reduction method of the loss.\n                Defaults to None.\n        \"\"\"\n        assert reduction_override in (None, 'none', 'mean', 'sum')\n\n        reduction = (reduction_override\n                     if reduction_override else self.reduction)\n\n        loss_kd_out = self.knowledge_distillation_kl_div_loss(\n            pred, soft_label, T=self.T)\n\n        if weight is not None:\n            loss_kd_out = weight * loss_kd_out\n\n        if avg_factor is None:\n            if reduction == 'none':\n                loss = loss_kd_out\n            elif reduction == 'mean':\n                loss = loss_kd_out.mean()\n            elif reduction == 'sum':\n                loss = loss_kd_out.sum()\n        else:\n            # if reduction is mean, then average the loss by avg_factor\n            if reduction == 'mean':\n                loss = loss_kd_out.sum() / avg_factor\n            # if reduction is 'none', then do nothing, otherwise raise an error\n            elif reduction != 'none':\n                raise ValueError(\n                    'avg_factor can not be used with reduction=\"sum\"')\n\n        loss_kd = self.loss_weight * loss\n        return loss_kd\n\n\n@register\nclass DistillPPYOLOELoss(nn.Layer):\n    def __init__(\n            self,\n            loss_weight={'logits': 4.0,\n                         'feat': 1.0},\n            logits_distill=True,\n            logits_loss_weight={'class': 1.0,\n                                'iou': 2.5,\n                                'dfl': 0.5},\n            logits_ld_distill=False,\n            logits_ld_params={'weight': 20000,\n                              'T': 10},\n            feat_distill=True,\n            feat_distiller='fgd',\n            feat_distill_place='neck_feats',\n            teacher_width_mult=1.0,  # L\n            student_width_mult=0.75,  # M\n            feat_out_channels=[768, 384, 192]):\n        super(DistillPPYOLOELoss, self).__init__()\n        self.loss_weight_logits = loss_weight['logits']\n        self.loss_weight_feat = loss_weight['feat']\n        self.logits_distill = logits_distill\n        self.logits_ld_distill = logits_ld_distill\n        self.feat_distill = feat_distill\n\n        if logits_distill and self.loss_weight_logits > 0:\n            self.bbox_loss_weight = logits_loss_weight['iou']\n            self.dfl_loss_weight = logits_loss_weight['dfl']\n            self.qfl_loss_weight = logits_loss_weight['class']\n            self.loss_bbox = GIoULoss()\n\n        if logits_ld_distill:\n            self.loss_kd = KnowledgeDistillationKLDivLoss(\n                loss_weight=logits_ld_params['weight'], T=logits_ld_params['T'])\n\n        if feat_distill and self.loss_weight_feat > 0:\n            assert feat_distiller in ['cwd', 'fgd', 'pkd', 'mgd', 'mimic']\n            assert feat_distill_place in ['backbone_feats', 'neck_feats']\n            self.feat_distill_place = feat_distill_place\n            self.t_channel_list = [\n                int(c * teacher_width_mult) for c in feat_out_channels\n            ]\n            self.s_channel_list = [\n                int(c * student_width_mult) for c in feat_out_channels\n            ]\n            self.distill_feat_loss_modules = []\n            for i in range(len(feat_out_channels)):\n                if feat_distiller == 'cwd':\n                    feat_loss_module = CWDFeatureLoss(\n                        student_channels=self.s_channel_list[i],\n                        teacher_channels=self.t_channel_list[i],\n                        normalize=True)\n                elif feat_distiller == 'fgd':\n                    feat_loss_module = FGDFeatureLoss(\n                        student_channels=self.s_channel_list[i],\n                        teacher_channels=self.t_channel_list[i],\n                        normalize=True,\n                        alpha_fgd=0.00001,\n                        beta_fgd=0.000005,\n                        gamma_fgd=0.00001,\n                        lambda_fgd=0.00000005)\n                elif feat_distiller == 'pkd':\n                    feat_loss_module = PKDFeatureLoss(\n                        student_channels=self.s_channel_list[i],\n                        teacher_channels=self.t_channel_list[i],\n                        normalize=True,\n                        resize_stu=True)\n                elif feat_distiller == 'mgd':\n                    feat_loss_module = MGDFeatureLoss(\n                        student_channels=self.s_channel_list[i],\n                        teacher_channels=self.t_channel_list[i],\n                        normalize=True,\n                        loss_func='ssim')\n                elif feat_distiller == 'mimic':\n                    feat_loss_module = MimicFeatureLoss(\n                        student_channels=self.s_channel_list[i],\n                        teacher_channels=self.t_channel_list[i],\n                        normalize=True)\n                else:\n                    raise ValueError\n                self.distill_feat_loss_modules.append(feat_loss_module)\n\n    def quality_focal_loss(self,\n                           pred_logits,\n                           soft_target_logits,\n                           beta=2.0,\n                           use_sigmoid=False,\n                           num_total_pos=None):\n        if use_sigmoid:\n            func = F.binary_cross_entropy_with_logits\n            soft_target = F.sigmoid(soft_target_logits)\n            pred_sigmoid = F.sigmoid(pred_logits)\n            preds = pred_logits\n        else:\n            func = F.binary_cross_entropy\n            soft_target = soft_target_logits\n            pred_sigmoid = pred_logits\n            preds = pred_sigmoid\n\n        scale_factor = pred_sigmoid - soft_target\n        loss = func(\n            preds, soft_target, reduction='none') * scale_factor.abs().pow(beta)\n        loss = loss.sum(1)\n\n        if num_total_pos is not None:\n            loss = loss.sum() / num_total_pos\n        else:\n            loss = loss.mean()\n        return loss\n\n    def bbox_loss(self, s_bbox, t_bbox, weight_targets=None):\n        # [x,y,w,h]\n        if weight_targets is not None:\n            loss = paddle.sum(self.loss_bbox(s_bbox, t_bbox) * weight_targets)\n            avg_factor = weight_targets.sum()\n            loss = loss / avg_factor\n        else:\n            loss = paddle.mean(self.loss_bbox(s_bbox, t_bbox))\n        return loss\n\n    def distribution_focal_loss(self,\n                                pred_corners,\n                                target_corners,\n                                weight_targets=None):\n        target_corners_label = F.softmax(target_corners, axis=-1)\n        loss_dfl = F.cross_entropy(\n            pred_corners,\n            target_corners_label,\n            soft_label=True,\n            reduction='none')\n        loss_dfl = loss_dfl.sum(1)\n\n        if weight_targets is not None:\n            loss_dfl = loss_dfl * (weight_targets.expand([-1, 4]).reshape([-1]))\n            loss_dfl = loss_dfl.sum(-1) / weight_targets.sum()\n        else:\n            loss_dfl = loss_dfl.mean(-1)\n        return loss_dfl / 4.0  # 4 direction\n\n    def main_kd(self, mask_positive, pred_scores, soft_cls, num_classes):\n        num_pos = mask_positive.sum()\n        if num_pos > 0:\n            cls_mask = mask_positive.unsqueeze(-1).tile([1, 1, num_classes])\n            pred_scores_pos = paddle.masked_select(\n                pred_scores, cls_mask).reshape([-1, num_classes])\n            soft_cls_pos = paddle.masked_select(\n                soft_cls, cls_mask).reshape([-1, num_classes])\n            loss_kd = self.loss_kd(\n                pred_scores_pos, soft_cls_pos, avg_factor=num_pos)\n        else:\n            loss_kd = paddle.zeros([])\n        return loss_kd\n\n    def forward(self, teacher_model, student_model):\n        teacher_distill_pairs = teacher_model.yolo_head.distill_pairs\n        student_distill_pairs = student_model.yolo_head.distill_pairs\n        if self.logits_distill and self.loss_weight_logits > 0:\n            distill_bbox_loss, distill_dfl_loss, distill_cls_loss = [], [], []\n\n            distill_cls_loss.append(\n                self.quality_focal_loss(\n                    student_distill_pairs['pred_cls_scores'].reshape(\n                        (-1, student_distill_pairs['pred_cls_scores'].shape[-1]\n                         )),\n                    teacher_distill_pairs['pred_cls_scores'].detach().reshape(\n                        (-1, teacher_distill_pairs['pred_cls_scores'].shape[-1]\n                         )),\n                    num_total_pos=student_distill_pairs['pos_num'],\n                    use_sigmoid=False))\n\n            distill_bbox_loss.append(\n                self.bbox_loss(student_distill_pairs['pred_bboxes_pos'],\n                                teacher_distill_pairs['pred_bboxes_pos'].detach(),\n                                weight_targets=student_distill_pairs['bbox_weight']\n                    ) if 'pred_bboxes_pos' in student_distill_pairs and \\\n                        'pred_bboxes_pos' in teacher_distill_pairs and \\\n                            'bbox_weight' in student_distill_pairs\n                    else paddle.zeros([]))\n\n            distill_dfl_loss.append(\n                self.distribution_focal_loss(\n                        student_distill_pairs['pred_dist_pos'].reshape((-1, student_distill_pairs['pred_dist_pos'].shape[-1])),\n                        teacher_distill_pairs['pred_dist_pos'].detach().reshape((-1, teacher_distill_pairs['pred_dist_pos'].shape[-1])), \\\n                        weight_targets=student_distill_pairs['bbox_weight']\n                    ) if 'pred_dist_pos' in student_distill_pairs and \\\n                        'pred_dist_pos' in teacher_distill_pairs and \\\n                            'bbox_weight' in student_distill_pairs\n                    else paddle.zeros([]))\n\n            distill_cls_loss = paddle.add_n(distill_cls_loss)\n            distill_bbox_loss = paddle.add_n(distill_bbox_loss)\n            distill_dfl_loss = paddle.add_n(distill_dfl_loss)\n            logits_loss = distill_bbox_loss * self.bbox_loss_weight + distill_cls_loss * self.qfl_loss_weight + distill_dfl_loss * self.dfl_loss_weight\n\n            if self.logits_ld_distill:\n                loss_kd = self.main_kd(\n                    student_distill_pairs['mask_positive_select'],\n                    student_distill_pairs['pred_cls_scores'],\n                    teacher_distill_pairs['pred_cls_scores'],\n                    student_model.yolo_head.num_classes, )\n                logits_loss += loss_kd\n        else:\n            logits_loss = paddle.zeros([])\n\n        if self.feat_distill and self.loss_weight_feat > 0:\n            feat_loss_list = []\n            inputs = student_model.inputs\n            assert 'gt_bbox' in inputs\n            assert self.feat_distill_place in student_distill_pairs\n            assert self.feat_distill_place in teacher_distill_pairs\n            stu_feats = student_distill_pairs[self.feat_distill_place]\n            tea_feats = teacher_distill_pairs[self.feat_distill_place]\n            for i, loss_module in enumerate(self.distill_feat_loss_modules):\n                feat_loss_list.append(\n                    loss_module(stu_feats[i], tea_feats[i], inputs))\n            feat_loss = paddle.add_n(feat_loss_list)\n        else:\n            feat_loss = paddle.zeros([])\n\n        student_model.yolo_head.distill_pairs.clear()\n        teacher_model.yolo_head.distill_pairs.clear()\n        return logits_loss * self.loss_weight_logits, feat_loss * self.loss_weight_feat\n\n\n@register\nclass CWDFeatureLoss(nn.Layer):\n    def __init__(self,\n                 student_channels,\n                 teacher_channels,\n                 normalize=False,\n                 tau=1.0,\n                 weight=1.0):\n        super(CWDFeatureLoss, self).__init__()\n        self.normalize = normalize\n        self.tau = tau\n        self.loss_weight = weight\n\n        if student_channels != teacher_channels:\n            self.align = nn.Conv2D(\n                student_channels,\n                teacher_channels,\n                kernel_size=1,\n                stride=1,\n                padding=0)\n        else:\n            self.align = None\n\n    def distill_softmax(self, x, tau):\n        _, _, w, h = x.shape\n        x = paddle.reshape(x, [-1, w * h])\n        x /= tau\n        return F.softmax(x, axis=1)\n\n    def forward(self, preds_s, preds_t, inputs=None):\n        assert preds_s.shape[-2:] == preds_t.shape[-2:]\n        N, C, H, W = preds_s.shape\n        eps = 1e-5\n        if self.align is not None:\n            preds_s = self.align(preds_s)\n\n        if self.normalize:\n            preds_s = feature_norm(preds_s)\n            preds_t = feature_norm(preds_t)\n\n        softmax_pred_s = self.distill_softmax(preds_s, self.tau)\n        softmax_pred_t = self.distill_softmax(preds_t, self.tau)\n\n        loss = paddle.sum(-softmax_pred_t * paddle.log(eps + softmax_pred_s) +\n                          softmax_pred_t * paddle.log(eps + softmax_pred_t))\n        return self.loss_weight * loss / (C * N)\n\n\n@register\nclass FGDFeatureLoss(nn.Layer):\n    \"\"\"\n    Focal and Global Knowledge Distillation for Detectors\n    The code is reference from https://github.com/yzd-v/FGD/blob/master/mmdet/distillation/losses/fgd.py\n   \n    Args:\n        student_channels (int): The number of channels in the student's FPN feature map. Default to 256.\n        teacher_channels (int): The number of channels in the teacher's FPN feature map. Default to 256.\n        normalize (bool): Whether to normalize the feature maps.\n        temp (float, optional): The temperature coefficient. Defaults to 0.5.\n        alpha_fgd (float, optional): The weight of fg_loss. Defaults to 0.001\n        beta_fgd (float, optional): The weight of bg_loss. Defaults to 0.0005\n        gamma_fgd (float, optional): The weight of mask_loss. Defaults to 0.001\n        lambda_fgd (float, optional): The weight of relation_loss. Defaults to 0.000005\n    \"\"\"\n\n    def __init__(self,\n                 student_channels,\n                 teacher_channels,\n                 normalize=False,\n                 loss_weight=1.0,\n                 temp=0.5,\n                 alpha_fgd=0.001,\n                 beta_fgd=0.0005,\n                 gamma_fgd=0.001,\n                 lambda_fgd=0.000005):\n        super(FGDFeatureLoss, self).__init__()\n        self.normalize = normalize\n        self.loss_weight = loss_weight\n        self.temp = temp\n        self.alpha_fgd = alpha_fgd\n        self.beta_fgd = beta_fgd\n        self.gamma_fgd = gamma_fgd\n        self.lambda_fgd = lambda_fgd\n        kaiming_init = parameter_init(\"kaiming\")\n        zeros_init = parameter_init(\"constant\", 0.0)\n\n        if student_channels != teacher_channels:\n            self.align = nn.Conv2D(\n                student_channels,\n                teacher_channels,\n                kernel_size=1,\n                stride=1,\n                padding=0,\n                weight_attr=kaiming_init)\n            student_channels = teacher_channels\n        else:\n            self.align = None\n\n        self.conv_mask_s = nn.Conv2D(\n            student_channels, 1, kernel_size=1, weight_attr=kaiming_init)\n        self.conv_mask_t = nn.Conv2D(\n            teacher_channels, 1, kernel_size=1, weight_attr=kaiming_init)\n\n        self.stu_conv_block = nn.Sequential(\n            nn.Conv2D(\n                student_channels,\n                student_channels // 2,\n                kernel_size=1,\n                weight_attr=zeros_init),\n            nn.LayerNorm([student_channels // 2, 1, 1]),\n            nn.ReLU(),\n            nn.Conv2D(\n                student_channels // 2,\n                student_channels,\n                kernel_size=1,\n                weight_attr=zeros_init))\n        self.tea_conv_block = nn.Sequential(\n            nn.Conv2D(\n                teacher_channels,\n                teacher_channels // 2,\n                kernel_size=1,\n                weight_attr=zeros_init),\n            nn.LayerNorm([teacher_channels // 2, 1, 1]),\n            nn.ReLU(),\n            nn.Conv2D(\n                teacher_channels // 2,\n                teacher_channels,\n                kernel_size=1,\n                weight_attr=zeros_init))\n\n    def spatial_channel_attention(self, x, t=0.5):\n        shape = x.shape\n        N, C, H, W = shape\n        _f = paddle.abs(x)\n        spatial_map = paddle.reshape(\n            paddle.mean(\n                _f, axis=1, keepdim=True) / t, [N, -1])\n        spatial_map = F.softmax(spatial_map, axis=1, dtype=\"float32\") * H * W\n        spatial_att = paddle.reshape(spatial_map, [N, H, W])\n\n        channel_map = paddle.mean(\n            paddle.mean(\n                _f, axis=2, keepdim=False), axis=2, keepdim=False)\n        channel_att = F.softmax(channel_map / t, axis=1, dtype=\"float32\") * C\n        return [spatial_att, channel_att]\n\n    def spatial_pool(self, x, mode=\"teacher\"):\n        batch, channel, width, height = x.shape\n        x_copy = x\n        x_copy = paddle.reshape(x_copy, [batch, channel, height * width])\n        x_copy = x_copy.unsqueeze(1)\n        if mode.lower() == \"student\":\n            context_mask = self.conv_mask_s(x)\n        else:\n            context_mask = self.conv_mask_t(x)\n\n        context_mask = paddle.reshape(context_mask, [batch, 1, height * width])\n        context_mask = F.softmax(context_mask, axis=2)\n        context_mask = context_mask.unsqueeze(-1)\n        context = paddle.matmul(x_copy, context_mask)\n        context = paddle.reshape(context, [batch, channel, 1, 1])\n        return context\n\n    def mask_loss(self, stu_channel_att, tea_channel_att, stu_spatial_att,\n                  tea_spatial_att):\n        def _func(a, b):\n            return paddle.sum(paddle.abs(a - b)) / len(a)\n\n        mask_loss = _func(stu_channel_att, tea_channel_att) + _func(\n            stu_spatial_att, tea_spatial_att)\n        return mask_loss\n\n    def feature_loss(self, stu_feature, tea_feature, mask_fg, mask_bg,\n                     tea_channel_att, tea_spatial_att):\n        mask_fg = mask_fg.unsqueeze(axis=1)\n        mask_bg = mask_bg.unsqueeze(axis=1)\n        tea_channel_att = tea_channel_att.unsqueeze(axis=-1).unsqueeze(axis=-1)\n        tea_spatial_att = tea_spatial_att.unsqueeze(axis=1)\n\n        fea_t = paddle.multiply(tea_feature, paddle.sqrt(tea_spatial_att))\n        fea_t = paddle.multiply(fea_t, paddle.sqrt(tea_channel_att))\n        fg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_fg))\n        bg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_bg))\n\n        fea_s = paddle.multiply(stu_feature, paddle.sqrt(tea_spatial_att))\n        fea_s = paddle.multiply(fea_s, paddle.sqrt(tea_channel_att))\n        fg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_fg))\n        bg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_bg))\n\n        fg_loss = F.mse_loss(fg_fea_s, fg_fea_t, reduction=\"sum\") / len(mask_fg)\n        bg_loss = F.mse_loss(bg_fea_s, bg_fea_t, reduction=\"sum\") / len(mask_bg)\n        return fg_loss, bg_loss\n\n    def relation_loss(self, stu_feature, tea_feature):\n        context_s = self.spatial_pool(stu_feature, \"student\")\n        context_t = self.spatial_pool(tea_feature, \"teacher\")\n        out_s = stu_feature + self.stu_conv_block(context_s)\n        out_t = tea_feature + self.tea_conv_block(context_t)\n        rela_loss = F.mse_loss(out_s, out_t, reduction=\"sum\") / len(out_s)\n        return rela_loss\n\n    def mask_value(self, mask, xl, xr, yl, yr, value):\n        mask[xl:xr, yl:yr] = paddle.maximum(mask[xl:xr, yl:yr], value)\n        return mask\n\n    def forward(self, stu_feature, tea_feature, inputs):\n        assert stu_feature.shape[-2:] == stu_feature.shape[-2:]\n        assert \"gt_bbox\" in inputs.keys() and \"im_shape\" in inputs.keys()\n        gt_bboxes = inputs['gt_bbox']\n        ins_shape = [\n            inputs['im_shape'][i] for i in range(inputs['im_shape'].shape[0])\n        ]\n        index_gt = []\n        for i in range(len(gt_bboxes)):\n            if gt_bboxes[i].size > 2:\n                index_gt.append(i)\n        # only distill feature with labeled GTbox\n        if len(index_gt) != len(gt_bboxes):\n            index_gt_t = paddle.to_tensor(index_gt)\n            stu_feature = paddle.index_select(stu_feature, index_gt_t)\n            tea_feature = paddle.index_select(tea_feature, index_gt_t)\n\n            ins_shape = [ins_shape[c] for c in index_gt]\n            gt_bboxes = [gt_bboxes[c] for c in index_gt]\n            assert len(gt_bboxes) == tea_feature.shape[0]\n\n        if self.align is not None:\n            stu_feature = self.align(stu_feature)\n\n        if self.normalize:\n            stu_feature = feature_norm(stu_feature)\n            tea_feature = feature_norm(tea_feature)\n\n        tea_spatial_att, tea_channel_att = self.spatial_channel_attention(\n            tea_feature, self.temp)\n        stu_spatial_att, stu_channel_att = self.spatial_channel_attention(\n            stu_feature, self.temp)\n\n        mask_fg = paddle.zeros(tea_spatial_att.shape)\n        mask_bg = paddle.ones_like(tea_spatial_att)\n        one_tmp = paddle.ones([*tea_spatial_att.shape[1:]])\n        zero_tmp = paddle.zeros([*tea_spatial_att.shape[1:]])\n        mask_fg.stop_gradient = True\n        mask_bg.stop_gradient = True\n        one_tmp.stop_gradient = True\n        zero_tmp.stop_gradient = True\n\n        wmin, wmax, hmin, hmax = [], [], [], []\n\n        if len(gt_bboxes) == 0:\n            loss = self.relation_loss(stu_feature, tea_feature)\n            return self.lambda_fgd * loss\n\n        N, _, H, W = stu_feature.shape\n        for i in range(N):\n            tmp_box = paddle.ones_like(gt_bboxes[i])\n            tmp_box.stop_gradient = True\n            tmp_box[:, 0] = gt_bboxes[i][:, 0] / ins_shape[i][1] * W\n            tmp_box[:, 2] = gt_bboxes[i][:, 2] / ins_shape[i][1] * W\n            tmp_box[:, 1] = gt_bboxes[i][:, 1] / ins_shape[i][0] * H\n            tmp_box[:, 3] = gt_bboxes[i][:, 3] / ins_shape[i][0] * H\n\n            zero = paddle.zeros_like(tmp_box[:, 0], dtype=\"int32\")\n            ones = paddle.ones_like(tmp_box[:, 2], dtype=\"int32\")\n            zero.stop_gradient = True\n            ones.stop_gradient = True\n            wmin.append(\n                paddle.cast(paddle.floor(tmp_box[:, 0]), \"int32\").maximum(zero))\n            wmax.append(paddle.cast(paddle.ceil(tmp_box[:, 2]), \"int32\"))\n            hmin.append(\n                paddle.cast(paddle.floor(tmp_box[:, 1]), \"int32\").maximum(zero))\n            hmax.append(paddle.cast(paddle.ceil(tmp_box[:, 3]), \"int32\"))\n\n            area_recip = 1.0 / (\n                hmax[i].reshape([1, -1]) + 1 - hmin[i].reshape([1, -1])) / (\n                    wmax[i].reshape([1, -1]) + 1 - wmin[i].reshape([1, -1]))\n\n            for j in range(len(gt_bboxes[i])):\n                if gt_bboxes[i][j].sum() > 0:\n                    mask_fg[i] = self.mask_value(\n                        mask_fg[i], hmin[i][j], hmax[i][j] + 1, wmin[i][j],\n                        wmax[i][j] + 1, area_recip[0][j])\n\n            mask_bg[i] = paddle.where(mask_fg[i] > zero_tmp, zero_tmp, one_tmp)\n\n            if paddle.sum(mask_bg[i]):\n                mask_bg[i] /= paddle.sum(mask_bg[i])\n\n        fg_loss, bg_loss = self.feature_loss(stu_feature, tea_feature, mask_fg,\n                                             mask_bg, tea_channel_att,\n                                             tea_spatial_att)\n        mask_loss = self.mask_loss(stu_channel_att, tea_channel_att,\n                                   stu_spatial_att, tea_spatial_att)\n        rela_loss = self.relation_loss(stu_feature, tea_feature)\n        loss = self.alpha_fgd * fg_loss + self.beta_fgd * bg_loss \\\n               + self.gamma_fgd * mask_loss + self.lambda_fgd * rela_loss\n        return loss * self.loss_weight\n\n\n@register\nclass PKDFeatureLoss(nn.Layer):\n    \"\"\"\n    PKD: General Distillation Framework for Object Detectors via Pearson Correlation Coefficient.\n\n    Args:\n        loss_weight (float): Weight of loss. Defaults to 1.0.\n        resize_stu (bool): If True, we'll down/up sample the features of the\n            student model to the spatial size of those of the teacher model if\n            their spatial sizes are different. And vice versa. Defaults to\n            True.\n    \"\"\"\n\n    def __init__(self,\n                 student_channels=256,\n                 teacher_channels=256,\n                 normalize=True,\n                 loss_weight=1.0,\n                 resize_stu=True):\n        super(PKDFeatureLoss, self).__init__()\n        self.normalize = normalize\n        self.loss_weight = loss_weight\n        self.resize_stu = resize_stu\n\n    def forward(self, stu_feature, tea_feature, inputs=None):\n        size_s, size_t = stu_feature.shape[2:], tea_feature.shape[2:]\n        if size_s[0] != size_t[0]:\n            if self.resize_stu:\n                stu_feature = F.interpolate(\n                    stu_feature, size_t, mode='bilinear')\n            else:\n                tea_feature = F.interpolate(\n                    tea_feature, size_s, mode='bilinear')\n        assert stu_feature.shape == tea_feature.shape\n\n        if self.normalize:\n            stu_feature = feature_norm(stu_feature)\n            tea_feature = feature_norm(tea_feature)\n\n        loss = F.mse_loss(stu_feature, tea_feature) / 2\n        return loss * self.loss_weight\n\n\n@register\nclass MimicFeatureLoss(nn.Layer):\n    def __init__(self,\n                 student_channels=256,\n                 teacher_channels=256,\n                 normalize=True,\n                 loss_weight=1.0):\n        super(MimicFeatureLoss, self).__init__()\n        self.normalize = normalize\n        self.loss_weight = loss_weight\n        self.mse_loss = nn.MSELoss()\n\n        if student_channels != teacher_channels:\n            self.align = nn.Conv2D(\n                student_channels,\n                teacher_channels,\n                kernel_size=1,\n                stride=1,\n                padding=0)\n        else:\n            self.align = None\n\n    def forward(self, stu_feature, tea_feature, inputs=None):\n        if self.align is not None:\n            stu_feature = self.align(stu_feature)\n\n        if self.normalize:\n            stu_feature = feature_norm(stu_feature)\n            tea_feature = feature_norm(tea_feature)\n\n        loss = self.mse_loss(stu_feature, tea_feature)\n        return loss * self.loss_weight\n\n\n@register\nclass MGDFeatureLoss(nn.Layer):\n    def __init__(self,\n                 student_channels=256,\n                 teacher_channels=256,\n                 normalize=True,\n                 loss_weight=1.0,\n                 loss_func='mse'):\n        super(MGDFeatureLoss, self).__init__()\n        self.normalize = normalize\n        self.loss_weight = loss_weight\n        assert loss_func in ['mse', 'ssim']\n        self.loss_func = loss_func\n        self.mse_loss = nn.MSELoss(reduction='sum')\n        self.ssim_loss = SSIM(11)\n\n        kaiming_init = parameter_init(\"kaiming\")\n        if student_channels != teacher_channels:\n            self.align = nn.Conv2D(\n                student_channels,\n                teacher_channels,\n                kernel_size=1,\n                stride=1,\n                padding=0,\n                weight_attr=kaiming_init,\n                bias_attr=False)\n        else:\n            self.align = None\n\n        self.generation = nn.Sequential(\n            nn.Conv2D(\n                teacher_channels, teacher_channels, kernel_size=3, padding=1),\n            nn.ReLU(),\n            nn.Conv2D(\n                teacher_channels, teacher_channels, kernel_size=3, padding=1))\n\n    def forward(self, stu_feature, tea_feature, inputs=None):\n        N = stu_feature.shape[0]\n        if self.align is not None:\n            stu_feature = self.align(stu_feature)\n        stu_feature = self.generation(stu_feature)\n\n        if self.normalize:\n            stu_feature = feature_norm(stu_feature)\n            tea_feature = feature_norm(tea_feature)\n\n        if self.loss_func == 'mse':\n            loss = self.mse_loss(stu_feature, tea_feature) / N\n        elif self.loss_func == 'ssim':\n            ssim_loss = self.ssim_loss(stu_feature, tea_feature)\n            loss = paddle.clip((1 - ssim_loss) / 2, 0, 1)\n        else:\n            raise ValueError\n        return loss * self.loss_weight\n\n\nclass SSIM(nn.Layer):\n    def __init__(self, window_size=11, size_average=True):\n        super(SSIM, self).__init__()\n        self.window_size = window_size\n        self.size_average = size_average\n        self.channel = 1\n        self.window = self.create_window(window_size, self.channel)\n\n    def gaussian(self, window_size, sigma):\n        gauss = paddle.to_tensor([\n            math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))\n            for x in range(window_size)\n        ])\n        return gauss / gauss.sum()\n\n    def create_window(self, window_size, channel):\n        _1D_window = self.gaussian(window_size, 1.5).unsqueeze(1)\n        _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0)\n        window = _2D_window.expand([channel, 1, window_size, window_size])\n        return window\n\n    def _ssim(self, img1, img2, window, window_size, channel,\n              size_average=True):\n        mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)\n        mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)\n        mu1_sq = mu1.pow(2)\n        mu2_sq = mu2.pow(2)\n        mu1_mu2 = mu1 * mu2\n\n        sigma1_sq = F.conv2d(\n            img1 * img1, window, padding=window_size // 2,\n            groups=channel) - mu1_sq\n        sigma2_sq = F.conv2d(\n            img2 * img2, window, padding=window_size // 2,\n            groups=channel) - mu2_sq\n        sigma12 = F.conv2d(\n            img1 * img2, window, padding=window_size // 2,\n            groups=channel) - mu1_mu2\n\n        C1 = 0.01**2\n        C2 = 0.03**2\n        ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (\n            1e-12 + (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))\n\n        if size_average:\n            return ssim_map.mean()\n        else:\n            return ssim_map.mean([1, 2, 3])\n\n    def forward(self, img1, img2):\n        channel = img1.shape[1]\n        if channel == self.channel and self.window.dtype == img1.dtype:\n            window = self.window\n        else:\n            window = self.create_window(self.window_size, channel)\n            self.window = window\n            self.channel = channel\n\n        return self._ssim(img1, img2, window, self.window_size, channel,\n                          self.size_average)\n"
  },
  {
    "path": "ppdet/slim/distill_model.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\n\nfrom ppdet.core.workspace import register, create, load_config\nfrom ppdet.utils.checkpoint import load_pretrain_weight\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'DistillModel',\n    'FGDDistillModel',\n    'CWDDistillModel',\n    'LDDistillModel',\n    'PPYOLOEDistillModel',\n]\n\n\n@register\nclass DistillModel(nn.Layer):\n    \"\"\"\n    Build common distill model.\n    Args:\n        cfg: The student config.\n        slim_cfg: The teacher and distill config.\n    \"\"\"\n\n    def __init__(self, cfg, slim_cfg):\n        super(DistillModel, self).__init__()\n        self.arch = cfg.architecture\n\n        self.stu_cfg = cfg\n        self.student_model = create(self.stu_cfg.architecture)\n        if 'pretrain_weights' in self.stu_cfg and self.stu_cfg.pretrain_weights:\n            stu_pretrain = self.stu_cfg.pretrain_weights\n        else:\n            stu_pretrain = None\n\n        slim_cfg = load_config(slim_cfg)\n        self.tea_cfg = slim_cfg\n        self.teacher_model = create(self.tea_cfg.architecture)\n        if 'pretrain_weights' in self.tea_cfg and self.tea_cfg.pretrain_weights:\n            tea_pretrain = self.tea_cfg.pretrain_weights\n        else:\n            tea_pretrain = None\n        self.distill_cfg = slim_cfg\n\n        # load pretrain weights\n        self.is_inherit = False\n        if stu_pretrain:\n            if self.is_inherit and tea_pretrain:\n                load_pretrain_weight(self.student_model, tea_pretrain)\n                logger.debug(\n                    \"Inheriting! loading teacher weights to student model!\")\n            load_pretrain_weight(self.student_model, stu_pretrain)\n            logger.info(\"Student model has loaded pretrain weights!\")\n        if tea_pretrain:\n            load_pretrain_weight(self.teacher_model, tea_pretrain)\n            logger.info(\"Teacher model has loaded pretrain weights!\")\n\n        self.teacher_model.eval()\n        for param in self.teacher_model.parameters():\n            param.trainable = False\n\n        self.distill_loss = self.build_loss(self.distill_cfg)\n\n    def build_loss(self, distill_cfg):\n        if 'distill_loss' in distill_cfg and distill_cfg.distill_loss:\n            return create(distill_cfg.distill_loss)\n        else:\n            return None\n\n    def parameters(self):\n        return self.student_model.parameters()\n\n    def forward(self, inputs):\n        if self.training:\n            student_loss = self.student_model(inputs)\n            with paddle.no_grad():\n                teacher_loss = self.teacher_model(inputs)\n\n            loss = self.distill_loss(self.teacher_model, self.student_model)\n            student_loss['distill_loss'] = loss\n            student_loss['teacher_loss'] = teacher_loss['loss']\n            student_loss['loss'] += student_loss['distill_loss']\n            return student_loss\n        else:\n            return self.student_model(inputs)\n\n\n@register\nclass FGDDistillModel(DistillModel):\n    \"\"\"\n    Build FGD distill model.\n    Args:\n        cfg: The student config.\n        slim_cfg: The teacher and distill config.\n    \"\"\"\n\n    def __init__(self, cfg, slim_cfg):\n        super(FGDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)\n        assert self.arch in ['RetinaNet', 'PicoDet'\n                             ], 'Unsupported arch: {}'.format(self.arch)\n        self.is_inherit = True\n\n    def build_loss(self, distill_cfg):\n        assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name\n        assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss\n        loss_func = dict()\n        name_list = distill_cfg.distill_loss_name\n        for name in name_list:\n            loss_func[name] = create(distill_cfg.distill_loss)\n        return loss_func\n\n    def forward(self, inputs):\n        if self.training:\n            s_body_feats = self.student_model.backbone(inputs)\n            s_neck_feats = self.student_model.neck(s_body_feats)\n            with paddle.no_grad():\n                t_body_feats = self.teacher_model.backbone(inputs)\n                t_neck_feats = self.teacher_model.neck(t_body_feats)\n\n            loss_dict = {}\n            for idx, k in enumerate(self.distill_loss):\n                loss_dict[k] = self.distill_loss[k](s_neck_feats[idx],\n                                                    t_neck_feats[idx], inputs)\n            if self.arch == \"RetinaNet\":\n                loss = self.student_model.head(s_neck_feats, inputs)\n            elif self.arch == \"PicoDet\":\n                head_outs = self.student_model.head(\n                    s_neck_feats, self.student_model.export_post_process)\n                loss_gfl = self.student_model.head.get_loss(head_outs, inputs)\n                total_loss = paddle.add_n(list(loss_gfl.values()))\n                loss = {}\n                loss.update(loss_gfl)\n                loss.update({'loss': total_loss})\n            else:\n                raise ValueError(f\"Unsupported model {self.arch}\")\n\n            for k in loss_dict:\n                loss['loss'] += loss_dict[k]\n                loss[k] = loss_dict[k]\n            return loss\n        else:\n            body_feats = self.student_model.backbone(inputs)\n            neck_feats = self.student_model.neck(body_feats)\n            head_outs = self.student_model.head(neck_feats)\n            if self.arch == \"RetinaNet\":\n                bbox, bbox_num = self.student_model.head.post_process(\n                    head_outs, inputs['im_shape'], inputs['scale_factor'])\n                return {'bbox': bbox, 'bbox_num': bbox_num}\n            elif self.arch == \"PicoDet\":\n                head_outs = self.student_model.head(\n                    neck_feats, self.student_model.export_post_process)\n                scale_factor = inputs['scale_factor']\n                bboxes, bbox_num = self.student_model.head.post_process(\n                    head_outs,\n                    scale_factor,\n                    export_nms=self.student_model.export_nms)\n                return {'bbox': bboxes, 'bbox_num': bbox_num}\n            else:\n                raise ValueError(f\"Unsupported model {self.arch}\")\n\n\n@register\nclass CWDDistillModel(DistillModel):\n    \"\"\"                                                                                                                                                    \n    Build CWD distill model.                                                                                                                               \n    Args:                                                                                                                                                  \n        cfg: The student config.                                                                                                                           \n        slim_cfg: The teacher and distill config.                                                                                                          \n    \"\"\"\n\n    def __init__(self, cfg, slim_cfg):\n        super(CWDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)\n        assert self.arch in ['GFL', 'RetinaNet'], 'Unsupported arch: {}'.format(\n            self.arch)\n\n    def build_loss(self, distill_cfg):\n        assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name\n        assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss\n        loss_func = dict()\n        name_list = distill_cfg.distill_loss_name\n        for name in name_list:\n            loss_func[name] = create(distill_cfg.distill_loss)\n        return loss_func\n\n    def get_loss_retinanet(self, stu_fea_list, tea_fea_list, inputs):\n        loss = self.student_model.head(stu_fea_list, inputs)\n        loss_dict = {}\n        for idx, k in enumerate(self.distill_loss):\n            loss_dict[k] = self.distill_loss[k](stu_fea_list[idx],\n                                                tea_fea_list[idx])\n\n            loss['loss'] += loss_dict[k]\n            loss[k] = loss_dict[k]\n        return loss\n\n    def get_loss_gfl(self, stu_fea_list, tea_fea_list, inputs):\n        loss = {}\n        head_outs = self.student_model.head(stu_fea_list)\n        loss_gfl = self.student_model.head.get_loss(head_outs, inputs)\n        loss.update(loss_gfl)\n        total_loss = paddle.add_n(list(loss.values()))\n        loss.update({'loss': total_loss})\n\n        feat_loss = {}\n        loss_dict = {}\n        s_cls_feat, t_cls_feat = [], []\n        for s_neck_f, t_neck_f in zip(stu_fea_list, tea_fea_list):\n            conv_cls_feat, _ = self.student_model.head.conv_feat(s_neck_f)\n            cls_score = self.student_model.head.gfl_head_cls(conv_cls_feat)\n            t_conv_cls_feat, _ = self.teacher_model.head.conv_feat(t_neck_f)\n            t_cls_score = self.teacher_model.head.gfl_head_cls(t_conv_cls_feat)\n            s_cls_feat.append(cls_score)\n            t_cls_feat.append(t_cls_score)\n\n        for idx, k in enumerate(self.distill_loss):\n            loss_dict[k] = self.distill_loss[k](s_cls_feat[idx],\n                                                t_cls_feat[idx])\n            feat_loss[f\"neck_f_{idx}\"] = self.distill_loss[k](stu_fea_list[idx],\n                                                              tea_fea_list[idx])\n\n        for k in feat_loss:\n            loss['loss'] += feat_loss[k]\n            loss[k] = feat_loss[k]\n\n        for k in loss_dict:\n            loss['loss'] += loss_dict[k]\n            loss[k] = loss_dict[k]\n        return loss\n\n    def forward(self, inputs):\n        if self.training:\n            s_body_feats = self.student_model.backbone(inputs)\n            s_neck_feats = self.student_model.neck(s_body_feats)\n            with paddle.no_grad():\n                t_body_feats = self.teacher_model.backbone(inputs)\n                t_neck_feats = self.teacher_model.neck(t_body_feats)\n\n            if self.arch == \"RetinaNet\":\n                loss = self.get_loss_retinanet(s_neck_feats, t_neck_feats,\n                                               inputs)\n            elif self.arch == \"GFL\":\n                loss = self.get_loss_gfl(s_neck_feats, t_neck_feats, inputs)\n            else:\n                raise ValueError(f\"unsupported arch {self.arch}\")\n            return loss\n        else:\n            body_feats = self.student_model.backbone(inputs)\n            neck_feats = self.student_model.neck(body_feats)\n            head_outs = self.student_model.head(neck_feats)\n            if self.arch == \"RetinaNet\":\n                bbox, bbox_num = self.student_model.head.post_process(\n                    head_outs, inputs['im_shape'], inputs['scale_factor'])\n                return {'bbox': bbox, 'bbox_num': bbox_num}\n            elif self.arch == \"GFL\":\n                bbox_pred, bbox_num = head_outs\n                output = {'bbox': bbox_pred, 'bbox_num': bbox_num}\n                return output\n            else:\n                raise ValueError(f\"unsupported arch {self.arch}\")\n\n\n@register\nclass LDDistillModel(DistillModel):\n    \"\"\"\n    Build LD distill model.\n    Args:\n        cfg: The student config.\n        slim_cfg: The teacher and distill config.\n    \"\"\"\n\n    def __init__(self, cfg, slim_cfg):\n        super(LDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)\n        assert self.arch in ['GFL'], 'Unsupported arch: {}'.format(self.arch)\n\n    def forward(self, inputs):\n        if self.training:\n            s_body_feats = self.student_model.backbone(inputs)\n            s_neck_feats = self.student_model.neck(s_body_feats)\n            s_head_outs = self.student_model.head(s_neck_feats)\n            with paddle.no_grad():\n                t_body_feats = self.teacher_model.backbone(inputs)\n                t_neck_feats = self.teacher_model.neck(t_body_feats)\n                t_head_outs = self.teacher_model.head(t_neck_feats)\n\n            soft_label_list = t_head_outs[0]\n            soft_targets_list = t_head_outs[1]\n            student_loss = self.student_model.head.get_loss(\n                s_head_outs, inputs, soft_label_list, soft_targets_list)\n            total_loss = paddle.add_n(list(student_loss.values()))\n            student_loss['loss'] = total_loss\n            return student_loss\n        else:\n            return self.student_model(inputs)\n\n\n@register\nclass PPYOLOEDistillModel(DistillModel):\n    \"\"\"\n    Build PPYOLOE distill model, only used in PPYOLOE\n    Args:\n        cfg: The student config.\n        slim_cfg: The teacher and distill config.\n    \"\"\"\n\n    def __init__(self, cfg, slim_cfg):\n        super(PPYOLOEDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)\n        assert self.arch in ['PPYOLOE'], 'Unsupported arch: {}'.format(\n            self.arch)\n\n    def forward(self, inputs, alpha=0.125):\n        if self.training:\n            with paddle.no_grad():\n                teacher_loss = self.teacher_model(inputs)\n            if hasattr(self.teacher_model.yolo_head, \"assigned_labels\"):\n                self.student_model.yolo_head.assigned_labels, self.student_model.yolo_head.assigned_bboxes, self.student_model.yolo_head.assigned_scores = \\\n                    self.teacher_model.yolo_head.assigned_labels, self.teacher_model.yolo_head.assigned_bboxes, self.teacher_model.yolo_head.assigned_scores\n                delattr(self.teacher_model.yolo_head, \"assigned_labels\")\n                delattr(self.teacher_model.yolo_head, \"assigned_bboxes\")\n                delattr(self.teacher_model.yolo_head, \"assigned_scores\")\n            student_loss = self.student_model(inputs)\n\n            logits_loss, feat_loss = self.distill_loss(self.teacher_model,\n                                                       self.student_model)\n            det_total_loss = student_loss['loss']\n            total_loss = alpha * (det_total_loss + logits_loss + feat_loss)\n            student_loss['loss'] = total_loss\n            student_loss['det_loss'] = det_total_loss\n            student_loss['logits_loss'] = logits_loss\n            student_loss['feat_loss'] = feat_loss\n            return student_loss\n        else:\n            return self.student_model(inputs)\n"
  },
  {
    "path": "ppdet/slim/ofa.py",
    "content": "from __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom ppdet.core.workspace import load_config, merge_config, create\nfrom ppdet.utils.checkpoint import load_weight, load_pretrain_weight\nfrom ppdet.utils.logger import setup_logger\nfrom ppdet.core.workspace import register, serializable\n\nfrom paddle.utils import try_import\n\nlogger = setup_logger(__name__)\n\n\n@register\n@serializable\nclass OFA(object):\n    def __init__(self, ofa_config):\n        super(OFA, self).__init__()\n        self.ofa_config = ofa_config\n\n    def __call__(self, model, param_state_dict):\n\n        paddleslim = try_import('paddleslim')\n        from paddleslim.nas.ofa import OFA, RunConfig, utils\n        from paddleslim.nas.ofa.convert_super import Convert, supernet\n        task = self.ofa_config['task']\n        expand_ratio = self.ofa_config['expand_ratio']\n\n        skip_neck = self.ofa_config['skip_neck']\n        skip_head = self.ofa_config['skip_head']\n\n        run_config = self.ofa_config['RunConfig']\n        if 'skip_layers' in run_config:\n            skip_layers = run_config['skip_layers']\n        else:\n            skip_layers = []\n\n        # supernet config\n        sp_config = supernet(expand_ratio=expand_ratio)\n        # convert to supernet\n        model = Convert(sp_config).convert(model)\n\n        skip_names = []\n        if skip_neck:\n            skip_names.append('neck.')\n        if skip_head:\n            skip_names.append('head.')\n\n        for name, sublayer in model.named_sublayers():\n            for n in skip_names:\n                if n in name:\n                    skip_layers.append(name)\n\n        run_config['skip_layers'] = skip_layers\n        run_config = RunConfig(**run_config)\n\n        # build ofa model\n        ofa_model = OFA(model, run_config=run_config)\n\n        ofa_model.set_epoch(0)\n        ofa_model.set_task(task)\n\n        input_spec = [{\n            \"image\": paddle.ones(\n                shape=[1, 3, 640, 640], dtype='float32'),\n            \"im_shape\": paddle.full(\n                [1, 2], 640, dtype='float32'),\n            \"scale_factor\": paddle.ones(\n                shape=[1, 2], dtype='float32')\n        }]\n\n        ofa_model._clear_search_space(input_spec=input_spec)\n        ofa_model._build_ss = True\n        check_ss = ofa_model._sample_config('expand_ratio', phase=None)\n        # tokenize the search space\n        ofa_model.tokenize()\n        # check token map, search cands and search space\n        logger.info('Token map is {}'.format(ofa_model.token_map))\n        logger.info('Search candidates is {}'.format(ofa_model.search_cands))\n        logger.info('The length of search_space is {}, search_space is {}'.\n                    format(len(ofa_model._ofa_layers), ofa_model._ofa_layers))\n        # set model state dict into ofa model\n        utils.set_state_dict(ofa_model.model, param_state_dict)\n        return ofa_model\n"
  },
  {
    "path": "ppdet/slim/prune.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport paddle\nfrom paddle.utils import try_import\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\ndef print_prune_params(model):\n    model_dict = model.state_dict()\n    for key in model_dict.keys():\n        weight_name = model_dict[key].name\n        logger.info('Parameter name: {}, shape: {}'.format(\n            weight_name, model_dict[key].shape))\n\n\n@register\n@serializable\nclass Pruner(object):\n    def __init__(self,\n                 criterion,\n                 pruned_params,\n                 pruned_ratios,\n                 print_params=False):\n        super(Pruner, self).__init__()\n        assert criterion in ['l1_norm', 'fpgm'], \\\n            \"unsupported prune criterion: {}\".format(criterion)\n        self.criterion = criterion\n        self.pruned_params = pruned_params\n        self.pruned_ratios = pruned_ratios\n        self.print_params = print_params\n\n    def __call__(self, model):\n        # FIXME: adapt to network graph when Training and inference are\n        # inconsistent, now only supports prune inference network graph.\n        model.eval()\n        paddleslim = try_import('paddleslim')\n        from paddleslim.analysis import dygraph_flops as flops\n        input_spec = [{\n            \"image\": paddle.ones(\n                shape=[1, 3, 640, 640], dtype='float32'),\n            \"im_shape\": paddle.full(\n                [1, 2], 640, dtype='float32'),\n            \"scale_factor\": paddle.ones(\n                shape=[1, 2], dtype='float32')\n        }]\n        if self.print_params:\n            print_prune_params(model)\n\n        ori_flops = flops(model, input_spec) / (1000**3)\n        logger.info(\"FLOPs before pruning: {}GFLOPs\".format(ori_flops))\n        if self.criterion == 'fpgm':\n            pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec)\n        elif self.criterion == 'l1_norm':\n            pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec)\n\n        logger.info(\"pruned params: {}\".format(self.pruned_params))\n        pruned_ratios = [float(n) for n in self.pruned_ratios]\n        ratios = {}\n        for i, param in enumerate(self.pruned_params):\n            ratios[param] = pruned_ratios[i]\n        pruner.prune_vars(ratios, [0])\n        pruned_flops = flops(model, input_spec) / (1000**3)\n        logger.info(\"FLOPs after pruning: {}GFLOPs; pruned ratio: {}\".format(\n            pruned_flops, (ori_flops - pruned_flops) / ori_flops))\n\n        return model\n\n\n@register\n@serializable\nclass PrunerQAT(object):\n    def __init__(self, criterion, pruned_params, pruned_ratios,\n                 print_prune_params, quant_config, print_qat_model):\n        super(PrunerQAT, self).__init__()\n        assert criterion in ['l1_norm', 'fpgm'], \\\n            \"unsupported prune criterion: {}\".format(criterion)\n        # Pruner hyperparameter\n        self.criterion = criterion\n        self.pruned_params = pruned_params\n        self.pruned_ratios = pruned_ratios\n        self.print_prune_params = print_prune_params\n        # QAT hyperparameter\n        self.quant_config = quant_config\n        self.print_qat_model = print_qat_model\n\n    def __call__(self, model):\n        # FIXME: adapt to network graph when Training and inference are\n        # inconsistent, now only supports prune inference network graph.\n        model.eval()\n        paddleslim = try_import('paddleslim')\n        from paddleslim.analysis import dygraph_flops as flops\n        input_spec = [{\n            \"image\": paddle.ones(\n                shape=[1, 3, 640, 640], dtype='float32'),\n            \"im_shape\": paddle.full(\n                [1, 2], 640, dtype='float32'),\n            \"scale_factor\": paddle.ones(\n                shape=[1, 2], dtype='float32')\n        }]\n        if self.print_prune_params:\n            print_prune_params(model)\n\n        ori_flops = flops(model, input_spec) / 1000\n        logger.info(\"FLOPs before pruning: {}GFLOPs\".format(ori_flops))\n        if self.criterion == 'fpgm':\n            pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec)\n        elif self.criterion == 'l1_norm':\n            pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec)\n\n        logger.info(\"pruned params: {}\".format(self.pruned_params))\n        pruned_ratios = [float(n) for n in self.pruned_ratios]\n        ratios = {}\n        for i, param in enumerate(self.pruned_params):\n            ratios[param] = pruned_ratios[i]\n        pruner.prune_vars(ratios, [0])\n        pruned_flops = flops(model, input_spec) / 1000\n        logger.info(\"FLOPs after pruning: {}GFLOPs; pruned ratio: {}\".format(\n            pruned_flops, (ori_flops - pruned_flops) / ori_flops))\n\n        self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config)\n\n        self.quanter.quantize(model)\n\n        if self.print_qat_model:\n            logger.info(\"Quantized model:\")\n            logger.info(model)\n\n        return model\n\n    def save_quantized_model(self, layer, path, input_spec=None, **config):\n        self.quanter.save_quantized_model(\n            model=layer, path=path, input_spec=input_spec, **config)\n"
  },
  {
    "path": "ppdet/slim/quant.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom paddle.utils import try_import\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\n@register\n@serializable\nclass QAT(object):\n    def __init__(self, quant_config, print_model):\n        super(QAT, self).__init__()\n        self.quant_config = quant_config\n        self.print_model = print_model\n\n    def __call__(self, model):\n        paddleslim = try_import('paddleslim')\n        self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config)\n        if self.print_model:\n            logger.info(\"Model before quant:\")\n            logger.info(model)\n\n        # For PP-YOLOE, convert model to deploy firstly.\n        for layer in model.sublayers():\n            if hasattr(layer, 'convert_to_deploy'):\n                layer.convert_to_deploy()\n\n        self.quanter.quantize(model)\n\n        if self.print_model:\n            logger.info(\"Quantized model:\")\n            logger.info(model)\n\n        return model\n\n    def save_quantized_model(self, layer, path, input_spec=None, **config):\n        self.quanter.save_quantized_model(\n            model=layer, path=path, input_spec=input_spec, **config)\n\n\n@register\n@serializable\nclass PTQ(object):\n    def __init__(self,\n                 ptq_config,\n                 quant_batch_num=10,\n                 output_dir='output_inference',\n                 fuse=True,\n                 fuse_list=None):\n        super(PTQ, self).__init__()\n        self.ptq_config = ptq_config\n        self.quant_batch_num = quant_batch_num\n        self.output_dir = output_dir\n        self.fuse = fuse\n        self.fuse_list = fuse_list\n\n    def __call__(self, model):\n        paddleslim = try_import('paddleslim')\n        self.ptq = paddleslim.PTQ(**self.ptq_config)\n        model.eval()\n        quant_model = self.ptq.quantize(\n            model, fuse=self.fuse, fuse_list=self.fuse_list)\n\n        return quant_model\n\n    def save_quantized_model(self,\n                             quant_model,\n                             quantize_model_path,\n                             input_spec=None):\n        self.ptq.save_quantized_model(quant_model, quantize_model_path,\n                                      input_spec)\n"
  },
  {
    "path": "ppdet/slim/unstructured_prune.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom paddle.utils import try_import\n\nfrom ppdet.core.workspace import register, serializable\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger(__name__)\n\n\n@register\n@serializable\nclass UnstructuredPruner(object):\n    def __init__(self,\n                 stable_epochs,\n                 pruning_epochs,\n                 tunning_epochs,\n                 pruning_steps,\n                 ratio,\n                 initial_ratio,\n                 prune_params_type=None):\n        self.stable_epochs = stable_epochs\n        self.pruning_epochs = pruning_epochs\n        self.tunning_epochs = tunning_epochs\n        self.ratio = ratio\n        self.prune_params_type = prune_params_type\n        self.initial_ratio = initial_ratio\n        self.pruning_steps = pruning_steps\n\n    def __call__(self, model, steps_per_epoch, skip_params_func=None):\n        paddleslim = try_import('paddleslim')\n        from paddleslim import GMPUnstructuredPruner\n        configs = {\n            'pruning_strategy': 'gmp',\n            'stable_iterations': self.stable_epochs * steps_per_epoch,\n            'pruning_iterations': self.pruning_epochs * steps_per_epoch,\n            'tunning_iterations': self.tunning_epochs * steps_per_epoch,\n            'resume_iteration': 0,\n            'pruning_steps': self.pruning_steps,\n            'initial_ratio': self.initial_ratio,\n        }\n\n        pruner = GMPUnstructuredPruner(\n            model,\n            ratio=self.ratio,\n            skip_params_func=skip_params_func,\n            prune_params_type=self.prune_params_type,\n            local_sparsity=True,\n            configs=configs)\n\n        return pruner\n"
  },
  {
    "path": "ppdet/utils/__init__.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppdet/utils/cam_utils.py",
    "content": "import numpy as np\nimport cv2\nimport os\nimport sys\nimport glob\nfrom ppdet.utils.logger import setup_logger\nimport copy\nlogger = setup_logger('ppdet_cam')\n\nimport paddle\nfrom ppdet.engine import Trainer\n\n\ndef get_test_images(infer_dir, infer_img):\n    \"\"\"\n    Get image path list in TEST mode\n    \"\"\"\n    assert infer_img is not None or infer_dir is not None, \\\n        \"--infer_img or --infer_dir should be set\"\n    assert infer_img is None or os.path.isfile(infer_img), \\\n            \"{} is not a file\".format(infer_img)\n    assert infer_dir is None or os.path.isdir(infer_dir), \\\n            \"{} is not a directory\".format(infer_dir)\n\n    # infer_img has a higher priority\n    if infer_img and os.path.isfile(infer_img):\n        return [infer_img]\n\n    images = set()\n    infer_dir = os.path.abspath(infer_dir)\n    assert os.path.isdir(infer_dir), \\\n        \"infer_dir {} is not a directory\".format(infer_dir)\n    exts = ['jpg', 'jpeg', 'png', 'bmp']\n    exts += [ext.upper() for ext in exts]\n    for ext in exts:\n        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))\n    images = list(images)\n\n    assert len(images) > 0, \"no image found in {}\".format(infer_dir)\n    logger.info(\"Found {} inference images in total.\".format(len(images)))\n\n    return images\n\n\ndef compute_ious(boxes1, boxes2):\n    \"\"\"[Compute pairwise IOU matrix for given two sets of boxes]\n\n        Args:\n            boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]\n            boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]\n        Returns:\n            pairwise IOU maxtrix with shape (N,M)，where the value at ith row jth column hold the iou between ith\n            box and jth box from box1 and box2 respectively.\n    \"\"\"\n    lu = np.maximum(\n        boxes1[:, None, :2], boxes2[:, :2]\n    )  # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2)\n    rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:])  # rd same to lu\n    intersection_wh = np.maximum(0.0, rd - lu)\n    intersection_area = intersection_wh[:, :,\n                                        0] * intersection_wh[:, :,\n                                                             1]  # with shape (N,M)\n    boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2])\n    boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1]  # with shape (N,)\n    boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2])\n    boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1]  # with shape (M,)\n    union_area = np.maximum(\n        boxes1_area[:, None] + boxes2_area - intersection_area,\n        1e-8)  # with shape (N,M)\n    ious = np.clip(intersection_area / union_area, 0.0, 1.0)\n    return ious\n\n\ndef grad_cam(feat, grad):\n    \"\"\"\n\n    Args:\n        feat:  CxHxW\n        grad:  CxHxW\n\n    Returns:\n           cam: HxW\n    \"\"\"\n    exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0)\n    exp = np.maximum(-exp, 0)\n    return exp\n\n\ndef resize_cam(explanation, resize_shape) -> np.ndarray:\n    \"\"\"\n\n    Args:\n        explanation: (width, height)\n        resize_shape: (width, height)\n\n    Returns:\n\n    \"\"\"\n    assert len(explanation.shape) == 2, f\"{explanation.shape}. \" \\\n                                        f\"Currently support 2D explanation results for visualization. \" \\\n                                        \"Reduce higher dimensions to 2D for visualization.\"\n\n    explanation = (explanation - explanation.min()) / (\n        explanation.max() - explanation.min())\n\n    explanation = cv2.resize(explanation, resize_shape)\n    explanation = np.uint8(255 * explanation)\n    explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET)\n    explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB)\n\n    return explanation\n\n\nclass BBoxCAM:\n    def __init__(self, FLAGS, cfg):\n        self.FLAGS = FLAGS\n        self.cfg = cfg\n        # build model\n        self.trainer = self.build_trainer(cfg)\n        # num_class\n        self.num_class = cfg.num_classes\n        # set hook for extraction of featuremaps and grads\n        self.set_hook(cfg)\n        self.nms_idx_need_divid_numclass_arch = [\n            'FasterRCNN', 'MaskRCNN', 'CascadeRCNN'\n        ]\n        \"\"\"\n        In these networks, the bbox array shape before nms contain num_class,\n        the nms_keep_idx of the bbox need to divide the num_class; \n        \"\"\"\n\n        # cam image output_dir\n        try:\n            os.makedirs(FLAGS.cam_out)\n        except:\n            print('Path already exists.')\n            pass\n\n    def build_trainer(self, cfg):\n        # build trainer\n        trainer = Trainer(cfg, mode='test')\n        # load weights\n        trainer.load_weights(cfg.weights)\n\n        # set for get extra_data before nms\n        trainer.model.use_extra_data = True\n        # set for record the bbox index before nms\n        if cfg.architecture in ['FasterRCNN', 'MaskRCNN']:\n            trainer.model.bbox_post_process.nms.return_index = True\n        elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']:\n            if trainer.model.post_process is not None:\n                # anchor based YOLOs: YOLOv3,PP-YOLO\n                trainer.model.post_process.nms.return_index = True\n            else:\n                # anchor free YOLOs: PP-YOLOE, PP-YOLOE+\n                trainer.model.yolo_head.nms.return_index = True\n        elif cfg.architecture == 'BlazeFace' or cfg.architecture == 'SSD':\n            trainer.model.post_process.nms.return_index = True\n        elif cfg.architecture == 'RetinaNet':\n            trainer.model.head.nms.return_index = True\n        else:\n            print(cfg.architecture + ' is not supported for cam temporarily!')\n            sys.exit()\n        # Todo: Unify the head/post_process name in each model\n\n        return trainer\n\n    def set_hook(self, cfg):\n        # set hook for extraction of featuremaps and grads\n        self.target_feats = {}\n        self.target_layer_name = cfg.target_feature_layer_name\n\n        # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor\n\n        def hook(layer, input, output):\n            self.target_feats[layer._layer_name_for_hook] = output\n\n        try:\n            exec('self.trainer.' + self.target_layer_name +\n                 '._layer_name_for_hook = self.target_layer_name')\n            # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name\n            exec('self.trainer.' + self.target_layer_name +\n                 '.register_forward_post_hook(hook)')\n            # self.trainer.target_layer_name.register_forward_post_hook(hook)\n        except:\n            print(\"Error! \"\n                  \"The target_layer_name--\" + self.target_layer_name +\n                  \" is not in model! \"\n                  \"Please check the spelling and \"\n                  \"the network's architecture!\")\n            sys.exit()\n\n    def get_bboxes(self):\n        # get inference images\n        images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img)\n\n        # inference\n        result = self.trainer.predict(\n            images,\n            draw_threshold=self.FLAGS.draw_threshold,\n            output_dir=self.FLAGS.output_dir,\n            save_results=self.FLAGS.save_results,\n            visualize=False)[0]\n        return result\n\n    def get_bboxes_cams(self):\n        # Get the bboxes prediction(after nms result) of the input\n        inference_result = self.get_bboxes()\n\n        # read input image\n        # Todo: Support folder multi-images process\n        from PIL import Image\n        img = np.array(Image.open(self.cfg.infer_img))\n\n        # data for calaulate bbox grad_cam\n        extra_data = inference_result['extra_data']\n        \"\"\"\n        Example of Faster_RCNN based architecture:\n            extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]\n                       'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1]\n                      }\n        Example of YOLOv3 based architecture:\n            extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400]\n                       'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1]\n                      }\n        \"\"\"\n\n        # array index of the predicted bbox before nms\n        if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch:\n            # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4],\n            # we need to divide num_classes to get the before_nms_index；\n            # currently, only include the rcnn architectures （fasterrcnn, maskrcnn, cascadercnn);\n            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy(\n            ) // self.num_class  # num_class\n        else:\n            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy()\n\n        # Calculate and visualize the heatmap of per predict bbox\n        for index, target_bbox in enumerate(inference_result['bbox']):\n            # target_bbox: [cls, score, x1, y1, x2, y2]\n            # filter bboxes with low predicted scores\n            if target_bbox[1] < self.FLAGS.draw_threshold:\n                continue\n\n            target_bbox_before_nms = int(before_nms_indexes[index])\n\n            if len(extra_data['scores'].shape) == 2:\n                score_out = extra_data['scores'][target_bbox_before_nms]\n            else:\n                score_out = extra_data['scores'][0, :, target_bbox_before_nms]\n            \"\"\"\n            There are two kinds array shape of bbox score output :\n                1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]\n                2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000]\n            \"\"\"\n\n            # construct one_hot label and do backward to get the gradients\n            predicted_label = paddle.argmax(score_out)\n            label_onehot = paddle.nn.functional.one_hot(\n                predicted_label, num_classes=len(score_out))\n            label_onehot = label_onehot.squeeze()\n            target = paddle.sum(score_out * label_onehot)\n            target.backward(retain_graph=True)\n\n\n            if 'backbone' in self.target_layer_name or \\\n                    'neck' in self.target_layer_name: # backbone/neck level feature\n                if isinstance(self.target_feats[self.target_layer_name], list):\n                    # when the featuremap contains of multiple scales,\n                    # take the featuremap of the last scale\n                    # Todo: fuse the cam result from multisclae featuremaps\n                    if self.target_feats[self.target_layer_name][-1].shape[\n                            -1] == 1:\n                        \"\"\"\n                        if the last level featuremap is 1x1 size,\n                        we take the second last one\n                        \"\"\"\n                        cam_grad = self.target_feats[self.target_layer_name][\n                            -2].grad.squeeze().cpu().numpy()\n                        cam_feat = self.target_feats[self.target_layer_name][\n                            -2].squeeze().cpu().numpy()\n                    else:\n                        cam_grad = self.target_feats[self.target_layer_name][\n                            -1].grad.squeeze().cpu().numpy()\n                        cam_feat = self.target_feats[self.target_layer_name][\n                            -1].squeeze().cpu().numpy()\n                else:\n                    cam_grad = self.target_feats[\n                        self.target_layer_name].grad.squeeze().cpu().numpy()\n                    cam_feat = self.target_feats[\n                        self.target_layer_name].squeeze().cpu().numpy()\n            else:  # roi level feature\n                cam_grad = self.target_feats[\n                    self.target_layer_name].grad.squeeze().cpu().numpy()[\n                        target_bbox_before_nms]\n                cam_feat = self.target_feats[self.target_layer_name].squeeze(\n                ).cpu().numpy()[target_bbox_before_nms]\n\n            # grad_cam:\n            exp = grad_cam(cam_feat, cam_grad)\n\n            if 'backbone' in self.target_layer_name or \\\n                    'neck' in self.target_layer_name:\n                \"\"\"\n                when use backbone/neck featuremap, \n                we first do the cam on whole image, \n                and then set the area outside the predic bbox to 0\n                \"\"\"\n                # reshape the cam image to the input image size\n                resized_exp = resize_cam(exp, (img.shape[1], img.shape[0]))\n                mask = np.zeros((img.shape[0], img.shape[1], 3))\n                mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[\n                    2]):int(target_bbox[4]), :] = 1\n                resized_exp = resized_exp * mask\n                # add the bbox cam back to the input image\n                overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6)\n            elif 'roi' in self.target_layer_name:\n                # get the bbox part of the image\n                bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(\n                    target_bbox[5]), int(target_bbox[2]):int(target_bbox[\n                        4]), :])\n                # reshape the cam image to the bbox size\n                resized_exp = resize_cam(exp,\n                                         (bbox_img.shape[1], bbox_img.shape[0]))\n                # add the bbox cam back to the bbox image\n                bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6)\n                # put the bbox_cam image to the original image\n                overlay_vis = copy.deepcopy(img)\n                overlay_vis[int(target_bbox[3]):int(target_bbox[5]), int(\n                    target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis\n            else:\n                print(\n                    'Only supported cam for  backbone/neck feature and roi feature,  the others are not supported temporarily!'\n                )\n                sys.exit()\n\n            # put the bbox rectangle on image\n            cv2.rectangle(\n                overlay_vis, (int(target_bbox[2]), int(target_bbox[3])),\n                (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2)\n\n            # save visualization result\n            cam_image = Image.fromarray(overlay_vis)\n            cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg')\n\n            # clear gradients after each bbox grad_cam\n            target.clear_gradient()\n            for n, v in self.trainer.model.named_sublayers():\n                v.clear_gradients()\n"
  },
  {
    "path": "ppdet/utils/check.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport sys\n\nimport paddle\nimport six\nimport paddle.version as paddle_version\n\nfrom .logger import setup_logger\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'check_gpu', 'check_npu', 'check_xpu', 'check_mlu', 'check_version',\n    'check_config'\n]\n\n\ndef check_mlu(use_mlu):\n    \"\"\"\n    Log error and exit when set use_mlu=true in paddlepaddle\n    cpu/gpu/xpu/npu version.\n    \"\"\"\n    err = \"Config use_mlu cannot be set as true while you are \" \\\n          \"using paddlepaddle cpu/gpu/xpu/npu version ! \\nPlease try: \\n\" \\\n          \"\\t1. Install paddlepaddle-mlu to run model on MLU \\n\" \\\n          \"\\t2. Set use_mlu as false in config file to run \" \\\n          \"model on CPU/GPU/XPU/NPU\"\n\n    try:\n        if use_mlu and not paddle.is_compiled_with_mlu():\n            logger.error(err)\n            sys.exit(1)\n    except Exception as e:\n        pass\n\n\ndef check_npu(use_npu):\n    \"\"\"\n    Log error and exit when set use_npu=true in paddlepaddle\n    version without paddle-custom-npu installed.\n    \"\"\"\n    err = \"Config use_npu cannot be set as true while you are \" \\\n          \"using paddlepaddle version without paddle-custom-npu \" \\\n          \"installed! \\nPlease try: \\n\" \\\n          \"\\t1. Install paddle-custom-npu to run model on NPU \\n\" \\\n          \"\\t2. Set use_npu as false in config file to run \" \\\n          \"model on other devices supported.\"\n\n    try:\n        if use_npu and not 'npu' in paddle.device.get_all_custom_device_type():\n            logger.error(err)\n            sys.exit(1)\n    except Exception as e:\n        pass\n\n\ndef check_xpu(use_xpu):\n    \"\"\"\n    Log error and exit when set use_xpu=true in paddlepaddle\n    cpu/gpu/npu version.\n    \"\"\"\n    err = \"Config use_xpu cannot be set as true while you are \" \\\n          \"using paddlepaddle cpu/gpu/npu version ! \\nPlease try: \\n\" \\\n          \"\\t1. Install paddlepaddle-xpu to run model on XPU \\n\" \\\n          \"\\t2. Set use_xpu as false in config file to run \" \\\n          \"model on CPU/GPU/NPU\"\n\n    try:\n        if use_xpu and not paddle.is_compiled_with_xpu():\n            logger.error(err)\n            sys.exit(1)\n    except Exception as e:\n        pass\n\n\ndef check_gpu(use_gpu):\n    \"\"\"\n    Log error and exit when set use_gpu=true in paddlepaddle\n    cpu version.\n    \"\"\"\n    err = \"Config use_gpu cannot be set as true while you are \" \\\n          \"using paddlepaddle cpu version ! \\nPlease try: \\n\" \\\n          \"\\t1. Install paddlepaddle-gpu to run model on GPU \\n\" \\\n          \"\\t2. Set use_gpu as false in config file to run \" \\\n          \"model on CPU\"\n\n    try:\n        if use_gpu and not paddle.is_compiled_with_cuda():\n            logger.error(err)\n            sys.exit(1)\n    except Exception as e:\n        pass\n\n\ndef check_version(version='2.2'):\n    \"\"\"\n    Log error and exit when the installed version of paddlepaddle is\n    not satisfied.\n    \"\"\"\n    err = \"PaddlePaddle version {} or higher is required, \" \\\n          \"or a suitable develop version is satisfied as well. \\n\" \\\n          \"Please make sure the version is good with your code.\".format(version)\n\n    version_installed = [\n        paddle_version.major, paddle_version.minor, paddle_version.patch,\n        paddle_version.rc\n    ]\n\n    if version_installed == ['0', '0', '0', '0']:\n        return\n\n    version_split = version.split('.')\n\n    length = min(len(version_installed), len(version_split))\n    for i in six.moves.range(length):\n        if version_installed[i] > version_split[i]:\n            return\n        if version_installed[i] < version_split[i]:\n            raise Exception(err)\n\n\ndef check_config(cfg):\n    \"\"\"\n    Check the correctness of the configuration file. Log error and exit\n    when Config is not compliant.\n    \"\"\"\n    err = \"'{}' not specified in config file. Please set it in config file.\"\n    check_list = ['architecture', 'num_classes']\n    try:\n        for var in check_list:\n            if not var in cfg:\n                logger.error(err.format(var))\n                sys.exit(1)\n    except Exception as e:\n        pass\n\n    if 'log_iter' not in cfg:\n        cfg.log_iter = 20\n\n    return cfg\n"
  },
  {
    "path": "ppdet/utils/checkpoint.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\n\nimport os\nimport json\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nfrom .download import get_weights_path\n\nfrom .logger import setup_logger\nlogger = setup_logger(__name__)\n\ndef convert_to_dict(obj):\n    if isinstance(obj, dict):\n        return {k: convert_to_dict(v) for k, v in obj.items()}\n    elif isinstance(obj, list):\n        return [convert_to_dict(i) for i in obj]\n    else:\n        return obj\n\ndef is_url(path):\n    \"\"\"\n    Whether path is URL.\n    Args:\n        path (string): URL string or not.\n    \"\"\"\n    return path.startswith('http://') \\\n            or path.startswith('https://') \\\n            or path.startswith('ppdet://')\n\n\ndef _strip_postfix(path):\n    path, ext = os.path.splitext(path)\n    assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \\\n            \"Unknown postfix {} from weights\".format(ext)\n    return path\n\n\ndef load_weight(model, weight, optimizer=None, ema=None, exchange=True):\n    if is_url(weight):\n        weight = get_weights_path(weight)\n\n    path = _strip_postfix(weight)\n    pdparam_path = path + '.pdparams'\n    if not os.path.exists(pdparam_path):\n        raise ValueError(\"Model pretrain path {} does not \"\n                         \"exists.\".format(pdparam_path))\n\n    if ema is not None and os.path.exists(path + '.pdema'):\n        if exchange:\n            # Exchange model and ema_model to load\n            logger.info('Exchange model and ema_model to load:')\n            ema_state_dict = paddle.load(pdparam_path)\n            logger.info('Loading ema_model weights from {}'.format(path +\n                                                                   '.pdparams'))\n            param_state_dict = paddle.load(path + '.pdema')\n            logger.info('Loading model weights from {}'.format(path + '.pdema'))\n        else:\n            ema_state_dict = paddle.load(path + '.pdema')\n            logger.info('Loading ema_model weights from {}'.format(path +\n                                                                   '.pdema'))\n            param_state_dict = paddle.load(pdparam_path)\n            logger.info('Loading model weights from {}'.format(path +\n                                                               '.pdparams'))\n    else:\n        ema_state_dict = None\n        param_state_dict = paddle.load(pdparam_path)\n\n    if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'):\n        print('Loading pretrain weights for Teacher-Student framework.')\n        print('Loading pretrain weights for Student model.')\n        student_model_dict = model.modelStudent.state_dict()\n        student_param_state_dict = match_state_dict(\n            student_model_dict, param_state_dict, mode='student')\n        model.modelStudent.set_dict(student_param_state_dict)\n        print('Loading pretrain weights for Teacher model.')\n        teacher_model_dict = model.modelTeacher.state_dict()\n\n        teacher_param_state_dict = match_state_dict(\n            teacher_model_dict, param_state_dict, mode='teacher')\n        model.modelTeacher.set_dict(teacher_param_state_dict)\n\n    else:\n        model_dict = model.state_dict()\n        model_weight = {}\n        incorrect_keys = 0\n        for key in model_dict.keys():\n            if key in param_state_dict.keys():\n                model_weight[key] = param_state_dict[key]\n            else:\n                logger.info('Unmatched key: {}'.format(key))\n                incorrect_keys += 1\n        assert incorrect_keys == 0, \"Load weight {} incorrectly, \\\n                {} keys unmatched, please check again.\".format(weight,\n                                                               incorrect_keys)\n        logger.info('Finish resuming model weights: {}'.format(pdparam_path))\n        model.set_dict(model_weight)\n\n    last_epoch = 0\n    if optimizer is not None and os.path.exists(path + '.pdopt'):\n        optim_state_dict = paddle.load(path + '.pdopt')\n        # to solve resume bug, will it be fixed in paddle 2.0\n        for key in optimizer.state_dict().keys():\n            if not key in optim_state_dict.keys():\n                optim_state_dict[key] = optimizer.state_dict()[key]\n        if 'last_epoch' in optim_state_dict:\n            last_epoch = optim_state_dict.pop('last_epoch')\n        optimizer.set_state_dict(optim_state_dict)\n\n        if ema_state_dict is not None:\n            ema.resume(ema_state_dict,\n                       optim_state_dict['LR_Scheduler']['last_epoch'])\n    elif ema_state_dict is not None:\n        ema.resume(ema_state_dict)\n    return last_epoch\n\n\ndef match_state_dict(model_state_dict, weight_state_dict, mode='default'):\n    \"\"\"\n    Match between the model state dict and pretrained weight state dict.\n    Return the matched state dict.\n\n    The method supposes that all the names in pretrained weight state dict are\n    subclass of the names in models`, if the prefix 'backbone.' in pretrained weight\n    keys is stripped. And we could get the candidates for each model key. Then we\n    select the name with the longest matched size as the final match result. For\n    example, the model state dict has the name of\n    'backbone.res2.res2a.branch2a.conv.weight' and the pretrained weight as\n    name of 'res2.res2a.branch2a.conv.weight' and 'branch2a.conv.weight'. We\n    match the 'res2.res2a.branch2a.conv.weight' to the model key.\n    \"\"\"\n\n    model_keys = sorted(model_state_dict.keys())\n    weight_keys = sorted(weight_state_dict.keys())\n\n    def teacher_match(a, b):\n        # skip student params\n        if b.startswith('modelStudent'):\n            return False\n        return a == b or a.endswith(\".\" + b) or b.endswith(\".\" + a)\n\n    def student_match(a, b):\n        # skip teacher params\n        if b.startswith('modelTeacher'):\n            return False\n        return a == b or a.endswith(\".\" + b) or b.endswith(\".\" + a)\n\n    def match(a, b):\n        if b.startswith('backbone.res5'):\n            b = b[9:]\n        return a == b or a.endswith(\".\" + b)\n\n    if mode == 'student':\n        match_op = student_match\n    elif mode == 'teacher':\n        match_op = teacher_match\n    else:\n        match_op = match\n\n    match_matrix = np.zeros([len(model_keys), len(weight_keys)])\n    for i, m_k in enumerate(model_keys):\n        for j, w_k in enumerate(weight_keys):\n            if match_op(m_k, w_k):\n                match_matrix[i, j] = len(w_k)\n    max_id = match_matrix.argmax(1)\n    max_len = match_matrix.max(1)\n    max_id[max_len == 0] = -1\n    load_id = set(max_id)\n    load_id.discard(-1)\n    not_load_weight_name = []\n    if weight_keys[0].startswith('modelStudent') or weight_keys[0].startswith(\n            'modelTeacher'):\n        for match_idx in range(len(max_id)):\n            if max_id[match_idx] == -1:\n                not_load_weight_name.append(model_keys[match_idx])\n        if len(not_load_weight_name) > 0:\n            logger.info('{} in model is not matched with pretrained weights, '\n                        'and its will be trained from scratch'.format(\n                            not_load_weight_name))\n\n    else:\n        for idx in range(len(weight_keys)):\n            if idx not in load_id:\n                not_load_weight_name.append(weight_keys[idx])\n\n        if len(not_load_weight_name) > 0:\n            logger.info('{} in pretrained weight is not used in the model, '\n                        'and its will not be loaded'.format(\n                            not_load_weight_name))\n    matched_keys = {}\n    result_state_dict = {}\n    for model_id, weight_id in enumerate(max_id):\n        if weight_id == -1:\n            continue\n        model_key = model_keys[model_id]\n        weight_key = weight_keys[weight_id]\n        weight_value = weight_state_dict[weight_key]\n        model_value_shape = list(model_state_dict[model_key].shape)\n\n        if list(weight_value.shape) != model_value_shape:\n            logger.info(\n                'The shape {} in pretrained weight {} is unmatched with '\n                'the shape {} in model {}. And the weight {} will not be '\n                'loaded'.format(weight_value.shape, weight_key,\n                                model_value_shape, model_key, weight_key))\n            continue\n\n        assert model_key not in result_state_dict\n        result_state_dict[model_key] = weight_value\n        if weight_key in matched_keys:\n            raise ValueError('Ambiguity weight {} loaded, it matches at least '\n                             '{} and {} in the model'.format(\n                                 weight_key, model_key, matched_keys[\n                                     weight_key]))\n        matched_keys[weight_key] = model_key\n    return result_state_dict\n\n\ndef load_pretrain_weight(model, pretrain_weight, ARSL_eval=False):\n    if is_url(pretrain_weight):\n        pretrain_weight = get_weights_path(pretrain_weight)\n\n    path = _strip_postfix(pretrain_weight)\n    if not (os.path.isdir(path) or os.path.isfile(path) or\n            os.path.exists(path + '.pdparams')):\n        raise ValueError(\"Model pretrain path `{}` does not exists. \"\n                         \"If you don't want to load pretrain model, \"\n                         \"please delete `pretrain_weights` field in \"\n                         \"config file.\".format(path))\n    teacher_student_flag = False\n    if not ARSL_eval:\n        if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'):\n            print('Loading pretrain weights for Teacher-Student framework.')\n            print(\n                'Assert Teacher model has the same structure with Student model.'\n            )\n            model_dict = model.modelStudent.state_dict()\n            teacher_student_flag = True\n        else:\n            model_dict = model.state_dict()\n\n        weights_path = path + '.pdparams'\n        param_state_dict = paddle.load(weights_path)\n        param_state_dict = match_state_dict(model_dict, param_state_dict)\n        for k, v in param_state_dict.items():\n            if isinstance(v, np.ndarray):\n                v = paddle.to_tensor(v)\n            if model_dict[k].dtype != v.dtype:\n                param_state_dict[k] = v.astype(model_dict[k].dtype)\n\n        if teacher_student_flag:\n            model.modelStudent.set_dict(param_state_dict)\n            model.modelTeacher.set_dict(param_state_dict)\n        else:\n            model.set_dict(param_state_dict)\n        logger.info('Finish loading model weights: {}'.format(weights_path))\n\n    else:\n        weights_path = path + '.pdparams'\n        param_state_dict = paddle.load(weights_path)\n        student_model_dict = model.modelStudent.state_dict()\n        student_param_state_dict = match_state_dict(\n            student_model_dict, param_state_dict, mode='student')\n        model.modelStudent.set_dict(student_param_state_dict)\n        print('Loading pretrain weights for Teacher model.')\n        teacher_model_dict = model.modelTeacher.state_dict()\n\n        teacher_param_state_dict = match_state_dict(\n            teacher_model_dict, param_state_dict, mode='teacher')\n        model.modelTeacher.set_dict(teacher_param_state_dict)\n        logger.info('Finish loading model weights: {}'.format(weights_path))\n\n\ndef save_model(model,\n               optimizer,\n               save_dir,\n               save_name,\n               last_epoch,\n               ema_model=None):\n    \"\"\"\n    save model into disk.\n\n    Args:\n        model (dict): the model state_dict to save parameters.\n        optimizer (paddle.optimizer.Optimizer): the Optimizer instance to\n            save optimizer states.\n        save_dir (str): the directory to be saved.\n        save_name (str): the path to be saved.\n        last_epoch (int): the epoch index.\n        ema_model (dict|None): the ema_model state_dict to save parameters.\n    \"\"\"\n    if paddle.distributed.get_rank() != 0:\n        return\n        \n    save_dir = os.path.normpath(save_dir)\n    if not os.path.exists(save_dir):\n        os.makedirs(save_dir)\n\n    if save_name == \"best_model\":\n        best_model_path = os.path.join(save_dir, 'best_model')\n        if not os.path.exists(best_model_path):\n            os.makedirs(best_model_path)\n\n    save_path = os.path.join(save_dir, save_name)\n    # save model\n    if isinstance(model, nn.Layer):\n        paddle.save(model.state_dict(), save_path + \".pdparams\")\n        best_model = model.state_dict()\n    else:\n        assert isinstance(model,\n                          dict), 'model is not a instance of nn.layer or dict'\n        if ema_model is None:\n            paddle.save(model, save_path + \".pdparams\")\n            best_model = model\n        else:\n            assert isinstance(ema_model,\n                              dict), (\"ema_model is not a instance of dict, \"\n                                      \"please call model.state_dict() to get.\")\n            # Exchange model and ema_model to save\n            paddle.save(ema_model, save_path + \".pdparams\")\n            paddle.save(model, save_path + \".pdema\")\n            best_model = ema_model\n\n    if save_name == 'best_model':\n        best_model_path = os.path.join(best_model_path, 'model')\n        paddle.save(best_model, best_model_path + \".pdparams\")\n    # save optimizer\n    state_dict = optimizer.state_dict()\n    state_dict['last_epoch'] = last_epoch\n    paddle.save(state_dict, save_path + \".pdopt\")\n    logger.info(\"Save checkpoint: {}\".format(save_dir))\n\n\ndef save_semi_model(teacher_model, student_model, optimizer, save_dir,\n                    save_name, last_epoch, last_iter):\n    \"\"\"\n    save teacher and student model into disk.\n    Args:\n        teacher_model (dict): the teacher_model state_dict to save parameters.\n        student_model (dict): the student_model state_dict to save parameters.\n        optimizer (paddle.optimizer.Optimizer): the Optimizer instance to\n            save optimizer states.\n        save_dir (str): the directory to be saved.\n        save_name (str): the path to be saved.\n        last_epoch (int): the epoch index.\n        last_iter (int): the iter index.\n    \"\"\"\n    if paddle.distributed.get_rank() != 0:\n        return\n    assert isinstance(teacher_model, dict), (\n        \"teacher_model is not a instance of dict, \"\n        \"please call teacher_model.state_dict() to get.\")\n    assert isinstance(student_model, dict), (\n        \"student_model is not a instance of dict, \"\n        \"please call student_model.state_dict() to get.\")\n    if not os.path.exists(save_dir):\n        os.makedirs(save_dir)\n    save_path = os.path.join(save_dir, save_name)\n    # save model\n    paddle.save(teacher_model, save_path + str(last_epoch) + \"epoch_t.pdparams\")\n    paddle.save(student_model, save_path + str(last_epoch) + \"epoch_s.pdparams\")\n\n    # save optimizer\n    state_dict = optimizer.state_dict()\n    state_dict['last_epoch'] = last_epoch\n    state_dict['last_iter'] = last_iter\n    paddle.save(state_dict, save_path + str(last_epoch) + \"epoch.pdopt\")\n    logger.info(\"Save checkpoint: {}\".format(save_dir))\n\ndef save_model_info(model_info, save_path, prefix):\n    \"\"\"\n    save model info to the target path\n    \"\"\"\n    save_path = os.path.join(save_path, prefix)\n    if not os.path.exists(save_path):\n        os.makedirs(save_path)\n    with open(os.path.join(save_path, f'{prefix}.info.json'), 'w') as f:\n        json.dump(model_info, f)\n    logger.info(\"Already save model info in {}\".format(save_path))\n\ndef update_train_results(config,\n                         prefix,\n                         metric_info,\n                         done_flag=False,\n                         last_num=5,\n                         ema=False):\n    if paddle.distributed.get_rank() != 0:\n        return\n    assert last_num >= 1\n    train_results_path = os.path.join(config[\"save_dir\"],\n                                      \"train_result.json\")\n    save_model_tag = [\"pdparams\", \"pdopt\", \"pdstates\"]\n    save_inference_tag = [\n        \"inference_config\", \"pdmodel\", \"pdiparams\", \"pdiparams.info\"\n    ]\n    if ema:\n        save_model_tag.append(\"pdema\")\n    if os.path.exists(train_results_path):\n        with open(train_results_path, \"r\") as fp:\n            train_results = json.load(fp)\n    else:\n        train_results = {}\n        train_results[\"model_name\"] = config[\"pdx_model_name\"]\n        train_results[\"label_dict\"] = \"\"\n        train_results[\"visualdl_log\"] = \"\"\n        train_results[\"train_log\"] = \"train.log\"\n        train_results[\"config\"] = \"config.yaml\"\n        train_results[\"models\"] = {}\n        for i in range(1, last_num + 1):\n            train_results[\"models\"][f\"last_{i}\"] = {}\n        train_results[\"models\"][\"best\"] = {}\n    train_results[\"done_flag\"] = done_flag\n    if prefix == \"best_model\":\n        train_results[\"models\"][\"best\"][\"score\"] = metric_info[\"metric\"]\n        for tag in save_model_tag:\n            train_results[\"models\"][\"best\"][tag] = os.path.join(\n                prefix, f\"{prefix}.{tag}\")\n        for tag in save_inference_tag:\n            train_results[\"models\"][\"best\"][tag] = os.path.join(\n                prefix, \"inference\", f\"inference.{tag}\" if tag != \"inference_config\" else \"inference.yml\")\n    else:\n        for i in range(last_num - 1, 0, -1):\n            train_results[\"models\"][f\"last_{i + 1}\"] = train_results[\"models\"][\n                f\"last_{i}\"].copy()\n        train_results[\"models\"][f\"last_{1}\"][\"score\"] = metric_info[\"metric\"]\n        for tag in save_model_tag:\n            train_results[\"models\"][f\"last_{1}\"][tag] = os.path.join(\n                prefix, f\"{prefix}.{tag}\")\n        for tag in save_inference_tag:\n            train_results[\"models\"][f\"last_{1}\"][tag] = os.path.join(\n                prefix, \"inference\", f\"inference.{tag}\" if tag != \"inference_config\" else \"inference.yml\")\n\n    with open(train_results_path, \"w\") as fp:\n        json.dump(train_results, fp)"
  },
  {
    "path": "ppdet/utils/cli.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom argparse import ArgumentParser, RawDescriptionHelpFormatter\n\nimport yaml\nimport re\nfrom ppdet.core.workspace import get_registered_modules, dump_value\n\n__all__ = ['ColorTTY', 'ArgsParser']\n\n\nclass ColorTTY(object):\n    def __init__(self):\n        super(ColorTTY, self).__init__()\n        self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan']\n\n    def __getattr__(self, attr):\n        if attr in self.colors:\n            color = self.colors.index(attr) + 31\n\n            def color_message(message):\n                return \"\u001b[{}m{}\u001b[0m\".format(color, message)\n\n            setattr(self, attr, color_message)\n            return color_message\n\n    def bold(self, message):\n        return self.with_code('01', message)\n\n    def with_code(self, code, message):\n        return \"\u001b[{}m{}\u001b[0m\".format(code, message)\n\n\nclass ArgsParser(ArgumentParser):\n    def __init__(self):\n        super(ArgsParser, self).__init__(\n            formatter_class=RawDescriptionHelpFormatter)\n        self.add_argument(\"-c\", \"--config\", help=\"configuration file to use\")\n        self.add_argument(\n            \"-o\", \"--opt\", nargs='*', help=\"set configuration options\")\n\n    def parse_args(self, argv=None):\n        args = super(ArgsParser, self).parse_args(argv)\n        assert args.config is not None, \\\n            \"Please specify --config=configure_file_path.\"\n        args.opt = self._parse_opt(args.opt)\n        return args\n\n    def _parse_opt(self, opts):\n        config = {}\n        if not opts:\n            return config\n        for s in opts:\n            s = s.strip()\n            k, v = s.split('=', 1)\n            if '.' not in k:\n                config[k] = yaml.load(v, Loader=yaml.Loader)\n            else:\n                keys = k.split('.')\n                if keys[0] not in config:\n                    config[keys[0]] = {}\n                cur = config[keys[0]]\n                for idx, key in enumerate(keys[1:]):\n                    if idx == len(keys) - 2:\n                        cur[key] = yaml.load(v, Loader=yaml.Loader)\n                    else:\n                        cur[key] = {}\n                        cur = cur[key]\n        return config\n\n\ndef merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']):\n    for k, v in vars(args).items():\n        if k not in exclude_args:\n            config[k] = v\n    return config\n\n\ndef print_total_cfg(config):\n    modules = get_registered_modules()\n    color_tty = ColorTTY()\n    green = '___{}___'.format(color_tty.colors.index('green') + 31)\n\n    styled = {}\n    for key in config.keys():\n        if not config[key]:  # empty schema\n            continue\n\n        if key not in modules and not hasattr(config[key], '__dict__'):\n            styled[key] = config[key]\n            continue\n        elif key in modules:\n            module = modules[key]\n        else:\n            type_name = type(config[key]).__name__\n            if type_name in modules:\n                module = modules[type_name].copy()\n                module.update({\n                    k: v\n                    for k, v in config[key].__dict__.items()\n                    if k in module.schema\n                })\n                key += \" ({})\".format(type_name)\n        default = module.find_default_keys()\n        missing = module.find_missing_keys()\n        mismatch = module.find_mismatch_keys()\n        extra = module.find_extra_keys()\n        dep_missing = []\n        for dep in module.inject:\n            if isinstance(module[dep], str) and module[dep] != '<value>':\n                if module[dep] not in modules:  # not a valid module\n                    dep_missing.append(dep)\n                else:\n                    dep_mod = modules[module[dep]]\n                    # empty dict but mandatory\n                    if not dep_mod and dep_mod.mandatory():\n                        dep_missing.append(dep)\n        override = list(\n            set(module.keys()) - set(default) - set(extra) - set(dep_missing))\n        replacement = {}\n        for name in set(override + default + extra + mismatch + missing):\n            new_name = name\n            if name in missing:\n                value = \"<missing>\"\n            else:\n                value = module[name]\n\n            if name in extra:\n                value = dump_value(value) + \" <extraneous>\"\n            elif name in mismatch:\n                value = dump_value(value) + \" <type mismatch>\"\n            elif name in dep_missing:\n                value = dump_value(value) + \" <module config missing>\"\n            elif name in override and value != '<missing>':\n                mark = green\n                new_name = mark + name\n            replacement[new_name] = value\n        styled[key] = replacement\n    buffer = yaml.dump(styled, default_flow_style=False, default_style='')\n    buffer = (re.sub(r\"<missing>\", r\"[31m<missing>[0m\", buffer))\n    buffer = (re.sub(r\"<extraneous>\", r\"[33m<extraneous>[0m\", buffer))\n    buffer = (re.sub(r\"<type mismatch>\", r\"[31m<type mismatch>[0m\", buffer))\n    buffer = (re.sub(r\"<module config missing>\",\n                     r\"[31m<module config missing>[0m\", buffer))\n    buffer = re.sub(r\"___(\\d+)___(.*?):\", r\"[\\1m\\2[0m:\", buffer)\n    print(buffer)\n"
  },
  {
    "path": "ppdet/utils/colormap.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\n\nimport numpy as np\n\n\ndef colormap(rgb=False):\n    \"\"\"\n    Get colormap\n\n    The code of this function is copied from https://github.com/facebookresearch/Detectron/blob/main/detectron/utils/colormap.py\n    \"\"\"\n    color_list = np.array([\n        0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494,\n        0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078,\n        0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000,\n        1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000,\n        0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667,\n        0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000,\n        0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000,\n        1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000,\n        0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500,\n        0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667,\n        0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333,\n        0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000,\n        0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333,\n        0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000,\n        1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000,\n        1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167,\n        0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000,\n        0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000,\n        0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000,\n        0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000,\n        0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833,\n        0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286,\n        0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714,\n        0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000\n    ]).astype(np.float32)\n    color_list = color_list.reshape((-1, 3)) * 255\n    if not rgb:\n        color_list = color_list[:, ::-1]\n    return color_list.astype('int32')\n"
  },
  {
    "path": "ppdet/utils/compact.py",
    "content": "import PIL\n\ndef imagedraw_textsize_c(draw, text, font=None):\n    if int(PIL.__version__.split('.')[0]) < 10:\n        tw, th = draw.textsize(text, font=font)\n    else:\n        left, top, right, bottom = draw.textbbox((0, 0), text, font=font)\n        tw, th = right - left, bottom - top\n\n    return tw, th\n\n"
  },
  {
    "path": "ppdet/utils/download.py",
    "content": "#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport os.path as osp\nimport sys\nimport yaml\nimport time\nimport shutil\nimport requests\nimport tqdm\nimport hashlib\nimport base64\nimport binascii\nimport tarfile\nimport zipfile\nimport errno\n\nfrom paddle.utils.download import _get_unique_endpoints\nfrom ppdet.core.workspace import BASE_KEY\nfrom .logger import setup_logger\nfrom .voc_utils import create_list\n\nlogger = setup_logger(__name__)\n\n__all__ = [\n    'get_weights_path', 'get_dataset_path', 'get_config_path',\n    'download_dataset', 'create_voc_list'\n]\n\nWEIGHTS_HOME = osp.expanduser(\"~/.cache/paddle/weights\")\nDATASET_HOME = osp.expanduser(\"~/.cache/paddle/dataset\")\nCONFIGS_HOME = osp.expanduser(\"~/.cache/paddle/configs\")\n\n# dict of {dataset_name: (download_info, sub_dirs)}\n# download info: [(url, md5sum)]\nDATASETS = {\n    'coco': ([\n        (\n            'http://images.cocodataset.org/zips/train2017.zip',\n            'cced6f7f71b7629ddf16f17bbcfab6b2', ),\n        (\n            'http://images.cocodataset.org/zips/val2017.zip',\n            '442b8da7639aecaf257c1dceb8ba8c80', ),\n        (\n            'http://images.cocodataset.org/annotations/annotations_trainval2017.zip',\n            'f4bbac642086de4f52a3fdda2de5fa2c', ),\n    ], [\"annotations\", \"train2017\", \"val2017\"]),\n    'voc': ([\n        (\n            'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',\n            '6cd6e144f989b92b3379bac3b3de84fd', ),\n        (\n            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',\n            'c52e279531787c972589f7e41ab4ae64', ),\n        (\n            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',\n            'b6e924de25625d8de591ea690078ad9f', ),\n        (\n            'https://paddledet.bj.bcebos.com/data/label_list.txt',\n            '5ae5d62183cfb6f6d3ac109359d06a1b', ),\n    ], [\"VOCdevkit/VOC2012\", \"VOCdevkit/VOC2007\"]),\n    'wider_face': ([\n        (\n            'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip',\n            '3fedf70df600953d25982bcd13d91ba2', ),\n        (\n            'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip',\n            'dfa7d7e790efa35df3788964cf0bbaea', ),\n        (\n            'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip',\n            'a4a898d6193db4b9ef3260a68bad0dc7', ),\n    ], [\"WIDER_train\", \"WIDER_val\", \"wider_face_split\"]),\n    'fruit': ([(\n        'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar',\n        'baa8806617a54ccf3685fa7153388ae6', ), ],\n              ['Annotations', 'JPEGImages']),\n    'roadsign_voc': ([(\n        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar',\n        '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']),\n    'roadsign_coco': ([(\n        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar',\n        '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']),\n    'spine_coco': ([(\n        'https://paddledet.bj.bcebos.com/data/spine.tar',\n        '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']),\n    'coco_ce': ([(\n        'https://paddledet.bj.bcebos.com/data/coco_ce.tar',\n        'eadd1b79bc2f069f2744b1dd4e0c0329', ), ], []),\n    'culane': ([('https://bj.bcebos.com/v1/paddledet/data/culane.tar', None, ), ], [])\n}\n\nDOWNLOAD_DATASETS_LIST = DATASETS.keys()\n\nDOWNLOAD_RETRY_LIMIT = 3\n\nPPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/'\n\n\n# When running unit tests, there could be multiple processes that\n# trying to create DATA_HOME directory simultaneously, so we cannot\n# use a if condition to check for the existence of the directory;\n# instead, we use the filesystem as the synchronization mechanism by\n# catching returned errors.\ndef must_mkdirs(path):\n    try:\n        os.makedirs(path)\n    except OSError as exc:\n        if exc.errno != errno.EEXIST:\n            raise\n        pass\n\n\ndef parse_url(url):\n    url = url.replace(\"ppdet://\", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX)\n    return url\n\n\ndef get_weights_path(url):\n    \"\"\"Get weights path from WEIGHTS_HOME, if not exists,\n    download it from url.\n    \"\"\"\n    url = parse_url(url)\n    path, _ = get_path(url, WEIGHTS_HOME)\n    return path\n\n\ndef get_config_path(url):\n    \"\"\"Get weights path from CONFIGS_HOME, if not exists,\n    download it from url.\n    \"\"\"\n    url = parse_url(url)\n    path = map_path(url, CONFIGS_HOME, path_depth=2)\n    if os.path.isfile(path):\n        return path\n\n    # config file not found, try download\n    # 1. clear configs directory\n    if osp.isdir(CONFIGS_HOME):\n        shutil.rmtree(CONFIGS_HOME)\n\n    # 2. get url\n    try:\n        from ppdet import __version__ as version\n    except ImportError:\n        version = None\n\n    cfg_url = \"ppdet://configs/{}/configs.tar\".format(version) \\\n                if version else \"ppdet://configs/configs.tar\"\n    cfg_url = parse_url(cfg_url)\n\n    # 3. download and decompress\n    cfg_fullname = _download_dist(cfg_url, osp.dirname(CONFIGS_HOME))\n    _decompress_dist(cfg_fullname)\n\n    # 4. check config file existing\n    if os.path.isfile(path):\n        return path\n    else:\n        logger.error(\"Get config {} failed after download, please contact us on \" \\\n            \"https://github.com/PaddlePaddle/PaddleDetection/issues\".format(path))\n        sys.exit(1)\n\n\ndef get_dataset_path(path, annotation, image_dir):\n    \"\"\"\n    If path exists, return path.\n    Otherwise, get dataset path from DATASET_HOME, if not exists,\n    download it.\n    \"\"\"\n    if _dataset_exists(path, annotation, image_dir):\n        return path\n\n    data_name = os.path.split(path.strip().lower())[-1]\n    if data_name not in DOWNLOAD_DATASETS_LIST:\n        raise ValueError(\n            \"Dataset {} is not valid for reason above, please check again.\".\n            format(osp.realpath(path)))\n    else:\n        logger.warning(\n            \"Dataset {} is not valid for reason above, try searching {} or \"\n            \"downloading dataset...\".format(osp.realpath(path), DATASET_HOME))\n\n    for name, dataset in DATASETS.items():\n        if data_name == name:\n            logger.debug(\"Parse dataset_dir {} as dataset \"\n                         \"{}\".format(path, name))\n            data_dir = osp.join(DATASET_HOME, name)\n\n            if name == \"spine_coco\":\n                if _dataset_exists(data_dir, annotation, image_dir):\n                    return data_dir\n\n            # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007\n            if name in ['voc', 'fruit', 'roadsign_voc']:\n                exists = True\n                for sub_dir in dataset[1]:\n                    check_dir = osp.join(data_dir, sub_dir)\n                    if osp.exists(check_dir):\n                        logger.info(\"Found {}\".format(check_dir))\n                    else:\n                        exists = False\n                if exists:\n                    return data_dir\n\n            # voc exist is checked above, voc is not exist here\n            check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc'\n            for url, md5sum in dataset[0]:\n                get_path(url, data_dir, md5sum, check_exist)\n\n            # voc should create list after download\n            if name == 'voc':\n                create_voc_list(data_dir)\n            return data_dir\n\n    raise ValueError(\"Dataset automaticly downloading Error.\")\n\n\ndef create_voc_list(data_dir, devkit_subdir='VOCdevkit'):\n    logger.debug(\"Create voc file list...\")\n    devkit_dir = osp.join(data_dir, devkit_subdir)\n    years = ['2007', '2012']\n\n    # NOTE: since using auto download VOC\n    # dataset, VOC default label list should be used,\n    # do not generate label_list.txt here. For default\n    # label, see ../data/source/voc.py\n    create_list(devkit_dir, years, data_dir)\n    logger.debug(\"Create voc file list finished\")\n\n\ndef map_path(url, root_dir, path_depth=1):\n    # parse path after download to decompress under root_dir\n    assert path_depth > 0, \"path_depth should be a positive integer\"\n    dirname = url\n    for _ in range(path_depth):\n        dirname = osp.dirname(dirname)\n    fpath = osp.relpath(url, dirname)\n\n    zip_formats = ['.zip', '.tar', '.gz']\n    for zip_format in zip_formats:\n        fpath = fpath.replace(zip_format, '')\n    return osp.join(root_dir, fpath)\n\n\ndef get_path(url, root_dir, md5sum=None, check_exist=True):\n    \"\"\" Download from given url to root_dir.\n    if file or directory specified by url is exists under\n    root_dir, return the path directly, otherwise download\n    from url and decompress it, return the path.\n\n    url (str): download url\n    root_dir (str): root dir for downloading, it should be\n                    WEIGHTS_HOME or DATASET_HOME\n    md5sum (str): md5 sum of download package\n    \"\"\"\n    # parse path after download to decompress under root_dir\n    fullpath = map_path(url, root_dir)\n\n    # For same zip file, decompressed directory name different\n    # from zip file name, rename by following map\n    decompress_name_map = {\n        \"VOCtrainval_11-May-2012\": \"VOCdevkit/VOC2012\",\n        \"VOCtrainval_06-Nov-2007\": \"VOCdevkit/VOC2007\",\n        \"VOCtest_06-Nov-2007\": \"VOCdevkit/VOC2007\",\n        \"annotations_trainval\": \"annotations\"\n    }\n    for k, v in decompress_name_map.items():\n        if fullpath.find(k) >= 0:\n            fullpath = osp.join(osp.split(fullpath)[0], v)\n\n    if osp.exists(fullpath) and check_exist:\n        if not osp.isfile(fullpath) or \\\n                _check_exist_file_md5(fullpath, md5sum, url):\n            logger.debug(\"Found {}\".format(fullpath))\n            return fullpath, True\n        else:\n            os.remove(fullpath)\n\n    fullname = _download_dist(url, root_dir, md5sum)\n\n    # new weights format which postfix is 'pdparams' not\n    # need to decompress\n    if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml', '.ttf']:\n        _decompress_dist(fullname)\n\n    return fullpath, False\n\n\ndef download_dataset(path, dataset=None):\n    if dataset not in DATASETS.keys():\n        logger.error(\"Unknown dataset {}, it should be \"\n                     \"{}\".format(dataset, DATASETS.keys()))\n        return\n    dataset_info = DATASETS[dataset][0]\n    for info in dataset_info:\n        get_path(info[0], path, info[1], False)\n    logger.debug(\"Download dataset {} finished.\".format(dataset))\n\n\ndef _dataset_exists(path, annotation, image_dir):\n    \"\"\"\n    Check if user define dataset exists\n    \"\"\"\n    if not osp.exists(path):\n        logger.warning(\"Config dataset_dir {} is not exits, \"\n                       \"dataset config is not valid\".format(path))\n        return False\n\n    if annotation:\n        annotation_path = osp.join(path, annotation)\n        if not osp.isfile(annotation_path):\n            logger.warning(\"Config annotation {} is not a \"\n                           \"file, dataset config is not \"\n                           \"valid\".format(annotation_path))\n            return False\n    if image_dir:\n        image_path = osp.join(path, image_dir)\n        if not osp.isdir(image_path):\n            logger.warning(\"Config image_dir {} is not a \"\n                           \"directory, dataset config is not \"\n                           \"valid\".format(image_path))\n            return False\n    return True\n\n\ndef _download(url, path, md5sum=None):\n    \"\"\"\n    Download from url, save to path.\n\n    url (str): download url\n    path (str): download to given path\n    \"\"\"\n    must_mkdirs(path)\n\n    fname = osp.split(url)[-1]\n    fullname = osp.join(path, fname)\n    retry_cnt = 0\n\n    while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum,\n                                                              url)):\n        if retry_cnt < DOWNLOAD_RETRY_LIMIT:\n            retry_cnt += 1\n        else:\n            raise RuntimeError(\"Download from {} failed. \"\n                               \"Retry limit reached\".format(url))\n\n        logger.info(\"Downloading {} from {}\".format(fname, url))\n\n        # NOTE: windows path join may incur \\, which is invalid in url\n        if sys.platform == \"win32\":\n            url = url.replace('\\\\', '/')\n\n        req = requests.get(url, stream=True)\n        if req.status_code != 200:\n            raise RuntimeError(\"Downloading from {} failed with code \"\n                               \"{}!\".format(url, req.status_code))\n\n        # For protecting download interupted, download to\n        # tmp_fullname firstly, move tmp_fullname to fullname\n        # after download finished\n        tmp_fullname = fullname + \"_tmp\"\n        total_size = req.headers.get('content-length')\n        with open(tmp_fullname, 'wb') as f:\n            if total_size:\n                for chunk in tqdm.tqdm(\n                        req.iter_content(chunk_size=1024),\n                        total=(int(total_size) + 1023) // 1024,\n                        unit='KB'):\n                    f.write(chunk)\n            else:\n                for chunk in req.iter_content(chunk_size=1024):\n                    if chunk:\n                        f.write(chunk)\n        shutil.move(tmp_fullname, fullname)\n    return fullname\n\n\ndef _download_dist(url, path, md5sum=None):\n    env = os.environ\n    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:\n        # Mainly used to solve the problem of downloading data from\n        # different machines in the case of multiple machines.\n        # Different nodes will download data, and the same node\n        # will only download data once.\n        # Reference https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/utils/download.py#L108\n        rank_id_curr_node = int(os.environ.get(\"PADDLE_RANK_IN_NODE\", 0))\n        num_trainers = int(env['PADDLE_TRAINERS_NUM'])\n        if num_trainers <= 1:\n            return _download(url, path, md5sum)\n        else:\n            fname = osp.split(url)[-1]\n            fullname = osp.join(path, fname)\n            lock_path = fullname + '.download.lock'\n\n            must_mkdirs(path)\n\n            if not osp.exists(fullname):\n                with open(lock_path, 'w'):  # touch\n                    os.utime(lock_path, None)\n                if rank_id_curr_node == 0:\n                    _download(url, path, md5sum)\n                    os.remove(lock_path)\n                else:\n                    while os.path.exists(lock_path):\n                        time.sleep(0.5)\n            return fullname\n    else:\n        return _download(url, path, md5sum)\n\n\ndef _check_exist_file_md5(filename, md5sum, url):\n    # if md5sum is None, and file to check is weights file,\n    # read md5um from url and check, else check md5sum directly\n    return _md5check_from_url(filename, url) if md5sum is None \\\n            and filename.endswith('pdparams') \\\n            else _md5check(filename, md5sum)\n\n\ndef _md5check_from_url(filename, url):\n    # For weights in bcebos URLs, MD5 value is contained\n    # in request header as 'content_md5'\n    req = requests.get(url, stream=True)\n    content_md5 = req.headers.get('content-md5')\n    req.close()\n    if not content_md5 or _md5check(\n            filename,\n            binascii.hexlify(base64.b64decode(content_md5.strip('\"'))).decode(\n            )):\n        return True\n    else:\n        return False\n\n\ndef _md5check(fullname, md5sum=None):\n    if md5sum is None:\n        return True\n\n    logger.debug(\"File {} md5 checking...\".format(fullname))\n    md5 = hashlib.md5()\n    with open(fullname, 'rb') as f:\n        for chunk in iter(lambda: f.read(4096), b\"\"):\n            md5.update(chunk)\n    calc_md5sum = md5.hexdigest()\n\n    if calc_md5sum != md5sum:\n        logger.warning(\"File {} md5 check failed, {}(calc) != \"\n                       \"{}(base)\".format(fullname, calc_md5sum, md5sum))\n        return False\n    return True\n\n\ndef _decompress(fname):\n    \"\"\"\n    Decompress for zip and tar file\n    \"\"\"\n    logger.info(\"Decompressing {}...\".format(fname))\n\n    # For protecting decompressing interupted,\n    # decompress to fpath_tmp directory firstly, if decompress\n    # successed, move decompress files to fpath and delete\n    # fpath_tmp and remove download compress file.\n    fpath = osp.split(fname)[0]\n    fpath_tmp = osp.join(fpath, 'tmp')\n    if osp.isdir(fpath_tmp):\n        shutil.rmtree(fpath_tmp)\n        os.makedirs(fpath_tmp)\n\n    if fname.find('tar') >= 0:\n        with tarfile.open(fname) as tf:\n            tf.extractall(path=fpath_tmp)\n    elif fname.find('zip') >= 0:\n        with zipfile.ZipFile(fname) as zf:\n            zf.extractall(path=fpath_tmp)\n    elif fname.find('.txt') >= 0:\n        return\n    else:\n        raise TypeError(\"Unsupport compress file type {}\".format(fname))\n\n    for f in os.listdir(fpath_tmp):\n        src_dir = osp.join(fpath_tmp, f)\n        dst_dir = osp.join(fpath, f)\n        _move_and_merge_tree(src_dir, dst_dir)\n\n    shutil.rmtree(fpath_tmp)\n    os.remove(fname)\n\n\ndef _decompress_dist(fname):\n    env = os.environ\n    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:\n        trainer_id = int(env['PADDLE_TRAINER_ID'])\n        num_trainers = int(env['PADDLE_TRAINERS_NUM'])\n        if num_trainers <= 1:\n            _decompress(fname)\n        else:\n            lock_path = fname + '.decompress.lock'\n            from paddle.distributed import ParallelEnv\n            unique_endpoints = _get_unique_endpoints(ParallelEnv()\n                                                     .trainer_endpoints[:])\n            # NOTE(dkp): _decompress_dist always performed after\n            # _download_dist, in _download_dist sub-trainers is waiting\n            # for download lock file release with sleeping, if decompress\n            # prograss is very fast and finished with in the sleeping gap\n            # time, e.g in tiny dataset such as coco_ce, spine_coco, main\n            # trainer may finish decompress and release lock file, so we\n            # only craete lock file in main trainer and all sub-trainer\n            # wait 1s for main trainer to create lock file, for 1s is\n            # twice as sleeping gap, this waiting time can keep all\n            # trainer pipeline in order\n            # **change this if you have more elegent methods**\n            if ParallelEnv().current_endpoint in unique_endpoints:\n                with open(lock_path, 'w'):  # touch\n                    os.utime(lock_path, None)\n                _decompress(fname)\n                os.remove(lock_path)\n            else:\n                time.sleep(1)\n                while os.path.exists(lock_path):\n                    time.sleep(0.5)\n    else:\n        _decompress(fname)\n\n\ndef _move_and_merge_tree(src, dst):\n    \"\"\"\n    Move src directory to dst, if dst is already exists,\n    merge src to dst\n    \"\"\"\n    if not osp.exists(dst):\n        shutil.move(src, dst)\n    elif osp.isfile(src):\n        shutil.move(src, dst)\n    else:\n        for fp in os.listdir(src):\n            src_fp = osp.join(src, fp)\n            dst_fp = osp.join(dst, fp)\n            if osp.isdir(src_fp):\n                if osp.isdir(dst_fp):\n                    _move_and_merge_tree(src_fp, dst_fp)\n                else:\n                    shutil.move(src_fp, dst_fp)\n            elif osp.isfile(src_fp) and \\\n                    not osp.isfile(dst_fp):\n                shutil.move(src_fp, dst_fp)\n"
  },
  {
    "path": "ppdet/utils/fuse_utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nimport paddle\nimport paddle.nn as nn\n\n__all__ = ['fuse_conv_bn']\n\n\ndef fuse_conv_bn(model):\n    is_train = False\n    if model.training:\n        model.eval()\n        is_train = True\n    fuse_list = []\n    tmp_pair = [None, None]\n    for name, layer in model.named_sublayers():\n        if isinstance(layer, nn.Conv2D):\n            tmp_pair[0] = name\n        if isinstance(layer, nn.BatchNorm2D):\n            tmp_pair[1] = name\n\n        if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2:\n            fuse_list.append(tmp_pair)\n            tmp_pair = [None, None]\n    model = fuse_layers(model, fuse_list)\n    if is_train:\n        model.train()\n    return model\n\n\ndef find_parent_layer_and_sub_name(model, name):\n    \"\"\"\n    Given the model and the name of a layer, find the parent layer and\n    the sub_name of the layer.\n    For example, if name is 'block_1/convbn_1/conv_1', the parent layer is\n    'block_1/convbn_1' and the sub_name is `conv_1`.\n    Args:\n        model(paddle.nn.Layer): the model to be quantized.\n        name(string): the name of a layer\n\n    Returns:\n        parent_layer, subname\n    \"\"\"\n    assert isinstance(model, nn.Layer), \\\n            \"The model must be the instance of paddle.nn.Layer.\"\n    assert len(name) > 0, \"The input (name) should not be empty.\"\n\n    last_idx = 0\n    idx = 0\n    parent_layer = model\n    while idx < len(name):\n        if name[idx] == '.':\n            sub_name = name[last_idx:idx]\n            if hasattr(parent_layer, sub_name):\n                parent_layer = getattr(parent_layer, sub_name)\n                last_idx = idx + 1\n        idx += 1\n    sub_name = name[last_idx:idx]\n    return parent_layer, sub_name\n\n\nclass Identity(nn.Layer):\n    '''a layer to replace bn or relu layers'''\n\n    def __init__(self, *args, **kwargs):\n        super(Identity, self).__init__()\n\n    def forward(self, input):\n        return input\n\n\ndef fuse_layers(model, layers_to_fuse, inplace=False):\n    '''\n       fuse layers in layers_to_fuse\n\n       Args:\n           model(nn.Layer): The model to be fused.\n           layers_to_fuse(list): The layers' names to be fused. For\n               example,\"fuse_list = [[\"conv1\", \"bn1\"], [\"conv2\", \"bn2\"]]\".\n               A TypeError would be raised if \"fuse\" was set as\n               True but \"fuse_list\" was None.\n                                 Default: None.\n           inplace(bool): Whether apply fusing to the input model.\n                          Default: False.\n\n       Return\n           fused_model(paddle.nn.Layer): The fused model.\n    '''\n    if not inplace:\n        model = copy.deepcopy(model)\n    for layers_list in layers_to_fuse:\n        layer_list = []\n        for layer_name in layers_list:\n            parent_layer, sub_name = find_parent_layer_and_sub_name(model,\n                                                                    layer_name)\n            layer_list.append(getattr(parent_layer, sub_name))\n        new_layers = _fuse_func(layer_list)\n        for i, item in enumerate(layers_list):\n            parent_layer, sub_name = find_parent_layer_and_sub_name(model, item)\n            setattr(parent_layer, sub_name, new_layers[i])\n    return model\n\n\ndef _fuse_func(layer_list):\n    '''choose the fuser method and fuse layers'''\n    types = tuple(type(m) for m in layer_list)\n    fusion_method = types_to_fusion_method.get(types, None)\n    new_layers = [None] * len(layer_list)\n    fused_layer = fusion_method(*layer_list)\n    for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items():\n        fused_layer.register_forward_pre_hook(pre_hook_fn)\n        del layer_list[0]._forward_pre_hooks[handle_id]\n    for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items():\n        fused_layer.register_forward_post_hook(hook_fn)\n        del layer_list[-1]._forward_post_hooks[handle_id]\n    new_layers[0] = fused_layer\n    for i in range(1, len(layer_list)):\n        identity = Identity()\n        identity.training = layer_list[0].training\n        new_layers[i] = identity\n    return new_layers\n\n\ndef _fuse_conv_bn(conv, bn):\n    '''fuse conv and bn for train or eval'''\n    assert(conv.training == bn.training),\\\n        \"Conv and BN both must be in the same mode (train or eval).\"\n    if conv.training:\n        assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d'\n        raise NotImplementedError\n    else:\n        return _fuse_conv_bn_eval(conv, bn)\n\n\ndef _fuse_conv_bn_eval(conv, bn):\n    '''fuse conv and bn for eval'''\n    assert (not (conv.training or bn.training)), \"Fusion only for eval!\"\n    fused_conv = copy.deepcopy(conv)\n\n    fused_weight, fused_bias = _fuse_conv_bn_weights(\n        fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon,\n        bn.weight, bn.bias)\n    fused_conv.weight.set_value(fused_weight)\n    if fused_conv.bias is None:\n        fused_conv.bias = paddle.create_parameter(\n            shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype)\n    fused_conv.bias.set_value(fused_bias)\n    return fused_conv\n\n\ndef _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):\n    '''fuse weights and bias of conv and bn'''\n    if conv_b is None:\n        conv_b = paddle.zeros_like(bn_rm)\n    if bn_w is None:\n        bn_w = paddle.ones_like(bn_rm)\n    if bn_b is None:\n        bn_b = paddle.zeros_like(bn_rm)\n    bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps)\n    conv_w = conv_w * \\\n        (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))\n    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b\n    return conv_w, conv_b\n\n\ntypes_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, }\n"
  },
  {
    "path": "ppdet/utils/logger.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\nimport os\nimport sys\n\nimport paddle.distributed as dist\n\n__all__ = ['setup_logger']\n\nlogger_initialized = []\n\n\ndef setup_logger(name=\"ppdet\", output=None, log_ranks=\"0\"):\n    \"\"\"\n    Initialize logger and set its verbosity level to INFO.\n    Args:\n        output (str): a file name or a directory to save log. If None, will not save log file.\n            If ends with \".txt\" or \".log\", assumed to be a file name.\n            Otherwise, logs will be saved to `output/log.txt`.\n        name (str): the root module name of this logger\n        log_ranks (str): The ids of gpu to log which are separated by \",\" when more than 1, \"0\" by default.\n\n    Returns:\n        logging.Logger: a logger\n    \"\"\"\n    logger = logging.getLogger(name)\n    if name in logger_initialized:\n        return logger\n\n    logger.setLevel(logging.INFO)\n    logger.propagate = False\n\n    formatter = logging.Formatter(\n        \"[%(asctime)s] %(name)s %(levelname)s: %(message)s\",\n        datefmt=\"%m/%d %H:%M:%S\")\n    \n    if isinstance(log_ranks, str):\n        log_ranks = [int(i) for i in log_ranks.split(',')]\n    elif isinstance(log_ranks, int):\n        log_ranks = [log_ranks]\n\n    # stdout logging: master only\n    local_rank = dist.get_rank()\n    if local_rank in log_ranks:\n        ch = logging.StreamHandler(stream=sys.stdout)\n        ch.setLevel(logging.DEBUG)\n        ch.setFormatter(formatter)\n        logger.addHandler(ch)\n\n    # file logging: all workers\n    if output is not None:\n        if output.endswith(\".txt\") or output.endswith(\".log\"):\n            filename = output\n        else:\n            filename = os.path.join(output, \"log.txt\")\n        if local_rank > 0:\n            filename = filename + \".rank{}\".format(local_rank)\n        os.makedirs(os.path.dirname(filename))\n        fh = logging.FileHandler(filename, mode='a')\n        fh.setLevel(logging.DEBUG)\n        fh.setFormatter(logging.Formatter())\n        logger.addHandler(fh)\n\n    logger_initialized.append(name)\n    return logger\n"
  },
  {
    "path": "ppdet/utils/profiler.py",
    "content": "# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport paddle\nimport paddle.profiler as profiler\n\n# A global variable to record the number of calling times for profiler\n# functions. It is used to specify the tracing range of training steps.\n_profiler_step_id = 0\n\n# A global variable to avoid parsing from string every time.\n_profiler_options = None\n_prof = None\n\nclass ProfilerOptions(object):\n    '''\n    Use a string to initialize a ProfilerOptions.\n    The string should be in the format: \"key1=value1;key2=value;key3=value3\".\n    For example:\n      \"profile_path=model.profile\"\n      \"batch_range=[50, 60]; profile_path=model.profile\"\n      \"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile\"\n\n    ProfilerOptions supports following key-value pair:\n      batch_range      - a integer list, e.g. [100, 110].\n      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. \n      sorted_key       - a string, the optional values are 'calls', 'total',\n                         'max', 'min' or 'ave.\n      tracer_option    - a string, the optional values are 'Default', 'OpDetail',\n                         'AllOpDetail'.\n      profile_path     - a string, the path to save the serialized profile data,\n                         which can be used to generate a timeline.\n      exit_on_finished - a boolean.\n    '''\n\n    def __init__(self, options_str):\n        assert isinstance(options_str, str)\n\n        self._options = {\n            'batch_range': [10, 20],\n            'state': 'All',\n            'sorted_key': 'total',\n            'tracer_option': 'Default',\n            'profile_path': '/tmp/profile',\n            'exit_on_finished': True,\n            'timer_only': True\n        }\n        self._parse_from_string(options_str)\n\n    def _parse_from_string(self, options_str):\n        for kv in options_str.replace(' ', '').split(';'):\n            key, value = kv.split('=')\n            if key == 'batch_range':\n                value_list = value.replace('[', '').replace(']', '').split(',')\n                value_list = list(map(int, value_list))\n                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[\n                        1] > value_list[0]:\n                    self._options[key] = value_list\n            elif key == 'exit_on_finished':\n                self._options[key] = value.lower() in (\"yes\", \"true\", \"t\", \"1\")\n            elif key in [\n                    'state', 'sorted_key', 'tracer_option', 'profile_path'\n            ]:\n                self._options[key] = value\n            elif key == 'timer_only':\n                self._options[key] = value\n\n    def __getitem__(self, name):\n        if self._options.get(name, None) is None:\n            raise ValueError(\n                \"ProfilerOptions does not have an option named %s.\" % name)\n        return self._options[name]\n\n\ndef add_profiler_step(options_str=None):\n    '''\n    Enable the operator-level timing using PaddlePaddle's profiler.\n    The profiler uses a independent variable to count the profiler steps.\n    One call of this function is treated as a profiler step.\n    Args:\n      profiler_options - a string to initialize the ProfilerOptions.\n                         Default is None, and the profiler is disabled.\n    '''\n    if options_str is None:\n        return\n\n    global _prof \n    global _profiler_step_id\n    global _profiler_options\n\n    if _profiler_options is None:\n        _profiler_options = ProfilerOptions(options_str)\n    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan\n    # timer_only = True  only the model's throughput and time overhead are displayed\n    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.\n    # timer_only = False the output Timeline information can be found in the profiler_log directory\n    if _prof is None:\n        _timer_only = str(_profiler_options['timer_only']) == str(True)\n        _prof = profiler.Profiler(\n                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),\n                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),\n                   timer_only = _timer_only)\n        _prof.start()\n    else:\n        _prof.step()\n        \n    if _profiler_step_id == _profiler_options['batch_range'][1]:\n        _prof.stop()\n        _prof.summary(\n             op_detail=True,\n             thread_sep=False,\n             time_unit='ms')\n        _prof = None\n        if _profiler_options['exit_on_finished']:\n            sys.exit(0)\n\n    _profiler_step_id += 1\n"
  },
  {
    "path": "ppdet/utils/stats.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport collections\nimport numpy as np\n\n__all__ = ['SmoothedValue', 'TrainingStats']\n\n\nclass SmoothedValue(object):\n    \"\"\"Track a series of values and provide access to smoothed values over a\n    window or the global series average.\n    \"\"\"\n\n    def __init__(self, window_size=20, fmt=None):\n        if fmt is None:\n            fmt = \"{median:.4f} ({avg:.4f})\"\n        self.deque = collections.deque(maxlen=window_size)\n        self.fmt = fmt\n        self.total = 0.\n        self.count = 0\n\n    def update(self, value, n=1):\n        self.deque.append(value)\n        self.count += n\n        self.total += value * n\n\n    @property\n    def median(self):\n        return np.median(self.deque)\n\n    @property\n    def avg(self):\n        return np.mean(self.deque)\n\n    @property\n    def max(self):\n        return np.max(self.deque)\n\n    @property\n    def value(self):\n        return self.deque[-1]\n\n    @property\n    def global_avg(self):\n        return self.total / self.count\n\n    def __str__(self):\n        return self.fmt.format(\n            median=self.median, avg=self.avg, max=self.max, value=self.value)\n\n\nclass TrainingStats(object):\n    def __init__(self, window_size, delimiter=' '):\n        self.meters = None\n        self.window_size = window_size\n        self.delimiter = delimiter\n\n    def update(self, stats):\n        if self.meters is None:\n            self.meters = {\n                k: SmoothedValue(self.window_size)\n                for k in stats.keys()\n            }\n        for k, v in self.meters.items():\n            v.update(float(stats[k]))\n\n    def get(self, extras=None):\n        stats = collections.OrderedDict()\n        if extras:\n            for k, v in extras.items():\n                stats[k] = v\n        for k, v in self.meters.items():\n            stats[k] = format(v.median, '.6f')\n\n        return stats\n\n    def log(self, extras=None):\n        d = self.get(extras)\n        strs = []\n        for k, v in d.items():\n            strs.append(\"{}: {}\".format(k, str(v)))\n        return self.delimiter.join(strs)\n"
  },
  {
    "path": "ppdet/utils/visualizer.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\n\nimport os\nimport numpy as np\nfrom PIL import Image, ImageDraw, ImageFont\nimport cv2\nimport math\n\nfrom .colormap import colormap\nfrom ppdet.utils.logger import setup_logger\nfrom ppdet.utils.compact import imagedraw_textsize_c\nfrom ppdet.utils.download import get_path\nlogger = setup_logger(__name__)\n\n__all__ = ['visualize_results']\n\n\ndef visualize_results(image,\n                      bbox_res,\n                      mask_res,\n                      segm_res,\n                      keypoint_res,\n                      pose3d_res,\n                      im_id,\n                      catid2name,\n                      threshold=0.5):\n    \"\"\"\n    Visualize bbox and mask results\n    \"\"\"\n    if bbox_res is not None:\n        image = draw_bbox(image, im_id, catid2name, bbox_res, threshold)\n    if mask_res is not None:\n        image = draw_mask(image, im_id, mask_res, threshold)\n    if segm_res is not None:\n        image = draw_segm(image, im_id, catid2name, segm_res, threshold)\n    if keypoint_res is not None:\n        image = draw_pose(image, keypoint_res, threshold)\n    if pose3d_res is not None:\n        pose3d = np.array(pose3d_res[0]['pose3d']) * 1000\n        image = draw_pose3d(image, pose3d, visual_thread=threshold)\n    return image\n\n\ndef draw_mask(image, im_id, segms, threshold, alpha=0.7):\n    \"\"\"\n    Draw mask on image\n    \"\"\"\n    mask_color_id = 0\n    w_ratio = .4\n    color_list = colormap(rgb=True)\n    img_array = np.array(image).astype('float32')\n    for dt in np.array(segms):\n        if im_id != dt['image_id']:\n            continue\n        segm, score = dt['segmentation'], dt['score']\n        if score < threshold:\n            continue\n        import pycocotools.mask as mask_util\n        mask = mask_util.decode(segm) * 255\n        color_mask = color_list[mask_color_id % len(color_list), 0:3]\n        mask_color_id += 1\n        for c in range(3):\n            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255\n        idx = np.nonzero(mask)\n        img_array[idx[0], idx[1], :] *= 1.0 - alpha\n        img_array[idx[0], idx[1], :] += alpha * color_mask\n    return Image.fromarray(img_array.astype('uint8'))\n\n\ndef draw_bbox(image, im_id, catid2name, bboxes, threshold):\n    \"\"\"\n    Draw bbox on image\n    \"\"\"\n    font_url = \"https://paddledet.bj.bcebos.com/simfang.ttf\"\n    font_path, _ = get_path(font_url, \"~/.cache/paddle/\")\n    font_size = 18\n    font = ImageFont.truetype(font_path, font_size, encoding=\"utf-8\")\n\n    draw = ImageDraw.Draw(image)\n\n    catid2color = {}\n    color_list = colormap(rgb=True)[:40]\n    for dt in np.array(bboxes):\n        if im_id != dt['image_id']:\n            continue\n        catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']\n        if score < threshold:\n            continue\n\n        if catid not in catid2color:\n            idx = np.random.randint(len(color_list))\n            catid2color[catid] = color_list[idx]\n        color = tuple(catid2color[catid])\n\n        # draw bbox\n        if len(bbox) == 4:\n            # draw bbox\n            xmin, ymin, w, h = bbox\n            xmax = xmin + w\n            ymax = ymin + h\n            draw.line(\n                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),\n                 (xmin, ymin)],\n                width=2,\n                fill=color)\n        elif len(bbox) == 8:\n            x1, y1, x2, y2, x3, y3, x4, y4 = bbox\n            draw.line(\n                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],\n                width=2,\n                fill=color)\n            xmin = min(x1, x2, x3, x4)\n            ymin = min(y1, y2, y3, y4)\n        else:\n            logger.error('the shape of bbox must be [M, 4] or [M, 8]!')\n\n        # draw label\n        text = \"{} {:.2f}\".format(catid2name[catid], score)\n        tw, th = imagedraw_textsize_c(draw, text, font=font)\n        draw.rectangle(\n            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)\n        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255), font=font)\n\n    return image\n\n\ndef save_result(save_path, results, catid2name, threshold):\n    \"\"\"\n    save result as txt\n    \"\"\"\n    img_id = int(results[\"im_id\"])\n    with open(save_path, 'w') as f:\n        if \"bbox_res\" in results:\n            for dt in results[\"bbox_res\"]:\n                catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']\n                if score < threshold:\n                    continue\n                # each bbox result as a line\n                # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4\n                # for bbox: classname score x1 y1 w h\n                bbox_pred = '{} {} '.format(catid2name[catid],\n                                            score) + ' '.join(\n                                                [str(e) for e in bbox])\n                f.write(bbox_pred + '\\n')\n        elif \"keypoint_res\" in results:\n            for dt in results[\"keypoint_res\"]:\n                kpts = dt['keypoints']\n                scores = dt['score']\n                keypoint_pred = [img_id, scores, kpts]\n                print(keypoint_pred, file=f)\n        else:\n            print(\"No valid results found, skip txt save\")\n\n\ndef draw_segm(image,\n              im_id,\n              catid2name,\n              segms,\n              threshold,\n              alpha=0.7,\n              draw_box=True):\n    \"\"\"\n    Draw segmentation on image\n    \"\"\"\n    mask_color_id = 0\n    w_ratio = .4\n    color_list = colormap(rgb=True)\n    img_array = np.array(image).astype('float32')\n    for dt in np.array(segms):\n        if im_id != dt['image_id']:\n            continue\n        segm, score, catid = dt['segmentation'], dt['score'], dt['category_id']\n        if score < threshold:\n            continue\n        import pycocotools.mask as mask_util\n        mask = mask_util.decode(segm) * 255\n        color_mask = color_list[mask_color_id % len(color_list), 0:3]\n        mask_color_id += 1\n        for c in range(3):\n            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255\n        idx = np.nonzero(mask)\n        img_array[idx[0], idx[1], :] *= 1.0 - alpha\n        img_array[idx[0], idx[1], :] += alpha * color_mask\n\n        if not draw_box:\n            center_y, center_x = ndimage.measurements.center_of_mass(mask)\n            label_text = \"{}\".format(catid2name[catid])\n            vis_pos = (max(int(center_x) - 10, 0), int(center_y))\n            cv2.putText(img_array, label_text, vis_pos,\n                        cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255))\n        else:\n            mask = mask_util.decode(segm) * 255\n            sum_x = np.sum(mask, axis=0)\n            x = np.where(sum_x > 0.5)[0]\n            sum_y = np.sum(mask, axis=1)\n            y = np.where(sum_y > 0.5)[0]\n            x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]\n            cv2.rectangle(img_array, (x0, y0), (x1, y1),\n                          tuple(color_mask.astype('int32').tolist()), 1)\n            bbox_text = '%s %.2f' % (catid2name[catid], score)\n            t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]\n            cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0],\n                                                y0 - t_size[1] - 3),\n                          tuple(color_mask.astype('int32').tolist()), -1)\n            cv2.putText(\n                img_array,\n                bbox_text, (x0, y0 - 2),\n                cv2.FONT_HERSHEY_SIMPLEX,\n                0.3, (0, 0, 0),\n                1,\n                lineType=cv2.LINE_AA)\n\n    return Image.fromarray(img_array.astype('uint8'))\n\n\ndef draw_pose(image,\n              results,\n              visual_thread=0.6,\n              save_name='pose.jpg',\n              save_dir='output',\n              returnimg=False,\n              ids=None):\n    try:\n        import matplotlib.pyplot as plt\n        import matplotlib\n        plt.switch_backend('agg')\n    except Exception as e:\n        logger.error('Matplotlib not found, please install matplotlib.'\n                     'for example: `pip install matplotlib`.')\n        raise e\n\n    skeletons = np.array([item['keypoints'] for item in results])\n    kpt_nums = 17\n    if len(skeletons) > 0:\n        kpt_nums = int(skeletons.shape[1] / 3)\n    skeletons = skeletons.reshape(-1, kpt_nums, 3)\n    if kpt_nums == 17:  #plot coco keypoint\n        EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8),\n                 (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14),\n                 (13, 15), (14, 16), (11, 12)]\n    else:  #plot mpii keypoint\n        EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8),\n                 (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12),\n                 (8, 13)]\n    NUM_EDGES = len(EDGES)\n\n    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \\\n            [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \\\n            [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]\n    cmap = matplotlib.cm.get_cmap('hsv')\n    plt.figure()\n\n    img = np.array(image).astype('float32')\n\n    color_set = results['colors'] if 'colors' in results else None\n\n    if 'bbox' in results and ids is None:\n        bboxs = results['bbox']\n        for j, rect in enumerate(bboxs):\n            xmin, ymin, xmax, ymax = rect\n            color = colors[0] if color_set is None else colors[color_set[j] %\n                                                               len(colors)]\n            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)\n\n    canvas = img.copy()\n    for i in range(kpt_nums):\n        for j in range(len(skeletons)):\n            if skeletons[j][i, 2] < visual_thread:\n                continue\n            if ids is None:\n                color = colors[i] if color_set is None else colors[color_set[j]\n                                                                   %\n                                                                   len(colors)]\n            else:\n                color = get_color(ids[j])\n\n            cv2.circle(\n                canvas,\n                tuple(skeletons[j][i, 0:2].astype('int32')),\n                2,\n                color,\n                thickness=-1)\n\n    to_plot = cv2.addWeighted(img, 0.3, canvas, 0.7, 0)\n    fig = matplotlib.pyplot.gcf()\n\n    stickwidth = 2\n\n    for i in range(NUM_EDGES):\n        for j in range(len(skeletons)):\n            edge = EDGES[i]\n            if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[\n                    1], 2] < visual_thread:\n                continue\n\n            cur_canvas = canvas.copy()\n            X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]]\n            Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]]\n            mX = np.mean(X)\n            mY = np.mean(Y)\n            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5\n            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))\n            polygon = cv2.ellipse2Poly((int(mY), int(mX)),\n                                       (int(length / 2), stickwidth),\n                                       int(angle), 0, 360, 1)\n            if ids is None:\n                color = colors[i] if color_set is None else colors[color_set[j]\n                                                                   %\n                                                                   len(colors)]\n            else:\n                color = get_color(ids[j])\n            cv2.fillConvexPoly(cur_canvas, polygon, color)\n            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)\n    image = Image.fromarray(canvas.astype('uint8'))\n    plt.close()\n    return image\n\n\ndef draw_pose3d(image,\n                pose3d,\n                pose2d=None,\n                visual_thread=0.6,\n                save_name='pose3d.jpg',\n                returnimg=True):\n    try:\n        import matplotlib.pyplot as plt\n        import matplotlib\n        plt.switch_backend('agg')\n    except Exception as e:\n        logger.error('Matplotlib not found, please install matplotlib.'\n                     'for example: `pip install matplotlib`.')\n        raise e\n\n    if pose3d.shape[0] == 24:\n        joints_connectivity_dict = [\n            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1],\n            [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0],\n            [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1],\n            [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0],\n            [23, 21, 1]\n        ]\n    elif pose3d.shape[0] == 14:\n        joints_connectivity_dict = [\n            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 12, 0],\n            [3, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1],\n            [8, 12, 0], [9, 12, 1], [12, 13, 1]\n        ]\n    else:\n        print(\n            \"not defined joints number :{}, cannot visualize because unknown of joint connectivity\".\n            format(pose.shape[0]))\n        return\n\n    def draw3Dpose(pose3d,\n                   ax,\n                   lcolor=\"#3498db\",\n                   rcolor=\"#e74c3c\",\n                   add_labels=False):\n        #    pose3d = orthographic_projection(pose3d, cam)\n        for i in joints_connectivity_dict:\n            x, y, z = [\n                np.array([pose3d[i[0], j], pose3d[i[1], j]]) for j in range(3)\n            ]\n            ax.plot(-x, -z, -y, lw=2, c=lcolor if i[2] else rcolor)\n\n        RADIUS = 1000\n        center_xy = 2 if pose3d.shape[0] == 14 else 14\n        x, y, z = pose3d[center_xy, 0], pose3d[center_xy, 1], pose3d[center_xy,\n                                                                     2]\n        ax.set_xlim3d([-RADIUS + x, RADIUS + x])\n        ax.set_ylim3d([-RADIUS + y, RADIUS + y])\n        ax.set_zlim3d([-RADIUS + z, RADIUS + z])\n\n        ax.set_xlabel(\"x\")\n        ax.set_ylabel(\"y\")\n        ax.set_zlabel(\"z\")\n\n    def draw2Dpose(pose2d,\n                   ax,\n                   lcolor=\"#3498db\",\n                   rcolor=\"#e74c3c\",\n                   add_labels=False):\n        for i in joints_connectivity_dict:\n            if pose2d[i[0], 2] and pose2d[i[1], 2]:\n                x, y = [\n                    np.array([pose2d[i[0], j], pose2d[i[1], j]])\n                    for j in range(2)\n                ]\n                ax.plot(x, y, 0, lw=2, c=lcolor if i[2] else rcolor)\n\n    def draw_img_pose(pose3d,\n                      pose2d=None,\n                      frame=None,\n                      figsize=(12, 12),\n                      savepath=None):\n        fig = plt.figure(figsize=figsize, dpi=80)\n        # fig.clear()\n        fig.tight_layout()\n\n        ax = fig.add_subplot(221)\n        if frame is not None:\n            ax.imshow(frame, interpolation='nearest')\n        if pose2d is not None:\n            draw2Dpose(pose2d, ax)\n\n        ax = fig.add_subplot(222, projection='3d')\n        ax.view_init(45, 45)\n        draw3Dpose(pose3d, ax)\n        ax = fig.add_subplot(223, projection='3d')\n        ax.view_init(0, 0)\n        draw3Dpose(pose3d, ax)\n        ax = fig.add_subplot(224, projection='3d')\n        ax.view_init(0, 90)\n        draw3Dpose(pose3d, ax)\n\n        if savepath is not None:\n            plt.savefig(savepath)\n            plt.close()\n        else:\n            return fig\n\n    def fig2data(fig):\n        \"\"\"\n        fig = plt.figure()\n        image = fig2data(fig)\n        @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it\n        @param fig a matplotlib figure\n        @return a numpy 3D array of RGBA values\n        \"\"\"\n        # draw the renderer\n        fig.canvas.draw()\n\n        # Get the RGBA buffer from the figure\n        w, h = fig.canvas.get_width_height()\n        buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8)\n        buf.shape = (w, h, 4)\n\n        # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode\n        buf = np.roll(buf, 3, axis=2)\n        image = Image.frombytes(\"RGBA\", (w, h), buf.tostring())\n        return image.convert(\"RGB\")\n\n    fig = draw_img_pose(pose3d, pose2d, frame=image)\n    data = fig2data(fig)\n    if returnimg is False:\n        data.save(save_name)\n    else:\n        return data\n"
  },
  {
    "path": "ppdet/utils/voc_utils.py",
    "content": "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport os.path as osp\nimport re\nimport random\n\n__all__ = ['create_list']\n\n\ndef create_list(devkit_dir, years, output_dir):\n    \"\"\"\n    create following list:\n        1. trainval.txt\n        2. test.txt\n    \"\"\"\n    trainval_list = []\n    test_list = []\n    for year in years:\n        trainval, test = _walk_voc_dir(devkit_dir, year, output_dir)\n        trainval_list.extend(trainval)\n        test_list.extend(test)\n\n    random.shuffle(trainval_list)\n    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:\n        for item in trainval_list:\n            ftrainval.write(item[0] + ' ' + item[1] + '\\n')\n\n    with open(osp.join(output_dir, 'test.txt'), 'w') as fval:\n        ct = 0\n        for item in test_list:\n            ct += 1\n            fval.write(item[0] + ' ' + item[1] + '\\n')\n\n\ndef _get_voc_dir(devkit_dir, year, type):\n    return osp.join(devkit_dir, 'VOC' + year, type)\n\n\ndef _walk_voc_dir(devkit_dir, year, output_dir):\n    filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main')\n    annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations')\n    img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages')\n    trainval_list = []\n    test_list = []\n    added = set()\n\n    for _, _, files in os.walk(filelist_dir):\n        for fname in files:\n            img_ann_list = []\n            if re.match(r'[a-z]+_trainval\\.txt', fname):\n                img_ann_list = trainval_list\n            elif re.match(r'[a-z]+_test\\.txt', fname):\n                img_ann_list = test_list\n            else:\n                continue\n            fpath = osp.join(filelist_dir, fname)\n            for line in open(fpath):\n                name_prefix = line.strip().split()[0]\n                if name_prefix in added:\n                    continue\n                added.add(name_prefix)\n                ann_path = osp.join(\n                    osp.relpath(annotation_dir, output_dir),\n                    name_prefix + '.xml')\n                img_path = osp.join(\n                    osp.relpath(img_dir, output_dir), name_prefix + '.jpg')\n                img_ann_list.append((img_path, ann_path))\n\n    return trainval_list, test_list\n"
  },
  {
    "path": "requirements.txt",
    "content": "numpy < 2.0\ntqdm\ntypeguard\nvisualdl>=2.2.0\nopencv-python <= 4.6.0\nPyYAML\nshapely\nscipy\nterminaltables\nCython\npycocotools\nsetuptools\nPillow\n\n# for MOT evaluation and inference\nlapx\nmotmetrics\nsklearn==0.0\n\n# for vehicleplate in deploy/pipeline/ppvehicle\npyclipper\n\n# for culane data augumetation\nimgaug>=0.4.0"
  },
  {
    "path": "scripts/build_wheel.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n#=================================================\n#                   Utils\n#=================================================\n\n\n# directory config\nDIST_DIR=\"dist\"\nBUILD_DIR=\"build\"\nEGG_DIR=\"paddledet.egg-info\"\n\nCFG_DIR=\"configs\"\nTEST_DIR=\".tests\"\nDATA_DIR=\"dataset\"\n\n# command line log config\nRED='\\033[0;31m'\nBLUE='\\033[0;34m'\nGREEN='\\033[1;32m'\nBOLD='\\033[1m'\nNONE='\\033[0m'\n\nfunction python_version_check() {\n  PY_MAIN_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`\n  PY_SUB_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`\n  echo -e \"find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}\"\n  if [ $PY_MAIN_VERSION -ne \"3\" -o $PY_SUB_VERSION -lt \"5\" ]; then\n    echo -e \"${RED}FAIL:${NONE} please use Python >= 3.5 !\"\n    exit 1\n  fi\n}\n\nfunction init() {\n    echo -e \"${BLUE}[init]${NONE} removing building directory...\"\n    rm -rf $DIST_DIR $BUILD_DIR $EGG_DIR $TEST_DIR\n    if [ `pip list | grep paddledet | wc -l` -gt 0  ]; then\n      echo -e \"${BLUE}[init]${NONE} uninstalling paddledet...\"\n      pip uninstall -y paddledet\n    fi\n    echo -e \"${BLUE}[init]${NONE} ${GREEN}init success\\n\"\n}\n\nfunction build_and_install() {\n  echo -e \"${BLUE}[build]${NONE} building paddledet wheel...\"\n  python setup.py sdist bdist_wheel\n  if [ $? -ne 0 ]; then\n    echo -e \"${RED}[FAIL]${NONE} build paddledet wheel failed !\"\n    exit 1\n  fi\n  echo -e \"${BLUE}[build]${NONE} ${GREEN}build paddldet wheel success\\n\"\n\n  echo -e \"${BLUE}[install]${NONE} installing paddledet...\"\n  cd $DIST_DIR\n  find . -name \"paddledet*.whl\" | xargs pip install\n  if [ $? -ne 0 ]; then\n    cd ..\n    echo -e \"${RED}[FAIL]${NONE} install paddledet wheel failed !\"\n    exit 1\n  fi\n  echo -e \"${BLUE}[install]${NONE} ${GREEN}paddledet install success\\n\"\n  cd ..\n}\n\nfunction unittest() {\n  if [ -d $TEST_DIR ]; then\n    rm -rf $TEST_DIR\n  fi;\n\n  echo -e \"${BLUE}[unittest]${NONE} run unittests...\"\n\n  # NOTE: perform unittests under TEST_DIR to\n  #       make sure installed paddledet is used\n  mkdir $TEST_DIR\n  cp -r $CFG_DIR $TEST_DIR\n  cp -r $DATA_DIR $TEST_DIR\n  cd $TEST_DIR\n\n  if [ $? != 0  ]; then\n    exit 1\n  fi\n  find \"../ppdet\" -wholename '*tests/test_*' -type f -print0 | \\\n      xargs -0 -I{} -n1 -t bash -c  'python -u -s {}'\n\n  # clean TEST_DIR\n  cd ..\n  rm -rf $TEST_DIR\n  echo -e \"${BLUE}[unittest]${NONE} ${GREEN}unittests success\\n${NONE}\"\n}\n\nfunction cleanup() {\n  if [ -d $TEST_DIR ]; then\n    rm -rf $TEST_DIR\n  fi\n\n  rm -rf $BUILD_DIR $EGG_DIR\n  pip uninstall -y paddledet\n}\n\nfunction abort() {\n  echo -e \"${RED}[FAIL]${NONE} build wheel and unittest failed !\n          please check your code\" 1>&2\n\n  cur_dir=`basename \"$pwd\"`\n  if [ cur_dir==$TEST_DIR -o cur_dir==$DIST_DIR ]; then\n    cd ..\n  fi\n\n  rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR $TEST_DIR\n  pip uninstall -y paddledet\n}\n\npython_version_check\n\ntrap 'abort' 0\nset -e\n\ninit\nbuild_and_install\nunittest\ncleanup\n\n# get Paddle version\nPADDLE_VERSION=`python -c \"import paddle; print(paddle.version.full_version)\"`\nPADDLE_COMMIT=`python -c \"import paddle; print(paddle.version.commit)\"`\nPADDLE_COMMIT=`git rev-parse --short $PADDLE_COMMIT`\n\n# get PaddleDetection branch\nPPDET_BRANCH=`git rev-parse --abbrev-ref HEAD`\nPPDET_COMMIT=`git rev-parse --short HEAD`\n\n# get Python version\nPYTHON_VERSION=`python -c \"import platform; print(platform.python_version())\"`\n\necho -e \"\\n${GREEN}paddledet wheel compiled and checked success !${NONE}\n        ${BLUE}Python version:${NONE} $PYTHON_VERSION\n        ${BLUE}Paddle version:${NONE} $PADDLE_VERSION ($PADDLE_COMMIT)\n        ${BLUE}PaddleDetection branch:${NONE} $PPDET_BRANCH ($PPDET_COMMIT)\\n\"\n\necho -e \"${GREEN}wheel saved under${NONE} ${RED}${BOLD}./dist\"\n\ntrap : 0\n"
  },
  {
    "path": "scripts/eval.sh",
    "content": "../../../../py37_meta_pd-2.4_cu11_comer/bin/python3.7 tools/eval.py \\\n    --config configs/artdetrv3_final/cortdetr_noisegroupx3_o2m_r18vd_120e_coco.yml \\\n    -o weights=outputs/"
  },
  {
    "path": "scripts/kill.sh",
    "content": "ps -ef | grep train.py | awk '{print $2}' | xargs kill -9\nkill -9 $(lsof -t /dev/nvidia*)\n"
  },
  {
    "path": "scripts/train.sh",
    "content": "PY37=/root/paddlejob/workspace/env_run/ws/py37_meta_pd-2.4_cu11_comer/bin/python3.7\n# PY37=../anaconda3/envs/py37_meta_pd-2.3.0_cu11/bin/python3.7\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\n\nnohup $PY37 -m paddle.distributed.launch --gpus=0,1,2,3 \\\n            tools/train.py \\\n            -c configs/artdetrv3/rtdetrv3_final_r18vd_6x_coco.yml --eval\\\n            -r output/rtdetrv3_final_r18vd_6x_coco/1 \\\n            -o save_dir=output/rtdetrv3_final_r18vd_6x_coco \\\n            &> output/train_rtdetrv3_final_r18vd_6x_coco.log&"
  },
  {
    "path": "tools/anchor_cluster.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('ppdet.anchor_cluster')\n\nfrom scipy.cluster.vq import kmeans\nimport numpy as np\nfrom tqdm import tqdm\n\nfrom ppdet.utils.cli import ArgsParser\nfrom ppdet.utils.check import check_gpu, check_version, check_config\nfrom ppdet.core.workspace import load_config, merge_config\n\n\nclass BaseAnchorCluster(object):\n    def __init__(self, n, cache_path, cache, verbose=True):\n        \"\"\"\n        Base Anchor Cluster\n\n        Args:\n            n (int): number of clusters\n            cache_path (str): cache directory path\n            cache (bool): whether using cache\n            verbose (bool): whether print results\n        \"\"\"\n        super(BaseAnchorCluster, self).__init__()\n        self.n = n\n        self.cache_path = cache_path\n        self.cache = cache\n        self.verbose = verbose\n\n    def print_result(self, centers):\n        raise NotImplementedError('%s.print_result is not available' %\n                                  self.__class__.__name__)\n\n    def get_whs(self):\n        whs_cache_path = os.path.join(self.cache_path, 'whs.npy')\n        shapes_cache_path = os.path.join(self.cache_path, 'shapes.npy')\n        if self.cache and os.path.exists(whs_cache_path) and os.path.exists(\n                shapes_cache_path):\n            self.whs = np.load(whs_cache_path)\n            self.shapes = np.load(shapes_cache_path)\n            return self.whs, self.shapes\n        whs = np.zeros((0, 2))\n        shapes = np.zeros((0, 2))\n        self.dataset.parse_dataset()\n        roidbs = self.dataset.roidbs\n        for rec in tqdm(roidbs):\n            h, w = rec['h'], rec['w']\n            bbox = rec['gt_bbox']\n            wh = bbox[:, 2:4] - bbox[:, 0:2] + 1\n            wh = wh / np.array([[w, h]])\n            shape = np.ones_like(wh) * np.array([[w, h]])\n            whs = np.vstack((whs, wh))\n            shapes = np.vstack((shapes, shape))\n\n        if self.cache:\n            os.makedirs(self.cache_path, exist_ok=True)\n            np.save(whs_cache_path, whs)\n            np.save(shapes_cache_path, shapes)\n\n        self.whs = whs\n        self.shapes = shapes\n        return self.whs, self.shapes\n\n    def calc_anchors(self):\n        raise NotImplementedError('%s.calc_anchors is not available' %\n                                  self.__class__.__name__)\n\n    def __call__(self):\n        self.get_whs()\n        centers = self.calc_anchors()\n        if self.verbose:\n            self.print_result(centers)\n        return centers\n\n\nclass YOLOv2AnchorCluster(BaseAnchorCluster):\n    def __init__(self,\n                 n,\n                 dataset,\n                 size,\n                 cache_path,\n                 cache,\n                 iters=1000,\n                 verbose=True):\n        super(YOLOv2AnchorCluster, self).__init__(\n            n, cache_path, cache, verbose=verbose)\n        \"\"\"\n        YOLOv2 Anchor Cluster\n\n        The code is based on https://github.com/AlexeyAB/darknet/blob/master/scripts/gen_anchors.py\n\n        Args:\n            n (int): number of clusters\n            dataset (DataSet): DataSet instance, VOC or COCO\n            size (list): [w, h]\n            cache_path (str): cache directory path\n            cache (bool): whether using cache\n            iters (int): kmeans algorithm iters\n            verbose (bool): whether print results\n        \"\"\"\n        self.dataset = dataset\n        self.size = size\n        self.iters = iters\n\n    def print_result(self, centers):\n        logger.info('%d anchor cluster result: [w, h]' % self.n)\n        for w, h in centers:\n            logger.info('[%d, %d]' % (round(w), round(h)))\n\n    def metric(self, whs, centers):\n        wh1 = whs[:, None]\n        wh2 = centers[None]\n        inter = np.minimum(wh1, wh2).prod(2)\n        return inter / (wh1.prod(2) + wh2.prod(2) - inter)\n\n    def kmeans_expectation(self, whs, centers, assignments):\n        dist = self.metric(whs, centers)\n        new_assignments = dist.argmax(1)\n        converged = (new_assignments == assignments).all()\n        return converged, new_assignments\n\n    def kmeans_maximizations(self, whs, centers, assignments):\n        new_centers = np.zeros_like(centers)\n        for i in range(centers.shape[0]):\n            mask = (assignments == i)\n            if mask.sum():\n                new_centers[i, :] = whs[mask].mean(0)\n        return new_centers\n\n    def calc_anchors(self):\n        self.whs = self.whs * np.array([self.size])\n        # random select k centers\n        whs, n, iters = self.whs, self.n, self.iters\n        logger.info('Running kmeans for %d anchors on %d points...' %\n                    (n, len(whs)))\n        idx = np.random.choice(whs.shape[0], size=n, replace=False)\n        centers = whs[idx]\n        assignments = np.zeros(whs.shape[0:1]) * -1\n        # kmeans\n        if n == 1:\n            return self.kmeans_maximizations(whs, centers, assignments)\n\n        pbar = tqdm(range(iters), desc='Cluster anchors with k-means algorithm')\n        for _ in pbar:\n            # E step\n            converged, assignments = self.kmeans_expectation(whs, centers,\n                                                             assignments)\n            if converged:\n                logger.info('kmeans algorithm has converged')\n                break\n            # M step\n            centers = self.kmeans_maximizations(whs, centers, assignments)\n            ious = self.metric(whs, centers)\n            pbar.desc = 'avg_iou: %.4f' % (ious.max(1).mean())\n\n        centers = sorted(centers, key=lambda x: x[0] * x[1])\n        return centers\n\n\ndef main():\n    parser = ArgsParser()\n    parser.add_argument(\n        '--n', '-n', default=9, type=int, help='num of clusters')\n    parser.add_argument(\n        '--iters',\n        '-i',\n        default=1000,\n        type=int,\n        help='num of iterations for kmeans')\n    parser.add_argument(\n        '--verbose', '-v', default=True, type=bool, help='whether print result')\n    parser.add_argument(\n        '--size',\n        '-s',\n        default=None,\n        type=str,\n        help='image size: w,h, using comma as delimiter')\n    parser.add_argument(\n        '--method',\n        '-m',\n        default='v2',\n        type=str,\n        help='cluster method, v2 is only supported now')\n    parser.add_argument(\n        '--cache_path', default='cache', type=str, help='cache path')\n    parser.add_argument(\n        '--cache', action='store_true', help='whether use cache')\n    FLAGS = parser.parse_args()\n\n    cfg = load_config(FLAGS.config)\n    merge_config(FLAGS.opt)\n    check_config(cfg)\n    # check if set use_gpu=True in paddlepaddle cpu version\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n    check_gpu(cfg.use_gpu)\n    # check if paddlepaddle version is satisfied\n    check_version('develop')\n\n    # get dataset\n    dataset = cfg['TrainDataset']\n    if FLAGS.size:\n        if ',' in FLAGS.size:\n            size = list(map(int, FLAGS.size.split(',')))\n            assert len(size) == 2, \"the format of size is incorrect\"\n        else:\n            size = int(FLAGS.size)\n            size = [size, size]\n    elif 'inputs_def' in cfg['TestReader'] and 'image_shape' in cfg[\n            'TestReader']['inputs_def']:\n        size = cfg['TestReader']['inputs_def']['image_shape'][1:]\n    else:\n        raise ValueError('size is not specified')\n\n    if FLAGS.method == 'v2':\n        cluster = YOLOv2AnchorCluster(FLAGS.n, dataset, size, FLAGS.cache_path,\n                                      FLAGS.cache, FLAGS.iters, FLAGS.verbose)\n    else:\n        raise ValueError('cluster method: %s is not supported' % FLAGS.method)\n\n    anchors = cluster()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tools/box_distribution.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport matplotlib.pyplot as plt\nimport json\nimport numpy as np\nimport argparse\nfrom pycocotools.coco import COCO\nfrom tqdm import tqdm\n\n\ndef median(data):\n    data.sort()\n    mid = len(data) // 2\n    median = (data[mid] + data[~mid]) / 2\n    return median\n\n\ndef draw_distribution(width, height, out_path):\n    w_bins = int((max(width) - min(width)) // 10)\n    h_bins = int((max(height) - min(height)) // 10)\n    plt.figure()\n    plt.subplot(221)\n    plt.hist(width, bins=w_bins, color='green')\n    plt.xlabel('Width rate *1000')\n    plt.ylabel('number')\n    plt.title('Distribution of Width')\n    plt.subplot(222)\n    plt.hist(height, bins=h_bins, color='blue')\n    plt.xlabel('Height rate *1000')\n    plt.title('Distribution of Height')\n    plt.savefig(out_path)\n    print(f'Distribution saved as {out_path}')\n    plt.show()\n\n\ndef get_ratio_infos(jsonfile, out_img, eval_size, small_stride):\n    coco = COCO(annotation_file=jsonfile)\n    allannjson = json.load(open(jsonfile, 'r'))\n    be_im_id = allannjson['annotations'][0]['image_id']\n    be_im_w = []\n    be_im_h = []\n    ratio_w = []\n    ratio_h = []\n    im_wid, im_hei = [], []\n    for ann in tqdm(allannjson['annotations']):\n        if ann['iscrowd']:\n            continue\n        x0, y0, w, h = ann['bbox'][:]\n        if be_im_id == ann['image_id']:\n            be_im_w.append(w)\n            be_im_h.append(h)\n        else:\n            im_w = coco.imgs[be_im_id]['width']\n            im_h = coco.imgs[be_im_id]['height']\n            im_wid.append(im_w)\n            im_hei.append(im_h)\n            im_m_w = np.mean(be_im_w)\n            im_m_h = np.mean(be_im_h)\n            dis_w = im_m_w / im_w\n            dis_h = im_m_h / im_h\n            ratio_w.append(dis_w)\n            ratio_h.append(dis_h)\n            be_im_id = ann['image_id']\n            be_im_w = [w]\n            be_im_h = [h]\n\n    im_w = coco.imgs[be_im_id]['width']\n    im_h = coco.imgs[be_im_id]['height']\n    im_wid.append(im_w)\n    im_hei.append(im_h)\n    all_im_m_w = np.mean(im_wid)\n    all_im_m_h = np.mean(im_hei)\n\n    im_m_w = np.mean(be_im_w)\n    im_m_h = np.mean(be_im_h)\n    dis_w = im_m_w / im_w\n    dis_h = im_m_h / im_h\n    ratio_w.append(dis_w)\n    ratio_h.append(dis_h)\n    mid_w = median(ratio_w)\n    mid_h = median(ratio_h)\n\n    reg_ratio = []\n    ratio_all = ratio_h + ratio_w\n    for r in ratio_all:\n        if r < 0.2:\n            reg_ratio.append(r)\n        elif r < 0.4:\n            reg_ratio.append(r / 2)\n        else:\n            reg_ratio.append(r / 4)\n    reg_ratio = sorted(reg_ratio)\n    max_ratio = reg_ratio[int(0.95 * len(reg_ratio))]\n    reg_max = round(max_ratio * eval_size / small_stride)\n\n    ratio_w = [i * 1000 for i in ratio_w]\n    ratio_h = [i * 1000 for i in ratio_h]\n    print(f'Suggested reg_range[1] is {reg_max+1}')\n    print(f'Mean of all img_w is {all_im_m_w}')\n    print(f'Mean of all img_h is {all_im_m_h}')\n    print(f'Median of ratio_w is {mid_w}')\n    print(f'Median of ratio_h is {mid_h}')\n    print('all_img with box: ', len(ratio_h))\n    print('all_ann: ', len(allannjson['annotations']))\n    draw_distribution(ratio_w, ratio_h, out_img)\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        '--json_path', type=str, default=None, help=\"Dataset json path.\")\n    parser.add_argument('--eval_size', type=int, default=640, help=\"eval size.\")\n    parser.add_argument(\n        '--small_stride', type=int, default=8, help=\"smallest stride.\")\n    parser.add_argument(\n        '--out_img',\n        type=str,\n        default='box_distribution.jpg',\n        help=\"Name of distibution img.\")\n    args = parser.parse_args()\n\n    get_ratio_infos(args.json_path, args.out_img, args.eval_size,\n                    args.small_stride)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tools/cam_ppdet.py",
    "content": "from __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\nfrom ppdet.utils.cli import ArgsParser, merge_args\nfrom ppdet.core.workspace import load_config, merge_config\nfrom ppdet.utils.check import check_gpu, check_npu, check_xpu, check_version, check_config\nfrom ppdet.utils.cam_utils import BBoxCAM\nimport paddle\n\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        \"--infer_img\",\n        type=str,\n        default='demo/000000014439.jpg',    # hxw: 404x640\n        help=\"Image path, has higher priority over --infer_dir\")\n    parser.add_argument(\"--weights\",\n                        type=str,\n                        default='output/faster_rcnn_r50_vd_fpn_2x_coco_paddlejob/best_model.pdparams'\n                        )\n    parser.add_argument(\"--cam_out\",\n                        type=str,\n                        default='cam_faster_rcnn'\n                        )\n    parser.add_argument(\"--use_gpu\",\n                        type=bool,\n                        default=True)\n    parser.add_argument(\n        \"--infer_dir\",\n        type=str,\n        default=None,\n        help=\"Directory for images to perform inference on.\")\n    parser.add_argument(\n        \"--output_dir\",\n        type=str,\n        default=\"output\",\n        help=\"Directory for storing the output visualization files.\")\n    parser.add_argument(\n        \"--draw_threshold\",\n        type=float,\n        default=0.8,\n        help=\"Threshold to reserve the result for visualization.\")\n    parser.add_argument(\n        \"--save_results\",\n        type=bool,\n        default=False,\n        help=\"Whether to save inference results to output_dir.\")\n    parser.add_argument(\n        \"--target_feature_layer_name\",\n        type=str,\n        default='model.backbone', # define the featuremap to show grad cam, such as model.backbone, model.bbox_head.roi_extractor\n        help=\"Whether to save inference results to output_dir.\")\n    args = parser.parse_args()\n\n    return args\n\ndef run(FLAGS, cfg):\n    assert cfg.architecture in ['FasterRCNN', 'MaskRCNN', 'YOLOv3', 'PPYOLOE',\n                                'PPYOLOEWithAuxHead', 'BlazeFace', 'SSD', 'RetinaNet'],  \\\n        'Only supported cam for faster_rcnn based and yolov3 based architecture for now,  ' \\\n        'the others are not supported temporarily!'\n\n    bbox_cam = BBoxCAM(FLAGS, cfg)\n    bbox_cam.get_bboxes_cams()\n\n    print('finish')\n\n\n\ndef main():\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    merge_args(cfg, FLAGS)\n    merge_config(FLAGS.opt)\n\n    # disable npu in config by default\n    if 'use_npu' not in cfg:\n        cfg.use_npu = False\n\n    # disable xpu in config by default\n    if 'use_xpu' not in cfg:\n        cfg.use_xpu = False\n\n    if cfg.use_gpu:\n        place = paddle.set_device('gpu')\n    elif cfg.use_npu:\n        place = paddle.set_device('npu')\n    elif cfg.use_xpu:\n        place = paddle.set_device('xpu')\n    else:\n        place = paddle.set_device('cpu')\n\n    check_config(cfg)\n    check_gpu(cfg.use_gpu)\n    check_npu(cfg.use_npu)\n    check_xpu(cfg.use_xpu)\n    check_version()\n\n    run(FLAGS, cfg)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "tools/eval.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\n\nimport paddle\n\nfrom ppdet.core.workspace import create, load_config, merge_config\nfrom ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config\nfrom ppdet.utils.cli import ArgsParser, merge_args\nfrom ppdet.engine import Trainer, Trainer_ARSL, init_parallel_env\nfrom ppdet.metrics.coco_utils import json_eval_results\nfrom ppdet.slim import build_slim_model\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('eval')\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        \"--output_eval\",\n        default=None,\n        type=str,\n        help=\"Evaluation directory, default is current directory.\")\n\n    parser.add_argument(\n        '--json_eval',\n        action='store_true',\n        default=False,\n        help='Whether to re eval with already exists bbox.json or mask.json')\n\n    parser.add_argument(\n        \"--slim_config\",\n        default=None,\n        type=str,\n        help=\"Configuration file of slim method.\")\n\n    # TODO: bias should be unified\n    parser.add_argument(\n        \"--bias\",\n        action=\"store_true\",\n        help=\"whether add bias or not while getting w and h\")\n\n    parser.add_argument(\n        \"--classwise\",\n        action=\"store_true\",\n        help=\"whether per-category AP and draw P-R Curve or not.\")\n\n    parser.add_argument(\n        '--save_prediction_only',\n        action='store_true',\n        default=False,\n        help='Whether to save the evaluation results only')\n\n    parser.add_argument(\n        \"--amp\",\n        action='store_true',\n        default=False,\n        help=\"Enable auto mixed precision eval.\")\n\n    # for smalldet slice_infer\n    parser.add_argument(\n        \"--slice_infer\",\n        action='store_true',\n        help=\"Whether to slice the image and merge the inference results for small object detection.\"\n    )\n    parser.add_argument(\n        '--slice_size',\n        nargs='+',\n        type=int,\n        default=[640, 640],\n        help=\"Height of the sliced image.\")\n    parser.add_argument(\n        \"--overlap_ratio\",\n        nargs='+',\n        type=float,\n        default=[0.25, 0.25],\n        help=\"Overlap height ratio of the sliced image.\")\n    parser.add_argument(\n        \"--combine_method\",\n        type=str,\n        default='nms',\n        help=\"Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat'].\"\n    )\n    parser.add_argument(\n        \"--match_threshold\",\n        type=float,\n        default=0.6,\n        help=\"Combine method matching threshold.\")\n    parser.add_argument(\n        \"--match_metric\",\n        type=str,\n        default='ios',\n        help=\"Combine method matching metric, choose in ['iou', 'ios'].\")\n    args = parser.parse_args()\n    return args\n\n\ndef run(FLAGS, cfg):\n    if FLAGS.json_eval:\n        logger.info(\n            \"In json_eval mode, PaddleDetection will evaluate json files in \"\n            \"output_eval directly. And proposal.json, bbox.json and mask.json \"\n            \"will be detected by default.\")\n        json_eval_results(\n            cfg.metric,\n            json_directory=FLAGS.output_eval,\n            dataset=create('EvalDataset')())\n        return\n\n    # init parallel environment if nranks > 1\n    init_parallel_env()\n    ssod_method = cfg.get('ssod_method', None)\n    if ssod_method == 'ARSL':\n        # build ARSL_trainer\n        trainer = Trainer_ARSL(cfg, mode='eval')\n        # load ARSL_weights\n        trainer.load_weights(cfg.weights, ARSL_eval=True)\n    else:\n        # build trainer\n        trainer = Trainer(cfg, mode='eval')\n        #load weights\n        trainer.load_weights(cfg.weights)\n\n    # training\n    if FLAGS.slice_infer:\n        trainer.evaluate_slice(\n            slice_size=FLAGS.slice_size,\n            overlap_ratio=FLAGS.overlap_ratio,\n            combine_method=FLAGS.combine_method,\n            match_threshold=FLAGS.match_threshold,\n            match_metric=FLAGS.match_metric)\n    else:\n        trainer.evaluate()\n\n\ndef main():\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    merge_args(cfg, FLAGS)\n    merge_config(FLAGS.opt)\n\n    # disable npu in config by default\n    if 'use_npu' not in cfg:\n        cfg.use_npu = False\n\n    # disable xpu in config by default\n    if 'use_xpu' not in cfg:\n        cfg.use_xpu = False\n\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n\n    # disable mlu in config by default\n    if 'use_mlu' not in cfg:\n        cfg.use_mlu = False\n\n    if cfg.use_gpu:\n        place = paddle.set_device('gpu')\n    elif cfg.use_npu:\n        place = paddle.set_device('npu')\n    elif cfg.use_xpu:\n        place = paddle.set_device('xpu')\n    elif cfg.use_mlu:\n        place = paddle.set_device('mlu')\n    else:\n        place = paddle.set_device('cpu')\n\n    if FLAGS.slim_config:\n        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='eval')\n\n    check_config(cfg)\n    check_gpu(cfg.use_gpu)\n    check_npu(cfg.use_npu)\n    check_xpu(cfg.use_xpu)\n    check_mlu(cfg.use_mlu)\n    check_version()\n\n    run(FLAGS, cfg)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "tools/eval_mot.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\n\nimport paddle\n\nfrom ppdet.core.workspace import load_config, merge_config\nfrom ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config\nfrom ppdet.utils.cli import ArgsParser\nfrom ppdet.engine import Tracker\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        \"--det_results_dir\",\n        type=str,\n        default='',\n        help=\"Directory name for detection results.\")\n    parser.add_argument(\n        '--output_dir',\n        type=str,\n        default='output',\n        help='Directory name for output tracking results.')\n    parser.add_argument(\n        '--save_images',\n        action='store_true',\n        help='Save tracking results (image).')\n    parser.add_argument(\n        '--save_videos',\n        action='store_true',\n        help='Save tracking results (video).')\n    parser.add_argument(\n        '--show_image',\n        action='store_true',\n        help='Show tracking results (image).')\n    parser.add_argument(\n        '--scaled',\n        type=bool,\n        default=False,\n        help=\"Whether coords after detector outputs are scaled, False in JDE YOLOv3 \"\n        \"True in general detector.\")\n    args = parser.parse_args()\n    return args\n\n\ndef run(FLAGS, cfg):\n    dataset_dir = cfg['EvalMOTDataset'].dataset_dir\n    data_root = cfg['EvalMOTDataset'].data_root\n    data_root = '{}/{}'.format(dataset_dir, data_root)\n    seqs = os.listdir(data_root)\n    seqs.sort()\n\n    # build Tracker\n    tracker = Tracker(cfg, mode='eval')\n\n    # load weights\n    if cfg.architecture in ['DeepSORT', 'ByteTrack']:\n        tracker.load_weights_sde(cfg.det_weights, cfg.reid_weights)\n    else:\n        tracker.load_weights_jde(cfg.weights)\n\n    # inference\n    tracker.mot_evaluate(\n        data_root=data_root,\n        seqs=seqs,\n        data_type=cfg.metric.lower(),\n        model_type=cfg.architecture,\n        output_dir=FLAGS.output_dir,\n        save_images=FLAGS.save_images,\n        save_videos=FLAGS.save_videos,\n        show_image=FLAGS.show_image,\n        scaled=FLAGS.scaled,\n        det_results_dir=FLAGS.det_results_dir)\n\n\ndef main():\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    merge_config(FLAGS.opt)\n\n    # disable npu in config by default\n    if 'use_npu' not in cfg:\n        cfg.use_npu = False\n\n    # disable xpu in config by default\n    if 'use_xpu' not in cfg:\n        cfg.use_xpu = False\n\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n\n    # disable mlu in config by default\n    if 'use_mlu' not in cfg:\n        cfg.use_mlu = False\n\n    if cfg.use_gpu:\n        place = paddle.set_device('gpu')\n    elif cfg.use_npu:\n        place = paddle.set_device('npu')\n    elif cfg.use_xpu:\n        place = paddle.set_device('xpu')\n    elif cfg.use_mlu:\n        place = paddle.set_device('mlu')\n    else:\n        place = paddle.set_device('cpu')\n\n    check_config(cfg)\n    check_gpu(cfg.use_gpu)\n    check_npu(cfg.use_npu)\n    check_xpu(cfg.use_xpu)\n    check_mlu(cfg.use_mlu)\n    check_version()\n\n    run(FLAGS, cfg)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "tools/export_model.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\n\nimport paddle\nfrom ppdet.core.workspace import load_config, merge_config\nfrom ppdet.utils.check import check_gpu, check_version, check_config\nfrom ppdet.utils.cli import ArgsParser\nfrom ppdet.engine import Trainer\nfrom ppdet.engine.trainer_ssod import Trainer_ARSL\nfrom ppdet.slim import build_slim_model\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('export_model')\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        \"--output_dir\",\n        type=str,\n        default=\"output_inference\",\n        help=\"Directory for storing the output model files.\")\n    parser.add_argument(\n        \"--export_serving_model\",\n        type=bool,\n        default=False,\n        help=\"Whether to export serving model or not.\")\n    parser.add_argument(\n        \"--slim_config\",\n        default=None,\n        type=str,\n        help=\"Configuration file of slim method.\")\n    parser.add_argument(\"--for_fd\", action='store_true')\n    args = parser.parse_args()\n    return args\n\n\ndef run(FLAGS, cfg):\n    ssod_method = cfg.get('ssod_method', None)\n    if ssod_method is not None and ssod_method == 'ARSL':\n        trainer = Trainer_ARSL(cfg, mode='test')\n        trainer.load_weights(cfg.weights, ARSL_eval=True)\n    # build detector\n    else:\n        trainer = Trainer(cfg, mode='test')\n\n        # load weights\n        if cfg.architecture in ['DeepSORT', 'ByteTrack']:\n            trainer.load_weights_sde(cfg.det_weights, cfg.reid_weights)\n        else:\n            trainer.load_weights(cfg.weights)\n\n    # export model\n    trainer.export(FLAGS.output_dir, for_fd=FLAGS.for_fd)\n\n    if FLAGS.export_serving_model:\n        assert not FLAGS.for_fd\n        from paddle_serving_client.io import inference_model_to_serving\n        model_name = os.path.splitext(os.path.split(cfg.filename)[-1])[0]\n\n        inference_model_to_serving(\n            dirname=\"{}/{}\".format(FLAGS.output_dir, model_name),\n            serving_server=\"{}/{}/serving_server\".format(FLAGS.output_dir,\n                                                         model_name),\n            serving_client=\"{}/{}/serving_client\".format(FLAGS.output_dir,\n                                                         model_name),\n            model_filename=\"model.pdmodel\",\n            params_filename=\"model.pdiparams\")\n\n\ndef main():\n    paddle.set_device(\"cpu\")\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    merge_config(FLAGS.opt)\n\n    if FLAGS.slim_config:\n        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')\n\n    # FIXME: Temporarily solve the priority problem of FLAGS.opt\n    merge_config(FLAGS.opt)\n    check_config(cfg)\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n    check_gpu(cfg.use_gpu)\n    check_version()\n\n    run(FLAGS, cfg)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "tools/gen_semi_coco.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport json\nimport argparse\nimport numpy as np\n\n\ndef save_json(path, images, annotations, categories):\n    new_json = {\n        'images': images,\n        'annotations': annotations,\n        'categories': categories,\n    }\n    with open(path, 'w') as f:\n        json.dump(new_json, f)\n    print('{} saved, with {} images and {} annotations.'.format(\n        path, len(images), len(annotations)))\n\n\ndef gen_semi_data(data_dir,\n                  json_file,\n                  percent=10.0,\n                  seed=1,\n                  seed_offset=0,\n                  txt_file=None):\n    json_name = json_file.split('/')[-1].split('.')[0]\n    json_file = os.path.join(data_dir, json_file)\n    anno = json.load(open(json_file, 'r'))\n    categories = anno['categories']\n    all_images = anno['images']\n    all_anns = anno['annotations']\n    print(\n        'Totally {} images and {} annotations, about {} gts per image.'.format(\n            len(all_images), len(all_anns), len(all_anns) / len(all_images)))\n\n    if txt_file:\n        print('Using percent {} and seed {}.'.format(percent, seed))\n        txt_file = os.path.join(data_dir, txt_file)\n        sup_idx = json.load(open(txt_file, 'r'))[str(percent)][str(seed)]\n        # max(sup_idx) = 117262 # 10%, sup_idx is not image_id\n    else:\n        np.random.seed(seed + seed_offset)\n        sup_len = int(percent / 100.0 * len(all_images))\n        sup_idx = np.random.choice(\n            range(len(all_images)), size=sup_len, replace=False)\n    labeled_images, labeled_anns = [], []\n    labeled_im_ids = []\n    unlabeled_images, unlabeled_anns = [], []\n\n    for i in range(len(all_images)):\n        if i in sup_idx:\n            labeled_im_ids.append(all_images[i]['id'])\n            labeled_images.append(all_images[i])\n        else:\n            unlabeled_images.append(all_images[i])\n\n    for an in all_anns:\n        im_id = an['image_id']\n        if im_id in labeled_im_ids:\n            labeled_anns.append(an)\n        else:\n            continue\n\n    save_path = '{}/{}'.format(data_dir, 'semi_annotations')\n    if not os.path.exists(save_path):\n        os.mkdir(save_path)\n\n    sup_name = '{}.{}@{}.json'.format(json_name, seed, int(percent))\n    sup_path = os.path.join(save_path, sup_name)\n    save_json(sup_path, labeled_images, labeled_anns, categories)\n\n    unsup_name = '{}.{}@{}-unlabeled.json'.format(json_name, seed, int(percent))\n    unsup_path = os.path.join(save_path, unsup_name)\n    save_json(unsup_path, unlabeled_images, unlabeled_anns, categories)\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--data_dir', type=str, default='./dataset/coco')\n    parser.add_argument(\n        '--json_file', type=str, default='annotations/instances_train2017.json')\n    parser.add_argument('--percent', type=float, default=10.0)\n    parser.add_argument('--seed', type=int, default=1)\n    parser.add_argument('--seed_offset', type=int, default=0)\n    parser.add_argument('--txt_file', type=str, default='COCO_supervision.txt')\n    args = parser.parse_args()\n    print(args)\n    gen_semi_data(args.data_dir, args.json_file, args.percent, args.seed,\n                  args.seed_offset, args.txt_file)\n"
  },
  {
    "path": "tools/infer.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\nimport glob\nimport ast\n\nimport paddle\nfrom ppdet.core.workspace import create, load_config, merge_config\nfrom ppdet.engine import Trainer, Trainer_ARSL\nfrom ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config\nfrom ppdet.utils.cli import ArgsParser, merge_args\nfrom ppdet.slim import build_slim_model\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('train')\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        \"--infer_dir\",\n        type=str,\n        default=None,\n        help=\"Directory for images to perform inference on.\")\n    parser.add_argument(\n        \"--infer_list\",\n        type=str,\n        default=None,\n        help=\"The file path containing path of image to be infered. Valid only when --infer_dir is given.\"\n    )\n    parser.add_argument(\n        \"--infer_img\",\n        type=str,\n        default=None,\n        help=\"Image path, has higher priority over --infer_dir\")\n    parser.add_argument(\n        \"--output_dir\",\n        type=str,\n        default=\"output\",\n        help=\"Directory for storing the output visualization files.\")\n    parser.add_argument(\n        \"--draw_threshold\",\n        type=float,\n        default=0.5,\n        help=\"Threshold to reserve the result for visualization.\")\n    parser.add_argument(\n        \"--save_threshold\",\n        type=float,\n        default=0.5,\n        help=\"Threshold to reserve the result for saving.\")\n    parser.add_argument(\n        \"--slim_config\",\n        default=None,\n        type=str,\n        help=\"Configuration file of slim method.\")\n    parser.add_argument(\n        \"--use_vdl\",\n        type=bool,\n        default=False,\n        help=\"Whether to record the data to VisualDL.\")\n    parser.add_argument(\n        \"--do_eval\",\n        type=ast.literal_eval,\n        default=False,\n        help=\"Whether to eval after infer.\")\n    parser.add_argument(\n        '--vdl_log_dir',\n        type=str,\n        default=\"vdl_log_dir/image\",\n        help='VisualDL logging directory for image.')\n    parser.add_argument(\n        \"--save_results\",\n        type=bool,\n        default=False,\n        help=\"Whether to save inference results to output_dir.\")\n    parser.add_argument(\n        \"--slice_infer\",\n        action='store_true',\n        help=\"Whether to slice the image and merge the inference results for small object detection.\"\n    )\n    parser.add_argument(\n        '--slice_size',\n        nargs='+',\n        type=int,\n        default=[640, 640],\n        help=\"Height of the sliced image.\")\n    parser.add_argument(\n        \"--overlap_ratio\",\n        nargs='+',\n        type=float,\n        default=[0.25, 0.25],\n        help=\"Overlap height ratio of the sliced image.\")\n    parser.add_argument(\n        \"--combine_method\",\n        type=str,\n        default='nms',\n        help=\"Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat'].\"\n    )\n    parser.add_argument(\n        \"--match_threshold\",\n        type=float,\n        default=0.6,\n        help=\"Combine method matching threshold.\")\n    parser.add_argument(\n        \"--match_metric\",\n        type=str,\n        default='ios',\n        help=\"Combine method matching metric, choose in ['iou', 'ios'].\")\n    parser.add_argument(\n        \"--visualize\",\n        type=ast.literal_eval,\n        default=True,\n        help=\"Whether to save visualize results to output_dir.\")\n    parser.add_argument(\n        \"--rtn_im_file\",\n        type=bool,\n        default=False,\n        help=\"Whether to return image file path in Dataloader.\")\n    args = parser.parse_args()\n    return args\n\n\ndef get_test_images(infer_dir, infer_img, infer_list=None):\n    \"\"\"\n    Get image path list in TEST mode\n    \"\"\"\n    assert infer_img is not None or infer_dir is not None, \\\n        \"--infer_img or --infer_dir should be set\"\n    assert infer_img is None or os.path.isfile(infer_img), \\\n            \"{} is not a file\".format(infer_img)\n    assert infer_dir is None or os.path.isdir(infer_dir), \\\n            \"{} is not a directory\".format(infer_dir)\n\n    # infer_img has a higher priority\n    if infer_img and os.path.isfile(infer_img):\n        return [infer_img]\n\n    images = set()\n    infer_dir = os.path.abspath(infer_dir)\n    assert os.path.isdir(infer_dir), \\\n        \"infer_dir {} is not a directory\".format(infer_dir)\n    if infer_list:\n        assert os.path.isfile(\n            infer_list), f\"infer_list {infer_list} is not a valid file path.\"\n        with open(infer_list, 'r') as f:\n            lines = f.readlines()\n        for line in lines:\n            images.update([os.path.join(infer_dir, line.strip())])\n    else:\n        exts = ['jpg', 'jpeg', 'png', 'bmp']\n        exts += [ext.upper() for ext in exts]\n        for ext in exts:\n            images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))\n    images = list(images)\n    assert len(images) > 0, \"no image found in {}\".format(infer_dir)\n    logger.info(\"Found {} inference images in total.\".format(len(images)))\n\n    return images\n\n\ndef run(FLAGS, cfg):\n    if FLAGS.rtn_im_file:\n        cfg['TestReader']['sample_transforms'][0]['Decode'][\n            'rtn_im_file'] = FLAGS.rtn_im_file\n    ssod_method = cfg.get('ssod_method', None)\n    if ssod_method == 'ARSL':\n        trainer = Trainer_ARSL(cfg, mode='test')\n        trainer.load_weights(cfg.weights, ARSL_eval=True)\n    else:\n        trainer = Trainer(cfg, mode='test')\n        trainer.load_weights(cfg.weights)\n    # get inference images\n    if FLAGS.do_eval:\n        dataset = create('TestDataset')()\n        images = dataset.get_images()\n    else:\n        images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img, FLAGS.infer_list)\n\n    # inference\n    if FLAGS.slice_infer:\n        trainer.slice_predict(\n            images,\n            slice_size=FLAGS.slice_size,\n            overlap_ratio=FLAGS.overlap_ratio,\n            combine_method=FLAGS.combine_method,\n            match_threshold=FLAGS.match_threshold,\n            match_metric=FLAGS.match_metric,\n            draw_threshold=FLAGS.draw_threshold,\n            output_dir=FLAGS.output_dir,\n            save_results=FLAGS.save_results,\n            visualize=FLAGS.visualize)\n    else:\n        trainer.predict(\n            images,\n            draw_threshold=FLAGS.draw_threshold,\n            output_dir=FLAGS.output_dir,\n            save_results=FLAGS.save_results,\n            visualize=FLAGS.visualize,\n            save_threshold=FLAGS.save_threshold,\n            do_eval=FLAGS.do_eval)\n\n\ndef main():\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    merge_args(cfg, FLAGS)\n    merge_config(FLAGS.opt)\n\n    # disable npu in config by default\n    if 'use_npu' not in cfg:\n        cfg.use_npu = False\n\n    # disable xpu in config by default\n    if 'use_xpu' not in cfg:\n        cfg.use_xpu = False\n\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n\n    # disable mlu in config by default\n    if 'use_mlu' not in cfg:\n        cfg.use_mlu = False\n\n    if cfg.use_gpu:\n        place = paddle.set_device('gpu')\n    elif cfg.use_npu:\n        place = paddle.set_device('npu')\n    elif cfg.use_xpu:\n        place = paddle.set_device('xpu')\n    elif cfg.use_mlu:\n        place = paddle.set_device('mlu')\n    else:\n        place = paddle.set_device('cpu')\n\n    if FLAGS.slim_config:\n        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')\n\n    check_config(cfg)\n    check_gpu(cfg.use_gpu)\n    check_npu(cfg.use_npu)\n    check_xpu(cfg.use_xpu)\n    check_mlu(cfg.use_mlu)\n    check_version()\n    run(FLAGS, cfg)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "tools/infer_culane.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\nimport glob\nimport ast\n\nimport paddle\nfrom ppdet.core.workspace import load_config, merge_config\nfrom ppdet.engine import Trainer\nfrom ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config\nfrom ppdet.utils.cli import ArgsParser, merge_args\nfrom ppdet.slim import build_slim_model\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('train')\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        \"--infer_dir\",\n        type=str,\n        default=None,\n        help=\"Directory for images to perform inference on.\")\n    parser.add_argument(\n        \"--infer_img\",\n        type=str,\n        default=None,\n        help=\"Image path, has higher priority over --infer_dir\")\n    parser.add_argument(\n        \"--output_dir\",\n        type=str,\n        default=\"output\",\n        help=\"Directory for storing the output visualization files.\")\n    parser.add_argument(\n        \"--save_results\",\n        type=bool,\n        default=False,\n        help=\"Whether to save inference results to output_dir.\")\n    parser.add_argument(\n        \"--visualize\",\n        type=ast.literal_eval,\n        default=True,\n        help=\"Whether to save visualize results to output_dir.\")\n    args = parser.parse_args()\n    return args\n\n\ndef get_test_images(infer_dir, infer_img):\n    \"\"\"\n    Get image path list in TEST mode\n    \"\"\"\n    assert infer_img is not None or infer_dir is not None, \\\n        \"--infer_img or --infer_dir should be set\"\n    assert infer_img is None or os.path.isfile(infer_img), \\\n            \"{} is not a file\".format(infer_img)\n    assert infer_dir is None or os.path.isdir(infer_dir), \\\n            \"{} is not a directory\".format(infer_dir)\n\n    # infer_img has a higher priority\n    if infer_img and os.path.isfile(infer_img):\n        return [infer_img]\n\n    images = set()\n    infer_dir = os.path.abspath(infer_dir)\n    assert os.path.isdir(infer_dir), \\\n        \"infer_dir {} is not a directory\".format(infer_dir)\n    exts = ['jpg', 'jpeg', 'png', 'bmp']\n    exts += [ext.upper() for ext in exts]\n    for ext in exts:\n        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))\n    images = list(images)\n\n    assert len(images) > 0, \"no image found in {}\".format(infer_dir)\n    logger.info(\"Found {} inference images in total.\".format(len(images)))\n\n    return images\n\n\ndef run(FLAGS, cfg):\n    # build trainer\n    trainer = Trainer(cfg, mode='test')\n\n    # load weights\n    trainer.load_weights(cfg.weights)\n\n    # get inference images\n    images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img)\n\n    trainer.predict_culane(\n        images,\n        output_dir=FLAGS.output_dir,\n        save_results=FLAGS.save_results,\n        visualize=FLAGS.visualize)\n\n\ndef main():\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    merge_args(cfg, FLAGS)\n    merge_config(FLAGS.opt)\n\n    # disable npu in config by default\n    if 'use_npu' not in cfg:\n        cfg.use_npu = False\n\n    # disable xpu in config by default\n    if 'use_xpu' not in cfg:\n        cfg.use_xpu = False\n\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n\n    # disable mlu in config by default\n    if 'use_mlu' not in cfg:\n        cfg.use_mlu = False\n\n    if cfg.use_gpu:\n        place = paddle.set_device('gpu')\n    elif cfg.use_npu:\n        place = paddle.set_device('npu')\n    elif cfg.use_xpu:\n        place = paddle.set_device('xpu')\n    elif cfg.use_mlu:\n        place = paddle.set_device('mlu')\n    else:\n        place = paddle.set_device('cpu')\n\n    check_config(cfg)\n    check_gpu(cfg.use_gpu)\n    check_npu(cfg.use_npu)\n    check_xpu(cfg.use_xpu)\n    check_mlu(cfg.use_mlu)\n    check_version()\n\n    run(FLAGS, cfg)\n\n\nif __name__ == '__main__':\n    main()"
  },
  {
    "path": "tools/infer_mot.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\n\nimport paddle\nfrom ppdet.core.workspace import load_config, merge_config\nfrom ppdet.engine import Tracker\nfrom ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config\nfrom ppdet.utils.cli import ArgsParser\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        '--video_file', type=str, default=None, help='Video name for tracking.')\n    parser.add_argument(\n        '--frame_rate',\n        type=int,\n        default=-1,\n        help='Video frame rate for tracking.')\n    parser.add_argument(\n        \"--image_dir\",\n        type=str,\n        default=None,\n        help=\"Directory for images to perform inference on.\")\n    parser.add_argument(\n        \"--det_results_dir\",\n        type=str,\n        default='',\n        help=\"Directory name for detection results.\")\n    parser.add_argument(\n        '--output_dir',\n        type=str,\n        default='output',\n        help='Directory name for output tracking results.')\n    parser.add_argument(\n        '--save_images',\n        action='store_true',\n        help='Save tracking results (image).')\n    parser.add_argument(\n        '--save_videos',\n        action='store_true',\n        help='Save tracking results (video).')\n    parser.add_argument(\n        '--show_image',\n        action='store_true',\n        help='Show tracking results (image).')\n    parser.add_argument(\n        '--scaled',\n        type=bool,\n        default=False,\n        help=\"Whether coords after detector outputs are scaled, False in JDE YOLOv3 \"\n        \"True in general detector.\")\n    parser.add_argument(\n        \"--draw_threshold\",\n        type=float,\n        default=0.5,\n        help=\"Threshold to reserve the result for visualization.\")\n    args = parser.parse_args()\n    return args\n\n\ndef run(FLAGS, cfg):\n    # build Tracker\n    tracker = Tracker(cfg, mode='test')\n\n    # load weights\n    if cfg.architecture in ['DeepSORT', 'ByteTrack']:\n        tracker.load_weights_sde(cfg.det_weights, cfg.reid_weights)\n    else:\n        tracker.load_weights_jde(cfg.weights)\n\n    # inference\n    tracker.mot_predict_seq(\n        video_file=FLAGS.video_file,\n        frame_rate=FLAGS.frame_rate,\n        image_dir=FLAGS.image_dir,\n        data_type=cfg.metric.lower(),\n        model_type=cfg.architecture,\n        output_dir=FLAGS.output_dir,\n        save_images=FLAGS.save_images,\n        save_videos=FLAGS.save_videos,\n        show_image=FLAGS.show_image,\n        scaled=FLAGS.scaled,\n        det_results_dir=FLAGS.det_results_dir,\n        draw_threshold=FLAGS.draw_threshold)\n\n\ndef main():\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    merge_config(FLAGS.opt)\n\n    # disable npu in config by default\n    if 'use_npu' not in cfg:\n        cfg.use_npu = False\n\n    # disable xpu in config by default\n    if 'use_xpu' not in cfg:\n        cfg.use_xpu = False\n\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n\n    # disable mlu in config by default\n    if 'use_mlu' not in cfg:\n        cfg.use_mlu = False\n\n    if cfg.use_gpu:\n        place = paddle.set_device('gpu')\n    elif cfg.use_npu:\n        place = paddle.set_device('npu')\n    elif cfg.use_xpu:\n        place = paddle.set_device('xpu')\n    elif cfg.use_mlu:\n        place = paddle.set_device('mlu')\n    else:\n        place = paddle.set_device('cpu')\n\n    check_config(cfg)\n    check_gpu(cfg.use_gpu)\n    check_npu(cfg.use_npu)\n    check_xpu(cfg.use_xpu)\n    check_mlu(cfg.use_mlu)\n    check_version()\n\n    run(FLAGS, cfg)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "tools/post_quant.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\n\nimport paddle\n\nfrom ppdet.core.workspace import load_config, merge_config\nfrom ppdet.utils.check import check_gpu, check_version, check_config\nfrom ppdet.utils.cli import ArgsParser\nfrom ppdet.engine import Trainer\nfrom ppdet.slim import build_slim_model\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('post_quant')\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        \"--output_dir\",\n        type=str,\n        default=\"output_inference\",\n        help=\"Directory for storing the output model files.\")\n    parser.add_argument(\n        \"--slim_config\",\n        default=None,\n        type=str,\n        help=\"Configuration file of slim method.\")\n    args = parser.parse_args()\n    return args\n\n\ndef run(FLAGS, cfg):\n    # build detector\n    trainer = Trainer(cfg, mode='eval')\n\n    # load weights\n    if cfg.architecture in ['DeepSORT']:\n        if cfg.det_weights != 'None':\n            trainer.load_weights_sde(cfg.det_weights, cfg.reid_weights)\n        else:\n            trainer.load_weights_sde(None, cfg.reid_weights)\n    else:\n        trainer.load_weights(cfg.weights)\n\n    # post quant model\n    trainer.post_quant(FLAGS.output_dir)\n\n\ndef main():\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    # TODO: to be refined in the future\n    if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn':\n        FLAGS.opt['norm_type'] = 'bn'\n    merge_config(FLAGS.opt)\n\n    if FLAGS.slim_config:\n        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')\n\n    # FIXME: Temporarily solve the priority problem of FLAGS.opt\n    merge_config(FLAGS.opt)\n    check_config(cfg)\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n    check_gpu(cfg.use_gpu)\n    check_version()\n\n    run(FLAGS, cfg)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "tools/slice_image.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nfrom tqdm import tqdm\n\n\ndef slice_data(image_dir, dataset_json_path, output_dir, slice_size,\n               overlap_ratio):\n    try:\n        from sahi.scripts.slice_coco import slice\n    except Exception as e:\n        raise RuntimeError(\n            'Unable to use sahi to slice images, please install sahi, for example: `pip install sahi`, see https://github.com/obss/sahi'\n        )\n    tqdm.write(\n        f\" slicing for slice_size={slice_size}, overlap_ratio={overlap_ratio}\")\n    slice(\n        image_dir=image_dir,\n        dataset_json_path=dataset_json_path,\n        output_dir=output_dir,\n        slice_size=slice_size,\n        overlap_ratio=overlap_ratio, )\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        '--image_dir', type=str, default=None, help=\"The image folder path.\")\n    parser.add_argument(\n        '--json_path', type=str, default=None, help=\"Dataset json path.\")\n    parser.add_argument(\n        '--output_dir', type=str, default=None, help=\"Output dir.\")\n    parser.add_argument(\n        '--slice_size', type=int, default=500, help=\"slice_size\")\n    parser.add_argument(\n        '--overlap_ratio', type=float, default=0.25, help=\"overlap_ratio\")\n    args = parser.parse_args()\n\n    slice_data(args.image_dir, args.json_path, args.output_dir, args.slice_size,\n               args.overlap_ratio)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tools/sniper_params_stats.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport json\nimport logging\nimport numpy as np\n\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('sniper_params_stats')\n\ndef get_default_params(architecture):\n    \"\"\"get_default_params\"\"\"\n    if architecture == \"FasterRCNN\":\n        anchor_range = np.array([64., 512.])  # for frcnn-fpn\n        # anchor_range = np.array([16., 373.])  # for yolov3\n        # anchor_range = np.array([32., 373.])  # for yolov3\n        default_crop_size = 1536  # mod 32 for frcnn-fpn\n        default_max_bbox_size = 352\n    elif architecture == \"YOLOv3\":\n        anchor_range = np.array([32., 373.])  # for yolov3\n        default_crop_size = 800  # mod 32 for yolov3\n        default_max_bbox_size = 352\n    else:\n        raise NotImplementedError\n\n    return anchor_range, default_crop_size, default_max_bbox_size\n\n\ndef get_box_ratios(anno_file):\n    \"\"\"\n    get_size_ratios\n    :param anno_file: coco anno flile\n    :return: size_ratio: (box_long_size / pic_long_size)\n    \"\"\"\n    coco_dict = json.load(open(anno_file))\n    image_list = coco_dict['images']\n    anno_list = coco_dict['annotations']\n\n    image_id2hw = {}\n    for im_dict in image_list:\n        im_id = im_dict['id']\n        h, w = im_dict['height'], im_dict['width']\n        image_id2hw[im_id] = (h, w)\n\n    box_ratios = []\n    for a_dict in anno_list:\n        im_id = a_dict['image_id']\n        im_h, im_w = image_id2hw[im_id]\n        bbox = a_dict['bbox']\n        x1, y1, w, h = bbox\n        pic_long = max(im_h, im_w)\n        box_long = max(w, h)\n        box_ratios.append(box_long / pic_long)\n\n    return np.array(box_ratios)\n\n\ndef get_target_size_and_valid_box_ratios(anchor_range, box_ratio_p2, box_ratio_p98):\n    \"\"\"get_scale_and_ratios\"\"\"\n    anchor_better_low, anchor_better_high = anchor_range  # (60., 512.)\n    anchor_center = np.sqrt(anchor_better_high * anchor_better_low)\n\n    anchor_log_range = np.log10(anchor_better_high) - np.log10(anchor_better_low)\n    box_ratio_log_range = np.log10(box_ratio_p98) - np.log10(box_ratio_p2)\n    logger.info(\"anchor_log_range:{}, box_ratio_log_range:{}\".format(anchor_log_range, box_ratio_log_range))\n\n    box_cut_num = int(np.ceil(box_ratio_log_range / anchor_log_range))\n    box_ratio_log_window = box_ratio_log_range / box_cut_num\n    logger.info(\"box_cut_num:{}, box_ratio_log_window:{}\".format(box_cut_num, box_ratio_log_window))\n\n    image_target_sizes = []\n    valid_ratios = []\n    for i in range(box_cut_num):\n        # # method1: align center\n        # box_ratio_log_center = np.log10(p2) + 0.5 * box_ratio_log_window + i * box_ratio_log_window\n        # box_ratio_center = np.power(10, box_ratio_log_center)\n        # scale = anchor_center / box_ratio_center\n        # method2: align left low\n        box_ratio_low = np.power(10, np.log10(box_ratio_p2) + i * box_ratio_log_window)\n        image_target_size = anchor_better_low / box_ratio_low\n\n        image_target_sizes.append(int(image_target_size))\n        valid_ratio = anchor_range / image_target_size\n        valid_ratios.append(valid_ratio.tolist())\n\n        logger.info(\"Box cut {}\".format(i))\n        logger.info(\"box_ratio_low: {}\".format(box_ratio_low))\n        logger.info(\"image_target_size: {}\".format(image_target_size))\n        logger.info(\"valid_ratio: {}\".format(valid_ratio))\n\n    return image_target_sizes, valid_ratios\n\n\ndef get_valid_ranges(valid_ratios):\n    \"\"\"\n    get_valid_box_ratios_range\n    :param valid_ratios:\n    :return:\n    \"\"\"\n    valid_ranges = []\n    if len(valid_ratios) == 1:\n        valid_ranges.append([-1, -1])\n    else:\n        for i, vratio in enumerate(valid_ratios):\n            if i == 0:\n                valid_ranges.append([-1, vratio[1]])\n            elif i == len(valid_ratios) - 1:\n                valid_ranges.append([vratio[0], -1])\n            else:\n                valid_ranges.append(vratio)\n    return valid_ranges\n\n\ndef get_percentile(a_array, low_percent, high_percent):\n    \"\"\"\n    get_percentile\n    :param low_percent:\n    :param high_percent:\n    :return:\n    \"\"\"\n    array_p0 = min(a_array)\n    array_p100 = max(a_array)\n    array_plow = np.percentile(a_array, low_percent)\n    array_phigh = np.percentile(a_array, high_percent)\n    logger.info(\n        \"array_percentile(0): {},array_percentile low({}): {}, \"\n        \"array_percentile high({}): {}, array_percentile 100: {}\".format(\n            array_p0, low_percent, array_plow, high_percent, array_phigh, array_p100))\n    return array_plow, array_phigh\n\n\ndef sniper_anno_stats(architecture, anno_file):\n    \"\"\"\n    sniper_anno_stats\n    :param anno_file:\n    :return:\n    \"\"\"\n\n    anchor_range, default_crop_size, default_max_bbox_size = get_default_params(architecture)\n\n    box_ratios = get_box_ratios(anno_file)\n\n    box_ratio_p8, box_ratio_p92 = get_percentile(box_ratios, 8, 92)\n\n    image_target_sizes, valid_box_ratios = get_target_size_and_valid_box_ratios(anchor_range, box_ratio_p8, box_ratio_p92)\n\n    valid_ranges = get_valid_ranges(valid_box_ratios)\n\n    crop_size = min(default_crop_size, min([item for item in image_target_sizes]))\n    crop_size = int(np.ceil(crop_size / 32.) * 32.)\n    crop_stride = max(min(default_max_bbox_size, crop_size), crop_size - default_max_bbox_size)\n    logger.info(\"Result\".center(100, '-'))\n    logger.info(\"image_target_sizes: {}\".format(image_target_sizes))\n    logger.info(\"valid_box_ratio_ranges: {}\".format(valid_ranges))\n    logger.info(\"chip_target_size: {}, chip_target_stride: {}\".format(crop_size, crop_stride))\n\n    return {\n        \"image_target_sizes\": image_target_sizes,\n        \"valid_box_ratio_ranges\": valid_ranges,\n        \"chip_target_size\": crop_size,\n        \"chip_target_stride\": crop_stride\n    }\n\nif __name__==\"__main__\":\n    architecture, anno_file = sys.argv[1], sys.argv[2]\n    sniper_anno_stats(architecture, anno_file)\n"
  },
  {
    "path": "tools/train.py",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\n# add python path of PaddleDetection to sys.path\nparent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))\nsys.path.insert(0, parent_path)\n\n# ignore warning log\nimport warnings\nwarnings.filterwarnings('ignore')\n\nimport cv2\ncv2.setNumThreads(0)\ncv2.ocl.setUseOpenCL(False)\n\nimport paddle\nfrom ppdet.core.workspace import load_config, merge_config\n\nfrom ppdet.engine import Trainer, TrainerCot, init_parallel_env, set_random_seed, init_fleet_env\nfrom ppdet.engine.trainer_ssod import Trainer_DenseTeacher, Trainer_ARSL, Trainer_Semi_RTDETR\n\nfrom ppdet.slim import build_slim_model\n\nfrom ppdet.utils.cli import ArgsParser, merge_args\nimport ppdet.utils.check as check\nfrom ppdet.utils.logger import setup_logger\nlogger = setup_logger('train')\n\n\ndef parse_args():\n    parser = ArgsParser()\n    parser.add_argument(\n        \"--eval\",\n        action='store_true',\n        default=False,\n        help=\"Whether to perform evaluation in train\")\n    parser.add_argument(\n        \"-r\", \"--resume\", default=None, help=\"weights path for resume\")\n    parser.add_argument(\n        \"--slim_config\",\n        default=None,\n        type=str,\n        help=\"Configuration file of slim method.\")\n    parser.add_argument(\n        \"--enable_ce\",\n        type=bool,\n        default=False,\n        help=\"If set True, enable continuous evaluation job.\"\n        \"This flag is only used for internal test.\")\n    parser.add_argument(\n        \"--amp\",\n        action='store_true',\n        default=False,\n        help=\"Enable auto mixed precision training.\")\n    parser.add_argument(\n        \"--fleet\", action='store_true', default=False, help=\"Use fleet or not\")\n    parser.add_argument(\n        \"--use_vdl\",\n        type=bool,\n        default=False,\n        help=\"whether to record the data to VisualDL.\")\n    parser.add_argument(\n        '--vdl_log_dir',\n        type=str,\n        default=\"vdl_log_dir/scalar\",\n        help='VisualDL logging directory for scalar.')\n    parser.add_argument(\n        \"--use_wandb\",\n        type=bool,\n        default=False,\n        help=\"whether to record the data to wandb.\")\n    parser.add_argument(\n        '--save_prediction_only',\n        action='store_true',\n        default=False,\n        help='Whether to save the evaluation results only')\n    parser.add_argument(\n        '--profiler_options',\n        type=str,\n        default=None,\n        help=\"The option of profiler, which should be in \"\n        \"format \\\"key1=value1;key2=value2;key3=value3\\\".\"\n        \"please see ppdet/utils/profiler.py for detail.\")\n    parser.add_argument(\n        '--save_proposals',\n        action='store_true',\n        default=False,\n        help='Whether to save the train proposals')\n    parser.add_argument(\n        '--proposals_path',\n        type=str,\n        default=\"sniper/proposals.json\",\n        help='Train proposals directory')\n    parser.add_argument(\n        \"--to_static\",\n        action='store_true',\n        default=False,\n        help=\"Enable dy2st to train.\")\n\n    args = parser.parse_args()\n    return args\n\n\ndef run(FLAGS, cfg):\n    # init fleet environment\n    if cfg.fleet:\n        init_fleet_env(cfg.get('find_unused_parameters', False))\n    else:\n        # init parallel environment if nranks > 1\n        init_parallel_env()\n\n    if FLAGS.enable_ce:\n        set_random_seed(0)\n\n    # build trainer\n    ssod_method = cfg.get('ssod_method', None)\n    if ssod_method is not None:\n        if ssod_method == 'DenseTeacher':\n            trainer = Trainer_DenseTeacher(cfg, mode='train')\n        elif ssod_method == 'ARSL':\n            trainer = Trainer_ARSL(cfg, mode='train')\n        elif ssod_method == 'Semi_RTDETR':\n            trainer = Trainer_Semi_RTDETR(cfg, mode='train')\n        else:\n            raise ValueError(\n                \"Semi-Supervised Object Detection only no support this method.\")\n    elif cfg.get('use_cot', False):\n        trainer = TrainerCot(cfg, mode='train')\n    else:\n        trainer = Trainer(cfg, mode='train')\n\n    # load weights\n    if FLAGS.resume is not None:\n        trainer.resume_weights(FLAGS.resume)\n    elif 'pretrain_student_weights' in cfg and 'pretrain_teacher_weights' in cfg \\\n            and cfg.pretrain_teacher_weights and cfg.pretrain_student_weights:\n        trainer.load_semi_weights(cfg.pretrain_teacher_weights,\n                                  cfg.pretrain_student_weights)\n    elif 'pretrain_weights' in cfg and cfg.pretrain_weights:\n        trainer.load_weights(cfg.pretrain_weights)\n\n    # training\n    trainer.train(FLAGS.eval)\n\n\ndef main():\n    FLAGS = parse_args()\n    cfg = load_config(FLAGS.config)\n    merge_args(cfg, FLAGS)\n    merge_config(FLAGS.opt)\n\n    # disable npu in config by default\n    if 'use_npu' not in cfg:\n        cfg.use_npu = False\n\n    # disable xpu in config by default\n    if 'use_xpu' not in cfg:\n        cfg.use_xpu = False\n\n    if 'use_gpu' not in cfg:\n        cfg.use_gpu = False\n\n    # disable mlu in config by default\n    if 'use_mlu' not in cfg:\n        cfg.use_mlu = False\n\n    if cfg.use_gpu:\n        place = paddle.set_device('gpu')\n    elif cfg.use_npu:\n        place = paddle.set_device('npu')\n    elif cfg.use_xpu:\n        place = paddle.set_device('xpu')\n    elif cfg.use_mlu:\n        place = paddle.set_device('mlu')\n    else:\n        place = paddle.set_device('cpu')\n\n    if FLAGS.slim_config:\n        cfg = build_slim_model(cfg, FLAGS.slim_config)\n\n    # FIXME: Temporarily solve the priority problem of FLAGS.opt\n    merge_config(FLAGS.opt)\n    check.check_config(cfg)\n    check.check_gpu(cfg.use_gpu)\n    check.check_npu(cfg.use_npu)\n    check.check_xpu(cfg.use_xpu)\n    check.check_mlu(cfg.use_mlu)\n    check.check_version()\n\n    run(FLAGS, cfg)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tools/x2coco.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nimport glob\nimport json\nimport os\nimport os.path as osp\nimport shutil\nimport xml.etree.ElementTree as ET\n\nimport numpy as np\nimport PIL.ImageDraw\nfrom tqdm import tqdm\nimport cv2\n\nlabel_to_num = {}\ncategories_list = []\nlabels_list = []\n\n\nclass MyEncoder(json.JSONEncoder):\n    def default(self, obj):\n        if isinstance(obj, np.integer):\n            return int(obj)\n        elif isinstance(obj, np.floating):\n            return float(obj)\n        elif isinstance(obj, np.ndarray):\n            return obj.tolist()\n        else:\n            return super(MyEncoder, self).default(obj)\n\n\ndef images_labelme(data, num):\n    image = {}\n    image['height'] = data['imageHeight']\n    image['width'] = data['imageWidth']\n    image['id'] = num + 1\n    if '\\\\' in data['imagePath']:\n        image['file_name'] = data['imagePath'].split('\\\\')[-1]\n    else:\n        image['file_name'] = data['imagePath'].split('/')[-1]\n    return image\n\n\ndef images_cityscape(data, num, img_file):\n    image = {}\n    image['height'] = data['imgHeight']\n    image['width'] = data['imgWidth']\n    image['id'] = num + 1\n    image['file_name'] = img_file\n    return image\n\n\ndef categories(label, labels_list):\n    category = {}\n    category['supercategory'] = 'component'\n    category['id'] = len(labels_list) + 1\n    category['name'] = label\n    return category\n\n\ndef annotations_rectangle(points, label, image_num, object_num, label_to_num):\n    annotation = {}\n    seg_points = np.asarray(points).copy()\n    seg_points[1, :] = np.asarray(points)[2, :]\n    seg_points[2, :] = np.asarray(points)[1, :]\n    annotation['segmentation'] = [list(seg_points.flatten())]\n    annotation['iscrowd'] = 0\n    annotation['image_id'] = image_num + 1\n    annotation['bbox'] = list(\n        map(float, [\n            points[0][0], points[0][1], points[1][0] - points[0][0], points[1][\n                1] - points[0][1]\n        ]))\n    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]\n    annotation['category_id'] = label_to_num[label]\n    annotation['id'] = object_num + 1\n    return annotation\n\n\ndef annotations_polygon(height, width, points, label, image_num, object_num,\n                        label_to_num):\n    annotation = {}\n    annotation['segmentation'] = [list(np.asarray(points).flatten())]\n    annotation['iscrowd'] = 0\n    annotation['image_id'] = image_num + 1\n    annotation['bbox'] = list(map(float, get_bbox(height, width, points)))\n    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]\n    annotation['category_id'] = label_to_num[label]\n    annotation['id'] = object_num + 1\n    return annotation\n\n\ndef get_bbox(height, width, points):\n    polygons = points\n    mask = np.zeros([height, width], dtype=np.uint8)\n    mask = PIL.Image.fromarray(mask)\n    xy = list(map(tuple, polygons))\n    PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1)\n    mask = np.array(mask, dtype=bool)\n    index = np.argwhere(mask == 1)\n    rows = index[:, 0]\n    clos = index[:, 1]\n    left_top_r = np.min(rows)\n    left_top_c = np.min(clos)\n    right_bottom_r = np.max(rows)\n    right_bottom_c = np.max(clos)\n    return [\n        left_top_c, left_top_r, right_bottom_c - left_top_c,\n        right_bottom_r - left_top_r\n    ]\n\n\ndef deal_json(ds_type, img_path, json_path):\n    data_coco = {}\n    images_list = []\n    annotations_list = []\n    image_num = -1\n    object_num = -1\n    for img_file in os.listdir(img_path):\n        img_label = os.path.splitext(img_file)[0]\n        if img_file.split('.')[\n                -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']:\n            continue\n        label_file = osp.join(json_path, img_label + '.json')\n        print('Generating dataset from:', label_file)\n        image_num = image_num + 1\n        with open(label_file) as f:\n            data = json.load(f)\n            if ds_type == 'labelme':\n                images_list.append(images_labelme(data, image_num))\n            elif ds_type == 'cityscape':\n                images_list.append(images_cityscape(data, image_num, img_file))\n            if ds_type == 'labelme':\n                for shapes in data['shapes']:\n                    object_num = object_num + 1\n                    label = shapes['label']\n                    if label not in labels_list:\n                        categories_list.append(categories(label, labels_list))\n                        labels_list.append(label)\n                        label_to_num[label] = len(labels_list)\n                    p_type = shapes['shape_type']\n                    if p_type == 'polygon':\n                        points = shapes['points']\n                        annotations_list.append(\n                            annotations_polygon(data['imageHeight'], data[\n                                'imageWidth'], points, label, image_num,\n                                                object_num, label_to_num))\n\n                    if p_type == 'rectangle':\n                        (x1, y1), (x2, y2) = shapes['points']\n                        x1, x2 = sorted([x1, x2])\n                        y1, y2 = sorted([y1, y2])\n                        points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]]\n                        annotations_list.append(\n                            annotations_rectangle(points, label, image_num,\n                                                  object_num, label_to_num))\n            elif ds_type == 'cityscape':\n                for shapes in data['objects']:\n                    object_num = object_num + 1\n                    label = shapes['label']\n                    if label not in labels_list:\n                        categories_list.append(categories(label, labels_list))\n                        labels_list.append(label)\n                        label_to_num[label] = len(labels_list)\n                    points = shapes['polygon']\n                    annotations_list.append(\n                        annotations_polygon(data['imgHeight'], data[\n                            'imgWidth'], points, label, image_num, object_num,\n                                            label_to_num))\n    data_coco['images'] = images_list\n    data_coco['categories'] = categories_list\n    data_coco['annotations'] = annotations_list\n    return data_coco\n\n\ndef voc_get_label_anno(ann_dir_path, ann_ids_path, labels_path):\n    with open(labels_path, 'r') as f:\n        labels_str = f.read().split()\n    labels_ids = list(range(1, len(labels_str) + 1))\n\n    with open(ann_ids_path, 'r') as f:\n        ann_ids = [lin.strip().split(' ')[-1] for lin in f.readlines()]\n\n    ann_paths = []\n    for aid in ann_ids:\n        if aid.endswith('xml'):\n            ann_path = os.path.join(ann_dir_path, aid)\n        else:\n            ann_path = os.path.join(ann_dir_path, aid + '.xml')\n        ann_paths.append(ann_path)\n\n    return dict(zip(labels_str, labels_ids)), ann_paths\n\n\ndef voc_get_image_info(annotation_root, im_id):\n    filename = annotation_root.findtext('filename')\n    assert filename is not None\n    img_name = os.path.basename(filename)\n\n    size = annotation_root.find('size')\n    width = float(size.findtext('width'))\n    height = float(size.findtext('height'))\n\n    image_info = {\n        'file_name': filename,\n        'height': height,\n        'width': width,\n        'id': im_id\n    }\n    return image_info\n\n\ndef voc_get_coco_annotation(obj, label2id):\n    label = obj.findtext('name')\n    assert label in label2id, \"label is not in label2id.\"\n    category_id = label2id[label]\n    bndbox = obj.find('bndbox')\n    xmin = float(bndbox.findtext('xmin'))\n    ymin = float(bndbox.findtext('ymin'))\n    xmax = float(bndbox.findtext('xmax'))\n    ymax = float(bndbox.findtext('ymax'))\n    assert xmax > xmin and ymax > ymin, \"Box size error.\"\n    o_width = xmax - xmin\n    o_height = ymax - ymin\n    anno = {\n        'area': o_width * o_height,\n        'iscrowd': 0,\n        'bbox': [xmin, ymin, o_width, o_height],\n        'category_id': category_id,\n        'ignore': 0,\n    }\n    return anno\n\n\ndef voc_xmls_to_cocojson(annotation_paths, label2id, output_dir, output_file):\n    output_json_dict = {\n        \"images\": [],\n        \"type\": \"instances\",\n        \"annotations\": [],\n        \"categories\": []\n    }\n    bnd_id = 1  # bounding box start id\n    im_id = 0\n    print('Start converting !')\n    for a_path in tqdm(annotation_paths):\n        # Read annotation xml\n        ann_tree = ET.parse(a_path)\n        ann_root = ann_tree.getroot()\n\n        img_info = voc_get_image_info(ann_root, im_id)\n        output_json_dict['images'].append(img_info)\n\n        for obj in ann_root.findall('object'):\n            ann = voc_get_coco_annotation(obj=obj, label2id=label2id)\n            ann.update({'image_id': im_id, 'id': bnd_id})\n            output_json_dict['annotations'].append(ann)\n            bnd_id = bnd_id + 1\n        im_id += 1\n\n    for label, label_id in label2id.items():\n        category_info = {'supercategory': 'none', 'id': label_id, 'name': label}\n        output_json_dict['categories'].append(category_info)\n    output_file = os.path.join(output_dir, output_file)\n    with open(output_file, 'w') as f:\n        output_json = json.dumps(output_json_dict)\n        f.write(output_json)\n\n\ndef widerface_to_cocojson(root_path):\n    train_gt_txt = os.path.join(root_path, \"wider_face_split\", \"wider_face_train_bbx_gt.txt\")\n    val_gt_txt = os.path.join(root_path, \"wider_face_split\", \"wider_face_val_bbx_gt.txt\")\n    train_img_dir = os.path.join(root_path, \"WIDER_train\", \"images\")\n    val_img_dir = os.path.join(root_path, \"WIDER_val\", \"images\")\n    assert train_gt_txt\n    assert val_gt_txt\n    assert train_img_dir\n    assert val_img_dir\n    save_path = os.path.join(root_path, \"widerface_train.json\")\n    widerface_convert(train_gt_txt, train_img_dir, save_path)\n    print(\"Wider Face train dataset converts sucess, the json path: {}\".format(save_path))\n    save_path = os.path.join(root_path, \"widerface_val.json\")\n    widerface_convert(val_gt_txt, val_img_dir, save_path)\n    print(\"Wider Face val dataset converts sucess, the json path: {}\".format(save_path))\n\n\ndef widerface_convert(gt_txt, img_dir, save_path):\n    output_json_dict = {\n        \"images\": [],\n        \"type\": \"instances\",\n        \"annotations\": [],\n        \"categories\": [{'supercategory': 'none', 'id': 0, 'name': \"human_face\"}]\n    }\n    bnd_id = 1  # bounding box start id\n    im_id = 0\n    print('Start converting !')\n    with open(gt_txt) as fd:\n        lines = fd.readlines()\n\n    i = 0\n    while i < len(lines):\n        image_name = lines[i].strip()\n        bbox_num = int(lines[i + 1].strip())\n        i += 2\n        img_info = get_widerface_image_info(img_dir, image_name, im_id)\n        if img_info:\n            output_json_dict[\"images\"].append(img_info)\n            for j in range(i, i + bbox_num):\n                anno = get_widerface_ann_info(lines[j])\n                anno.update({'image_id': im_id, 'id': bnd_id})\n                output_json_dict['annotations'].append(anno)\n                bnd_id += 1\n        else:\n            print(\"The image dose not exist: {}\".format(os.path.join(img_dir, image_name)))\n        bbox_num = 1 if bbox_num == 0 else bbox_num\n        i += bbox_num\n        im_id += 1\n    with open(save_path, 'w') as f:\n        output_json = json.dumps(output_json_dict)\n        f.write(output_json)\n\n\ndef get_widerface_image_info(img_root, img_relative_path, img_id):\n    image_info = {}\n    save_path = os.path.join(img_root, img_relative_path)\n    if os.path.exists(save_path):\n        img = cv2.imread(save_path)\n        image_info[\"file_name\"] = os.path.join(os.path.basename(\n            os.path.dirname(img_root)), os.path.basename(img_root),\n            img_relative_path)\n        image_info[\"height\"] = img.shape[0]\n        image_info[\"width\"] = img.shape[1]\n        image_info[\"id\"] = img_id\n    return image_info\n\n\ndef get_widerface_ann_info(info):\n    info = [int(x) for x in info.strip().split()]\n    anno = {\n        'area': info[2] * info[3],\n        'iscrowd': 0,\n        'bbox': [info[0], info[1], info[2], info[3]],\n        'category_id': 0,\n        'ignore': 0,\n        'blur': info[4],\n        'expression': info[5],\n        'illumination': info[6],\n        'invalid': info[7],\n        'occlusion': info[8],\n        'pose': info[9]\n    }\n    return anno\n\n\ndef main():\n    parser = argparse.ArgumentParser(\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n    parser.add_argument(\n        '--dataset_type',\n        help='the type of dataset, can be `voc`, `widerface`, `labelme` or `cityscape`')\n    parser.add_argument('--json_input_dir', help='input annotated directory')\n    parser.add_argument('--image_input_dir', help='image directory')\n    parser.add_argument(\n        '--output_dir', help='output dataset directory', default='./')\n    parser.add_argument(\n        '--train_proportion',\n        help='the proportion of train dataset',\n        type=float,\n        default=1.0)\n    parser.add_argument(\n        '--val_proportion',\n        help='the proportion of validation dataset',\n        type=float,\n        default=0.0)\n    parser.add_argument(\n        '--test_proportion',\n        help='the proportion of test dataset',\n        type=float,\n        default=0.0)\n    parser.add_argument(\n        '--voc_anno_dir',\n        help='In Voc format dataset, path to annotation files directory.',\n        type=str,\n        default=None)\n    parser.add_argument(\n        '--voc_anno_list',\n        help='In Voc format dataset, path to annotation files ids list.',\n        type=str,\n        default=None)\n    parser.add_argument(\n        '--voc_label_list',\n        help='In Voc format dataset, path to label list. The content of each line is a category.',\n        type=str,\n        default=None)\n    parser.add_argument(\n        '--voc_out_name',\n        type=str,\n        default='voc.json',\n        help='In Voc format dataset, path to output json file')\n    parser.add_argument(\n        '--widerface_root_dir',\n        help='The root_path for wider face dataset, which contains `wider_face_split`, `WIDER_train` and `WIDER_val`.And the json file will save in this path',\n        type=str,\n        default=None)\n    args = parser.parse_args()\n    try:\n        assert args.dataset_type in ['voc', 'labelme', 'cityscape', 'widerface']\n    except AssertionError as e:\n        print(\n            'Now only support the voc, cityscape dataset and labelme dataset!!')\n        os._exit(0)\n\n    if args.dataset_type == 'voc':\n        assert args.voc_anno_dir and args.voc_anno_list and args.voc_label_list\n        label2id, ann_paths = voc_get_label_anno(\n            args.voc_anno_dir, args.voc_anno_list, args.voc_label_list)\n        voc_xmls_to_cocojson(\n            annotation_paths=ann_paths,\n            label2id=label2id,\n            output_dir=args.output_dir,\n            output_file=args.voc_out_name)\n    elif args.dataset_type == \"widerface\":\n        assert args.widerface_root_dir\n        widerface_to_cocojson(args.widerface_root_dir)\n    else:\n        try:\n            assert os.path.exists(args.json_input_dir)\n        except AssertionError as e:\n            print('The json folder does not exist!')\n            os._exit(0)\n        try:\n            assert os.path.exists(args.image_input_dir)\n        except AssertionError as e:\n            print('The image folder does not exist!')\n            os._exit(0)\n        try:\n            assert abs(args.train_proportion + args.val_proportion \\\n                    + args.test_proportion - 1.0) < 1e-5\n        except AssertionError as e:\n            print(\n                'The sum of pqoportion of training, validation and test datase must be 1!'\n            )\n            os._exit(0)\n\n        # Allocate the dataset.\n        total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json')))\n        if args.train_proportion != 0:\n            train_num = int(total_num * args.train_proportion)\n            out_dir = args.output_dir + '/train'\n            if not os.path.exists(out_dir):\n                os.makedirs(out_dir)\n        else:\n            train_num = 0\n        if args.val_proportion == 0.0:\n            val_num = 0\n            test_num = total_num - train_num\n            out_dir = args.output_dir + '/test'\n            if args.test_proportion != 0.0 and not os.path.exists(out_dir):\n                os.makedirs(out_dir)\n        else:\n            val_num = int(total_num * args.val_proportion)\n            test_num = total_num - train_num - val_num\n            val_out_dir = args.output_dir + '/val'\n            if not os.path.exists(val_out_dir):\n                os.makedirs(val_out_dir)\n            test_out_dir = args.output_dir + '/test'\n            if args.test_proportion != 0.0 and not os.path.exists(test_out_dir):\n                os.makedirs(test_out_dir)\n        count = 1\n        for img_name in os.listdir(args.image_input_dir):\n            if count <= train_num:\n                if osp.exists(args.output_dir + '/train/'):\n                    shutil.copyfile(\n                        osp.join(args.image_input_dir, img_name),\n                        osp.join(args.output_dir + '/train/', img_name))\n            else:\n                if count <= train_num + val_num:\n                    if osp.exists(args.output_dir + '/val/'):\n                        shutil.copyfile(\n                            osp.join(args.image_input_dir, img_name),\n                            osp.join(args.output_dir + '/val/', img_name))\n                else:\n                    if osp.exists(args.output_dir + '/test/'):\n                        shutil.copyfile(\n                            osp.join(args.image_input_dir, img_name),\n                            osp.join(args.output_dir + '/test/', img_name))\n            count = count + 1\n\n        # Deal with the json files.\n        if not os.path.exists(args.output_dir + '/annotations'):\n            os.makedirs(args.output_dir + '/annotations')\n        if args.train_proportion != 0:\n            train_data_coco = deal_json(args.dataset_type,\n                                        args.output_dir + '/train',\n                                        args.json_input_dir)\n            train_json_path = osp.join(args.output_dir + '/annotations',\n                                       'instance_train.json')\n            json.dump(\n                train_data_coco,\n                open(train_json_path, 'w'),\n                indent=4,\n                cls=MyEncoder)\n        if args.val_proportion != 0:\n            val_data_coco = deal_json(args.dataset_type,\n                                      args.output_dir + '/val',\n                                      args.json_input_dir)\n            val_json_path = osp.join(args.output_dir + '/annotations',\n                                     'instance_val.json')\n            json.dump(\n                val_data_coco,\n                open(val_json_path, 'w'),\n                indent=4,\n                cls=MyEncoder)\n        if args.test_proportion != 0:\n            test_data_coco = deal_json(args.dataset_type,\n                                       args.output_dir + '/test',\n                                       args.json_input_dir)\n            test_json_path = osp.join(args.output_dir + '/annotations',\n                                      'instance_test.json')\n            json.dump(\n                test_data_coco,\n                open(test_json_path, 'w'),\n                indent=4,\n                cls=MyEncoder)\n\n\nif __name__ == '__main__':\n    main()\n"
  }
]