Repository: open-mmlab/mmcv
Branch: main
Commit: a8073c74bf83
Files: 857
Total size: 4.7 MB

Directory structure:
gitextract_pswz5bz1/

├── .dev_scripts/
│   └── check_installation.py
├── .dockerignore
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── 1-bug-report.yml
│   │   ├── 2-feature_request.yml
│   │   ├── 3-documentation.yml
│   │   └── config.yml
│   ├── pull_request_template.md
│   └── workflows/
│       ├── build_macos_wheel.yml
│       ├── lint.yml
│       ├── merge_stage_test.yml
│       ├── pr_stage_test.yml
│       └── publish-to-pypi.yml
├── .gitignore
├── .pre-commit-config-zh-cn.yaml
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CITATION.cff
├── CONTRIBUTING.md
├── CONTRIBUTING_zh-CN.md
├── LICENSE
├── LICENSES.md
├── MANIFEST.in
├── README.md
├── README_zh-CN.md
├── TERMINOLOGY.md
├── docker/
│   ├── README.md
│   ├── dev/
│   │   └── Dockerfile
│   └── release/
│       └── Dockerfile
├── docs/
│   ├── en/
│   │   ├── Makefile
│   │   ├── _static/
│   │   │   ├── css/
│   │   │   │   └── readthedocs.css
│   │   │   └── version.json
│   │   ├── _templates/
│   │   │   └── classtemplate.rst
│   │   ├── api/
│   │   │   ├── arraymisc.rst
│   │   │   ├── cnn.rst
│   │   │   ├── image.rst
│   │   │   ├── ops.rst
│   │   │   ├── transforms.rst
│   │   │   ├── utils.rst
│   │   │   ├── video.rst
│   │   │   └── visualization.rst
│   │   ├── community/
│   │   │   ├── contributing.md
│   │   │   └── pr.md
│   │   ├── compatibility.md
│   │   ├── conf.py
│   │   ├── deployment/
│   │   │   └── mmcv_ops_definition.md
│   │   ├── docutils.conf
│   │   ├── faq.md
│   │   ├── get_started/
│   │   │   ├── api_reference.md
│   │   │   ├── build.md
│   │   │   ├── installation.md
│   │   │   ├── introduction.md
│   │   │   └── previous_versions.md
│   │   ├── index.rst
│   │   ├── make.bat
│   │   ├── switch_language.md
│   │   └── understand_mmcv/
│   │       ├── cnn.md
│   │       ├── data_process.md
│   │       ├── data_transform.md
│   │       ├── ops.md
│   │       └── visualization.md
│   └── zh_cn/
│       ├── Makefile
│       ├── _static/
│       │   ├── css/
│       │   │   └── readthedocs.css
│       │   └── version.json
│       ├── _templates/
│       │   └── classtemplate.rst
│       ├── api/
│       │   ├── arraymisc.rst
│       │   ├── cnn.rst
│       │   ├── image.rst
│       │   ├── ops.rst
│       │   ├── transforms.rst
│       │   ├── utils.rst
│       │   ├── video.rst
│       │   └── visualization.rst
│       ├── community/
│       │   ├── code_style.md
│       │   ├── contributing.md
│       │   └── pr.md
│       ├── compatibility.md
│       ├── conf.py
│       ├── docutils.conf
│       ├── faq.md
│       ├── get_started/
│       │   ├── api_reference.md
│       │   ├── article.md
│       │   ├── build.md
│       │   ├── installation.md
│       │   ├── introduction.md
│       │   └── previous_versions.md
│       ├── index.rst
│       ├── make.bat
│       ├── switch_language.md
│       └── understand_mmcv/
│           ├── cnn.md
│           ├── data_process.md
│           ├── data_transform.md
│           ├── ops.md
│           └── visualization.md
├── mmcv/
│   ├── __init__.py
│   ├── arraymisc/
│   │   ├── __init__.py
│   │   └── quantization.py
│   ├── cnn/
│   │   ├── __init__.py
│   │   ├── alexnet.py
│   │   ├── bricks/
│   │   │   ├── __init__.py
│   │   │   ├── activation.py
│   │   │   ├── context_block.py
│   │   │   ├── conv.py
│   │   │   ├── conv2d_adaptive_padding.py
│   │   │   ├── conv_module.py
│   │   │   ├── conv_ws.py
│   │   │   ├── depthwise_separable_conv_module.py
│   │   │   ├── drop.py
│   │   │   ├── generalized_attention.py
│   │   │   ├── hsigmoid.py
│   │   │   ├── hswish.py
│   │   │   ├── non_local.py
│   │   │   ├── norm.py
│   │   │   ├── padding.py
│   │   │   ├── plugin.py
│   │   │   ├── scale.py
│   │   │   ├── swish.py
│   │   │   ├── transformer.py
│   │   │   ├── upsample.py
│   │   │   └── wrappers.py
│   │   ├── resnet.py
│   │   ├── rfsearch/
│   │   │   ├── __init__.py
│   │   │   ├── operator.py
│   │   │   ├── search.py
│   │   │   └── utils.py
│   │   ├── utils/
│   │   │   ├── __init__.py
│   │   │   ├── flops_counter.py
│   │   │   └── fuse_conv_bn.py
│   │   └── vgg.py
│   ├── image/
│   │   ├── __init__.py
│   │   ├── colorspace.py
│   │   ├── geometric.py
│   │   ├── io.py
│   │   ├── misc.py
│   │   └── photometric.py
│   ├── ops/
│   │   ├── __init__.py
│   │   ├── active_rotated_filter.py
│   │   ├── assign_score_withk.py
│   │   ├── ball_query.py
│   │   ├── bbox.py
│   │   ├── bezier_align.py
│   │   ├── bias_act.py
│   │   ├── border_align.py
│   │   ├── box_iou_quadri.py
│   │   ├── box_iou_rotated.py
│   │   ├── carafe.py
│   │   ├── cc_attention.py
│   │   ├── chamfer_distance.py
│   │   ├── contour_expand.py
│   │   ├── conv2d_gradfix.py
│   │   ├── convex_iou.py
│   │   ├── corner_pool.py
│   │   ├── correlation.py
│   │   ├── csrc/
│   │   │   ├── README.md
│   │   │   ├── common/
│   │   │   │   ├── box_iou_rotated_utils.hpp
│   │   │   │   ├── cuda/
│   │   │   │   │   ├── active_rotated_filter_cuda_kernel.cuh
│   │   │   │   │   ├── assign_score_withk_cuda_kernel.cuh
│   │   │   │   │   ├── ball_query_cuda_kernel.cuh
│   │   │   │   │   ├── bbox_overlaps_cuda_kernel.cuh
│   │   │   │   │   ├── bezier_align_cuda_kernel.cuh
│   │   │   │   │   ├── border_align_cuda_kernel.cuh
│   │   │   │   │   ├── box_iou_quadri_cuda.cuh
│   │   │   │   │   ├── box_iou_rotated_cuda.cuh
│   │   │   │   │   ├── carafe_cuda_kernel.cuh
│   │   │   │   │   ├── carafe_naive_cuda_kernel.cuh
│   │   │   │   │   ├── chamfer_distance_cuda_kernel.cuh
│   │   │   │   │   ├── common_cuda_helper.hpp
│   │   │   │   │   ├── convex_iou_cuda_kernel.cuh
│   │   │   │   │   ├── correlation_cuda.cuh
│   │   │   │   │   ├── deform_conv_cuda_kernel.cuh
│   │   │   │   │   ├── deform_roi_pool_cuda_kernel.cuh
│   │   │   │   │   ├── diff_iou_rotated_cuda_kernel.cuh
│   │   │   │   │   ├── furthest_point_sample_cuda_kernel.cuh
│   │   │   │   │   ├── gather_points_cuda_kernel.cuh
│   │   │   │   │   ├── group_points_cuda_kernel.cuh
│   │   │   │   │   ├── iou3d_cuda_kernel.cuh
│   │   │   │   │   ├── knn_cuda_kernel.cuh
│   │   │   │   │   ├── masked_conv2d_cuda_kernel.cuh
│   │   │   │   │   ├── min_area_polygons_cuda.cuh
│   │   │   │   │   ├── modulated_deform_conv_cuda_kernel.cuh
│   │   │   │   │   ├── ms_deform_attn_cuda_kernel.cuh
│   │   │   │   │   ├── nms_cuda_kernel.cuh
│   │   │   │   │   ├── nms_quadri_cuda.cuh
│   │   │   │   │   ├── nms_rotated_cuda.cuh
│   │   │   │   │   ├── parrots_cudawarpfunction.cuh
│   │   │   │   │   ├── points_in_boxes_cuda_kernel.cuh
│   │   │   │   │   ├── points_in_polygons_cuda_kernel.cuh
│   │   │   │   │   ├── prroi_pool_cuda_kernel.cuh
│   │   │   │   │   ├── psamask_cuda_kernel.cuh
│   │   │   │   │   ├── riroi_align_rotated_cuda_kernel.cuh
│   │   │   │   │   ├── roi_align_cuda_kernel.cuh
│   │   │   │   │   ├── roi_align_rotated_cuda_kernel.cuh
│   │   │   │   │   ├── roi_pool_cuda_kernel.cuh
│   │   │   │   │   ├── roiaware_pool3d_cuda_kernel.cuh
│   │   │   │   │   ├── roipoint_pool3d_cuda_kernel.cuh
│   │   │   │   │   ├── rotated_feature_align_cuda_kernel.cuh
│   │   │   │   │   ├── scatter_points_cuda_kernel.cuh
│   │   │   │   │   ├── sigmoid_focal_loss_cuda_kernel.cuh
│   │   │   │   │   ├── softmax_focal_loss_cuda_kernel.cuh
│   │   │   │   │   ├── spconv/
│   │   │   │   │   │   ├── indice.cuh
│   │   │   │   │   │   └── reordering.cuh
│   │   │   │   │   ├── stack_ball_query_cuda_kernel.cuh
│   │   │   │   │   ├── stack_group_points_cuda_kernel.cuh
│   │   │   │   │   ├── sync_bn_cuda_kernel.cuh
│   │   │   │   │   ├── three_interpolate_cuda_kernel.cuh
│   │   │   │   │   ├── three_nn_cuda_kernel.cuh
│   │   │   │   │   ├── tin_shift_cuda_kernel.cuh
│   │   │   │   │   └── voxelization_cuda_kernel.cuh
│   │   │   │   ├── mlu/
│   │   │   │   │   ├── common_mlu_helper.hpp
│   │   │   │   │   ├── masked_conv2d_mlu_kernel.mlu
│   │   │   │   │   └── roi_pool_mlu_kernel.mlu
│   │   │   │   ├── mps/
│   │   │   │   │   ├── MPSDevice.h
│   │   │   │   │   ├── MPSLibrary.h
│   │   │   │   │   ├── MPSLibrary.mm
│   │   │   │   │   ├── MPSStream.h
│   │   │   │   │   └── MPSUtils.h
│   │   │   │   ├── musa/
│   │   │   │   │   ├── active_rotated_filter_musa_kernel.muh
│   │   │   │   │   ├── assign_score_withk_musa_kernel.muh
│   │   │   │   │   ├── ball_query_musa_kernel.muh
│   │   │   │   │   ├── bbox_overlaps_musa_kernel.muh
│   │   │   │   │   ├── bezier_align_musa_kernel.muh
│   │   │   │   │   ├── border_align_musa_kernel.muh
│   │   │   │   │   ├── box_iou_quadri_musa.muh
│   │   │   │   │   ├── box_iou_rotated_musa.muh
│   │   │   │   │   ├── carafe_musa_kernel.muh
│   │   │   │   │   ├── carafe_naive_musa_kernel.muh
│   │   │   │   │   ├── chamfer_distance_musa_kernel.muh
│   │   │   │   │   ├── common_musa_helper.hpp
│   │   │   │   │   ├── convex_iou_musa_kernel.muh
│   │   │   │   │   ├── correlation_musa.muh
│   │   │   │   │   ├── deform_conv_musa_kernel.muh
│   │   │   │   │   ├── deform_roi_pool_musa_kernel.muh
│   │   │   │   │   ├── diff_iou_rotated_musa_kernel.muh
│   │   │   │   │   ├── furthest_point_sample_musa_kernel.muh
│   │   │   │   │   ├── gather_points_musa_kernel.muh
│   │   │   │   │   ├── group_points_musa_kernel.muh
│   │   │   │   │   ├── iou3d_musa_kernel.muh
│   │   │   │   │   ├── knn_musa_kernel.muh
│   │   │   │   │   ├── masked_conv2d_musa_kernel.muh
│   │   │   │   │   ├── min_area_polygons_musa.muh
│   │   │   │   │   ├── modulated_deform_conv_musa_kernel.muh
│   │   │   │   │   ├── ms_deform_attn_musa_kernel.muh
│   │   │   │   │   ├── nms_musa_kernel.muh
│   │   │   │   │   ├── nms_quadri_musa.muh
│   │   │   │   │   ├── nms_rotated_musa.muh
│   │   │   │   │   ├── points_in_boxes_musa_kernel.muh
│   │   │   │   │   ├── points_in_polygons_musa_kernel.muh
│   │   │   │   │   ├── prroi_pool_musa_kernel.muh
│   │   │   │   │   ├── psamask_musa_kernel.muh
│   │   │   │   │   ├── riroi_align_rotated_musa_kernel.muh
│   │   │   │   │   ├── roi_align_musa_kernel.muh
│   │   │   │   │   ├── roi_align_rotated_musa_kernel.muh
│   │   │   │   │   ├── roi_pool_musa_kernel.muh
│   │   │   │   │   ├── roiaware_pool3d_musa_kernel.muh
│   │   │   │   │   ├── roipoint_pool3d_musa_kernel.muh
│   │   │   │   │   ├── rotated_feature_align_musa_kernel.muh
│   │   │   │   │   ├── scatter_points_musa_kernel.muh
│   │   │   │   │   ├── sigmoid_focal_loss_musa_kernel.muh
│   │   │   │   │   ├── softmax_focal_loss_musa_kernel.muh
│   │   │   │   │   ├── spconv/
│   │   │   │   │   │   ├── indice.muh
│   │   │   │   │   │   └── reordering.muh
│   │   │   │   │   ├── stack_ball_query_musa_kernel.muh
│   │   │   │   │   ├── stack_group_points_musa_kernel.muh
│   │   │   │   │   ├── sync_bn_musa_kernel.muh
│   │   │   │   │   ├── three_interpolate_musa_kernel.muh
│   │   │   │   │   ├── three_nn_musa_kernel.muh
│   │   │   │   │   ├── tin_shift_musa_kernel.muh
│   │   │   │   │   └── voxelization_musa_kernel.muh
│   │   │   │   ├── parrots_cpp_helper.hpp
│   │   │   │   ├── parrots_cuda_helper.hpp
│   │   │   │   ├── pytorch_cpp_helper.hpp
│   │   │   │   ├── pytorch_cuda_helper.hpp
│   │   │   │   ├── pytorch_device_registry.hpp
│   │   │   │   ├── pytorch_mlu_helper.hpp
│   │   │   │   ├── pytorch_musa_helper.hpp
│   │   │   │   ├── pytorch_npu_helper.hpp
│   │   │   │   ├── pytorch_npu_util.hpp
│   │   │   │   └── utils/
│   │   │   │       └── spconv/
│   │   │   │           ├── paramsgrid.h
│   │   │   │           ├── prettyprint.h
│   │   │   │           ├── pybind11_utils.h
│   │   │   │           ├── spconv/
│   │   │   │           │   ├── geometry.h
│   │   │   │           │   ├── indice.h
│   │   │   │           │   ├── maxpool.h
│   │   │   │           │   ├── mp_helper.h
│   │   │   │           │   ├── point2voxel.h
│   │   │   │           │   └── reordering.h
│   │   │   │           └── tensorview/
│   │   │   │               ├── helper_kernel.cuh
│   │   │   │               ├── helper_kernel.muh
│   │   │   │               ├── helper_launch.h
│   │   │   │               └── tensorview.h
│   │   │   ├── parrots/
│   │   │   │   ├── active_rotated_filter.cpp
│   │   │   │   ├── active_rotated_filter_parrots.cpp
│   │   │   │   ├── active_rotated_filter_pytorch.h
│   │   │   │   ├── assign_score_withk.cpp
│   │   │   │   ├── assign_score_withk_parrots.cpp
│   │   │   │   ├── assign_score_withk_pytorch.h
│   │   │   │   ├── ball_query._parrots.cpp
│   │   │   │   ├── ball_query.cpp
│   │   │   │   ├── ball_query_pytorch.h
│   │   │   │   ├── bbox_overlaps.cpp
│   │   │   │   ├── bbox_overlaps_parrots.cpp
│   │   │   │   ├── bbox_overlaps_pytorch.h
│   │   │   │   ├── border_align.cpp
│   │   │   │   ├── border_align_parrots.cpp
│   │   │   │   ├── border_align_pytorch.h
│   │   │   │   ├── box_iou_rotated.cpp
│   │   │   │   ├── box_iou_rotated_parrots.cpp
│   │   │   │   ├── box_iou_rotated_pytorch.h
│   │   │   │   ├── carafe.cpp
│   │   │   │   ├── carafe_naive.cpp
│   │   │   │   ├── carafe_naive_parrots.cpp
│   │   │   │   ├── carafe_naive_pytorch.h
│   │   │   │   ├── carafe_parrots.cpp
│   │   │   │   ├── carafe_pytorch.h
│   │   │   │   ├── chamfer_distance.cpp
│   │   │   │   ├── chamfer_distance_parrots.cpp
│   │   │   │   ├── chamfer_distance_pytorch.h
│   │   │   │   ├── contour_expand.cpp
│   │   │   │   ├── contour_expand_parrots.cpp
│   │   │   │   ├── contour_expand_pytorch.h
│   │   │   │   ├── convex_iou.cpp
│   │   │   │   ├── convex_iou_parrots.cpp
│   │   │   │   ├── convex_iou_pytorch.h
│   │   │   │   ├── correlation.cpp
│   │   │   │   ├── correlation_parrots.cpp
│   │   │   │   ├── correlation_pytorch.h
│   │   │   │   ├── cudabind.cpp
│   │   │   │   ├── deform_conv.cpp
│   │   │   │   ├── deform_conv_parrots.cpp
│   │   │   │   ├── deform_conv_pytorch.h
│   │   │   │   ├── deform_roi_pool.cpp
│   │   │   │   ├── deform_roi_pool_parrots.cpp
│   │   │   │   ├── deform_roi_pool_pytorch.h
│   │   │   │   ├── diff_iou_rotated.cpp
│   │   │   │   ├── diff_iou_rotated_parrots.cpp
│   │   │   │   ├── diff_iou_rotated_pytorch.h
│   │   │   │   ├── focal_loss.cpp
│   │   │   │   ├── focal_loss_parrots.cpp
│   │   │   │   ├── focal_loss_pytorch.h
│   │   │   │   ├── furthest_point_sample.cpp
│   │   │   │   ├── furthest_point_sample_parrots.cpp
│   │   │   │   ├── furthest_point_sample_pytorch.h
│   │   │   │   ├── fused_bias_leakyrelu.cpp
│   │   │   │   ├── fused_bias_parrots.cpp
│   │   │   │   ├── gather_points.cpp
│   │   │   │   ├── gather_points_parrots.cpp
│   │   │   │   ├── gather_points_pytorch.h
│   │   │   │   ├── group_points.cpp
│   │   │   │   ├── group_points_parrots.cpp
│   │   │   │   ├── group_points_pytorch.h
│   │   │   │   ├── info.cpp
│   │   │   │   ├── iou3d.cpp
│   │   │   │   ├── iou3d_parrots.cpp
│   │   │   │   ├── iou3d_pytorch.h
│   │   │   │   ├── knn.cpp
│   │   │   │   ├── knn_parrots.cpp
│   │   │   │   ├── knn_pytorch.h
│   │   │   │   ├── masked_conv2d.cpp
│   │   │   │   ├── masked_conv2d_parrots.cpp
│   │   │   │   ├── masked_conv2d_pytorch.h
│   │   │   │   ├── min_area_polygons.cpp
│   │   │   │   ├── min_area_polygons_parrots.cpp
│   │   │   │   ├── min_area_polygons_pytorch.h
│   │   │   │   ├── modulated_deform_conv.cpp
│   │   │   │   ├── modulated_deform_conv_parrots.cpp
│   │   │   │   ├── modulated_deform_conv_pytorch.h
│   │   │   │   ├── ms_deform_attn.cpp
│   │   │   │   ├── ms_deform_attn_parrots.cpp
│   │   │   │   ├── nms.cpp
│   │   │   │   ├── nms_parrots.cpp
│   │   │   │   ├── nms_pytorch.h
│   │   │   │   ├── nms_rotated.cpp
│   │   │   │   ├── pixel_group.cpp
│   │   │   │   ├── pixel_group_parrots.cpp
│   │   │   │   ├── pixel_group_pytorch.h
│   │   │   │   ├── points_in_boxes.cpp
│   │   │   │   ├── points_in_boxes_parrots.cpp
│   │   │   │   ├── points_in_boxes_pytorch.h
│   │   │   │   ├── points_in_polygons.cpp
│   │   │   │   ├── points_in_polygons_parrots.cpp
│   │   │   │   ├── points_in_polygons_pytorch.h
│   │   │   │   ├── prroi_pool.cpp
│   │   │   │   ├── prroi_pool_parrots.cpp
│   │   │   │   ├── prroi_pool_pytorch.h
│   │   │   │   ├── psamask.cpp
│   │   │   │   ├── psamask_parrots.cpp
│   │   │   │   ├── psamask_pytorch.h
│   │   │   │   ├── riroi_align_rotated.cpp
│   │   │   │   ├── riroi_align_rotated_parrots.cpp
│   │   │   │   ├── riroi_align_rotated_pytorch.h
│   │   │   │   ├── roi_align.cpp
│   │   │   │   ├── roi_align_parrots.cpp
│   │   │   │   ├── roi_align_pytorch.h
│   │   │   │   ├── roi_align_rotated.cpp
│   │   │   │   ├── roi_align_rotated_parrots.cpp
│   │   │   │   ├── roi_align_rotated_pytorch.h
│   │   │   │   ├── roi_pool.cpp
│   │   │   │   ├── roi_pool_parrots.cpp
│   │   │   │   ├── roi_pool_pytorch.h
│   │   │   │   ├── roiaware_pool3d.cpp
│   │   │   │   ├── roiaware_pool3d_parrots.cpp
│   │   │   │   ├── roiaware_pool3d_pytorch.h
│   │   │   │   ├── roipoint_pool3d.cpp
│   │   │   │   ├── roipoint_pool3d_parrots.cpp
│   │   │   │   ├── roipoint_pool3d_pytorch.h
│   │   │   │   ├── rotated_feature_align.cpp
│   │   │   │   ├── rotated_feature_align_parrots.cpp
│   │   │   │   ├── rotated_feature_align_pytorch.h
│   │   │   │   ├── sync_bn.cpp
│   │   │   │   ├── sync_bn_parrots.cpp
│   │   │   │   ├── sync_bn_pytorch.h
│   │   │   │   ├── three_interpolate.cpp
│   │   │   │   ├── three_interpolate_parrots.cpp
│   │   │   │   ├── three_interpolate_pytorch.h
│   │   │   │   ├── three_nn.cpp
│   │   │   │   ├── three_nn_parrots.cpp
│   │   │   │   ├── three_nn_pytorch.h
│   │   │   │   ├── tin_shift.cpp
│   │   │   │   ├── tin_shift_parrots.cpp
│   │   │   │   ├── tin_shift_pytorch.h
│   │   │   │   ├── upfirdn2d.cpp
│   │   │   │   ├── upfirdn2d_parrots.cpp
│   │   │   │   ├── voxelization.cpp
│   │   │   │   ├── voxelization_parrots.cpp
│   │   │   │   └── voxelization_pytorch.h
│   │   │   └── pytorch/
│   │   │       ├── active_rotated_filter.cpp
│   │   │       ├── assign_score_withk.cpp
│   │   │       ├── ball_query.cpp
│   │   │       ├── bbox_overlaps.cpp
│   │   │       ├── bezier_align.cpp
│   │   │       ├── bias_act.cpp
│   │   │       ├── border_align.cpp
│   │   │       ├── box_iou_quadri.cpp
│   │   │       ├── box_iou_rotated.cpp
│   │   │       ├── carafe.cpp
│   │   │       ├── carafe_naive.cpp
│   │   │       ├── chamfer_distance.cpp
│   │   │       ├── contour_expand.cpp
│   │   │       ├── convex_iou.cpp
│   │   │       ├── correlation.cpp
│   │   │       ├── cpu/
│   │   │       │   ├── active_rotated_filter.cpp
│   │   │       │   ├── bbox_overlaps_cpu.cpp
│   │   │       │   ├── bezier_align.cpp
│   │   │       │   ├── box_iou_quadri.cpp
│   │   │       │   ├── box_iou_rotated.cpp
│   │   │       │   ├── deform_conv.cpp
│   │   │       │   ├── modulated_deform_conv.cpp
│   │   │       │   ├── nms.cpp
│   │   │       │   ├── nms_quadri.cpp
│   │   │       │   ├── nms_rotated.cpp
│   │   │       │   ├── pixel_group.cpp
│   │   │       │   ├── points_in_boxes.cpp
│   │   │       │   ├── psamask.cpp
│   │   │       │   ├── roi_align.cpp
│   │   │       │   ├── roi_align_rotated.cpp
│   │   │       │   ├── rotated_feature_align.cpp
│   │   │       │   ├── sparse_indice.cpp
│   │   │       │   ├── sparse_maxpool.cpp
│   │   │       │   ├── sparse_reordering.cpp
│   │   │       │   └── voxelization.cpp
│   │   │       ├── cuda/
│   │   │       │   ├── active_rotated_filter_cuda.cu
│   │   │       │   ├── assign_score_withk_cuda.cu
│   │   │       │   ├── ball_query_cuda.cu
│   │   │       │   ├── bbox_overlaps_cuda.cu
│   │   │       │   ├── bezier_align_cuda.cu
│   │   │       │   ├── bias_act_cuda.cu
│   │   │       │   ├── border_align_cuda.cu
│   │   │       │   ├── box_iou_quadri_cuda.cu
│   │   │       │   ├── box_iou_rotated_cuda.cu
│   │   │       │   ├── carafe_cuda.cu
│   │   │       │   ├── carafe_naive_cuda.cu
│   │   │       │   ├── chamfer_distance_cuda.cu
│   │   │       │   ├── convex_iou.cu
│   │   │       │   ├── correlation_cuda.cu
│   │   │       │   ├── cudabind.cpp
│   │   │       │   ├── deform_conv_cuda.cu
│   │   │       │   ├── deform_roi_pool_cuda.cu
│   │   │       │   ├── diff_iou_rotated_cuda.cu
│   │   │       │   ├── filtered_lrelu.cu
│   │   │       │   ├── focal_loss_cuda.cu
│   │   │       │   ├── furthest_point_sample_cuda.cu
│   │   │       │   ├── fused_bias_leakyrelu_cuda.cu
│   │   │       │   ├── fused_spconv_ops_cuda.cu
│   │   │       │   ├── gather_points_cuda.cu
│   │   │       │   ├── group_points_cuda.cu
│   │   │       │   ├── iou3d_cuda.cu
│   │   │       │   ├── knn_cuda.cu
│   │   │       │   ├── masked_conv2d_cuda.cu
│   │   │       │   ├── min_area_polygons.cu
│   │   │       │   ├── modulated_deform_conv_cuda.cu
│   │   │       │   ├── ms_deform_attn_cuda.cu
│   │   │       │   ├── nms_cuda.cu
│   │   │       │   ├── nms_quadri_cuda.cu
│   │   │       │   ├── nms_rotated_cuda.cu
│   │   │       │   ├── points_in_boxes_cuda.cu
│   │   │       │   ├── points_in_polygons_cuda.cu
│   │   │       │   ├── prroi_pool_cuda.cu
│   │   │       │   ├── psamask_cuda.cu
│   │   │       │   ├── riroi_align_rotated_cuda.cu
│   │   │       │   ├── roi_align_cuda.cu
│   │   │       │   ├── roi_align_rotated_cuda.cu
│   │   │       │   ├── roi_pool_cuda.cu
│   │   │       │   ├── roiaware_pool3d_cuda.cu
│   │   │       │   ├── roipoint_pool3d_cuda.cu
│   │   │       │   ├── rotated_feature_align_cuda.cu
│   │   │       │   ├── scatter_points_cuda.cu
│   │   │       │   ├── sparse_indice.cu
│   │   │       │   ├── sparse_maxpool.cu
│   │   │       │   ├── sparse_pool_ops_cuda.cu
│   │   │       │   ├── sparse_reordering.cu
│   │   │       │   ├── spconv_ops_cuda.cu
│   │   │       │   ├── stack_ball_query_cuda.cu
│   │   │       │   ├── stack_group_points_cuda.cu
│   │   │       │   ├── sync_bn_cuda.cu
│   │   │       │   ├── three_interpolate_cuda.cu
│   │   │       │   ├── three_nn_cuda.cu
│   │   │       │   ├── tin_shift_cuda.cu
│   │   │       │   ├── upfirdn2d_kernel.cu
│   │   │       │   └── voxelization_cuda.cu
│   │   │       ├── deform_conv.cpp
│   │   │       ├── deform_roi_pool.cpp
│   │   │       ├── diff_iou_rotated.cpp
│   │   │       ├── filtered_lrelu.cpp
│   │   │       ├── focal_loss.cpp
│   │   │       ├── furthest_point_sample.cpp
│   │   │       ├── fused_bias_leakyrelu.cpp
│   │   │       ├── fused_spconv_ops.cpp
│   │   │       ├── gather_points.cpp
│   │   │       ├── group_points.cpp
│   │   │       ├── info.cpp
│   │   │       ├── iou3d.cpp
│   │   │       ├── knn.cpp
│   │   │       ├── masked_conv2d.cpp
│   │   │       ├── min_area_polygons.cpp
│   │   │       ├── mlu/
│   │   │       │   ├── ball_query_mlu.cpp
│   │   │       │   ├── bbox_overlaps_mlu.cpp
│   │   │       │   ├── box_iou_rotated.cpp
│   │   │       │   ├── carafe_mlu.cpp
│   │   │       │   ├── deform_roi_pool_mlu.cpp
│   │   │       │   ├── diff_iou_rotated_mlu.cpp
│   │   │       │   ├── focal_loss_sigmoid_mlu.cpp
│   │   │       │   ├── iou3d_mlu.cpp
│   │   │       │   ├── masked_conv2d_mlu.cpp
│   │   │       │   ├── mlu_common_helper.cpp
│   │   │       │   ├── mlu_common_helper.h
│   │   │       │   ├── ms_deform_attn_mlu.cpp
│   │   │       │   ├── nms_mlu.cpp
│   │   │       │   ├── nms_rotated_mlu.cpp
│   │   │       │   ├── psamask_mlu.cpp
│   │   │       │   ├── roi_align_mlu.cpp
│   │   │       │   ├── roi_align_rotated_mlu.cpp
│   │   │       │   ├── roi_pool_mlu.cpp
│   │   │       │   ├── roiaware_pool3d_mlu.cpp
│   │   │       │   ├── roipoint_pool3d_mlu.cpp
│   │   │       │   ├── rotated_feature_align_mlu.cpp
│   │   │       │   ├── scatter_points_mlu.cpp
│   │   │       │   ├── sparse_conv_mlu.cpp
│   │   │       │   ├── three_nn_mlu.cpp
│   │   │       │   ├── tin_shift_mlu.cpp
│   │   │       │   └── voxelization_mlu.cpp
│   │   │       ├── modulated_deform_conv.cpp
│   │   │       ├── mps/
│   │   │       │   └── bbox_overlaps_mps.mm
│   │   │       ├── ms_deform_attn.cpp
│   │   │       ├── musa/
│   │   │       │   ├── active_rotated_filter_musa.mu
│   │   │       │   ├── assign_score_withk_musa.mu
│   │   │       │   ├── ball_query_musa.mu
│   │   │       │   ├── bbox_overlaps_musa.mu
│   │   │       │   ├── bezier_align_musa.mu
│   │   │       │   ├── bias_act_musa.mu
│   │   │       │   ├── border_align_musa.mu
│   │   │       │   ├── box_iou_quadri_musa.mu
│   │   │       │   ├── box_iou_rotated_musa.mu
│   │   │       │   ├── carafe_musa.mu
│   │   │       │   ├── carafe_naive_musa.mu
│   │   │       │   ├── chamfer_distance_musa.mu
│   │   │       │   ├── convex_iou.mu
│   │   │       │   ├── correlation_musa.mu
│   │   │       │   ├── deform_conv_musa.mu
│   │   │       │   ├── deform_roi_pool_musa.mu
│   │   │       │   ├── diff_iou_rotated_musa.mu
│   │   │       │   ├── filtered_lrelu.mu
│   │   │       │   ├── focal_loss_musa.mu
│   │   │       │   ├── furthest_point_sample_musa.mu
│   │   │       │   ├── fused_bias_leakyrelu_musa.mu
│   │   │       │   ├── fused_spconv_ops_musa.mu
│   │   │       │   ├── gather_points_musa.mu
│   │   │       │   ├── group_points_musa.mu
│   │   │       │   ├── iou3d_musa.mu
│   │   │       │   ├── knn_musa.mu
│   │   │       │   ├── masked_conv2d_musa.mu
│   │   │       │   ├── min_area_polygons.mu
│   │   │       │   ├── modulated_deform_conv_musa.mu
│   │   │       │   ├── ms_deform_attn_musa.mu
│   │   │       │   ├── musabind.cpp
│   │   │       │   ├── nms_musa.mu
│   │   │       │   ├── nms_quadri_musa.mu
│   │   │       │   ├── nms_rotated_musa.mu
│   │   │       │   ├── points_in_boxes_musa.mu
│   │   │       │   ├── points_in_polygons_musa.mu
│   │   │       │   ├── prroi_pool_musa.mu
│   │   │       │   ├── psamask_musa.mu
│   │   │       │   ├── riroi_align_rotated_musa.mu
│   │   │       │   ├── roi_align_musa.mu
│   │   │       │   ├── roi_align_rotated_musa.mu
│   │   │       │   ├── roi_pool_musa.mu
│   │   │       │   ├── roiaware_pool3d_musa.mu
│   │   │       │   ├── roipoint_pool3d_musa.mu
│   │   │       │   ├── rotated_feature_align_musa.mu
│   │   │       │   ├── scatter_points_musa.mu
│   │   │       │   ├── sparse_indice.mu
│   │   │       │   ├── sparse_maxpool.mu
│   │   │       │   ├── sparse_pool_ops_musa.mu
│   │   │       │   ├── sparse_reordering.mu
│   │   │       │   ├── spconv_ops_musa.mu
│   │   │       │   ├── stack_ball_query_musa.mu
│   │   │       │   ├── stack_group_points_musa.mu
│   │   │       │   ├── sync_bn_musa.mu
│   │   │       │   ├── three_interpolate_musa.mu
│   │   │       │   ├── three_nn_musa.mu
│   │   │       │   ├── tin_shift_musa.mu
│   │   │       │   ├── upfirdn2d_kernel.mu
│   │   │       │   └── voxelization_musa.mu
│   │   │       ├── nms.cpp
│   │   │       ├── nms_quadri.cpp
│   │   │       ├── nms_rotated.cpp
│   │   │       ├── npu/
│   │   │       │   ├── active_rotated_filter_npu.cpp
│   │   │       │   ├── assign_score_withk_npu.cpp
│   │   │       │   ├── ball_query_npu.cpp
│   │   │       │   ├── bbox_overlaps_npu.cpp
│   │   │       │   ├── border_align_npu.cpp
│   │   │       │   ├── box_iou_quadri_npu.cpp
│   │   │       │   ├── box_iou_rotated_npu.cpp
│   │   │       │   ├── boxes_overlap_bev_npu.cpp
│   │   │       │   ├── chamfer_distance_npu.cpp
│   │   │       │   ├── common_util.h
│   │   │       │   ├── deform_roi_pool.cpp
│   │   │       │   ├── diff_iou_rotated_npu.cpp
│   │   │       │   ├── focal_loss_npu.cpp
│   │   │       │   ├── furthest_point_sample_npu.cpp
│   │   │       │   ├── furthest_point_sampling_with_dist_npu.cpp
│   │   │       │   ├── fused_bias_leakyrelu_npu.cpp
│   │   │       │   ├── gather_points_npu.cpp
│   │   │       │   ├── group_points_npu.cpp
│   │   │       │   ├── knn_npu.cpp
│   │   │       │   ├── ms_deform_attn_npu.cpp
│   │   │       │   ├── nms3d_normal_npu.cpp
│   │   │       │   ├── nms3d_npu.cpp
│   │   │       │   ├── nms_npu.cpp
│   │   │       │   ├── nms_rotated_npu.cpp
│   │   │       │   ├── points_in_box_npu.cpp
│   │   │       │   ├── points_in_box_npu_all.cpp
│   │   │       │   ├── points_in_polygons_npu.cpp
│   │   │       │   ├── psa_mask_npu.cpp
│   │   │       │   ├── roi_align_npu.cpp
│   │   │       │   ├── roi_align_rotated_npu.cpp
│   │   │       │   ├── roi_pool_npu.cpp
│   │   │       │   ├── roiaware_pool3d_npu.cpp
│   │   │       │   ├── roipoint_pool3d_forward.cpp
│   │   │       │   ├── rotated_feature_align_npu.cpp
│   │   │       │   ├── stack_ball_query_npu.cpp
│   │   │       │   ├── stack_group_points_npu.cpp
│   │   │       │   ├── three_interpolate_npu.cpp
│   │   │       │   ├── three_nn_npu.cpp
│   │   │       │   └── voxelization_npu.cpp
│   │   │       ├── pixel_group.cpp
│   │   │       ├── points_in_boxes.cpp
│   │   │       ├── points_in_polygons.cpp
│   │   │       ├── prroi_pool.cpp
│   │   │       ├── psamask.cpp
│   │   │       ├── pybind.cpp
│   │   │       ├── riroi_align_rotated.cpp
│   │   │       ├── roi_align.cpp
│   │   │       ├── roi_align_rotated.cpp
│   │   │       ├── roi_pool.cpp
│   │   │       ├── roiaware_pool3d.cpp
│   │   │       ├── roipoint_pool3d.cpp
│   │   │       ├── rotated_feature_align.cpp
│   │   │       ├── scatter_points.cpp
│   │   │       ├── sparse_pool_ops.cpp
│   │   │       ├── spconv_ops.cpp
│   │   │       ├── spconv_utils.h
│   │   │       ├── sync_bn.cpp
│   │   │       ├── three_interpolate.cpp
│   │   │       ├── three_nn.cpp
│   │   │       ├── tin_shift.cpp
│   │   │       ├── upfirdn2d.cpp
│   │   │       └── voxelization.cpp
│   │   ├── deform_conv.py
│   │   ├── deform_roi_pool.py
│   │   ├── deprecated_wrappers.py
│   │   ├── diff_iou_rotated.py
│   │   ├── filtered_lrelu.py
│   │   ├── focal_loss.py
│   │   ├── furthest_point_sample.py
│   │   ├── fused_bias_leakyrelu.py
│   │   ├── gather_points.py
│   │   ├── group_points.py
│   │   ├── info.py
│   │   ├── iou3d.py
│   │   ├── knn.py
│   │   ├── masked_conv.py
│   │   ├── merge_cells.py
│   │   ├── min_area_polygons.py
│   │   ├── modulated_deform_conv.py
│   │   ├── multi_scale_deform_attn.py
│   │   ├── nms.py
│   │   ├── pixel_group.py
│   │   ├── point_sample.py
│   │   ├── points_in_boxes.py
│   │   ├── points_in_polygons.py
│   │   ├── points_sampler.py
│   │   ├── prroi_pool.py
│   │   ├── psa_mask.py
│   │   ├── riroi_align_rotated.py
│   │   ├── roi_align.py
│   │   ├── roi_align_rotated.py
│   │   ├── roi_pool.py
│   │   ├── roiaware_pool3d.py
│   │   ├── roipoint_pool3d.py
│   │   ├── rotated_feature_align.py
│   │   ├── saconv.py
│   │   ├── scatter_points.py
│   │   ├── sparse_conv.py
│   │   ├── sparse_functional.py
│   │   ├── sparse_modules.py
│   │   ├── sparse_ops.py
│   │   ├── sparse_pool.py
│   │   ├── sparse_structure.py
│   │   ├── sync_bn.py
│   │   ├── three_interpolate.py
│   │   ├── three_nn.py
│   │   ├── tin_shift.py
│   │   ├── upfirdn2d.py
│   │   └── voxelize.py
│   ├── transforms/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── builder.py
│   │   ├── formatting.py
│   │   ├── loading.py
│   │   ├── processing.py
│   │   ├── utils.py
│   │   └── wrappers.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── device_type.py
│   │   ├── env.py
│   │   ├── ext_loader.py
│   │   └── parrots_jit.py
│   ├── version.py
│   ├── video/
│   │   ├── __init__.py
│   │   ├── io.py
│   │   ├── optflow.py
│   │   └── processing.py
│   └── visualization/
│       ├── __init__.py
│       ├── color.py
│       ├── image.py
│       └── optflow.py
├── requirements/
│   ├── build.txt
│   ├── docs.txt
│   ├── optional.txt
│   ├── runtime.txt
│   └── test.txt
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests/
    ├── test_arraymisc.py
    ├── test_cnn/
    │   ├── test_build_layers.py
    │   ├── test_context_block.py
    │   ├── test_conv2d_adaptive_padding.py
    │   ├── test_conv_module.py
    │   ├── test_depthwise_seperable_conv_module.py
    │   ├── test_flops_counter.py
    │   ├── test_fuse_conv_bn.py
    │   ├── test_generalized_attention.py
    │   ├── test_hsigmoid.py
    │   ├── test_hswish.py
    │   ├── test_non_local.py
    │   ├── test_rfsearch/
    │   │   ├── test_operator.py
    │   │   └── test_search.py
    │   ├── test_scale.py
    │   ├── test_silu.py
    │   ├── test_swish.py
    │   ├── test_transformer.py
    │   └── test_wrappers.py
    ├── test_image/
    │   ├── test_colorspace.py
    │   ├── test_geometric.py
    │   ├── test_image_misc.py
    │   ├── test_io.py
    │   └── test_photometric.py
    ├── test_ops/
    │   ├── output.pkl
    │   ├── test_active_rotated_filter.py
    │   ├── test_assign_score_withk.py
    │   ├── test_ball_query.py
    │   ├── test_bbox.py
    │   ├── test_bezier_align.py
    │   ├── test_bias_act.py
    │   ├── test_bilinear_grid_sample.py
    │   ├── test_border_align.py
    │   ├── test_box_iou_quadri.py
    │   ├── test_box_iou_rotated.py
    │   ├── test_carafe.py
    │   ├── test_cc_attention.py
    │   ├── test_chamfer_distance.py
    │   ├── test_contour_expand.py
    │   ├── test_conv_gradfix.py
    │   ├── test_convex_iou.py
    │   ├── test_corner_pool.py
    │   ├── test_correlation.py
    │   ├── test_deform_conv.py
    │   ├── test_deform_roi_pool.py
    │   ├── test_diff_iou_rotated.py
    │   ├── test_filtered_lrelu.py
    │   ├── test_focal_loss.py
    │   ├── test_furthest_point_sample.py
    │   ├── test_fused_bias_leakyrelu.py
    │   ├── test_gather_points.py
    │   ├── test_group_points.py
    │   ├── test_info.py
    │   ├── test_iou3d.py
    │   ├── test_knn.py
    │   ├── test_masked_conv2d.py
    │   ├── test_merge_cells.py
    │   ├── test_min_area_polygons.py
    │   ├── test_modulated_deform_conv.py
    │   ├── test_ms_deformable_attn.py
    │   ├── test_nms.py
    │   ├── test_nms_quadri.py
    │   ├── test_nms_rotated.py
    │   ├── test_onnx.py
    │   ├── test_pixel_group.py
    │   ├── test_points_in_polygons.py
    │   ├── test_prroi_pool.py
    │   ├── test_psa_mask.py
    │   ├── test_riroi_align_rotated.py
    │   ├── test_roi_align.py
    │   ├── test_roi_align_rotated.py
    │   ├── test_roi_pool.py
    │   ├── test_roiaware_pool3d.py
    │   ├── test_roipoint_pool3d.py
    │   ├── test_rotated_feature_align.py
    │   ├── test_saconv.py
    │   ├── test_scatter_points.py
    │   ├── test_spconv.py
    │   ├── test_syncbn.py
    │   ├── test_three_interpolate.py
    │   ├── test_three_nn.py
    │   ├── test_tin_shift.py
    │   ├── test_upfirdn2d.py
    │   └── test_voxelization.py
    ├── test_transforms/
    │   ├── test_transforms_formatting.py
    │   ├── test_transforms_loading.py
    │   ├── test_transforms_processing.py
    │   └── test_transforms_wrapper.py
    ├── test_utils/
    │   ├── test_env.py
    │   └── test_parrots_jit.py
    ├── test_video/
    │   ├── test_optflow.py
    │   ├── test_processing.py
    │   └── test_reader.py
    └── test_visualization.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dev_scripts/check_installation.py
================================================
import numpy as np
import torch

from mmcv.ops import box_iou_rotated
from mmcv.utils import collect_env


def check_installation():
    """Check whether mmcv has been installed successfully."""
    np_boxes1 = np.asarray(
        [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
         [7.0, 7.0, 8.0, 8.0, 0.4]],
        dtype=np.float32)
    np_boxes2 = np.asarray(
        [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
         [5.0, 5.0, 6.0, 7.0, 0.4]],
        dtype=np.float32)
    boxes1 = torch.from_numpy(np_boxes1)
    boxes2 = torch.from_numpy(np_boxes2)

    # test mmcv with CPU ops
    box_iou_rotated(boxes1, boxes2)
    print('CPU ops were compiled successfully.')

    # test mmcv with both CPU and CUDA ops
    if torch.cuda.is_available():
        boxes1 = boxes1.cuda()
        boxes2 = boxes2.cuda()
        box_iou_rotated(boxes1, boxes2)
        print('CUDA ops were compiled successfully.')
    else:
        print('No CUDA runtime is found, skipping the checking of CUDA ops.')


if __name__ == '__main__':
    print('Start checking the installation of mmcv ...')
    check_installation()
    print('mmcv has been installed successfully.\n')

    env_info_dict = collect_env()
    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    print('Environment information:')
    print(dash_line + env_info + '\n' + dash_line)


================================================
FILE: .dockerignore
================================================
.git
.gitignore
*.egg-info
.eggs/
.mypy-cache
pip-wheel-metadata


================================================
FILE: .github/ISSUE_TEMPLATE/1-bug-report.yml
================================================
name: "🐞 Bug report"
description: "Create a report to help us reproduce and fix the bug"
labels: bug
title: "[Bug] "

body:
  - type: markdown
    attributes:
      value: |
        ## Note
        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)
        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**

  - type: checkboxes
    attributes:
      label: Prerequisite
      description: Please check the following items before creating a new issue.
      options:
      - label: I have searched [Issues](https://github.com/open-mmlab/mmcv/issues) and [Discussions](https://github.com/open-mmlab/mmcv/discussions) but cannot get the expected help.
        required: true
      - label: The bug has not been fixed in the latest version(https://github.com/open-mmlab/mmcv).
        required: true

  - type: textarea
    attributes:
      label: Environment
      description: |
        Please run `python -c "from mmcv.utils import collect_env; print(collect_env())"` to collect necessary environment information and copy-paste it here.
        You may add additional information that may be helpful for locating the problem, such as
          - How you installed PyTorch \[e.g., pip, conda, source\]
          - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
    validations:
      required: true

  - type: textarea
    attributes:
      label: Reproduces the problem - code sample
      description: |
        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
      placeholder: |
        ```python
        # Sample code to reproduce the problem
        ```
    validations:
      required: true

  - type: textarea
    attributes:
      label: Reproduces the problem - command or script
      description: |
        What command or script did you run?
      placeholder: |
        ```shell
        The command or script you run.
        ```
    validations:
      required: true

  - type: textarea
    attributes:
      label: Reproduces the problem - error message
      description: |
        Please provide the error message or logs you got, with the full traceback.

        Tip: You can attach images or log files by dragging them into the text area..
      placeholder: |
        ```
        The error message or logs you got, with the full traceback.
        ```
    validations:
      required: true

  - type: textarea
    attributes:
      label: Additional information
      description: |
        Tell us anything else you think we should know.

        Tip: You can attach images or log files by dragging them into the text area.
      placeholder: |
        1. What's your expected result?
        2. What dataset did you use?
        3. What do you think might be the reason?

  - type: markdown
    attributes:
      value: |
        ## Acknowledgement
        Thanks for taking the time to fill out this report.

        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**Here**](https://github.com/open-mmlab/mmcv/pulls)!
        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.

        Welcome to join our [**Community (TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬


================================================
FILE: .github/ISSUE_TEMPLATE/2-feature_request.yml
================================================
name: 🚀 Feature request
description: Suggest an idea for this project
labels: [feature-request]
title: "[Feature] "

body:
  - type: markdown
    attributes:
      value: |
        ## Note
        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)

        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**

  - type: textarea
    attributes:
      label: What is the feature?
      description: Tell us more about the feature and how this feature can help.
      placeholder: |
        E.g., It is inconvenient when \[....\].
    validations:
      required: true

  - type: textarea
    attributes:
      label: Any other context?
      description: |
        Have you considered any alternative solutions or features? If so, what are they? Also, feel free to add any other context or screenshots about the feature request here.

  - type: markdown
    attributes:
      value: |
        ## Acknowledgement
        Thanks for taking the time to fill out this report.

        We strongly appreciate you creating a new PR to implement it [**Here**](https://github.com/open-mmlab/mmcv/pulls)!
        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.

        Welcome to join our [**Community (TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬


================================================
FILE: .github/ISSUE_TEMPLATE/3-documentation.yml
================================================
name: 📚 Documentation
description: Report an issue related to the documentation.
labels: "docs"
title: "[Docs] "

body:
  - type: markdown
    attributes:
      value: |
        ## Note
        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)
        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**

  - type: textarea
    attributes:
      label: 📚 The doc issue
      description: >
        A clear and concise description the issue.
    validations:
      required: true

  - type: textarea
    attributes:
      label: Suggest a potential alternative/fix
      description: >
        Tell us how we could improve the documentation in this regard.

  - type: markdown
    attributes:
      value: |
        ## Acknowledgement
        Thanks for taking the time to fill out this report.

        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**here**](https://github.com/open-mmlab/mmcv/pulls)!
        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.

        Welcome to join our [**Community(TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false

contact_links:
  - name: 💬 Forum
    url: https://github.com/open-mmlab/mmcv/discussions
    about: Ask general usage questions and discuss with other mmcv community members
  - name: MMCV Documentation
    url: https://mmcv.readthedocs.io/en/latest/
    about: Check if your question is answered in docs
  - name: 🌐 Explore OpenMMLab
    url: https://openmmlab.com/
    about: Get know more about OpenMMLab


================================================
FILE: .github/pull_request_template.md
================================================
Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers.

## Motivation

Please describe the motivation of this PR and the goal you want to achieve through this PR.

## Modification

Please briefly describe what modification is made in this PR.

## BC-breaking (Optional)

Does the modification introduce changes that break the backward-compatibility of the downstream repositories?
If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR.

## Use cases (Optional)

If this PR introduces a new feature, it is better to list some use cases here, and update the documentation.

## Checklist

**Before PR**:

- [ ] I have read and followed the workflow indicated in the [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) to create this PR.
- [ ] Pre-commit or linting tools indicated in [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) are used to fix the potential lint issues.
- [ ] Bug fixes are covered by unit tests, the case that causes the bug should be added in the unit tests.
- [ ] New functionalities are covered by complete unit tests. If not, please add more unit test to ensure the correctness.
- [ ] The documentation has been modified accordingly, including docstring or example tutorials.

**After PR**:

- [ ] If the modification has potential influence on downstream or other related projects, this PR should be tested with some of those projects, like MMDet or MMCls.
- [ ] CLA has been signed and all committers have signed the CLA in this PR.


================================================
FILE: .github/workflows/build_macos_wheel.yml
================================================
name: build macos wheel

on: push

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  build_macos10_wheel:
    runs-on: macos-latest
    if: contains(github.event.head_commit.message, 'Bump version to')
    strategy:
      matrix:
        torch: [1.8.0, 1.9.0, 1.10.0, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0]
        python-version: [3.7, 3.8, 3.9, '3.10', '3.11']
        include:
          - torch: 1.8.0
            torchvision: 0.9.0
          - torch: 1.9.0
            torchvision: 0.10.0
          - torch: 1.10.0
            torchvision: 0.11.0
          - torch: 1.11.0
            torchvision: 0.12.0
          - torch: 1.12.0
            torchvision: 0.13.0
          - torch: 1.13.0
            torchvision: 0.14.0
          - torch: 2.0.0
            torchvision: 0.15.1
          - torch: 2.1.0
            torchvision: 0.16.0
        exclude:
          - torch: 1.8.0
            python-version: '3.10'
          - torch: 1.9.0
            python-version: '3.10'
          - torch: 1.10.0
            python-version: '3.10'
          - torch: 1.8.0
            python-version: '3.11'
          - torch: 1.9.0
            python-version: '3.11'
          - torch: 1.10.0
            python-version: '3.11'
          - torch: 1.10.0
            python-version: '3.11'
          - torch: 1.11.0
            python-version: '3.11'
          - torch: 1.12.0
            python-version: '3.11'
          - torch: 1.13.0
            python-version: '3.11'
          - torch: 2.0.0
            python-version: 3.7
          - torch: 2.1.0
            python-version: 3.7
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install psutil
        run: pip install psutil
      - name: Install PyTorch
        run: pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} --no-cache-dir
      - name: Build and install
        run: |
          pip install wheel
          python setup.py bdist_wheel
      - uses: actions/upload-artifact@v3
        with:
          name: ${{matrix.torch}}
          path: dist/


================================================
FILE: .github/workflows/lint.yml
================================================
name: lint

on: [push, pull_request]

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  lint:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python 3.10.15
        uses: actions/setup-python@v2
        with:
          python-version: '3.10.15'
      - name: Install pre-commit hook
        run: |
          pip install pre-commit
          pre-commit install
      - name: Linting
        run: pre-commit run --all-files
      - name: Format c/cuda codes with clang-format
        uses: DoozyX/clang-format-lint-action@v0.18
        with:
          source: mmcv/ops/csrc
          extensions: h,c,cpp,hpp,cu,cuh
          style: google


================================================
FILE: .github/workflows/merge_stage_test.yml
================================================
name: merge_stage_test

on:
  push:
    paths-ignore:
      - ".github/**.md"
      - "docker/**"
      - "docs/**"
      - 'examples/**'
      - '.dev_scripts/**'
      - "README.md"
      - "README_zh-CN.md"
      - "CONTRIBUTING.md"
      - ".pre-commit-config.yaml"
      - ".pre-commit-config-zh-cn.yaml"
    branches:
      - main

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  build_without_torch:
    runs-on: ubuntu-22.04
    env:
      MMCV_WITH_OPS: 0
    strategy:
      matrix:
        python-version: [3.7]
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install system dependencies
        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests
        run: pytest tests/test_image tests/test_transforms tests/test_video tests/test_arraymisc.py tests/test_visualization.py tests/test_utils/test_env.py --ignore=tests/test_image/test_io.py
  build_without_ops:
    runs-on: ubuntu-22.04
    env:
      MMCV_WITH_OPS: 0
    strategy:
      matrix:
        python-version: [3.7]
        torch: [1.8.1, 1.9.1]
        include:
          - torch: 1.8.1
            torchvision: 0.9.1
          - torch: 1.9.1
            torchvision: 0.10.1
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install system dependencies
        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
      - name: Install PyTorch
        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests
        run: pytest tests --ignore=tests/test_ops
  build_cpu_py:
    runs-on: ubuntu-22.04
    strategy:
      matrix:
        python-version: [3.8, 3.9, '3.10']
        torch: [1.13.0]
        include:
          - torch: 1.13.0
            torchvision: 0.14.0
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install system dependencies
        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
      - name: Upgrade pip and wheel
        run: pip install pip wheel --upgrade
      - name: Install PyTorch
        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests and generate coverage report
        run: |
          coverage run --branch --source mmcv -m pytest tests/
          coverage xml
          coverage report -m
  build_cpu_pt:
    runs-on: ubuntu-22.04
    strategy:
      matrix:
        python-version: [3.7]
        torch: [1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0]
        include:
          - torch: 1.8.1
            torchvision: 0.9.1
          - torch: 1.9.1
            torchvision: 0.10.1
          - torch: 1.10.1
            torchvision: 0.11.2
          - torch: 1.11.0
            torchvision: 0.12.0
          - torch: 1.12.0
            torchvision: 0.13.0
          - torch: 1.13.0
            torchvision: 0.14.0
          - torch: 2.0.0
            torchvision: 0.15.1
            python-version: 3.8
          - torch: 2.1.0
            torchvision: 0.16.0
            python-version: 3.8
        exclude:
          - torch: 2.0.0
            python-version: 3.7
          - torch: 2.1.0
            python-version: 3.7
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install system dependencies
        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
      - name: Upgrade pip and wheel
        run: pip install pip wheel --upgrade
      - name: Install PyTorch
        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests and generate coverage report
        run: |
          coverage run --branch --source mmcv -m pytest tests/
          coverage xml
          coverage report -m
      # Only upload coverage report for python3.7 && pytorch1.8.1 cpu
      - name: Upload coverage to Codecov
        if: ${{matrix.torch == '1.8.1' && matrix.python-version == '3.8'}}
        uses: codecov/codecov-action@v1.0.14
        with:
          file: ./coverage.xml
          flags: unittests
          env_vars: OS,PYTHON
          name: codecov-umbrella
          fail_ci_if_error: false
  build_cu102:
    runs-on: ubuntu-22.04
    container:
      image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
    env:
      FORCE_CUDA: 1
      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
    strategy:
      matrix:
        python-version: [3.7]
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip and wheel
        run: pip install pip wheel --upgrade
      - name: Fetch GPG keys
        run: |
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
      - name: Install system dependencies
        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests and generate coverage report
        run: |
          coverage run --branch --source mmcv -m pytest tests/
          coverage xml
          coverage report -m
  build_cu111:
    runs-on: ubuntu-22.04
    container:
      image: pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel
    env:
      FORCE_CUDA: 1
      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
    strategy:
      matrix:
        python-version: [3.7]
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip and wheel
        run: pip install pip wheel --upgrade
      - name: Fetch GPG keys
        run: |
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
      - name: Install system dependencies
        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests and generate coverage report
        run: |
          coverage run --branch --source mmcv -m pytest tests/
          coverage xml
          coverage report -m
  build_cu116:
    runs-on: ubuntu-22.04
    container:
      image: pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel
    env:
      FORCE_CUDA: 1
      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
    strategy:
      matrix:
        python-version: [3.7]
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip and wheel
        run: pip install pip wheel --upgrade
      - name: Fetch GPG keys
        run: |
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
      - name: Install system dependencies
        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests and generate coverage report
        run: |
          coverage run --branch --source mmcv -m pytest tests
          coverage xml
          coverage report -m
  build_windows_without_ops:
    runs-on: windows-2019
    env:
      MMCV_WITH_OPS: 0
    strategy:
      matrix:
        python-version: [3.7]
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip and wheel
        run: python -m pip install pip wheel --upgrade
      - name: Install PyTorch
        run: pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests
        run: pytest tests --ignore=tests/test_ops --ignore tests/test_image/test_io.py
  build_windows:
    runs-on: windows-2019
    strategy:
      matrix:
        torch: [1.8.1, 2.1.0]
        include:
          - torch: 1.8.1
            torchvision: 0.9.1
            python-version: 3.7
          - torch: 2.1.0
            torchvision: 0.16.0
            python-version: 3.8
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip and wheel
        run: python -m pip install pip wheel --upgrade
      - name: Install PyTorch
        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests
        run: pytest tests/ --ignore tests/test_image/test_io.py
  build_macos:
    runs-on: macos-latest
    strategy:
      matrix:
        torch: [1.8.1, 2.1.0]
        include:
          - torch: 1.8.1
            torchvision: 0.9.1
            python-version: 3.7
          - torch: 2.1.0
            torchvision: 0.16.0
            python-version: 3.8
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install system dependencies
        run: brew install ffmpeg jpeg-turbo
      - name: Upgrade pip and wheel
        run: pip install pip wheel --upgrade
      - name: Install PyTorch
        run: pip install torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }}
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests
        run: pytest tests/


================================================
FILE: .github/workflows/pr_stage_test.yml
================================================
name: pr_stage_test

on:
  pull_request:
    paths-ignore:
      - ".github/**.md"
      - "docker/**"
      - "docs/**"
      - 'examples/**'
      - '.dev_scripts/**'
      - "README.md"
      - "README_zh-CN.md"
      - "CONTRIBUTING.md"
      - ".pre-commit-config.yaml"
      - ".pre-commit-config-zh-cn.yaml"

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  build_cu121:
    runs-on: ubuntu-22.04
    container:
      image: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
    strategy:
      matrix:
        python-version: ["3.10"]
        torch: ["2.5.1"]
    steps:
      - uses: actions/checkout@v3
      - name: Install basic tools
        run: |
          apt-get update
          apt-get install -y wget build-essential git software-properties-common

      # 安装特定版本的 Python
      - name: Install Python ${{ matrix.python-version }}
        run: |
          add-apt-repository ppa:deadsnakes/ppa -y
          apt-get update
          apt-get install -y python${{ matrix.python-version }} python${{ matrix.python-version }}-dev python${{ matrix.python-version }}-distutils
          update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${{ matrix.python-version }} 1
          update-alternatives --install /usr/bin/python python /usr/bin/python${{ matrix.python-version }} 1
          wget https://bootstrap.pypa.io/get-pip.py
          python get-pip.py

      # 安装 PyTorch
      - name: Install PyTorch
        run: |
          pip install torch==${{ matrix.torch }} torchvision --index-url https://download.pytorch.org/whl/cu121
      - name: Fetch GPG keys
        run: |
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
      - name: Install system dependencies
        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMEngine from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: |
          pip install -r requirements/test.txt
        # Distributed related unit test may randomly error in PyTorch 1.13.0
      - name: Run unittests and generate coverage report
        run: |
          coverage run --branch --source mmengine -m pytest tests/  --ignore tests/test_dist/
          coverage xml
          coverage report -m

  build_cu118:
    runs-on: ubuntu-22.04
    container:
      image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
    strategy:
      matrix:
        python-version: ["3.10"]
        torch: ["2.0.0", "2.1.0","2.3.1"]
    steps:
      - uses: actions/checkout@v3
      - name: Install basic tools
        run: |
          apt-get update
          apt-get install -y wget build-essential git software-properties-common

      # 安装特定版本的 Python
      - name: Install Python ${{ matrix.python-version }}
        run: |
          add-apt-repository ppa:deadsnakes/ppa -y
          apt-get update
          apt-get install -y python${{ matrix.python-version }} python${{ matrix.python-version }}-dev python${{ matrix.python-version }}-distutils
          update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${{ matrix.python-version }} 1
          update-alternatives --install /usr/bin/python python /usr/bin/python${{ matrix.python-version }} 1
          wget https://bootstrap.pypa.io/get-pip.py
          python get-pip.py

      # 安装 PyTorch
      - name: Install PyTorch
        run: |
          pip install torch==${{ matrix.torch }} torchvision --index-url https://download.pytorch.org/whl/cu118
      - name: Fetch GPG keys
        run: |
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
      - name: Install system dependencies
        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMEngine from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: |
          pip install -r requirements/test.txt
          pip install numpy==1.24.3
        # Distributed related unit test may randomly error in PyTorch 1.13.0
      - name: Run unittests and generate coverage report
        run: |
          coverage run --branch --source mmengine -m pytest tests/  --ignore tests/test_dist/
          coverage xml
          coverage report -m

  build_windows_without_ops:
    runs-on: windows-2019
    env:
      MMCV_WITH_OPS: 0
    strategy:
      matrix:
        python-version: [3.7]
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip and wheel
        run: python -m pip install pip wheel --upgrade
      - name: Install PyTorch
        run: pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests
        run: pytest tests --ignore=tests/test_ops --ignore tests/test_image/test_io.py
  build_windows:
    runs-on: windows-2019
    strategy:
      matrix:
        torch: [1.8.1, 2.1.0]
        include:
          - torch: 1.8.1
            torchvision: 0.9.1
            python-version: 3.7
          - torch: 2.1.0
            torchvision: 0.16.0
            python-version: 3.8
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Upgrade pip and wheel
        run: python -m pip install pip wheel --upgrade
      - name: Install PyTorch
        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
      - name: Install MMEngine from main branch
        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
      - name: Install ninja to speed the compilation
        run: pip install ninja psutil
      - name: Build MMCV from source
        run: pip install -e . -v
      - name: Install unit tests dependencies
        run: pip install -r requirements/test.txt
      - name: Run unit tests
        run: pytest tests/ --ignore tests/test_image/test_io.py
  # build_macos:
  #   runs-on: macos-latest
  #   strategy:
  #     matrix:
  #       torch: [1.8.1, 2.1.0]
  #       include:
  #         - torch: 1.8.1
  #           torchvision: 0.9.1
  #           python-version: 3.7.1
  #         - torch: 2.1.0
  #           torchvision: 0.16.0
  #           python-version: 3.8.1
  #   steps:
  #     - uses: actions/checkout@v2
  #     - name: Set up Python ${{ matrix.python-version }}
  #       uses: actions/setup-python@v2
  #       with:
  #         python-version: ${{ matrix.python-version }}
  #     - name: Install system dependencies
  #       run: brew install ffmpeg jpeg-turbo
  #     - name: Upgrade pip and wheel
  #       run: pip install pip wheel --upgrade
  #     - name: Install PyTorch
  #       run: pip install torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }}
  #     - name: Install MMEngine from main branch
  #       run: pip install git+https://github.com/open-mmlab/mmengine.git@main
  #     - name: Install ninja to speed the compilation
  #       run: pip install ninja psutil
  #     - name: Build MMCV from source
  #       run: pip install -e . -v
  #     - name: Install unit tests dependencies
  #       run: pip install -r requirements/test.txt
  #     - name: Run unit tests
  #       run: pytest tests/


================================================
FILE: .github/workflows/publish-to-pypi.yml
================================================
name: deploy

on: push

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  build-n-publish_without_ops:
    runs-on: ubuntu-22.04
    if: startsWith(github.event.ref, 'refs/tags')
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python 3.7
        uses: actions/setup-python@v1
        with:
          python-version: 3.7
      - name: Upgrade Setuptools
        run: pip install setuptools wheel --upgrade
      - name: Build MMCV
        run: |
          sed -i "s/os.getenv('MMCV_WITH_OPS', '1')/os.getenv('MMCV_WITH_OPS', '0')/g" setup.py
          python setup.py sdist bdist_wheel
      - name: Publish distribution to PyPI
        run: |
          pip install twine
          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}

  build-n-publish_with_ops:
    runs-on: ubuntu-22.04
    if: startsWith(github.event.ref, 'refs/tags')
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python 3.7
        uses: actions/setup-python@v1
        with:
          python-version: 3.7
      - name: Upgrade Setuptools
        run: pip install setuptools --upgrade
      - name: Build MMCV with ops
        run: python setup.py sdist
      - name: Publish distribution to PyPI
        run: |
          pip install twine
          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}


================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# PyTorch checkpoint
*.pth

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
mlu-ops/
mlu-ops.*

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/en/_build/
docs/en/api/generated/
docs/zh_cn/_build/
docs/zh_cn/api/generated/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

# editors and IDEs
.idea/
.vscode/

# custom
.DS_Store

# datasets and logs and checkpoints
data/
work_dir/

src/


================================================
FILE: .pre-commit-config-zh-cn.yaml
================================================
exclude: ^tests/data/
repos:
  - repo: https://github.com/pre-commit/pre-commit
    rev: v4.0.0
    hooks:
      - id: validate_manifest
  - repo: https://github.com/PyCQA/flake8
    rev: 7.1.1
    hooks:
      - id: flake8
  - repo: https://gitee.com/openmmlab/mirrors-isort
    rev: 5.11.5
    hooks:
      - id: isort
  - repo: https://gitee.com/openmmlab/mirrors-yapf
    rev: v0.32.0
    hooks:
      - id: yapf
  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
      - id: check-yaml
      - id: end-of-file-fixer
      - id: requirements-txt-fixer
      - id: double-quote-string-fixer
      - id: check-merge-conflict
      - id: fix-encoding-pragma
        args: ["--remove"]
      - id: mixed-line-ending
        args: ["--fix=lf"]
  - repo: https://gitee.com/openmmlab/mirrors-codespell
    rev: v2.2.1
    hooks:
      - id: codespell
  - repo: https://gitee.com/openmmlab/mirrors-mdformat
    rev: 0.7.9
    hooks:
      - id: mdformat
        args: ["--number"]
        additional_dependencies:
          - mdformat-openmmlab
          - mdformat_frontmatter
          - linkify-it-py
  - repo: https://gitee.com/openmmlab/mirrors-docformatter
    # TODO:https://github.com/PyCQA/docformatter/issues/289
    rev: v1.3.1
    hooks:
      - id: docformatter
        args: ["--in-place", "--wrap-descriptions", "79"]
  - repo: https://github.com/asottile/pyupgrade
    rev: v3.0.0
    hooks:
      - id: pyupgrade
        args: ["--py36-plus"]
  - repo: https://gitee.com/openmmlab/pre-commit-hooks
    rev: v0.2.0  # Use the ref you want to point at
    hooks:
      - id: check-copyright
        args: ["mmcv", "tests", "--excludes", "mmcv/ops"]
  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.2.0
    hooks:
      - id: mypy
        exclude: |-
          (?x)(
              ^test
              | ^docs
          )
        additional_dependencies: ["types-setuptools", "types-requests"]
  # - repo: local
  #   hooks:
  #     - id: clang-format
  #       name: clang-format
  #       description: Format files with ClangFormat
  #       entry: clang-format -style=google -i
  #       language: system
  #       files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$


================================================
FILE: .pre-commit-config.yaml
================================================
exclude: ^tests/data/
repos:
  - repo: https://github.com/pre-commit/pre-commit
    rev: v4.0.0
    hooks:
      - id: validate_manifest
  - repo: https://github.com/PyCQA/flake8
    rev: 7.1.1
    hooks:
      - id: flake8
  - repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:
      - id: isort
  - repo: https://github.com/pre-commit/mirrors-yapf
    rev: v0.32.0
    hooks:
      - id: yapf
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
      - id: check-yaml
      - id: end-of-file-fixer
      - id: requirements-txt-fixer
      - id: double-quote-string-fixer
      - id: check-merge-conflict
      - id: fix-encoding-pragma
        args: ["--remove"]
      - id: mixed-line-ending
        args: ["--fix=lf"]
  - repo: https://github.com/codespell-project/codespell
    rev: v2.2.1
    hooks:
      - id: codespell
  - repo: https://github.com/executablebooks/mdformat
    rev: 0.7.9
    hooks:
      - id: mdformat
        args: ["--number"]
        additional_dependencies:
          - mdformat-openmmlab
          - mdformat_frontmatter
          - linkify-it-py
  - repo: https://github.com/myint/docformatter
    rev: 06907d0
    hooks:
      - id: docformatter
        args: ["--in-place", "--wrap-descriptions", "79"]
  - repo: https://github.com/asottile/pyupgrade
    rev: v3.0.0
    hooks:
      - id: pyupgrade
        args: ["--py36-plus"]
  - repo: https://github.com/open-mmlab/pre-commit-hooks
    rev: v0.2.0  # Use the ref you want to point at
    hooks:
      - id: check-copyright
        args: ["mmcv", "tests", "--excludes", "mmcv/ops"]
  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.2.0
    hooks:
      - id: mypy
        exclude: |-
          (?x)(
              ^test
              | ^docs
          )
        additional_dependencies: ["types-setuptools", "types-requests"]
  # - repo: local
  #   hooks:
  #     - id: clang-format
  #       name: clang-format
  #       description: Format files with ClangFormat
  #       entry: clang-format -style=google -i
  #       language: system
  #       files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$


================================================
FILE: .readthedocs.yml
================================================
version: 2

formats: all

build:
  os: ubuntu-22.04
  tools:
    python: "3.7"

python:
  install:
    - requirements: requirements/runtime.txt
    - requirements: requirements/docs.txt


================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
  - name: "MMCV Contributors"
title: "OpenMMLab Computer Vision Foundation"
date-released: 2018-08-22
url: "https://github.com/open-mmlab/mmcv"
license: Apache-2.0


================================================
FILE: CONTRIBUTING.md
================================================
## Contributing to OpenMMLab

Welcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to

**Fix bug**

You can directly post a Pull Request to fix typo in code or documents

The steps to fix the bug of code implementation are as follows.

1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution.

2. Posting a pull request after fixing the bug and adding corresponding unit test.

**New Feature or Enhancement**

1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design.
2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test.

**Document**

You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable.

### Pull Request Workflow

If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)

#### 1. Fork and clone

If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile.

<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">

Then, you can clone the repositories to local:

```shell
git clone git@github.com:{username}/mmcv.git
```

After that, you should ddd official repository as the upstream repository

```bash
git remote add upstream git@github.com:open-mmlab/mmcv
```

Check whether remote repository has been added successfully by `git remote -v`

```bash
origin	git@github.com:{username}/mmcv.git (fetch)
origin	git@github.com:{username}/mmcv.git (push)
upstream	git@github.com:open-mmlab/mmcv (fetch)
upstream	git@github.com:open-mmlab/mmcv (push)
```

> Here's a brief introduction to origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically.

#### 2. Configure pre-commit

You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory.

```shell
pip install -U pre-commit
pre-commit install
```

Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`.

```shell
pre-commit run --all-files
```

<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">

<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">

If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation.

If the code does not conform to the code style specification, pre-commit will raise a warning and  fixes some of the errors automatically.

<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">

If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**).

```shell
git commit -m "xxx" --no-verify
```

#### 3. Create a development branch

After configuring the pre-commit, we should create a branch based on the master branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`

```shell
git checkout -b yhc/refactor_contributing_doc
```

In subsequent development, if the master branch of the local repository is behind the master branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command:

```shell
git pull upstream master
```

#### 4. Commit the code and pass the unit test

- MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html).

- The committed code should pass through the unit test

  ```shell
  # Pass all unit tests
  pytest tests

  # Pass the unit test of runner
  pytest tests/test_runner/test_runner.py
  ```

  If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test)

- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering)

#### 5. Push the code to remote

We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option.

```shell
git push -u origin {branch_name}
```

This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository.

#### 6. Create a Pull Request

(1) Create a pull request in GitHub's Pull request interface

<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">

(2) Modify the PR description according to the guidelines so that other developers can better understand your changes

<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">

Find more details about Pull Request description in [pull request guidelines](#pr-specs).

**note**

(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)

(b) If it is your first contribution, please sign the CLA

<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">

(c) Check whether the Pull Request pass through the CI

<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">

MMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.

(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.

<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">

#### 7. Resolve conflicts

If your local branch conflicts with the latest master branch of "upstream", you'll need to resolove them. There are two ways to do this:

```shell
git fetch --all --prune
git rebase upstream/master
```

or

```shell
git fetch --all --prune
git merge upstream/master
```

If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.

### Guidance

#### Unit test

If you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) module, you can try to install the following dependencies:

```shell
# Linux
sudo apt-get update -y
sudo apt-get install -y libturbojpeg
sudo apt-get install -y ffmpeg

# Windows
conda install ffmpeg
```

We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test:

```shell
python -m coverage run -m pytest /path/to/test_file
python -m coverage html
# check file in htmlcov/index.html
```

#### Document rendering

If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results:

```shell
pip install -r requirements/docs.txt
cd docs/zh_cn/
# or docs/en
make html
# check file in ./docs/zh_cn/_build/html/index.html
```

### Code style

#### Python

We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.

We use the following tools for linting and formatting:

- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
- [yapf](https://github.com/google/yapf): A formatter for Python files.
- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.

Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).

We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).

#### C++ and CUDA

We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).

### PR Specs

1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style

2. One short-time branch should be matched with only one PR

3. Accomplish a detailed change in one PR. Avoid large PR

   - Bad: Support Faster R-CNN
   - Acceptable: Add a box head to Faster R-CNN
   - Good: Add a parameter to box head to support custom conv-layer number

4. Provide clear and significant commit message

5. Provide clear and meaningful PR description

   - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix)
   - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily)
   - Introduce main changes, results and influences on other modules in short description
   - Associate related issues and pull requests with a milestone


================================================
FILE: CONTRIBUTING_zh-CN.md
================================================
## 贡献代码

欢迎加入 MMCV 社区，我们致力于打造最前沿的计算机视觉基础库，我们欢迎任何类型的贡献，包括但不限于

**修复错误**

修复代码实现错误的步骤如下：

1. 如果提交的代码改动较大，建议先提交 issue，并正确描述 issue 的现象、原因和复现方式，讨论后确认修复方案。
2. 修复错误并补充相应的单元测试，提交拉取请求。

**新增功能或组件**

1. 如果新功能或模块涉及较大的代码改动，建议先提交 issue，确认功能的必要性。
2. 实现新增功能并添单元测试，提交拉取请求。

**文档补充**

修复文档可以直接提交拉取请求

添加文档或将文档翻译成其他语言步骤如下

1. 提交 issue，确认添加文档的必要性。
2. 添加文档，提交拉取请求。

### 拉取请求工作流

如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)

#### 1. 复刻仓库

当你第一次提交拉取请求时，先复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮，复刻后的代码库将会出现在你的 GitHub 个人主页下。

<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">

将代码克隆到本地

```shell
git clone git@github.com:{username}/mmcv.git
```

添加原代码库为上游代码库

```bash
git remote add upstream git@github.com:open-mmlab/mmcv
```

检查 remote 是否添加成功，在终端输入 `git remote -v`

```bash
origin	git@github.com:{username}/mmcv.git (fetch)
origin	git@github.com:{username}/mmcv.git (push)
upstream	git@github.com:open-mmlab/mmcv (fetch)
upstream	git@github.com:open-mmlab/mmcv (push)
```

> 这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 git clone 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 open-mmlab。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。

#### 2. 配置 pre-commit

在本地开发环境中，我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格，以确保代码风格的统一。在提交代码，需要先安装 pre-commit（需要在 MMCV 目录下执行）:

```shell
pip install -U pre-commit
pre-commit install
```

检查 pre-commit 是否配置成功，并安装 `.pre-commit-config.yaml` 中的钩子：

```shell
pre-commit run --all-files
```

<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">

<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">

> 如果你是中国用户，由于网络原因，可能会出现安装失败的情况，这时可以使用国内源

> pre-commit install -c .pre-commit-config-zh-cn.yaml

> pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml

如果安装过程被中断，可以重复执行 `pre-commit run ...` 继续安装。

如果提交的代码不符合代码风格规范，pre-commit 会发出警告，并自动修复部分错误。

<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">

如果我们想临时绕开 pre-commit 的检查提交一次代码，可以在 `git commit` 时加上 `--no-verify`（需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查）。

```shell
git commit -m "xxx" --no-verify
```

#### 3. 创建开发分支

安装完 pre-commit 之后，我们需要基于 master 创建开发分支，建议的分支命名规则为 `username/pr_name`。

```shell
git checkout -b yhc/refactor_contributing_doc
```

在后续的开发中，如果本地仓库的 master 分支落后于 upstream 的 master 分支，我们需要先拉取 upstream 的代码进行同步，再执行上面的命令

```shell
git pull upstream master
```

#### 4. 提交代码并在本地通过单元测试

- MMCV 引入了 mypy 来做静态类型检查，以增加代码的鲁棒性。因此我们在提交代码时，需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。

- 提交的代码同样需要通过单元测试

  ```shell
  # 通过全量单元测试
  pytest tests

  # 我们需要保证提交的代码能够通过修改模块的单元测试，以 runner 为例
  pytest tests/test_runner/test_runner.py
  ```

  如果你由于缺少依赖无法运行修改模块的单元测试，可以参考[指引-单元测试](#单元测试)

- 如果修改/添加了文档，参考[指引](#文档渲染)确认文档渲染正常。

#### 5. 推送代码到远程

代码通过单元测试和 pre-commit 检查后，将代码推送到远程仓库，如果是第一次推送，可以在 `git push` 后加上 `-u` 参数以关联远程分支

```shell
git push -u origin {branch_name}
```

这样下次就可以直接使用 `git push` 命令推送代码了，而无需指定分支和远程仓库。

#### 6. 提交拉取请求（PR）

(1) 在 GitHub 的 Pull request 界面创建拉取请求
<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">

(2) 根据指引修改 PR 描述，以便于其他开发者更好地理解你的修改

<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">

描述规范详见[拉取请求规范](#拉取请求规范)

&#160;

**注意事项**

(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 Issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）

(b) 如果是第一次为 OpenMMLab 做贡献，需要签署 CLA

<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">

(c) 检查提交的 PR 是否通过 CI（集成测试）

<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">

MMCV 会在不同的平台（Linux、Window、Mac），基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试，以保证代码的正确性，如果有任何一个没有通过，我们可点击上图中的 `Details` 来查看具体的测试信息，以便于我们修改代码。

(3) 如果 PR 通过了 CI，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤，直到 reviewer 同意合入 PR。

<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">

所有 reviewer 同意合入 PR 后，我们会尽快将 PR 合并到主分支。

#### 7. 解决冲突

随着时间的推移，我们的代码库会不断更新，这时候，如果你的 PR 与主分支存在冲突，你需要解决冲突，解决冲突的方式有两种：

```shell
git fetch --all --prune
git rebase upstream/master
```

或者

```shell
git fetch --all --prune
git merge upstream/master
```

如果你非常善于处理冲突，那么可以使用 rebase 的方式来解决冲突，因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用，那么可以使用 `merge` 的方式来解决冲突。

### 指引

#### 单元测试

如果你无法正常执行部分模块的单元测试，例如 [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) 模块，可能是你的当前环境没有安装以下依赖

```shell
# Linux
sudo apt-get update -y
sudo apt-get install -y libturbojpeg
sudo apt-get install -y ffmpeg

# Windows
conda install ffmpeg
```

在提交修复代码错误或新增特性的拉取请求时，我们应该尽可能的让单元测试覆盖所有提交的代码，计算单元测试覆盖率的方法如下

```shell
python -m coverage run -m pytest /path/to/test_file
python -m coverage html
# check file in htmlcov/index.html
```

#### 文档渲染

在提交修复代码错误或新增特性的拉取请求时，可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。
本地生成渲染后的文档的方法如下

```shell
pip install -r requirements/docs.txt
cd docs/zh_cn/
# or docs/en
make html
# check file in ./docs/zh_cn/_build/html/index.html
```

### 代码风格

#### Python

[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码

- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装
- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误
- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具
- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具

yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到

通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，
修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。
pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。

pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。

更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。

#### C++ and CUDA

C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)

### 拉取请求规范

1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题

2. 一个`拉取请求`对应一个短期分支

3. 粒度要细，一个`拉取请求`只做一件事情，避免超大的`拉取请求`

   - Bad：实现 Faster R-CNN
   - Acceptable：给 Faster R-CNN 添加一个 box head
   - Good：给 box head 增加一个参数来支持自定义的 conv 层数

4. 每次 Commit 时需要提供清晰且有意义 commit 信息

5. 提供清晰且有意义的`拉取请求`描述

   - 标题写明白任务名称，一般格式:\[Prefix\] Short description of the pull request (Suffix)
   - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review)
   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
   - 关联相关的`议题` (issue) 和其他`拉取请求`

6. 如果引入了其他三方库，或借鉴了三方库的代码，请确认他们的许可证和 mmcv 兼容，并在借鉴的代码上补充 `This code is inspired from http://`


================================================
FILE: LICENSE
================================================
Copyright (c) OpenMMLab. All rights reserved

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2018-2020 Open-MMLab. All rights reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: LICENSES.md
================================================
# Licenses for special operations

In this file, we list the operations with other licenses instead of Apache 2.0. Users should be careful about adopting these operations in any commercial matters.

|    Operation     |                                                                             Files                                                                              |    License     |
| :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: |
|    upfirdn2d     |          [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu)          | NVIDIA License |
| fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License |
|     bias_act     |             [mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu)             | NVIDIA License |
|  filtered_lrelu  |            [mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu)            | NVIDIA License |
|  conv2d_gradfix  |                              [mmcv/ops/conv2d_gradfix.py](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/conv2d_gradfix.py)                              | NVIDIA License |


================================================
FILE: MANIFEST.in
================================================
include requirements/runtime.txt
include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm
recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm


================================================
FILE: README.md
================================================
<div align="center">
  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/main/docs/en/mmcv-logo.png" width="300"/>
  <div>&nbsp;</div>
  <div align="center">
    <b><font size="5">OpenMMLab website</font></b>
    <sup>
      <a href="https://openmmlab.com">
        <i><font size="4">HOT</font></i>
      </a>
    </sup>
    &nbsp;&nbsp;&nbsp;&nbsp;
    <b><font size="5">OpenMMLab platform</font></b>
    <sup>
      <a href="https://platform.openmmlab.com">
        <i><font size="4">TRY IT OUT</font></i>
      </a>
    </sup>
  </div>
  <div>&nbsp;</div>

[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/en/latest/get_started/installation.html)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
[![pytorch](https://img.shields.io/badge/pytorch-1.8~2.0-orange)](https://pytorch.org/get-started/previous-versions/)
[![cuda](https://img.shields.io/badge/cuda-10.1~11.8-green)](https://developer.nvidia.com/cuda-downloads)
[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)

[📘Documentation](https://mmcv.readthedocs.io/en/latest/) |
[🛠️Installation](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) |
[🤔Reporting Issues](https://github.com/open-mmlab/mmcv/issues/new/choose)

</div>

<div align="center">

English | [简体中文](README_zh-CN.md)

</div>

## Highlights

The OpenMMLab team released a new generation of training engine [MMEngine](https://github.com/open-mmlab/mmengine) at the World Artificial Intelligence Conference on September 1, 2022. It is a foundational library for training deep learning models. Compared with MMCV, it provides a universal and powerful runner, an open architecture with a more unified interface, and a more customizable training process.

MMCV v2.0.0 official version was released on April 6, 2023. In version 2.x, it removed components related to the training process and added a data transformation module. Also, starting from 2.x, it renamed the package names **mmcv** to **mmcv-lite** and **mmcv-full** to **mmcv**. For details, see [Compatibility Documentation](docs/en/compatibility.md).

MMCV will maintain both [1.x](https://github.com/open-mmlab/mmcv/tree/1.x) (corresponding to the original [master](https://github.com/open-mmlab/mmcv/tree/master) branch) and **2.x** (corresponding to the **main** branch, now the default branch) versions simultaneously. For details, see [Branch Maintenance Plan](README.md#branch-maintenance-plan).

## Introduction

MMCV is a foundational library for computer vision research and it provides the following functionalities:

- [Image/Video processing](https://mmcv.readthedocs.io/en/latest/understand_mmcv/data_process.html)
- [Image and annotation visualization](https://mmcv.readthedocs.io/en/latest/understand_mmcv/visualization.html)
- [Image transformation](https://mmcv.readthedocs.io/en/latest/understand_mmcv/data_transform.html)
- [Various CNN architectures](https://mmcv.readthedocs.io/en/latest/understand_mmcv/cnn.html)
- [High-quality implementation of common CPU and CUDA ops](https://mmcv.readthedocs.io/en/latest/understand_mmcv/ops.html)

It supports the following systems:

- Linux
- Windows
- macOS

See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage.

Note: MMCV requires Python 3.7+.

## Installation

There are two versions of MMCV:

- **mmcv**: comprehensive, with full features and various CUDA ops out of the box. It takes longer time to build.
- **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.

**Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`.

### Install mmcv

Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation). For apple silicon users, please use PyTorch 1.13+.

The command to install mmcv:

```bash
pip install -U openmim
mim install mmcv
```

If you need to specify the version of mmcv, you can use the following command:

```bash
mim install mmcv==2.0.0
```

If you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html).

<details>
<summary>Installation log using pre-built packages</summary>

Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
Collecting mmcv<br />
<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>

</details>

<details>
<summary>Installation log using source packages</summary>

Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
Collecting mmcv==2.0.0<br />
<b>Downloading mmcv-2.0.0.tar.gz</b>

</details>

For more installation methods, please refer to the [Installation documentation](https://mmcv.readthedocs.io/en/latest/get_started/installation.html).

### Install mmcv-lite

If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).

```bash
pip install -U openmim
mim install mmcv-lite
```

## FAQ

If you face some installation issues, CUDA related issues or RuntimeErrors,
you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html).

If you face installation problems or runtime issues, you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html) to see if there is a solution. If the problem is still not solved, feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).

## Citation

If you find this project useful in your research, please consider cite:

```latex
@misc{mmcv,
    title={{MMCV: OpenMMLab} Computer Vision Foundation},
    author={MMCV Contributors},
    howpublished = {\url{https://github.com/open-mmlab/mmcv}},
    year={2018}
}
```

## Contributing

We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline.

## License

MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters.

## Branch Maintenance Plan

MMCV currently has four branches, namely main, 1.x, master, and 2.x, where 2.x is an alias for the main branch, and master is an alias for the 1.x branch. The 2.x and master branches will be deleted in the future. MMCV's branches go through the following three stages:

| Phase                | Time                  | Branch                                                                                                                              | description                                                                                                                                            |
| -------------------- | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
| RC Period            | 2022.9.1 - 2023.4.5   | Release candidate code (2.x version) will be released on 2.x branch. Default master branch is still 1.x version                     | Master and 2.x branches iterate normally                                                                                                               |
| Compatibility Period | 2023.4.6 - 2023.12.31 | **The 2.x branch has been renamed to the main branch and set as the default branch**, and 1.x branch will correspond to 1.x version | We still maintain the old version 1.x, respond to user needs, but try not to introduce changes that break compatibility; main branch iterates normally |
| Maintenance Period   | From 2024/1/1         | Default main branch corresponds to 2.x version and 1.x branch is 1.x version                                                        | 1.x branch is in maintenance phase, no more new feature support; main branch is iterating normally                                                     |

## Projects in OpenMMLab

- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.


================================================
FILE: README_zh-CN.md
================================================
<div align="center">
  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/main/docs/en/mmcv-logo.png" width="300"/>
  <div>&nbsp;</div>
  <div align="center">
    <b><font size="5">OpenMMLab 官网</font></b>
    <sup>
      <a href="https://openmmlab.com">
        <i><font size="4">HOT</font></i>
      </a>
    </sup>
    &nbsp;&nbsp;&nbsp;&nbsp;
    <b><font size="5">OpenMMLab 开放平台</font></b>
    <sup>
      <a href="https://platform.openmmlab.com">
        <i><font size="4">TRY IT OUT</font></i>
      </a>
    </sup>
  </div>
  <div>&nbsp;</div>

[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
[![pytorch](https://img.shields.io/badge/pytorch-1.8~2.0-orange)](https://pytorch.org/get-started/previous-versions/)
[![cuda](https://img.shields.io/badge/cuda-10.1~11.8-green)](https://developer.nvidia.com/cuda-downloads)
[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)

[📘使用文档](https://mmcv.readthedocs.io/zh_CN/latest/) |
[🛠️安装教程](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html) |
[🤔报告问题](https://github.com/open-mmlab/mmcv/issues/new/choose)

</div>

<div align="center">

[English](README.md) | 简体中文

</div>

<div align="center">
  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
  <a href="https://discord.gg/raweFPmdzG" style="text-decoration:none;">
    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
</div>

## Highlights

OpenMMLab 团队于 2022 年 9 月 1 日在世界人工智能大会发布了新一代训练引擎 [MMEngine](https://github.com/open-mmlab/mmengine)，它是一个用于训练深度学习模型的基础库。相比于 MMCV，它提供了更高级且通用的训练器、接口更加统一的开放架构以及可定制化程度更高的训练流程。

MMCV v2.0.0 正式版本于 2023 年 4 月 6 日发布。在 2.x 版本中，它删除了和训练流程相关的组件，并新增了数据变换模块。另外，从 2.x 版本开始，重命名包名 **mmcv** 为 **mmcv-lite** 以及 **mmcv-full** 为 **mmcv**。详情见[兼容性文档](docs/zh_cn/compatibility.md)。

MMCV 会同时维护 [1.x](https://github.com/open-mmlab/mmcv/tree/1.x) (对应原 [master](https://github.com/open-mmlab/mmcv/tree/master) 分支) 和 **2.x**（对应 **main** 分支，现为默认分支）版本，详情见[分支维护计划](README_zh-CN.md#分支维护计划)。

## 简介

MMCV 是一个面向计算机视觉的基础库，它提供了以下功能：

- [图像和视频处理](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/data_process.html)
- [图像和标注结果可视化](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/visualization.html)
- [图像变换](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/data_transform.html)
- [多种 CNN 网络结构](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/cnn.html)
- [高质量实现的常见 CUDA 算子](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/ops.html)

MMCV 支持多种平台，包括：

- Linux
- Windows
- macOS

如想了解更多特性和使用，请参考[文档](http://mmcv.readthedocs.io/zh_CN/latest)。

提示: MMCV 需要 Python 3.7 以上版本。

## 安装

MMCV 有两个版本：

- **mmcv**: 完整版，包含所有的特性以及丰富的开箱即用的 CUDA 算子。注意完整版本可能需要更长时间来编译。
- **mmcv-lite**: 精简版，不包含 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用 CUDA 算子的话，精简版可以作为一个考虑选项。

**注意**: 请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果 CUDA 可用，强烈推荐安装 mmcv`。

### 安装 mmcv

在安装 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。如果你使用的是搭载 apple silicon 的 mac 设备，请安装 PyTorch 1.13+ 的版本。

安装 mmcv 的命令如下：

```bash
pip install -U openmim
mim install mmcv
```

如果需要指定 mmcv 的版本，可以使用以下命令

```bash
mim install mmcv==2.0.0
```

如果发现上述的安装命令没有使用预编译包（以 `.whl` 结尾）而是使用源码包（以 `.tar.gz` 结尾）安装，则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包，此时，你可以[源码安装 mmcv](https://mmcv.readthedocs.io/zh_CN/latest/get_started/build.html)。

<details>
<summary>使用预编译包的安装日志</summary>

Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
Collecting mmcv<br />
<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>

</details>

<details>
<summary>使用源码包的安装日志</summary>

Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
Collecting mmcv==2.0.0<br />
<b>Downloading mmcv-2.0.0.tar.gz</b>

</details>

更多安装方式请参考[安装文档](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html)。

### 安装 mmcv-lite

如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。

```bash
pip install -U openmim
mim install mmcv-lite
```

## FAQ

如果你遇到了安装问题或者运行时问题，请查看[问题解决页面](https://mmcv.readthedocs.io/zh_CN/latest/faq.html)是否已有解决方案。如果问题仍然没有解决，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。

## 贡献指南

我们感谢所有的贡献者为改进和提升 MMCV 所作出的努力。请参考[贡献指南](CONTRIBUTING.md)来了解参与项目贡献的相关指引。

## 许可证

`MMCV` 目前以 Apache 2.0 的许可证发布，但是其中有一部分功能并不是使用的 Apache2.0 许可证，我们在 [许可证](LICENSES.md) 中详细地列出了这些功能以及他们对应的许可证，如果您正在从事盈利性活动，请谨慎参考此文档。

## 分支维护计划

MMCV 目前有四个分支，分别是 main、1.x、master 和 2.x，其中 2.x 为 main 分支的别名，master 为 1.x 分支的别名，2.x 和 master 这两个分支在将来会被删除。MMCV 的分支经历以下三个阶段：

| 阶段   | 时间                  | 分支                                                                  | 说明                                                                                                   |
| ------ | --------------------- | --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
| 公测期 | 2022.9.1 - 2023.4.5   | 公测版代码发布在 2.x 分支；默认主分支 master 仍对应 1.x 版本          | master 和 2.x 分支正常进行迭代                                                                         |
| 兼容期 | 2023.4.6 - 2023.12.31 | **2.x 分支重命名为 main 分支并设置为默认分支**；1.x 分支对应 1.x 版本 | 保持对旧版本 1.x 的维护和开发，响应用户需求，但尽量不引进破坏旧版本兼容性的改动；main 分支正常进行迭代 |
| 维护期 | 2024.1.1 - 待定       | 默认主分支 main 为 2.x 版本；1.x 分支对应 1.x 版本                    | 1.x 分支进入维护阶段，不再进行新功能支持；main 分支正常进行迭代                                        |

## OpenMMLab 的其他项目

- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库
- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架

## 欢迎加入 OpenMMLab 社区

扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=K0QI8ByU)，或添加微信小助手”OpenMMLabwx“加入官方交流微信群。

<div align="center">
<img src="https://user-images.githubusercontent.com/25839884/205870927-39f4946d-8751-4219-a4c0-740117558fd7.jpg" height="400" />  <img src="https://user-images.githubusercontent.com/25839884/203904835-62392033-02d4-4c73-a68c-c9e4c1e2b07f.jpg" height="400" /> <img src="https://user-images.githubusercontent.com/25839884/205872898-e2e6009d-c6bb-4d27-8d07-117e697a3da8.jpg" height="400" />
</div>

我们会在 OpenMMLab 社区为大家

- 📢 分享 AI 框架的前沿核心技术
- 💻 解读 PyTorch 常用模块源码
- 📰 发布 OpenMMLab 的相关新闻
- 🚀 介绍 OpenMMLab 开发的前沿算法
- 🏃 获取更高效的问题答疑和意见反馈
- 🔥 提供与各行各业开发者充分交流的平台

干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬


================================================
FILE: TERMINOLOGY.md
================================================
# English-Chinese terminology comparison (英汉术语对照)

This document is used as a reference for English-Chinese terminology translation.

该文档用作中英文翻译对照参考。

|      English      |     中文     |
| :---------------: | :----------: |
|    annotation     |     标注     |
|     backbone      |   主干网络   |
|     benchmark     |   基准测试   |
|    checkpoint     | 模型权重文件 |
|    classifier     |    分类器    |
|     cls_head      |    分类头    |
|      decoder      |    解码器    |
|     detector      |    检测器    |
|      encoder      |    编码器    |
|     finetune      |     微调     |
|   ground truth    |   真实标签   |
|       hook        |     钩子     |
|     localizer     |    定位器    |
|       neck        |   模型颈部   |
|     pipeline      |    流水线    |
|    recognizer     |    识别器    |
|     register      |    注册器    |
|     schedule      |     调整     |
|     scheduler     |    调度器    |
|     segmentor     |    分割器    |
|      tensor       |     张量     |
| training schedule |   训练策略   |


================================================
FILE: docker/README.md
================================================
# Docker images

There are two `Dockerfile` files to build docker images, one to build an image with the mmcv pre-built package and the other with the mmcv development environment.

```text
.
|-- README.md
|-- dev  # build with mmcv development environment
|   `-- Dockerfile
`-- release  # build with mmcv pre-built package
    `-- Dockerfile
```

## Build docker images

### Build with mmcv pre-built package

Build with local repository

```bash
git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
docker build -t mmcv -f docker/release/Dockerfile .
```

Or build with remote repository

```bash
docker build -t mmcv https://github.com/open-mmlab/mmcv.git#master:docker/release
```

The [Dockerfile](release/Dockerfile) installs latest released version of mmcv by default, but you can specify mmcv versions to install expected versions.

```bash
docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0rc1 .
```

If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.

An example to build an image with PyTorch 1.11 and CUDA 11.3.

```bash
docker build -t mmcv -f docker/release/Dockerfile \
    --build-arg PYTORCH=1.9.0 \
    --build-arg CUDA=11.1 \
    --build-arg CUDNN=8 \
    --build-arg MMCV=2.0.0rc1 .
```

More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).

### Build with mmcv development environment

If you want to build an docker image with the mmcv development environment, you can use the following command

```bash
git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
docker build -t mmcv -f docker/dev/Dockerfile --build-arg CUDA_ARCH=7.5 .
```

Note that `CUDA_ARCH` is the cumpute capability of your GPU and you can find it at [Compute Capability](https://developer.nvidia.com/cuda-gpus#compute).

The building process may take 10 minutes or more.

## Run images

```bash
docker run --gpus all --shm-size=8g -it mmcv
```

See [docker run](https://docs.docker.com/engine/reference/commandline/run/) for more usages.


================================================
FILE: docker/dev/Dockerfile
================================================
ARG PYTORCH="1.8.1"
ARG CUDA="10.2"
ARG CUDNN="7"

FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

# To fix GPG key error when running apt-get update
RUN rm /etc/apt/sources.list.d/cuda.list \
    && rm /etc/apt/sources.list.d/nvidia-ml.list \
    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub

# Install git and system dependencies for opencv-python
RUN apt-get update && apt-get install -y git \
    && apt-get update && apt-get install -y libgl1 libglib2.0-0

# Install system dependencies for unit tests
RUN apt-get install -y ffmpeg libturbojpeg \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

# build mmcv from source with develop mode
ARG HTTPS_PROXY=""
ENV https_proxy=${HTTPS_PROXY}
ENV FORCE_CUDA="1"
ARG CUDA_ARCH=""
ENV TORCH_CUDA_ARCH_LIST=${CUDA_ARCH}
RUN git clone https://github.com/open-mmlab/mmcv.git /mmcv
WORKDIR /mmcv
RUN git checkout 2.x && git rev-parse --short HEAD
RUN pip install --no-cache-dir -e .[all] -v && pip install pre-commit && pre-commit install


================================================
FILE: docker/release/Dockerfile
================================================
ARG PYTORCH="1.8.1"
ARG CUDA="10.2"
ARG CUDNN="7"

FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

# To fix GPG key error when running apt-get update
RUN rm /etc/apt/sources.list.d/cuda.list \
    && rm /etc/apt/sources.list.d/nvidia-ml.list \
    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub

# Install system dependencies for opencv-python
RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

# Install mmcv
ARG MMCV=""
RUN if [ "${MMCV}" = "" ]; then pip install -U openmim && mim install 'mmcv>=2.0.0rc1'; else pip install -U openmim && mim install mmcv==${MMCV}; fi

# Verify the installation
RUN python -c 'import mmcv;print(mmcv.__version__)'


================================================
FILE: docs/en/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/en/_static/css/readthedocs.css
================================================
.header-logo {
    background-image: url("../image/mmcv-logo.png");
    background-size: 85px 40px;
    height: 40px;
    width: 85px;
}

table.colwidths-auto td {
    width: 50%
}


================================================
FILE: docs/en/_static/version.json
================================================
{
    "Linux": [
        {
            "cuda": "12.1",
            "torch": "2.4.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.4.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.7",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.7",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.6",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.6",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.5",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.0",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.5.x",
            "mmcv": [
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.5.x",
            "mmcv": [
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "9.2",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "9.2",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "9.2",
            "torch": "1.5.x",
            "mmcv": [
                "2.0.0rc3",
                "2.0.0rc2"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.4.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.5.x",
            "mmcv": [
                "2.0.0rc3",
                "2.0.0rc2"
            ]
        }
    ],
    "Windows": [
        {
            "cuda": "12.1",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.7",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.7",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.6",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.6",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.5",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.8.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        }
    ],
    "macOS": [
        {
            "cuda": "cpu",
            "torch": "2.1.x",
            "mmcv": [
                "2.1.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.0.x",
            "mmcv": [
                "2.1.0",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.13.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "mps",
            "torch": "1.13.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.12.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.11.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.10.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.9.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.8.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2"
            ]
        }
    ]
}


================================================
FILE: docs/en/_templates/classtemplate.rst
================================================
.. role:: hidden
    :class: hidden-section
.. currentmodule:: {{ module }}


{{ name | underline}}

.. autoclass:: {{ name }}
    :members:


..
  autogenerated from source/_templates/classtemplate.rst
  note it does not have :inherited-members:


================================================
FILE: docs/en/api/arraymisc.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.arraymisc
===================================

.. contents:: mmcv.arraymisc
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.arraymisc

.. autosummary::
   :toctree: generated
   :nosignatures:

   quantize
   dequantize


================================================
FILE: docs/en/api/cnn.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.cnn
===================================

.. contents:: mmcv.cnn
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.cnn

Module
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   ContextBlock
   Conv2d
   Conv3d
   ConvAWS2d
   ConvModule
   ConvTranspose2d
   ConvTranspose3d
   ConvWS2d
   DepthwiseSeparableConvModule
   GeneralizedAttention
   HSigmoid
   HSwish
   LayerScale
   Linear
   MaxPool2d
   MaxPool3d
   NonLocal1d
   NonLocal2d
   NonLocal3d
   Scale
   Swish
   Conv2dRFSearchOp

Build Function
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   build_activation_layer
   build_conv_layer
   build_norm_layer
   build_padding_layer
   build_plugin_layer
   build_upsample_layer

Miscellaneous
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   fuse_conv_bn
   conv_ws_2d
   is_norm
   make_res_layer
   make_vgg_layer
   get_model_complexity_info


================================================
FILE: docs/en/api/image.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.image
===================================

.. contents:: mmcv.image
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.image

IO
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   imfrombytes
   imread
   imwrite
   use_backend

Color Space
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   bgr2gray
   bgr2hls
   bgr2hsv
   bgr2rgb
   bgr2ycbcr
   gray2bgr
   gray2rgb
   hls2bgr
   hsv2bgr
   imconvert
   rgb2bgr
   rgb2gray
   rgb2ycbcr
   ycbcr2bgr
   ycbcr2rgb

Geometric
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   cutout
   imcrop
   imflip
   impad
   impad_to_multiple
   imrescale
   imresize
   imresize_like
   imresize_to_multiple
   imrotate
   imshear
   imtranslate
   rescale_size

Photometric
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   adjust_brightness
   adjust_color
   adjust_contrast
   adjust_hue
   adjust_lighting
   adjust_sharpness
   auto_contrast
   clahe
   imdenormalize
   imequalize
   iminvert
   imnormalize
   lut_transform
   posterize
   solarize

Miscellaneous
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   tensor2imgs


================================================
FILE: docs/en/api/ops.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.ops
===================================

.. contents:: mmcv.ops
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.ops

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   BorderAlign
   CARAFE
   CARAFENaive
   CARAFEPack
   Conv2d
   ConvTranspose2d
   CornerPool
   Correlation
   CrissCrossAttention
   DeformConv2d
   DeformConv2dPack
   DeformRoIPool
   DeformRoIPoolPack
   DynamicScatter
   FusedBiasLeakyReLU
   GroupAll
   Linear
   MaskedConv2d
   MaxPool2d
   ModulatedDeformConv2d
   ModulatedDeformConv2dPack
   ModulatedDeformRoIPoolPack
   MultiScaleDeformableAttention
   PSAMask
   PointsSampler
   PrRoIPool
   QueryAndGroup
   RiRoIAlignRotated
   RoIAlign
   RoIAlignRotated
   RoIAwarePool3d
   RoIPointPool3d
   RoIPool
   SAConv2d
   SigmoidFocalLoss
   SimpleRoIAlign
   SoftmaxFocalLoss
   SparseConv2d
   SparseConv3d
   SparseConvTensor
   SparseConvTranspose2d
   SparseConvTranspose3d
   SparseInverseConv2d
   SparseInverseConv3d
   SparseMaxPool2d
   SparseMaxPool3d
   SparseModule
   SparseSequential
   SubMConv2d
   SubMConv3d
   SyncBatchNorm
   TINShift
   Voxelization

.. autosummary::
   :toctree: generated
   :nosignatures:

   active_rotated_filter
   assign_score_withk
   ball_query
   batched_nms
   bbox_overlaps
   border_align
   box_iou_rotated
   boxes_iou3d
   boxes_iou_bev
   boxes_overlap_bev
   carafe
   carafe_naive
   chamfer_distance
   contour_expand
   convex_giou
   convex_iou
   deform_conv2d
   deform_roi_pool
   diff_iou_rotated_2d
   diff_iou_rotated_3d
   dynamic_scatter
   furthest_point_sample
   furthest_point_sample_with_dist
   fused_bias_leakyrelu
   gather_points
   grouping_operation
   knn
   masked_conv2d
   min_area_polygons
   modulated_deform_conv2d
   nms
   nms3d
   nms3d_normal
   nms_bev
   nms_match
   nms_normal_bev
   nms_rotated
   pixel_group
   point_sample
   points_in_boxes_all
   points_in_boxes_cpu
   points_in_boxes_part
   points_in_polygons
   prroi_pool
   rel_roi_point_to_rel_img_point
   riroi_align_rotated
   roi_align
   roi_align_rotated
   roi_pool
   rotated_feature_align
   scatter_nd
   sigmoid_focal_loss
   soft_nms
   softmax_focal_loss
   three_interpolate
   three_nn
   tin_shift
   upfirdn2d
   voxelization


================================================
FILE: docs/en/api/transforms.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.transforms
===================================

.. currentmodule:: mmcv.transforms

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   BaseTransform
   TestTimeAug

Loading
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   LoadAnnotations
   LoadImageFromFile

Processing
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   CenterCrop
   MultiScaleFlipAug
   Normalize
   Pad
   RandomChoiceResize
   RandomFlip
   RandomGrayscale
   RandomResize
   Resize
   ToTensor
   ImageToTensor

Wrapper
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   Compose
   KeyMapper
   RandomApply
   RandomChoice
   TransformBroadcaster


================================================
FILE: docs/en/api/utils.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.utils
===================================

.. contents:: mmcv.utils
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.utils

.. autosummary::
   :toctree: generated
   :nosignatures:

   IS_CUDA_AVAILABLE
   IS_MLU_AVAILABLE
   IS_MPS_AVAILABLE
   collect_env
   jit
   skip_no_elena


================================================
FILE: docs/en/api/video.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.video
===================================

.. contents:: mmcv.video
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.video

IO
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   VideoReader
   Cache

.. autosummary::
   :toctree: generated
   :nosignatures:

   frames2video

Optical Flow
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   dequantize_flow
   flow_from_bytes
   flow_warp
   flowread
   flowwrite
   quantize_flow
   sparse_flow_from_bytes

Video Processing
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   concat_video
   convert_video
   cut_video
   resize_video


================================================
FILE: docs/en/api/visualization.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.visualization
===================================

.. contents:: mmcv.visualization
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.visualization

Color
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   Color

.. autosummary::
   :toctree: generated
   :nosignatures:

   color_val

Image
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   imshow
   imshow_bboxes
   imshow_det_bboxes

Optical Flow
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   flow2rgb
   flowshow
   make_color_wheel


================================================
FILE: docs/en/community/contributing.md
================================================
## Contributing to OpenMMLab

Welcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to

**Fix bug**

You can directly post a Pull Request to fix typo in code or documents

The steps to fix the bug of code implementation are as follows.

1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution.

2. Posting a pull request after fixing the bug and adding corresponding unit test.

**New Feature or Enhancement**

1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design.
2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test.

**Document**

You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable.

### Pull Request Workflow

If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)

#### 1. Fork and clone

If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile.

<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">

Then, you can clone the repositories to local:

```shell
git clone git@github.com:{username}/mmcv.git
```

After that, you should ddd official repository as the upstream repository

```bash
git remote add upstream git@github.com:open-mmlab/mmcv
```

Check whether remote repository has been added successfully by `git remote -v`

```bash
origin	git@github.com:{username}/mmcv.git (fetch)
origin	git@github.com:{username}/mmcv.git (push)
upstream	git@github.com:open-mmlab/mmcv (fetch)
upstream	git@github.com:open-mmlab/mmcv (push)
```

```{note}
Here's a brief introduction to origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically.
```

#### 2. Configure pre-commit

You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory.

```shell
pip install -U pre-commit
pre-commit install
```

Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`.

```shell
pre-commit run --all-files
```

<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">

<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">

```{note}
Chinese users may fail to download the pre-commit hooks due to the network issue. In this case, you could download these hooks from gitee by setting the .pre-commit-config-zh-cn.yaml

pre-commit install -c .pre-commit-config-zh-cn.yaml
pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml
```

If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation.

If the code does not conform to the code style specification, pre-commit will raise a warning and  fixes some of the errors automatically.

<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">

If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**.

```shell
git commit -m "xxx" --no-verify
```

#### 3. Create a development branch

After configuring the pre-commit, we should create a branch based on the main branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`

```shell
git checkout -b yhc/refactor_contributing_doc
```

In subsequent development, if the main branch of the local repository is behind the main branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command:

```shell
git pull upstream main
```

#### 4. Commit the code and pass the unit test

- MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html).

- The committed code should pass through the unit test

  ```shell
  # Pass all unit tests
  pytest tests

  # Pass the unit test of runner
  pytest tests/test_runner/test_runner.py
  ```

  If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test)

- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering)

#### 5. Push the code to remote

We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option.

```shell
git push -u origin {branch_name}
```

This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository.

#### 6. Create a Pull Request

(1) Create a pull request in GitHub's Pull request interface

<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">

(2) Modify the PR description according to the guidelines so that other developers can better understand your changes

<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">

Find more details about Pull Request description in [pull request guidelines](#pr-specs).

**note**

(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)

(b) If it is your first contribution, please sign the CLA

<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">

(c) Check whether the Pull Request pass through the CI

<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">

MMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.

(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.

<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">

#### 7. Resolve conflicts

If your local branch conflicts with the latest main branch of "upstream", you'll need to resolove them. There are two ways to do this:

```shell
git fetch --all --prune
git rebase upstream/main
```

or

```shell
git fetch --all --prune
git merge upstream/main
```

If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.

### Guidance

#### Unit test

If you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/main/mmcv/video) module, you can try to install the following dependencies:

```shell
# Linux
sudo apt-get update -y
sudo apt-get install -y libturbojpeg
sudo apt-get install -y ffmpeg

# Windows
conda install ffmpeg
```

We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test:

```shell
python -m coverage run -m pytest /path/to/test_file
python -m coverage html
# check file in htmlcov/index.html
```

#### Document rendering

If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results:

```shell
pip install -r requirements/docs.txt
cd docs/zh_cn/
# or docs/en
make html
# check file in ./docs/zh_cn/_build/html/index.html
```

### Code style

#### Python

We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.

We use the following tools for linting and formatting:

- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
- [yapf](https://github.com/google/yapf): A formatter for Python files.
- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.

Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).

We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).

#### C++ and CUDA

We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).

### PR Specs

1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style

2. One short-time branch should be matched with only one PR

3. Accomplish a detailed change in one PR. Avoid large PR

   - Bad: Support Faster R-CNN
   - Acceptable: Add a box head to Faster R-CNN
   - Good: Add a parameter to box head to support custom conv-layer number

4. Provide clear and significant commit message

5. Provide clear and meaningful PR description

   - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix)
   - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily)
   - Introduce main changes, results and influences on other modules in short description
   - Associate related issues and pull requests with a milestone


================================================
FILE: docs/en/community/pr.md
================================================
## Pull Request (PR)

Content has been migrated to [contributing guidance](contributing.md).


================================================
FILE: docs/en/compatibility.md
================================================
### v2.0.0

The OpenMMLab team released a new generation of training engine [MMEngine](https://github.com/open-mmlab/mmengine) at the World Artificial Intelligence Conference on September 1, 2022. It is a foundational library for training deep learning models. Compared with MMCV, it provides a universal and powerful runner, an open architecture with a more unified interface, and a more customizable training process.

The OpenMMLab team released MMCV v2.0.0 on April 6, 2023. In the 2.x version, it has the following significant changes:

(1) It removed the following components:

- `mmcv.fileio` module, removed in PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179). FileIO module from mmengine will be used wherever required.
- `mmcv.runner`, `mmcv.parallel`, `mmcv. engine` and `mmcv.device`, removed in PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216).
- All classes in `mmcv.utils` (eg `Config` and `Registry`) and many functions, removed in PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217). Only a few functions related to mmcv are reserved.
- `mmcv.onnx`, `mmcv.tensorrt` modules and related functions, removed in PR [#2225](https://github.com/open-mmlab/mmcv/pull/2225).
- Removed all root registrars in MMCV and registered classes or functions to the [root registrar](https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py) in MMEngine.

(2) It added the [`mmcv.transforms`](https://github.com/open-mmlab/mmcv/tree/main/mmcv/transforms) data transformation module.

(3) It renamed the package name **mmcv** to **mmcv-lite** and **mmcv-full** to **mmcv** in PR [#2235](https://github.com/open-mmlab/mmcv/pull/2235). Also, change the default value of the environment variable `MMCV_WITH_OPS` from 0 to 1.

<table class="docutils">
<thead>
  <tr>
    <th align="center">MMCV < 2.0</th>
    <th align="center">MMCV >= 2.0 </th>
<tbody>
  <tr>
  <td valign="top">

```bash
# Contains ops, because the highest version of mmcv-full is less than 2.0.0, so there is no need to add version restrictions
pip install openmim
mim install mmcv-full

# do not contain ops
pip install openmim
mim install "mmcv < 2.0.0"
```

</td>
  <td valign="top">

```bash
# Contains ops
pip install openmim
mim install mmcv

# Ops are not included, because the starting version of mmcv-lite is 2.0.0rc1, so there is no need to add version restrictions
pip install openmim
mim install mmcv-lite
```

</td>
</tr>
</thead>
</table>

### v1.3.18

Some ops have different implementations on different devices. Lots of macros and type checks are scattered in several files, which makes the code hard to maintain. For example:

```c++
  if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(rois);
    CHECK_CUDA_INPUT(output);
    CHECK_CUDA_INPUT(argmax_y);
    CHECK_CUDA_INPUT(argmax_x);

    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
                           aligned_height, aligned_width, spatial_scale,
                           sampling_ratio, pool_mode, aligned);
#else
    AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
  } else {
    CHECK_CPU_INPUT(input);
    CHECK_CPU_INPUT(rois);
    CHECK_CPU_INPUT(output);
    CHECK_CPU_INPUT(argmax_y);
    CHECK_CPU_INPUT(argmax_x);
    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
                          aligned_height, aligned_width, spatial_scale,
                          sampling_ratio, pool_mode, aligned);
  }
```

Registry and dispatcher are added to manage these implementations.

```c++

void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,
                                       int aligned_height, int aligned_width,
                                       float spatial_scale, int sampling_ratio,
                                       int pool_mode, bool aligned);

void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  ROIAlignForwardCUDAKernelLauncher(
      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
      spatial_scale, sampling_ratio, pool_mode, aligned);
}

// register cuda implementation
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);
REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);

// roi_align.cpp
// use the dispatcher to invoke different implementation depending on device type of input tensors.
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
                       argmax_x, aligned_height, aligned_width, spatial_scale,
                       sampling_ratio, pool_mode, aligned);
}

```

### v1.3.11

In order to flexibly support more backends and hardwares like `NVIDIA GPUs` and `AMD GPUs`, the directory of `mmcv/ops/csrc` is refactored. Note that this refactoring will not affect the usage in API. For related information, please refer to [PR1206](https://github.com/open-mmlab/mmcv/pull/1206).

The original directory was organized as follows.

```
.
├── common_cuda_helper.hpp
├── ops_cuda_kernel.cuh
├── pytorch_cpp_helper.hpp
├── pytorch_cuda_helper.hpp
├── parrots_cpp_helper.hpp
├── parrots_cuda_helper.hpp
├── parrots_cudawarpfunction.cuh
├── onnxruntime
│   ├── onnxruntime_register.h
│   ├── onnxruntime_session_options_config_keys.h
│   ├── ort_mmcv_utils.h
│   ├── ...
│   ├── onnx_ops.h
│   └── cpu
│       ├── onnxruntime_register.cpp
│       ├── ...
│       └── onnx_ops_impl.cpp
├── parrots
│   ├── ...
│   ├── ops.cpp
│   ├── ops_cuda.cu
│   ├── ops_parrots.cpp
│   └── ops_pytorch.h
├── pytorch
│   ├── ...
│   ├── ops.cpp
│   ├── ops_cuda.cu
│   ├── pybind.cpp
└── tensorrt
    ├── trt_cuda_helper.cuh
    ├── trt_plugin_helper.hpp
    ├── trt_plugin.hpp
    ├── trt_serialize.hpp
    ├── ...
    ├── trt_ops.hpp
    └── plugins
        ├── trt_cuda_helper.cu
        ├── trt_plugin.cpp
        ├── ...
        ├── trt_ops.cpp
        └── trt_ops_kernel.cu
```

After refactored, it is organized as follows.

```
.
├── common
│   ├── box_iou_rotated_utils.hpp
│   ├── parrots_cpp_helper.hpp
│   ├── parrots_cuda_helper.hpp
│   ├── pytorch_cpp_helper.hpp
│   ├── pytorch_cuda_helper.hpp
│   └── cuda
│       ├── common_cuda_helper.hpp
│       ├── parrots_cudawarpfunction.cuh
│       ├── ...
│       └── ops_cuda_kernel.cuh
├── onnxruntime
│   ├── onnxruntime_register.h
│   ├── onnxruntime_session_options_config_keys.h
│   ├── ort_mmcv_utils.h
│   ├── ...
│   ├── onnx_ops.h
│   └── cpu
│       ├── onnxruntime_register.cpp
│       ├── ...
│       └── onnx_ops_impl.cpp
├── parrots
│   ├── ...
│   ├── ops.cpp
│   ├── ops_parrots.cpp
│   └── ops_pytorch.h
├── pytorch
│   ├── info.cpp
│   ├── pybind.cpp
│   ├── ...
│   ├── ops.cpp
│   └── cuda
│       ├── ...
│       └── ops_cuda.cu
└── tensorrt
    ├── trt_cuda_helper.cuh
    ├── trt_plugin_helper.hpp
    ├── trt_plugin.hpp
    ├── trt_serialize.hpp
    ├── ...
    ├── trt_ops.hpp
    └── plugins
        ├── trt_cuda_helper.cu
        ├── trt_plugin.cpp
        ├── ...
        ├── trt_ops.cpp
        └── trt_ops_kernel.cu
```


================================================
FILE: docs/en/conf.py
================================================
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys

import pytorch_sphinx_theme
from sphinx.builders.html import StandaloneHTMLBuilder

sys.path.insert(0, os.path.abspath('../..'))

version_file = '../../mmcv/version.py'
with open(version_file) as f:
    exec(compile(f.read(), version_file, 'exec'))
__version__ = locals()['__version__']

# -- Project information -----------------------------------------------------

project = 'mmcv'
copyright = '2018-2022, OpenMMLab'
author = 'MMCV Authors'

# The short X.Y version
version = __version__
# The full version, including alpha/beta/rc tags
release = __version__

# -- General configuration ---------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.

extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
    'sphinx.ext.intersphinx',
    'sphinx.ext.napoleon',
    'sphinx.ext.viewcode',
    'sphinx_markdown_tables',
    'myst_parser',
    'sphinx_copybutton',
]  # yapf: disable

myst_heading_anchors = 4

myst_enable_extensions = ['colon_fence']

# Configuration for intersphinx
intersphinx_mapping = {
    'python': ('https://docs.python.org/3', None),
    'numpy': ('https://numpy.org/doc/stable', None),
    'torch': ('https://pytorch.org/docs/stable/', None),
    'mmengine': ('https://mmengine.readthedocs.io/en/latest', None),
}

autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
source_suffix = {
    '.rst': 'restructuredtext',
    '.md': 'markdown',
}

# The master toctree document.
master_doc = 'index'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
# html_theme = 'sphinx_rtd_theme'
html_theme = 'pytorch_sphinx_theme'
html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
    'menu': [
        {
            'name': 'GitHub',
            'url': 'https://github.com/open-mmlab/mmcv'
        },
    ],
    # Specify the language of shared menu
    'menu_lang': 'en',
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = ['css/readthedocs.css']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself.  Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}

# -- Options for HTMLHelp output ---------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = 'mmcvdoc'

# -- Options for LaTeX output ------------------------------------------------

latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',

    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',

    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',

    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
    (master_doc, 'mmcv.tex', 'mmcv Documentation', 'MMCV Contributors',
     'manual'),
]

# -- Options for manual page output ------------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, 'mmcv', 'mmcv Documentation', [author], 1)]

# -- Options for Texinfo output ----------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
    (master_doc, 'mmcv', 'mmcv Documentation', author, 'mmcv',
     'One line description of project.', 'Miscellaneous'),
]

# -- Options for Epub output -------------------------------------------------

# Bibliographic Dublin Core info.
epub_title = project

# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''

# A unique identification for the text.
#
# epub_uid = ''

# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']

# set priority when building html
StandaloneHTMLBuilder.supported_image_types = [
    'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
]
# -- Extension configuration -------------------------------------------------
# Ignore >>> when copying code
copybutton_prompt_text = r'>>> |\.\.\. '
copybutton_prompt_is_regexp = True


================================================
FILE: docs/en/deployment/mmcv_ops_definition.md
================================================
# MMCV Operators

To make custom operators in MMCV more standard, precise definitions of each operator are listed in this document.

<!-- TOC -->

- [MMCV Operators](#mmcv-operators)
  - [MMCVBorderAlign](#mmcvborderalign)
    - [Description](#description)
    - [Parameters](#parameters)
    - [Inputs](#inputs)
    - [Outputs](#outputs)
    - [Type Constraints](#type-constraints)
  - [MMCVCARAFE](#mmcvcarafe)
    - [Description](#description-1)
    - [Parameters](#parameters-1)
    - [Inputs](#inputs-1)
    - [Outputs](#outputs-1)
    - [Type Constraints](#type-constraints-1)
  - [MMCVCAWeight](#mmcvcaweight)
    - [Description](#description-2)
    - [Parameters](#parameters-2)
    - [Inputs](#inputs-2)
    - [Outputs](#outputs-2)
    - [Type Constraints](#type-constraints-2)
  - [MMCVCAMap](#mmcvcamap)
    - [Description](#description-3)
    - [Parameters](#parameters-3)
    - [Inputs](#inputs-3)
    - [Outputs](#outputs-3)
    - [Type Constraints](#type-constraints-3)
  - [MMCVCornerPool](#mmcvcornerpool)
    - [Description](#description-4)
    - [Parameters](#parameters-4)
    - [Inputs](#inputs-4)
    - [Outputs](#outputs-4)
    - [Type Constraints](#type-constraints-4)
  - [MMCVDeformConv2d](#mmcvdeformconv2d)
    - [Description](#description-5)
    - [Parameters](#parameters-5)
    - [Inputs](#inputs-5)
    - [Outputs](#outputs-5)
    - [Type Constraints](#type-constraints-5)
  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)
    - [Description](#description-6)
    - [Parameters](#parameters-6)
    - [Inputs](#inputs-6)
    - [Outputs](#outputs-6)
    - [Type Constraints](#type-constraints-6)
  - [MMCVDeformRoIPool](#mmcvdeformroipool)
    - [Description](#description-7)
    - [Parameters](#parameters-7)
    - [Inputs](#inputs-7)
    - [Outputs](#outputs-7)
    - [Type Constraints](#type-constraints-7)
  - [MMCVMaskedConv2d](#mmcvmaskedconv2d)
    - [Description](#description-8)
    - [Parameters](#parameters-8)
    - [Inputs](#inputs-8)
    - [Outputs](#outputs-8)
    - [Type Constraints](#type-constraints-8)
  - [MMCVPSAMask](#mmcvpsamask)
    - [Description](#description-9)
    - [Parameters](#parameters-9)
    - [Inputs](#inputs-9)
    - [Outputs](#outputs-9)
    - [Type Constraints](#type-constraints-9)
  - [NonMaxSuppression](#nonmaxsuppression)
    - [Description](#description-10)
    - [Parameters](#parameters-10)
    - [Inputs](#inputs-10)
    - [Outputs](#outputs-10)
    - [Type Constraints](#type-constraints-10)
  - [MMCVRoIAlign](#mmcvroialign)
    - [Description](#description-11)
    - [Parameters](#parameters-11)
    - [Inputs](#inputs-11)
    - [Outputs](#outputs-11)
    - [Type Constraints](#type-constraints-11)
  - [MMCVRoIAlignRotated](#mmcvroialignrotated)
    - [Description](#description-12)
    - [Parameters](#parameters-12)
    - [Inputs](#inputs-12)
    - [Outputs](#outputs-12)
    - [Type Constraints](#type-constraints-12)
  - [grid_sampler\*](#grid_sampler)
    - [Description](#description-13)
    - [Parameters](#parameters-13)
    - [Inputs](#inputs-13)
    - [Outputs](#outputs-13)
    - [Type Constraints](#type-constraints-13)
  - [cummax\*](#cummax)
    - [Description](#description-14)
    - [Parameters](#parameters-14)
    - [Inputs](#inputs-14)
    - [Outputs](#outputs-14)
    - [Type Constraints](#type-constraints-14)
  - [cummin\*](#cummin)
    - [Description](#description-15)
    - [Parameters](#parameters-15)
    - [Inputs](#inputs-15)
    - [Outputs](#outputs-15)
    - [Type Constraints](#type-constraints-15)
  - [Reminders](#reminders)

<!-- TOC -->

## MMCVBorderAlign

### Description

Applies `border_align` over the input feature based on predicted bboxes.

For each border line (e.g. top, left, bottom or right) of each box,
border_align does the following:

- uniformly samples `pool_size`+1 positions on this line, involving the start and end points.
- the corresponding features on these points are computed by bilinear interpolation.
- max pooling over all the `pool_size`+1 positions are used for computing pooled feature.

Read [BorderDet: Border Feature for Dense Object Detection](ttps://arxiv.org/abs/2007.11056) for more detailed information.

### Parameters

| Type  | Parameter   | Description                                                                         |
| ----- | ----------- | ----------------------------------------------------------------------------------- |
| `int` | `pool_size` | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>Features with shape [N,4C,H,W]. Channels ranged in [0,C), [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, right features respectively</dd>
<dt><tt>boxes</tt>: T</dt>
<dd>Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>Pooled features with shape [N,C,H*W,4]. The order is(top,left,bottom,right) for the last dimension.</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## MMCVCARAFE

### Description

CARAFE operator performs feature upsampling.

Read [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188) for more detailed information.

### Parameters

| Type    | Parameter      | Description                                   |
| ------- | -------------- | --------------------------------------------- |
| `int`   | `kernel_size`  | reassemble kernel size, should be odd integer |
| `int`   | `group_size`   | reassemble group size                         |
| `float` | `scale_factor` | upsample ratio(>=1)                           |

### Inputs

<dl>
<dt><tt>features</tt>: T</dt>
<dd>Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.</dd>
<dt><tt>masks</tt>: T</dt>
<dd>The input mask</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>The upsampled features. 4-D tensor of shape (N, C, H * scale_factor, W * scale_factor). N is the batch size.</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## MMCVCAWeight

### Description

Operator for Criss-Cross Attention
Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/pdf/1811.11721.pdf) for more detailed information.

### Parameters

None

### Inputs

<dl>
<dt><tt>t</tt>: T</dt>
<dd>The query matrix of shape (N, C', H, W).</dd>
<dt><tt>f</tt>: T</dt>
<dd>The key matrix of shape (N, C', H, W).</dd>
</dl>

### Outputs

<dl>
<dt><tt>weight</tt>: T</dt>
<dd>The attention map of shape (N, H+W-1, H, W).</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## MMCVCAMap

### Description

Operator for Criss-Cross Attention
Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/pdf/1811.11721.pdf) for more detailed information.

### Parameters

None

### Inputs

<dl>
<dt><tt>weight</tt>: T</dt>
<dd>Output from the operator MMCVCAWeight.</dd>
<dt><tt>value</tt>: T</dt>
<dd>The value matrix of shape (N, C, H, W).</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>Output tensor of aggregated contextual information</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## MMCVCornerPool

### Description

Perform CornerPool on `input` features. Read [CornerNet -- Detecting Objects as Paired Keypoints](https://arxiv.org/abs/1808.01244) for more details.

### Parameters

| Type  | Parameter | Description                                                      |
| ----- | --------- | ---------------------------------------------------------------- |
| `int` | `mode`    | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>The pooled features. 4-D tensor of shape (N, C, H, W).</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## MMCVDeformConv2d

### Description

Applies a deformable 2D convolution over an input signal composed of several input planes.

Read [Deformable Convolutional Networks](https://arxiv.org/pdf/1703.06211.pdf) for detail.

### Parameters

| Type           | Parameter           | Description                                                                                                       |
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------- |
| `list of ints` | `stride`            | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`.                                              |
| `list of ints` | `padding`           | Paddings on both sides of the input, (padH, padW).  Defaults to `(0, 0)`.                                         |
| `list of ints` | `dilation`          | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`.                                               |
| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`.            |
| `int`          | `deformable_groups` | Groups of deformable offset. Defaults to `1`.                                                                     |
| `int`          | `bias`              | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`. |
| `int`          | `im2col_step`       | Groups of deformable offset. Defaults to `32`.                                                                    |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.</dd>
<dt><tt>offset</tt>: T</dt>
<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW is the height and width of offset and output.</dd>
<dt><tt>weight</tt>: T</dt>
<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
</dl>

### Type Constraints

- T:tensor(float32, Linear)

## MMCVModulatedDeformConv2d

### Description

Perform Modulated Deformable Convolution on input feature, read [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168?from=timeline) for detail.

### Parameters

| Type           | Parameter           | Description                                                                           |
| -------------- | ------------------- | ------------------------------------------------------------------------------------- |
| `list of ints` | `stride`            | The stride of the convolving kernel. (sH, sW)                                         |
| `list of ints` | `padding`           | Paddings on both sides of the input. (padH, padW)                                     |
| `list of ints` | `dilation`          | The spacing between kernel elements. (dH, dW)                                         |
| `int`          | `deformable_groups` | Groups of deformable offset.                                                          |
| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. |

### Inputs

<dl>
<dt><tt>feature</tt>: T</dt>
<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.</dd>
<dt><tt>offset</tt>: T</dt>
<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW are the height and width of offset and output.</dd>
<dt><tt>mask</tt>: T</dt>
<dd>Input mask; 4-D tensor of shape (N, deformable_group* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW are the height and width of offset and output.</dd>
<dt><tt>weight]</tt>: T</dt>
<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
<dt><tt>bias</tt>: T, optional</dt>
<dd>Input bias; 1-D tensor of shape (output_channel).</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
</dl>

### Type Constraints

- T:tensor(float32, Linear)

## MMCVDeformRoIPool

### Description

Deformable roi pooling layer

### Parameters

| Type    | Parameter        | Description                                                                                                   |
| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
| `int`   | `output_height`  | height of output roi                                                                                          |
| `int`   | `output_width`   | width of output roi                                                                                           |
| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
| `float` | `gamma`          | gamma                                                                                                         |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
<dt><tt>rois</tt>: T</dt>
<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>
<dt><tt>offset</tt>: T</dt>
<dd>offset of height and width. Defaults to a tensor of zero</dd>
</dl>

### Outputs

<dl>
<dt><tt>feat</tt>: T</dt>
<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>
</dl>

### Type Constraints

- T:tensor(float32)

## MMCVMaskedConv2d

### Description

Performs a masked 2D convolution from PixelRNN
Read [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759) for more detailed information.

### Parameters

| Type           | Parameter | Description                                                                      |
| -------------- | --------- | -------------------------------------------------------------------------------- |
| `list of ints` | `stride`  | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv** |
| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`.         |

### Inputs

<dl>
<dt><tt>features</tt>: T</dt>
<dd>Input features; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
<dt><tt>mask</tt>: T</dt>
<dd>Input mask; 3D tensor of shape (N, H, W)</dd>
<dt><tt>weight</tt>: T</dt>
<dd>The learnable weights of the module</dd>
<dt><tt>bias</tt>: T</dt>
<dd>The learnable bias of the module</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>The output convolved feature</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## MMCVPSAMask

### Description

An operator from PSANet.

Read [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://hszhao.github.io/papers/eccv18_psanet.pdf) for more detailed information.

### Parameters

| Type           | Parameter   | Description                                  |
| -------------- | ----------- | -------------------------------------------- |
| `int`          | `psa_type`  | `0` means collect and `1` means `distribute` |
| `list of ints` | `mask_size` | The size of mask                             |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>Input feature; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>Output tensor of shape (N, H * W, H, W)</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## NonMaxSuppression

### Description

Filter out boxes has high IoU overlap with previously selected boxes or low score. Output the indices of valid boxes.

Note this definition is slightly different with [onnx: NonMaxSuppression](https://github.com/onnx/onnx/blob/main/docs/Operators.md#nonmaxsuppression)

### Parameters

| Type    | Parameter                    | Description                                                                                                                          |
| ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
| `int`   | `center_point_box`           | 0 - the box data is supplied as \[y1, x1, y2, x2\], 1-the box data is supplied as \[x_center, y_center, width, height\].             |
| `int`   | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. |
| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0.                   |
| `float` | `score_threshold`            | The threshold for deciding when to remove boxes based on score.                                                                      |
| `int`   | `offset`                     | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                                                |

### Inputs

<dl>
<dt><tt>boxes</tt>: T</dt>
<dd>Input boxes. 3-D tensor of shape (num_batches, spatial_dimension, 4).</dd>
<dt><tt>scores</tt>: T</dt>
<dd>Input scores. 3-D tensor of shape (num_batches, num_classes, spatial_dimension).</dd>
</dl>

### Outputs

<dl>
<dt><tt>indices</tt>: tensor(int32, Linear)</dt>
<dd>Selected indices. 2-D tensor of shape (num_selected_indices, 3) as [[batch_index, class_index, box_index], ...].</dd>
<dd>num_selected_indices=num_batches* num_classes* min(max_output_boxes_per_class, spatial_dimension).</dd>
<dd>All invalid indices will be filled with -1.</dd>
</dl>

### Type Constraints

- T:tensor(float32, Linear)

## MMCVRoIAlign

### Description

Perform RoIAlign on output feature, used in bbox_head of most two-stage detectors.

### Parameters

| Type    | Parameter        | Description                                                                                                   |
| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
| `int`   | `output_height`  | height of output roi                                                                                          |
| `int`   | `output_width`   | width of output roi                                                                                           |
| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
<dt><tt>rois</tt>: T</dt>
<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>
</dl>

### Outputs

<dl>
<dt><tt>feat</tt>: T</dt>
<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>
</dl>

### Type Constraints

- T:tensor(float32)

## MMCVRoIAlignRotated

### Description

Perform RoI align pooling for rotated proposals

### Parameters

| Type    | Parameter        | Description                                                                                                   |
| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
| `int`   | `output_height`  | height of output roi                                                                                          |
| `int`   | `output_width`   | width of output roi                                                                                           |
| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
| `int`   | `clockwise`      | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |

### Inputs

<dl>
<dt><tt>features</tt>: T</dt>
<dd>Input feature map; 4D tensor of shape (N, C, H, W)</dd>
<dt><tt>rois</tt>: T</dt>
<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>
</dl>

### Outputs

<dl>
<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>
</dl>

### Type Constraints

- T:tensor(float32)

## grid_sampler\*

### Description

Perform sample from `input` with pixel locations from `grid`.

Check [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html?highlight=grid_sample#torch.nn.functional.grid_sample) for more information.

### Parameters

| Type  | Parameter            | Description                                                                                                                                                                                                                                                                                     |
| ----- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `int` | `interpolation_mode` | Interpolation mode to calculate output values. (0: `bilinear` , 1: `nearest`)                                                                                                                                                                                                                   |
| `int` | `padding_mode`       | Padding mode for outside grid values. (0: `zeros`, 1: `border`, 2: `reflection`)                                                                                                                                                                                                                |
| `int` | `align_corners`      | If `align_corners=1`, the extrema (`-1` and `1`) are considered as referring to the center points of the input's corner pixels. If `align_corners=0`, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic. |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the numbers of channels, inH and inW are the height and width of the data.</dd>
<dt><tt>grid</tt>: T</dt>
<dd>Input offset; 4-D tensor of shape (N, outH, outW, 2), where outH and outW are the height and width of offset and output. </dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>Output feature; 4-D tensor of shape (N, C, outH, outW).</dd>
</dl>

### Type Constraints

- T:tensor(float32, Linear)

## cummax\*

### Description

Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum elements of `input` in the dimension `dim`. And `indices` is the index location of each maximum value found in the dimension `dim`. Read [torch.cummax](https://pytorch.org/docs/stable/generated/torch.cummax.html) for more details.

### Parameters

| Type  | Parameter | Description                            |
| ----- | --------- | -------------------------------------- |
| `int` | `dim`     | the dimension to do the operation over |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>The input tensor with various shapes. Tensor with empty element is also supported.</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>Output the cumulative maximum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.</dd>
<dt><tt>indices</tt>: tensor(int64)</dt>
<dd>Output the index location of each cumulative maximum value found in the dimension `dim`, with the same shape as `input`.</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## cummin\*

### Description

Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum elements of `input` in the dimension `dim`. And `indices` is the index location of each minimum value found in the dimension `dim`. Read [torch.cummin](https://pytorch.org/docs/stable/generated/torch.cummin.html) for more details.

### Parameters

| Type  | Parameter | Description                            |
| ----- | --------- | -------------------------------------- |
| `int` | `dim`     | the dimension to do the operation over |

### Inputs

<dl>
<dt><tt>input</tt>: T</dt>
<dd>The input tensor with various shapes. Tensor with empty element is also supported.</dd>
</dl>

### Outputs

<dl>
<dt><tt>output</tt>: T</dt>
<dd>Output the cumulative minimum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.</dd>
<dt><tt>indices</tt>: tensor(int64)</dt>
<dd>Output the index location of each cumulative minimum value found in the dimension `dim`, with the same shape as `input`.</dd>
</dl>

### Type Constraints

- T:tensor(float32)

## Reminders

- Operators endwith `*` are defined in Torch and are included here for the conversion to ONNX.


================================================
FILE: docs/en/docutils.conf
================================================
[html writers]
table_style: colwidths-auto


================================================
FILE: docs/en/faq.md
================================================
## Frequently Asked Questions

We list some common troubles faced by many users and their corresponding solutions here.
Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.

### Installation

- KeyError: "xxx: 'yyy is not in the zzz registry'"

  The registry mechanism will be triggered only when the file of the module is imported.
  So you need to import that file somewhere. More details can be found at [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974).

- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"

  1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`
  2. Install mmcv-full following the [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) or [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html)

- "invalid device function" or "no kernel image is available for execution"

  1. Check the CUDA compute capability of you GPU
  2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture. You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. The compatibility issue could happen when  using old GPUS, e.g., Tesla K80 (3.7) on colab.
  3. Check whether the running environment is the same as that when mmcv/mmdet is compiled. For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0 environments

- "undefined symbol" or "cannot open xxx.so"

  1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check
     whether the CUDA/GCC runtimes are the same as those used for compiling mmcv
  2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the Pytorch version is the same as that used for compiling mmcv
  3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment

- "RuntimeError: CUDA error: invalid configuration argument"

  This error may be caused by the poor performance of GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
  and recompile mmcv.

- "RuntimeError: nms is not compiled with GPU support"

  This error is because your CUDA environment is not installed correctly.
  You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv.

- "Segmentation fault"

  1. Check your GCC version and use GCC >= 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause "segmentation fault" and simply changing it to GCC 5.4 could solve the problem
  2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal and see whether they could correctly output results
     ```shell
     python -c 'import torch; print(torch.cuda.is_available())'
     ```
  3. If PyTorch is correctly installed, check whether MMCV is correctly installed. If MMCV is correctly installed, then there will be no issue of the command
     ```shell
     python -c 'import mmcv; import mmcv.ops'
     ```
  4. If MMCV and PyTorch are correctly installed, you can use `ipdb` to set breakpoints or directly add `print` to debug and see which part leads the `segmentation fault`

- "libtorch_cuda_cu.so: cannot open shared object file"

  `mmcv-full` depends on the share object but it can not be found. We can check whether the object exists in `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` or try to re-install the PyTorch.

- "fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!"

  If you are building mmcv-full on Windows and the version of CUDA is 9.2, you will probably encounter the error `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`, in which case you can use a lower version of Microsoft Visual Studio like vs2017.

- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized"

  If your version of PyTorch is 1.5.0 and you are building mmcv-full on Windows, you will probably encounter the error `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`. The way to solve the error is to replace all the `static constexpr bool all_slots = false;` with `static bool all_slots = false;` at this file `https://github.com/pytorch/pytorch/blob/v1.5.0/torch/csrc/jit/api/module.h`. More details can be found at [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394).

- "error: a member with an in-class initializer must be const"

  If your version of PyTorch is 1.6.0 and you are building mmcv-full on Windows, you will probably encounter the error `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. The way to solve the error is to replace all the `CONSTEXPR_EXCEPT_WIN_CUDA ` with `const` at `torch/include\torch/csrc/jit/api/module.h`. More details can be found at [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575).

- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized"

  If your version of PyTorch is 1.7.0 and you are building mmcv-full on Windows, you will probably encounter the error `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. The way to solve the error needs to modify several local files of PyTorch:

  - delete `static constexpr Symbol Kind = ::c10::prim::profile;` and `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` at `torch/include\torch/csrc/jit/ir/ir.h`
  - replace `explicit operator type&() { return *(this->value); }` with `explicit operator type&() { return *((type*)this->value); }` at `torch\include\pybind11\cast.h`
  - replace all the `CONSTEXPR_EXCEPT_WIN_CUDA` with `const` at `torch/include\torch/csrc/jit/api/module.h`

  More details can be found at [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956).

- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer"

  Please install the correct version of MMCV for the version of your MMDetection following the [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation).

### Usage

- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"

  1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode. More datails at [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582).
  2. You can set ` find_unused_parameters = True` in the config to solve the above problems or find those unused parameters manually

- "RuntimeError: Trying to backward through the graph a second time"

  `GradientCumulativeOptimizerHook` and `OptimizerHook` are both set which causes the `loss.backward()` to be called twice so `RuntimeError` was raised. We can only use one of these. More datails at [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379).


================================================
FILE: docs/en/get_started/api_reference.md
================================================
# API reference table

Due to the removal of the `mmcv.fileio`, `mmcv.runner`, `mmcv.parallel`, `mmcv.engine`, `mmcv.device` modules, and all classes and most of the functions in the `mmcv.utils` module during the upgrade from MMCV v1.x to MMCV v2.x, which were removed at PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179), PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216), PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217). Therefore, we provide the following API reference table to make it easier to quickly find the migrated interfaces.

## Related issues, PRs and discussions

- [Remove runner, parallel, engine and device](https://github.com/open-mmlab/mmcv/pull/2216)
- [ImportError: cannot import name 'is_list_of' from 'mmcv.utils'](https://github.com/open-mmlab/mmcv/issues/2282)
- [Could not find the files in MMengine which are removed in MMCV_v2x parallel. example, for DataContainer](https://github.com/open-mmlab/mmcv/issues/2934)
- [mmcv.cnn.bricks.registry](https://github.com/open-mmlab/mmengine/discussions/1356)
- [Replace mmcv's function and modules imported with mmengine's](https://github.com/open-mmlab/mmdetection/pull/8594)

## `mmcv.fileio`

| MMCV                                              | MMCV URL                                                                              | MMEngine                                                    | MMEngine URL                                                                                   |
| ------------------------------------------------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |
| mmcv.fileio.file_client.BaseStorageBackend        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.base.BaseStorageBackend            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/base.py              |
| mmcv.fileio.file_client.CephBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             |                                                             |                                                                                                |
| mmcv.fileio.file_client.PetrelBackend             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.petrel_backend.PetrelBackend       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/petrel_backend.py    |
| mmcv.fileio.file_client.MemcachedBackend          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.memcached_backend.MemcachedBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/memcached_backend.py |
| mmcv.fileio.file_client.LmdbBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.lmdb_backend.LmdbBackend           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/lmdb_backend.py      |
| mmcv.fileio.file_client.HardDiskBackend           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.file_client.HardDiskBackend                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py                |
| mmcv.fileio.file_client.HTTPBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.http_backend.HTTPBackend           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/http_backend.py      |
| mmcv.fileio.file_client.FileClient                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.file_client.FileClient                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py                |
| mmcv.fileio.io.load                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.io.load                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py                         |
| mmcv.fileio.io.dump                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.io.dump                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py                         |
| mmcv.fileio.io.\_register_handler                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.handlers.\_register_handler                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py    |
| mmcv.fileio.io.register_handler                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.handlers.register_handler                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py    |
| mmcv.fileio.parse.list_from_file                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py                   | mmengine.fileio.parse.list_from_file                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py                      |
| mmcv.fileio.parse.dict_from_file                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py                   | mmengine.fileio.parse.dict_from_file                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py                      |
| mmcv.fileio.handlers.base.BaseFileHandler         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/base.py           | mmengine.fileio.handlers.base.BaseFileHandler               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/base.py              |
| mmcv.fileio.handlers.json_handler.set_default     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py   | mmengine.fileio.handlers.json_handler.set_default           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py      |
| mmcv.fileio.handlers.json_handler.JsonHandler     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py   | mmengine.fileio.handlers.json_handler.JsonHandler           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py      |
| mmcv.fileio.handlers.pickle_handler.PickleHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/pickle_handler.py | mmengine.fileio.handlers.pickle_handler.PickleHandler       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/pickle_handler.py    |
| mmcv.fileio.handlers.yaml_handler.YamlHandler     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/yaml_handler.py   | mmengine.fileio.handlers.yaml_handler.YamlHandler           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/yaml_handler.py      |

## `mmcv.runner`

| MMCV                                                                  | MMCV URL                                                                                    | MMEngine                                                                                                                                       | MMEngine URL                                                                                                                                                                                           |
| --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| mmcv.runner.hooks.logger.base.LoggerHook                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/base.py             | mmengine.hooks.logger_hook.LoggerHook                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py                                                                                                                         |
| mmcv.runner.hooks.logger.clearml.ClearMLLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/clearml.py          | Similar: mmengine.visualization.vis_backend.ClearMLVisBackend                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.dvclive.DvcliveLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/dvclive.py          | Similar: mmengine.visualization.vis_backend.DVCLiveVisBackend                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.mlflow.MlflowLoggerHook                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/mlflow.py           | Similar: mmengine.visualization.vis_backend.MLflowVisBackend                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.neptune.NeptuneLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/neptune.py          | Similar: mmengine.visualization.vis_backend.NeptuneVisBackend                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.pavi.PaviLoggerHook                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/pavi.py             |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.logger.segmind.SegmindLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/segmind.py          |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.logger.tensorboard.TensorboardLoggerHook            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/tensorboard.py      | Similar: mmengine.visualization.vis_backend.TensorboardVisBackend                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.text.TextLoggerHook                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/text.py             |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.logger.wandb.WandbLoggerHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/wandb.py            | Similar: mmengine.visualization.vis_backend.WandbVisBackend                                                                                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.checkpoint.CheckpointHook                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/checkpoint.py              | mmengine.hooks.checkpoint_hook.CheckpointHook                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |
| mmcv.runner.hooks.closure.ClosureHook                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/closure.py                 |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.ema.EMAHook                                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/ema.py                     | mmengine.hooks.ema_hook.EMAHook                                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/ema_hook.py                                                                                                                            |
| mmcv.runner.hooks.evaluation.EvalHook                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py              | Some features have been moved to: mmengine.hooks.checkpoint_hook.CheckpointHook                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |
| mmcv.runner.hooks.evaluation.DistEvalHook                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py              | Some features have been moved to: mmengine.hooks.checkpoint_hook.CheckpointHook                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |
| mmcv.runner.hooks.hook.HOOKS                                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py                    | mmengine.registry.root.HOOKS                                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |
| mmcv.runner.hooks.hook.Hook                                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py                    | mmengine.hooks.hook.Hook                                                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py                                                                                                                                |
| mmcv.runner.hooks.iter_timer.IterTimerHook                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/iter_timer.py              | mmengine.hooks.iter_timer_hook.IterTimerHook                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/iter_timer_hook.py                                                                                                                     |
| mmcv.runner.hooks.lr_updater.LrUpdaterHook                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.LRSchedulerMixin                                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.FixedLrUpdaterHook                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.ConstantLR                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.StepLrUpdaterHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.StepLR                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.ExpLrUpdaterHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.ExponentialLR                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.PolyLrUpdaterHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.PolyLR                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.InvLrUpdaterHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.lr_updater.CosineAnnealingUpdaterHook               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineAnnealingLR                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.FlatCosineAnnealingUpdaterHook           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.lr_updater.CosineRestartLrUpdaterHook               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.get_position_from_periods                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR.get_position_from_periods                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.CyclicLrUpdaterHook                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.lr_updater.OneCycleLrUpdaterHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.LinearAnnealingLrUpdaterHook             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.lr_updater.annealing_cos                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_annealing_cos                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.annealing_linear                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_annealing_linear                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.format_param                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_format_param                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.memory.EmptyCacheHook                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/memory.py                  | mmengine.hoos.empty_cache_hook.EmptyCacheHook                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/empty_cache_hook.py                                                                                                                    |
| mmcv.runner.hooks.momentum_updater.MomentumUpdaterHook                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.MomentumSchedulerMixin                                                                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |
| mmcv.runner.hooks.momentum_updater.StepMomentumUpdaterHook            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.StepMomentum                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |
| mmcv.runner.hooks.momentum_updater.CosineAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.CosineAnnealingMomentum                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |
| mmcv.runner.hooks.momentum_updater.LinearAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.momentum_updater.CyclikcMomentumUpdaterHook         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.momentum_updater.OneCycleMomentumUpdaterHook        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.optimizer.OptimizerHook                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | mmengine.optimizer.optimizer_wrapper.OptimWrapper                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/optimizer_wrapper.py                                                                                                         |
| mmcv.runner.hooks.optimizer.GradientCumulativeOptimizerHook           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.optimizer.Fp16OptimizerHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | Moved to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.optimizer.Fp16OptimizerHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | Moved to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.hooks.profiler.ProfilerHook                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/profiler.py                | mmengine.hooks.profiler_hook.ProfilerHook                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/profiler_hook.py                                                                                                                       |
| mmcv.runner.hooks.sampler_seed.DistSamplerSeedHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sampler_seed.py            | mmengine.hooks.sampler_seed_hook.DistSamplerSeedHook                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sampler_seed_hook.py                                                                                                                   |
| mmcv.runner.hooks.sync_buffer.SyncbuffersHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sync_buffer.py             | mmengine.hooks.sync_buffer_hook.SyncBufferHook                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sync_buffer_hook.py                                                                                                                    |
| mmcv.runner.optimizer.builder.OPTIMIZERS                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             | mmengine.registry.root.OPTIMIZERS                                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |
| mmcv.runner.optimizer.builder.OPTIMIZER_BUILDERS                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.optimizer.builder.register_torch_optimizers               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             | mmengine.optim.optimizer.builder.register_torch_optimizers                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/builder.py                                                                                                                   |
| mmcv.runner.optimizer.builder.TORCH_OPTIMIZERS                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.optimizer.builder.build_optimizer_constructor             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.optimizer.builder.build_optimizer                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.optimizer.default_constructor.DefaultOptimizerConstructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/default_constructor.py |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.base_module.BaseModule                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.BaseModule                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |
| mmcv.runner.base_module.Sequential                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.Sequential                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |
| mmcv.runner.base_module.ModuleList                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.ModuleList                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |
| mmcv.runner.base_module.ModuleDict                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.ModuleDict                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |
| mmcv.runner.base_runner.BaseRunner                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_runner.py                   | mmengine.runner.runner.Runner                                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py                                                                                                                             |
| mmcv.runner.builder.RUNNERS                                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       | mmengine.registry.root.RUNNERS                                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |
| mmcv.runner.builder.RUNNER_BUILDERS                                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       | mmengine.registry.root.RUNNER_CONSTRUCTORS                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |
| mmcv.runner.builder.build_runner_constructor                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.builder.build_runner                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.checkpoint.ENV_MMCV_HOME                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.ENV_MMENGINE_HOME                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.ENV_XDG_CACHE_HOME                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.ENV_XDG_CACHE_HOME                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.DEFAULT_CACHE_HOME                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.DEFAULT_CACHE_DIR                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_get_mmcv_home                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_get_mmengine_home                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_state_dict                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_state_dict                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_torchvision_models                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_torchvision_models                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_external_models                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_external_models                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_mmcls_models                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_mmcls_models                                                                                                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_deprecated_model_names                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_deprecated_model_names                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_process_mmcls_checkpoint                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_process_mmcls_checkpoint                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.CheckpointLoader                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.CheckpointLoader                                                                                                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_local                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_local                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_http                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_http                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_pavi                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_pavi                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_ceph                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_ceph                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_torchvision                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_torchvision                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_openmmlab                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_openmmlab                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_mmcls                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_mmcls                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_load_checkpoint                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_load_checkpoint                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_load_checkpoint_with_prefix                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_load_checkpoint_with_prefix                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_checkpoint                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_checkpoint                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.weights_to_cpu                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.weights_to_cpu                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_save_to_state_dict                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_save_to_state_dict                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_state_dict                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_state_dict                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.save_checkpoint                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.save_checkpoint                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.default_coonstructor.DefaultRunnerConstructor             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/default_constructor.py           |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.\_find_free_port                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.\_is_free_port                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.init_dist                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.init_dist                                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.\_init_dist_pytorch                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\_init_dist_pytorch                                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.\_init_dist_mpi                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\_init_dist_mpi                                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.\_init_dist_slurm                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\_init_dist_slurm                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.get_dist_info                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.get_dist_info                                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.master_only                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.master_only                                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.allreduce_params                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.allreduce_grads                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.\_allreduce_coalesced                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.epoch_based_runner.EpochBasedRunner                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py            | mmengine.runner.loops.EpochBasedTrainLoop                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py                                                                                                                              |
| mmcv.runner.epoch_based_runner.Runner                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py            |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.fp16_utils.cast_tensor_type                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.auto_fp16                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.force_fp32                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.allreduce_grads                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.wrap_fp16_model                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.patch_norm_fp32                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.patch_forward_method                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.LossScaler                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.iter_based_runner.IterLoader                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py             |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.iter_based_runner.IterBasedRunner                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py             | mmengine.runner.loops.IterBasedTrainLoop                                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py                                                                                                                              |
| mmcv.runner.log_buffer.LogBuffer                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/log_buffer.py                    |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.priority.Priority                                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py                      | mmengine.runer.priority.Priority                                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py                                                                                                                           |
| mmcv.runner.priority.get_priority                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py                      | mmengine.runner.priority.get_priority                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py                                                                                                                           |
| mmcv.runner.utils.get_host_info                                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.utils.get_time_str                                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.utils.obj_from_dict                                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                                |                                                                                                                                                                                                        |
| mmcv.runner.utils.set_random_seed                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         | mmengine.runner.utils.set_random_seed                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/utils.py                                                                                                                              |

## `mmcv.parallel`

| MMCV                                                           | MMCV URL                                                                       | MMEngine                                                       | MMEngine URL                                                                              |
| -------------------------------------------------------------- | ------------------------------------------------------------------------------ | -------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| mmcv.parallel.\_functions.scatter                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                |                                                                                           |
| mmcv.parallel.\_functions.synchronize_stream                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                |                                                                                           |
| mmcv.parallel.\_functions.get_input_device                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                |                                                                                           |
| mmcv.parallel.\_functions.Scatter                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                |                                                                                           |
| mmcv.parallel.collate.collate                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/collate.py        |                                                                |                                                                                           |
| mmcv.parallel.data_container.assert_tensor_type                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py |                                                                |                                                                                           |
| mmcv.parallel.data_container.DataContainer                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py | Similar: mmengine/structures/base_data_element.BaseDataElement | https://github.com/open-mmlab/mmengine/blob/main/mmengine/structures/base_data_element.py |
| mmcv.parallel.data_parallel.MMDataParallel                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_parallel.py  |                                                                |                                                                                           |
| mmcv.parallel.distributed.MMDistributedDataParallel            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py    | mmengine.model.wrappers.distributed.MMDistributedDataParallel  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py   |
| mmcv.parallel.distributed_deprecated.MMDistributedDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py    | mmengine.model.wrappers.distributed.MMDistributedDataParallel  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py   |
| mmcv.parallel.registry.MODULE_WRAPPERS                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/registry.py       | mmengine.registry.root.MODEL_WRAPPERS                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                |
| mmcv.parallel.scatter_gather.scatter                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py |                                                                |                                                                                           |
| mmcv.parallel.scatter_gather.scatter_kwargs                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py |                                                                |                                                                                           |
| mmcv.parallel.utils.is_module_wrapper                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/utils.py          | mmengine.model.wrappers.utils.is_model_wrapper                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/utils.py         |

## `mmcv.engine`

| MMCV                                 | MMCV URL                                                           | MMEngine | MMEngine URL |
| ------------------------------------ | ------------------------------------------------------------------ | -------- | ------------ |
| mmcv.engine.test.single_gpu_test     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |
| mmcv.engine.test.multi_gpu_test      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |
| mmcv.engine.test.collect_results_cpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |
| mmcv.engine.test.collect_results_gpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |

## `mmcv.device`

| MMCV                                      | MMCV URL                                                                     | MMEngine                         | MMEngine URL                                                              |
| ----------------------------------------- | ---------------------------------------------------------------------------- | -------------------------------- | ------------------------------------------------------------------------- |
| mmcv.device.ipu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/ipu               |                                  |                                                                           |
| mmcv.device.mlu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mlu               |                                  |                                                                           |
| mmcv.device.mps                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mps               |                                  |                                                                           |
| mmcv.device.npu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/npu               |                                  |                                                                           |
| mmcv.device.\_functions.scatter           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py     |                                  |                                                                           |
| mmcv.device.\_functions.Scatter           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py     |                                  |                                                                           |
| mmcv.device.scatter_gather.scatter        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py |                                  |                                                                           |
| mmcv.device.scatter_gather.scatter_kwargs | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py |                                  |                                                                           |
| mmcv.device.utils.get_device              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/utils.py          | mmengine.device.utils.get_device | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py |

## `mmcv.utils`

| MMCV                                                   | MMCV URL                                                                     | MMEngine                                                            | MMEngine URL                                                                                |
| ------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
| mmcv.utils.config.BASE_KEY                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.BASE_KEY                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.DELETE_KEY                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DELETE_KEY                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.DEPRECATION_KEY                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DEPRECATION_KEY                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.ConfigDict                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.ConfigDict                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.add_args                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.add_args                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.Config                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.Config                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.DictAction                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DictAction                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.device_type.is_ipu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     |                                                                     |                                                                                             |
| mmcv.utils.device_type.IS_IPU_AVAILABLE                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     |                                                                     |                                                                                             |
| mmcv.utils.device_type.is_mlu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_mlu_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |
| mmcv.utils.device_type.is_mps_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_mps_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |
| mmcv.utils.device_type.is_npu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_npu_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |
| mmcv.utils.hub.\_is_legacy_zip_format                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.\_is_legacy_zip_format                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |
| mmcv.utils.hub.\_legacy_zip_load                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.\_legacy_zip_load                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |
| mmcv.utils.hub.load_url                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.load_url                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |
| mmcv.utils.logging.logger_initialized                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |
| mmcv.utils.logging.get_logger                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |
| mmcv.utils.logging.print_log                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |
| mmcv.utils.misc.\_ntuple                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\_ntuple                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_1tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_1tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_2tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_2tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_3tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_3tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_4tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_4tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_ntuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_ntuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_str                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_str                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.import_modules_from_strings            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.import_modules_from_strings                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.iter_cast                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.iter_cast                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.list_cast                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.list_cast                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.tuple_cast                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.tuple_cast                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_seq_of                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_seq_of                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_list_of                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_list_of                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_tuple_of                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_tuple_of                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.slice_list                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.slice_list                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.concat_list                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.concat_list                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.check_prerequisites                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.check_prerequisites                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.\_check_py_package                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\_check_py_package                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.\_check_executable                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\_check_executable                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.requires_package                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.requires_package                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.requires_executable                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.requires_executable                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.deprecated_api_warning                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.deprecated_api_warning                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_method_overridden                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_method_overridden                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.has_method                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.has_method                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.parrots_wrapper.TORCH_VERSION               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.TORCH_VERSION               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.is_cuda_available           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.devices.utils.is_cuda_available                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |
| mmcv.utils.parrots_wrapper.IS_CUDA_AVAILABLE           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py |                                                                     |                                                                                             |
| mmcv.utils.parrots_wrapper.is_rocm_pytorch             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.is_rocm_pytorch             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_cuda_home             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_cuda_home             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.get_build_config            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.get_build_config            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_conv                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_conv                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_dataloader            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_dataloader            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_extension             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_extension             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_pool                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_pool                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_norm                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_norm                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.SyncBatchNorm               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.SyncBatchNorm               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.path.is_filepath                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.is_filepath                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.fopen                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.fopen                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.check_file_exist                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.path.check_file_exist                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.mkdir_or_exist                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.mkdir_or_exist                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.symlink                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.symlink                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.scandir                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.scandir                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.find_vcs_root                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.find_vcs_root                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.progressbar.ProgressBar                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.ProgressBar                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.progressbar.track_progress                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_progress                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.progressbar.init_pool                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.init_pool                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.progressbar.track_parallel_progress         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_parallel_progress                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.progressbar.track_iter_progress             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_iter_progress                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.registry.build_from_cfg                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py        | mmengine.registry.build_functions.build_from_cfg                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/build_functions.py       |
| mmcv.utils.registry.Registry                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py        | mmengine.registry.registry.Registry                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/registry.py              |
| mmcv.utils.seed.worker_init_fn                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/seed.py            | mmengine.dataset.utils.worker_init_fn                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/utils.py                  |
| mmcv.utils.testing.check_python_script                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.check_python_script                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.\_any                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.\_any                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_dict_contains_subset         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_dict_contains_subset                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_attrs_equal                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_attrs_equal                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_dict_has_keys                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_dict_has_keys                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_keys_equal                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_keys_equal                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_is_norm_layer                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_is_norm_layer                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_params_all_zeros             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_params_all_zeros                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.timer.TimerError                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.TimerError                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |
| mmcv.utils.timer.Timer                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.Timer                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |
| mmcv.utils.timer.\_g_timers                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.\_g_timers                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |
| mmcv.utils.timer.check_time                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.check_time                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |
| mmcv.utils.torch_ops.\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py       | mmengine.utils.dl_utils.torch_ops.\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py       |
| mmcv.utils.torch_ops.torch_meshgrid                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py       | mmengine.utils.dl_utils.torch_ops.torch_meshgrid                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py       |
| mmcv.utils.trace.is_jit_tracing                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/trace.py           | mmengine.utils.dl_utils.trace.is_jit_tracing                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/trace.py           |
| mmcv.utils.version_utils.digit_version                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.digit_version                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |
| mmcv.utils.version_utils.\_minimal_ext_cmd             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.\_minimal_ext_cmd                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |
| mmcv.utils.version_utils.get_git_hash                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.get_git_hash                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |

## `mmcv.cnn`

| MMCV                                         | MMCV URL                                                                 | MMEngine                                   | MMEngine URL                                                             |
| -------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------------------ |
| mmcv.cnn.utils.sync_bn.\_BatchNormXd         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.\_BatchNormXd         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py |
| mmcv.cnn.utils.sync_bn.revert_sync_batchnorm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.revert_sync_batchnorm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py |

## `mmcv.model_zoo`

| MMCV                                 | MMCV URL                                                                            | MMEngine                           | MMEngine URL                                                                        |
| ------------------------------------ | ----------------------------------------------------------------------------------- | ---------------------------------- | ----------------------------------------------------------------------------------- |
| mmcv.model_zoo.deprecated.json       | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/deprecated.json       | mmengine.hub.deprecated.json       | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/deprecated.json       |
| mmcv.model_zoo.mmcls.json            | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/mmcls.json            | mmengine.hub.mmcls.json            | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/mmcls.json            |
| mmcv.model_zoo.open_mmlab.json       | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/open_mmlab.json       | mmengine.hub.openmmlab.json        | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/openmmlab.json        |
| mmcv.model_zoo.torchvision_0.12.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/torchvision_0.12.json | mmengine.hub.torchvision_0.12.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/torchvision_0.12.json |


================================================
FILE: docs/en/get_started/build.md
================================================
## Build MMCV from source

### Build mmcv

Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command

```bash
python -c 'import torch;print(torch.__version__)'
```

If version information is output, then PyTorch is installed.

```{note}
If you would like to use `opencv-python-headless` instead of `opencv-python`,
e.g., in a minimum container environment or servers without GUI,
you can first install it before installing MMCV to skip the installation of `opencv-python`.
```

#### Build on Linux

1. Clone the repo

   ```bash
   git clone https://github.com/open-mmlab/mmcv.git
   cd mmcv
   ```

2. Install `ninja` and `psutil` to speed up the compilation

   ```bash
   pip install -r requirements/optional.txt
   ```

3. Check the nvcc version (requires 9.2+. Skip if no GPU available.)

   ```bash
   nvcc --version
   ```

   If the above command outputs the following message, it means that the nvcc setting is OK, otherwise you need to set CUDA_HOME.

   ```
   nvcc: NVIDIA (R) Cuda compiler driver
   Copyright (c) 2005-2020 NVIDIA Corporation
   Built on Mon_Nov_30_19:08:53_PST_2020
   Cuda compilation tools, release 11.2, V11.2.67
   Build cuda_11.2.r11.2/compiler.29373293_0
   ```

   :::{note}
   If you want to support ROCm, you can refer to [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) to install ROCm.
   :::

4. Check the gcc version (requires 5.4+)

   ```bash
   gcc --version
   ```

5. Start building (takes 10+ min)

   ```bash
   pip install -e . -v
   ```

6. Validate the installation

   ```bash
   python .dev_scripts/check_installation.py
   ```

   If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.

   If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).

#### Build on macOS

```{note}
If you are using a mac with apple silicon chip, install the PyTorch 1.13+, otherwise you will encounter the problem in [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218).
```

1. Clone the repo

   ```bash
   git clone https://github.com/open-mmlab/mmcv.git
   cd mmcv
   ```

2. Install `ninja` and `psutil` to speed up the compilation

   ```bash
   pip install -r requirements/optional.txt
   ```

3. Start building

   ```bash
   MMCV_WITH_OPS=1 pip install -e .
   ```

4. Validate the installation

   ```bash
   python .dev_scripts/check_installation.py
   ```

   If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.

   If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).

#### Build on Windows

Building MMCV on Windows is a bit more complicated than that on Linux.
The following instructions show how to get this accomplished.

##### Prerequisite

The following software is required for building MMCV on windows.
Install them first.

- [Git](https://git-scm.com/download/win)
  - During installation, tick **add git to Path**.
- [Visual Studio Community 2019](https://visualstudio.microsoft.com)
  - A compiler for C++ and CUDA codes.
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
  - Official distributions of Python should work too.
- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)
  - Not required for building CPU version.
  - Customize the installation if necessary. As a recommendation, skip the driver installation if a newer version is already installed.

```{note}
You should know how to set up environment variables, especially `Path`, on Windows. The following instruction relies heavily on this skill.
```

##### Common steps

1. Launch Anaconda prompt from Windows Start menu

   Do not use raw `cmd.exe` s instruction is based on PowerShell syntax.

2. Create a new conda environment

   ```powershell
   (base) PS C:\Users\xxx> conda create --name mmcv python=3.7
   (base) PS C:\Users\xxx> conda activate mmcv  # make sure to activate environment before any operation
   ```

3. Install PyTorch. Choose a version based on your need.

   ```powershell
   # CUDA version
   (mmcv) PS C:\Users\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
   # CPU version
   (mmcv) PS C:\Users\xxx> conda install install pytorch torchvision cpuonly -c pytorch
   ```

4. Clone the repo

   ```powershell
   (mmcv) PS C:\Users\xxx> git clone https://github.com/open-mmlab/mmcv.git
   (mmcv) PS C:\Users\xxx\mmcv> cd mmcv
   ```

5. Install `ninja` and `psutil` to speed up the compilation

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> pip install -r requirements/optional.txt
   ```

6. Set up MSVC compiler

   Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below.

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> cl
   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
   Copyright (C) Microsoft Corporation.   All rights reserved.

   usage: cl [ option... ] filename... [ / link linkoption... ]
   ```

   For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path.

   You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English.

##### Build and install MMCV

mmcv can be built in two ways:

1. Full version (CPU ops)

   Module `ops` will be compiled as a pytorch extension, but only x86 code will be compiled. The compiled ops can be executed on CPU only.

2. Full version (CUDA ops)

   Both x86 and CUDA codes of `ops` module will be compiled. The compiled version can be run on both CPU and CUDA-enabled GPU (if implemented).

###### CPU version

Build and install

```powershell
(mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext
(mmcv) PS C:\Users\xxx\mmcv> python setup.py develop
```

###### GPU version

1. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below:

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> ls env:

   Name                           Value
   ----                           -----
   CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
   CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
   CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
   ```

   This should already be done by CUDA installer. If not, or you have multiple version of CUDA toolkit installed, set it with

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
   # OR
   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # if CUDA_PATH_V10_2 is in envs:
   ```

2. Set CUDA target arch

   ```shell
   # Here you need to change to the target architecture corresponding to your GPU
   (mmcv) PS C:\Users\xxx\mmcv> $env:TORCH_CUDA_ARCH_LIST="7.5"
   ```

   :::{note}
   Check your the compute capability of your GPU from [here](https://developer.nvidia.com/cuda-gpus).

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> &"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\demo_suite\deviceQuery.exe"
   Device 0: "NVIDIA GeForce GTX 1660 SUPER"
   CUDA Driver Version / Runtime Version          11.7 / 11.1
   CUDA Capability Major/Minor version number:    7.5
   ```

   The 7.5 above indicates the target architecture. Note: You need to replace v10.2 with your CUDA version in the above command.
   :::

3. Build and install

   ```powershell
   # build
   python setup.py build_ext # if success, cl will be launched to compile ops
   # install
   python setup.py develop
   ```

   ```{note}
   If you are compiling against PyTorch 1.6.0, you might meet some errors from PyTorch as described in [this issue](https://github.com/pytorch/pytorch/issues/42467). Follow [this pull request](https://github.com/pytorch/pytorch/pull/43380/files) to modify the source code in your local PyTorch installation.
   ```

##### Validate installation

```powershell
(mmcv) PS C:\Users\xxx\mmcv> python .dev_scripts/check_installation.py
```

If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.
If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).

### Build mmcv-lite

If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).

1. Clone the repo

   ```bash
   git clone https://github.com/open-mmlab/mmcv.git
   cd mmcv
   ```

2. Start building

   ```bash
   MMCV_WITH_OPS=0 pip install -e . -v
   ```

3. Validate installation

   ```bash
   python -c 'import mmcv;print(mmcv.__version__)'
   ```

### Build mmcv-full on Cambricon MLU Devices

#### Install torch_mlu

##### Option1: Install mmcv-full based on Cambricon docker image

Firstly, install and pull Cambricon docker image (please email service@cambricon.com for the latest release docker):

```bash
docker pull ${docker image}
```

Run and attach to the docker, [Install mmcv-full on MLU device](#install-mmcv\-full-on-cambricon-mlu-device) and [make sure you've installed mmcv-full on MLU device successfully](#test-code)

##### Option2: Install mmcv-full from compiling Cambricon PyTorch source code

Please email service@cambricon.com or contact with Cambricon engineers for a suitable version of CATCH package. After you get the suitable version of CATCH package, please follow the steps in ${CATCH-path}/CONTRIBUTING.md to install Cambricon PyTorch.

#### Install mmcv-full on Cambricon MLU device

Clone the repo

```bash
git clone https://github.com/open-mmlab/mmcv.git
```

The mlu-ops library will be downloaded to the default directory (mmcv/mlu-ops) while building MMCV. You can also set `MMCV_MLU_OPS_PATH` to an existing mlu-ops library before building as follows:

```bash
export MMCV_MLU_OPS_PATH=/xxx/xxx/mlu-ops
```

Install mmcv-full

```bash
cd mmcv
export MMCV_WITH_OPS=1
export FORCE_MLU=1
python setup.py install
```

#### Test Code

After finishing previous steps, you can run the following python code to make sure that you've installed mmcv-full on MLU device successfully

```python
import torch
import torch_mlu
from mmcv.ops import sigmoid_focal_loss
x = torch.randn(3, 10).mlu()
x.requires_grad = True
y = torch.tensor([1, 5, 3]).mlu()
w = torch.ones(10).float().mlu()
output = sigmoid_focal_loss(x, y, 2.0, 0.25, w, 'none')
print(output)
```


================================================
FILE: docs/en/get_started/installation.md
================================================
## Installation

There are two versions of MMCV:

- **mmcv**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
- **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.

```{warning}
Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is avaliable`.
```

### Install mmcv

Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command

```bash
python -c 'import torch;print(torch.__version__)'
```

If version information is output, then PyTorch is installed.

#### Install with mim (recommended)

[mim](https://github.com/open-mmlab/mim) is the package management tool for the OpenMMLab projects, which makes it easy to install mmcv

```bash
pip install -U openmim
mim install mmcv
```

If you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](build.md).

<details>
<summary>Installation log using pre-built packages</summary>

Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
Collecting mmcv<br />
<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>

</details>

<details>
<summary>Installation log using source packages</summary>

Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
Collecting mmcv==2.0.0<br />
<b>Downloading mmcv-2.0.0.tar.gz</b>

</details>

To install a specific version of mmcv, for example, mmcv version 2.0.0, you can use the following command

```bash
mim install mmcv==2.0.0
```

:::{note}
If you would like to use `opencv-python-headless` instead of `opencv-python`,
e.g., in a minimum container environment or servers without GUI,
you can first install it before installing MMCV to skip the installation of `opencv-python`.

Alternatively, if it takes too long to install a dependency library, you can specify the pypi source

```bash
mim install mmcv -i https://pypi.tuna.tsinghua.edu.cn/simple
```

:::

You can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.

#### Install with pip

Use the following command to check the version of CUDA and PyTorch

```bash
python -c 'import torch;print(torch.__version__);print(torch.version.cuda)'
```

Select the appropriate installation command depending on the type of system, CUDA version, PyTorch version, and MMCV version

<html>
<body>
<style>
    select {
        /*z-index: 1000;*/
        position: absolute;
        top: 10px;
        width: 6.7rem;
    }
    #select-container {
        position: relative;
        height: 30px;
    }
    #select-cmd {
        background-color: #f5f6f7;
        font-size: 14px;
        margin-top: 20px;
    }
    /* 让每一个都间隔1.3rem */
    #select-os {
        /* left: 1.375rem; */
        left: 0;
    }
    #select-cuda {
        /* left: 9.375rem;    9.375 = 1.375 + 6.7 + 1.3 */
        left: 8rem;
    }
    #select-torch {
        /* left: 17.375rem;    17.375 = 9.375 + 6.7 + 1.3 */
        left: 16rem;
    }
    #select-mmcv {
        /* left: 25.375rem;    25.375 = 17.375 + 6.7 + 1.3 */
        left: 24rem;
    }
</style>
<div id="select-container">
    <select
            size="1"
            onmousedown="handleSelectMouseDown(this.id)"
            onclick="clickOutside(this, () => handleSelectBlur(this.id))"
            onchange="changeOS(this.value)"
            id="select-os">
    </select>
    <select
            size="1"
            onmousedown="handleSelectMouseDown(this.id)"
            onclick="clickOutside(this, () => handleSelectBlur(this.is))"
            onchange="changeCUDA(this.value)"
            id="select-cuda">
    </select>
    <select
            size="1"
            onmousedown="handleSelectMouseDown(this.id)"
            onclick="clickOutside(this, () => handleSelectBlur(this.is))"
            onchange="changeTorch(this.value)"
            id="select-torch">
    </select>
    <select
            size="1"
            onmousedown="handleSelectMouseDown(this.id)"
            onclick="clickOutside(this, () => handleSelectBlur(this.is))"
            onchange="changeMMCV(this.value)"
            id="select-mmcv">
    </select>
</div>
<pre id="select-cmd"></pre>
</body>
<script>
    // 各个select当前的值
    let osVal, cudaVal, torchVal, mmcvVal;
    function clickOutside(targetDom, handler) {
        const clickHandler = (e) => {
            if (!targetDom || targetDom.contains(e.target)) return;
            handler?.();
            document.removeEventListener('click', clickHandler, false);
        };
        document.addEventListener('click', clickHandler, false);
    }
    function changeMMCV(val) {
        mmcvVal = val;
        change("select-mmcv");
    }
    function changeTorch(val) {
        torchVal = val;
        change("select-torch");
    }
    function changeCUDA(val) {
        cudaVal = val;
        change("select-cuda");
    }
    function changeOS(val) {
        osVal = val;
        change("select-os");
    }
    // 控制size大小相关的几个方法
    function handleSelectMouseDown(id) {
        const dom = document.getElementById(id);
        if (!dom) return;
        const len = dom?.options?.length;
        if (len >= 10) {
            dom.size = 10;
            dom.style.zIndex = 100;
        }
    }
    function handleSelectClick() {
        const selects = Array.from(document.getElementsByTagName("select"));
        selects.forEach(select => {
            select.size = 1;
        });
    }
    function handleSelectBlur(id) {
        const dom = document.getElementById(id);
        if (!dom) {
            // 如果没有指定特定的id，那就直接把所有的select都设置成size = 1
            handleSelectClick();
            return;
        }
        dom.size = 1;
        dom.style.zIndex = 1;
    }
    function changeCmd() {
        const cmd = document.getElementById("select-cmd");
        let cmdString = "pip install mmcv=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html";
        // e.g: pip install mmcv==2.0.0rc1 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9/index.html
        let cudaVersion;
        if (cudaVal === "cpu" || cudaVal === "mps") {
            cudaVersion = "cpu";
        } else {
            cudaVersion = `cu${cudaVal.split(".").join("")}`;
        }
        const torchVersion = `torch${torchVal.substring(0, torchVal.length - 2)}`;
        cmdString = cmdString.replace("{cu_version}", cudaVersion).replace("{mmcv_version}", mmcvVal).replace("{torch_version}", torchVersion);
        cmd.textContent = cmdString;
    }
    // string数组去重
    function unique(arr) {
        if (!arr || !Array.isArray(arr)) return [];
        return [...new Set(arr)];
    }
    // 根据string数组生成option的DocumentFragment
    function genOptionFragment(data, id) {
        const name = id.includes("-")? id.split("-")[1] : id;
        const fragment = new DocumentFragment();
        data.forEach(option => {
            const ele = document.createElement("option");
            let text = `${name} ${option}`;
            if (name === "os" || option.toUpperCase() === "CPU" || option.toUpperCase() === "MPS") {
                text = `${option}`;
            }
            ele.textContent = text;
            // 添加value属性，方便下拉框选择时直接读到数据
            ele.value = option;
            // 添加点击事件监听
            ele.addEventListener('click', handleSelectClick);
            fragment.appendChild(ele);
        });
        return fragment;
    }
    // 在dom树中找到id对应的dom（select元素），并将生成的options添加到元素内
    function findAndAppend(data, id) {
        const fragment = genOptionFragment(data, id);
        const dom = document.getElementById(id);
        if (dom) dom.replaceChildren(fragment);
    }
    /**
     * change方法的重点在于
     * 1. 各个下拉框数据的联动
     *      OS ==> cuda ==> torch ==> mmcv
     * 2. 命令行的修改
     */
    function change(id) {
        const order = ["select-mmcv", "select-torch", "select-cuda", "select-os"];
        const idx = order.indexOf(id);
        if (idx === -1) return;
        const versionDetail = version[osVal];
        if (idx >= 3) {
            // 根据os修改cuda
            let cuda = [];
            versionDetail.forEach(v => {
                cuda.push(v.cuda);
            });
            cuda = unique(cuda);
            cudaVal = cuda[0];
            findAndAppend(cuda, "select-cuda");
        }
        if (idx >= 2) {
            // 根据cuda修改torch
            const torch = [];
            versionDetail.forEach(v => {
                if (v.cuda === cudaVal) torch.push(v.torch);
            });
            torchVal = torch[0];
            findAndAppend(torch, "select-torch");
        }
        if (idx >= 1) {
            // 根据torch修改mmcv
            let mmcv = [];
            versionDetail.forEach(v => {
                if (v.cuda === cudaVal && v.torch === torchVal) mmcv = v.mmcv;
            });
            mmcvVal = mmcv[0];
            findAndAppend(mmcv, "select-mmcv");
        }
        changeCmd();
    }
    // 初始化，处理version数据，并调用findAndAppend
    function init() {
        // 增加一个全局的click事件监听，作为select onBlur事件失效的兜底
        // document.addEventListener("click", handleSelectBlur);
        const version = window.version;
        // OS
        const os = Object.keys(version);
        osVal = os[0];
        findAndAppend(os, "select-os");
        change("select-os");
        changeCmd();
    }
    // 利用xhr获取本地version数据，如果作为html直接浏览的话需要使用本地服务器打开，否则会有跨域问题
    window.onload = function () {
        const url = "../_static/version.json"
        // 申明一个XMLHttpRequest
        const request = new XMLHttpRequest();
        // 设置请求方法与路径
        request.open("get", url);
        // 不发送数据到服务器
        request.send(null);
        //XHR对象获取到返回信息后执行
        request.onload = function () {
            // 返回状态为200，即为数据获取成功
            if (request.status !== 200) return;
            const data = JSON.parse(request.responseText);
            window.version = data;
            init();
        }
    }
</script>
</html>

If you do not find a corresponding version in the dropdown box above, you probably do not have a pre-built package corresponding to the PyTorch or CUDA or mmcv version, at which point you can [build mmcv from source](build.md).

:::{note}
mmcv is only compiled on PyTorch 1.x.0 because the compatibility
usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you
can install mmcv compiled with PyTorch 1.x.0 and it usually works well.
For example, if your PyTorch version is 1.8.1, you can feel free to choose 1.8.x.
:::

:::{note}
If you would like to use `opencv-python-headless` instead of `opencv-python`,
e.g., in a minimum container environment or servers without GUI,
you can first install it before installing MMCV to skip the installation of `opencv-python`.

Alternatively, if it takes too long to install a dependency library, you can specify the pypi source

```bash
mim install mmcv -i https://pypi.tuna.tsinghua.edu.cn/simple
```

:::

You can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) to check the installation of mmcv after running the installation commands.

#### Using mmcv with Docker

Build with local repository

```bash
git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
docker build -t mmcv -f docker/release/Dockerfile .
```

Or build with remote repository

```bash
docker build -t mmcv https://github.com/open-mmlab/mmcv.git#main:docker/release
```

The [Dockerfile](release/Dockerfile) installs latest released version of mmcv-full by default, but you can specify mmcv versions to install expected versions.

```bash
docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0 .
```

If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.

An example to build an image with PyTorch 1.11 and CUDA 11.3.

```bash
docker build -t mmcv -f docker/release/Dockerfile \
    --build-arg PYTORCH=1.11.0 \
    --build-arg CUDA=11.3 \
    --build-arg CUDNN=8 \
    --build-arg MMCV=2.0.0 .
```

More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).

### Install mmcv-lite

If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).

```python
pip install mmcv-lite
```


================================================
FILE: docs/en/get_started/introduction.md
================================================
## Introduction

MMCV is a foundational library for computer vision research and provides the following functionalities.

- [Image/Video processing](../understand_mmcv/data_process.md)
- [Image and annotation visualization](../understand_mmcv/visualization.md)
- [Image transformation](../understand_mmcv/data_transform.md)
- [Various CNN architectures](../understand_mmcv/cnn.md)
- [High-quality implementation of common CUDA ops](../understand_mmcv/ops.md)

It supports the following systems:

- Linux
- Windows
- macOS

It supports many research projects as below:

- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.


================================================
FILE: docs/en/get_started/previous_versions.md
================================================
## OTHER VERSIONS OF PYTORCH BUILT FOR MMCV-FULL

We no longer provide `mmcv-full` packages compiled under lower versions of `PyTorch`, but for your convenience, you can find them below.

### PyTorch 1.4

| 1.0.0 \<= mmcv_version \<= 1.2.1

#### CUDA 10.1

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
```

#### CUDA 9.2

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
```

#### CPU

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.4.0/index.html
```

### PyTorch v1.3

| 1.0.0 \<= mmcv_version \<= 1.3.16

#### CUDA 10.1

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
```

#### CUDA 9.2

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
```

#### CPU

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.3.0/index.html
```


================================================
FILE: docs/en/index.rst
================================================
Welcome to MMCV's documentation!
================================

You can switch between Chinese and English documents in the lower-left corner of the layout.

.. toctree::
   :maxdepth: 2
   :caption: Get Started

   get_started/introduction.md
   get_started/installation.md
   get_started/build.md
   get_started/api_reference.md

.. toctree::
   :maxdepth: 2
   :caption: Understand MMCV

   understand_mmcv/data_process.md
   understand_mmcv/data_transform.md
   understand_mmcv/visualization.md
   understand_mmcv/cnn.md
   understand_mmcv/ops.md

.. toctree::
   :maxdepth: 2
   :caption: Deployment

   deployment/mmcv_ops_definition.md

.. toctree::
   :caption: Switch Language

   switch_language.md

.. toctree::
   :maxdepth: 2
   :caption: Compatibility

   compatibility.md

.. toctree::

   faq.md

.. toctree::
   :maxdepth: 2
   :caption: Community

   community/contributing.md
   community/pr.md

.. toctree::
   :maxdepth: 1
   :caption: API Reference

   mmcv.image <api/image>
   mmcv.video <api/video>
   mmcv.visualization <api/visualization>
   mmcv.cnn <api/cnn>
   mmcv.ops <api/ops>
   mmcv.transforms <api/transforms>
   mmcv.arraymisc <api/arraymisc>
   mmcv.utils <api/utils>

Indices and tables
==================

* :ref:`genindex`
* :ref:`search`


================================================
FILE: docs/en/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%

:end
popd


================================================
FILE: docs/en/switch_language.md
================================================
## <a href='https://mmcv.readthedocs.io/en/latest/'>English</a>

## <a href='https://mmcv.readthedocs.io/zh_CN/latest/'>简体中文</a>


================================================
FILE: docs/en/understand_mmcv/cnn.md
================================================
## CNN

We provide some building bricks for CNNs, including layer building, module bundles and weight initialization.

### Layer building

We may need to try different layers of the same type when running experiments,
but do not want to modify the code from time to time.
Here we provide some layer building methods to construct layers from a dict,
which can be written in configs or specified via command line arguments.

#### Usage

A simplest example is

```python
from mmcv.cnn import build_conv_layer

cfg = dict(type='Conv3d')
layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
```

- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d).
- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d).
- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU.
- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle.
- `build_padding_layer`: Supported types are zero, reflect, replicate.

#### Extension

We also allow extending the building methods with custom layers and operators.

1. Write and register your own module.

   ```python
   from mmengine.registry import MODELS

   @MODELS.register_module()
   class MyUpsample:

       def __init__(self, scale_factor):
           pass

       def forward(self, x):
           pass
   ```

2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it.

   ```python
   from mmcv.cnn import build_upsample_layer

   cfg = dict(type='MyUpsample', scale_factor=2)
   layer = build_upsample_layer(cfg)
   ```

### Module bundles

We also provide common module bundles to facilitate the network construction.
`ConvModule` is a bundle of convolution, normalization and activation layers,
please refer to the [api](api.html#mmcv.cnn.ConvModule) for details.

```python
from mmcv.cnn import ConvModule

# conv + bn + relu
conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
# conv + gn + relu
conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
# conv + relu
conv = ConvModule(3, 8, 2)
# conv
conv = ConvModule(3, 8, 2, act_cfg=None)
# conv + leaky relu
conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
# bn + conv + relu
conv = ConvModule(
    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
```


================================================
FILE: docs/en/understand_mmcv/data_process.md
================================================
## Data Process

### Image

This module provides some image processing methods, which requires `opencv` to be installed first.

#### Read/Write/Show

To read or write images files, use `imread` or `imwrite`.

```python
import mmcv

img = mmcv.imread('test.jpg')
img = mmcv.imread('test.jpg', flag='grayscale')
img_ = mmcv.imread(img)  # nothing will happen, img_ = img
mmcv.imwrite(img, 'out.jpg')
```

To read images from bytes

```python
with open('test.jpg', 'rb') as f:
    data = f.read()
img = mmcv.imfrombytes(data)
```

To show an image file or a loaded image

```python
mmcv.imshow('tests/data/color.jpg')
# this is equivalent to

for i in range(10):
    img = np.random.randint(256, size=(100, 100, 3), dtype=np.uint8)
    mmcv.imshow(img, win_name='test image', wait_time=200)
```

#### Color space conversion

Supported conversion methods:

- bgr2gray
- gray2bgr
- bgr2rgb
- rgb2bgr
- bgr2hsv
- hsv2bgr

```python
img = mmcv.imread('tests/data/color.jpg')
img1 = mmcv.bgr2rgb(img)
img2 = mmcv.rgb2gray(img1)
img3 = mmcv.bgr2hsv(img)
```

#### Resize

There are three resize methods. All `imresize_*` methods have an argument `return_scale`,
if this argument is `False`, then the return value is merely the resized image, otherwise
is a tuple `(resized_img, scale)`.

```python
# resize to a given size
mmcv.imresize(img, (1000, 600), return_scale=True)

# resize to the same size of another image
mmcv.imresize_like(img, dst_img, return_scale=False)

# resize by a ratio
mmcv.imrescale(img, 0.5)

# resize so that the max edge no longer than 1000, short edge no longer than 800
# without changing the aspect ratio
mmcv.imrescale(img, (1000, 800))
```

#### Rotate

To rotate an image by some angle, use `imrotate`. The center can be specified,
which is the center of original image by default. There are two modes of rotating,
one is to keep the image size unchanged so that some parts of the image will be
cropped after rotating, the other is to extend the image size to fit the rotated
image.

```python
img = mmcv.imread('tests/data/color.jpg')

# rotate the image clockwise by 30 degrees.
img_ = mmcv.imrotate(img, 30)

# rotate the image counterclockwise by 90 degrees.
img_ = mmcv.imrotate(img, -90)

# rotate the image clockwise by 30 degrees, and rescale it by 1.5x at the same time.
img_ = mmcv.imrotate(img, 30, scale=1.5)

# rotate the image clockwise by 30 degrees, with (100, 100) as the center.
img_ = mmcv.imrotate(img, 30, center=(100, 100))

# rotate the image clockwise by 30 degrees, and extend the image size.
img_ = mmcv.imrotate(img, 30, auto_bound=True)
```

#### Flip

To flip an image, use `imflip`.

```python
img = mmcv.imread('tests/data/color.jpg')

# flip the image horizontally
mmcv.imflip(img)

# flip the image vertically
mmcv.imflip(img, direction='vertical')
```

#### Crop

`imcrop` can crop the image with one or more regions. Each region is represented by the upper left and lower right coordinates as (x1, y1, x2, y2).

```python
import mmcv
import numpy as np

img = mmcv.imread('tests/data/color.jpg')

# crop the region (10, 10, 100, 120)
bboxes = np.array([10, 10, 100, 120])
patch = mmcv.imcrop(img, bboxes)

# crop two regions (10, 10, 100, 120) and (0, 0, 50, 50)
bboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])
patches = mmcv.imcrop(img, bboxes)

# crop two regions, and rescale the patches by 1.2x
patches = mmcv.imcrop(img, bboxes, scale=1.2)
```

#### Padding

There are two methods, `impad` and `impad_to_multiple`, to pad an image to the
specific size with given values.

```python
img = mmcv.imread('tests/data/color.jpg')

# pad the image to (1000, 1200) with all zeros
img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)

# pad the image to (1000, 1200) with different values for three channels.
img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))

# pad the image on left, right, top, bottom borders with all zeros
img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)

# pad the image on left, right, top, bottom borders with different values
# for three channels.
img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))

# pad an image so that each edge is a multiple of some value.
img_ = mmcv.impad_to_multiple(img, 32)
```

### Video

This module provides the following functionalities:

- A `VideoReader` class with friendly apis to read and convert videos.
- Some methods for editing (cut, concat, resize) videos.
- Optical flow read/write/warp.

#### VideoReader

The `VideoReader` class provides sequence like apis to access video frames.
It will internally cache the frames which have been visited.

```python
video = mmcv.VideoReader('test.mp4')

# obtain basic information
print(len(video))
print(video.width, video.height, video.resolution, video.fps)

# iterate over all frames
for frame in video:
    print(frame.shape)

# read the next frame
img = video.read()

# read a frame by index
img = video[100]

# read some frames
img = video[5:10]
```

To convert a video to images or generate a video from a image directory.

```python
# split a video into frames and save to a folder
video = mmcv.VideoReader('test.mp4')
video.cvt2frames('out_dir')

# generate video from frames
mmcv.frames2video('out_dir', 'test.avi')
```

#### Editing utils

There are also some methods for editing videos, which wraps the commands of ffmpeg.

```python
# cut a video clip
mmcv.cut_video('test.mp4', 'clip1.mp4', start=3, end=10, vcodec='h264')

# join a list of video clips
mmcv.concat_video(['clip1.mp4', 'clip2.mp4'], 'joined.mp4', log_level='quiet')

# resize a video with the specified size
mmcv.resize_video('test.mp4', 'resized1.mp4', (360, 240))

# resize a video with a scaling ratio of 2
mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)
```

#### Optical flow

`mmcv` provides the following methods to operate on optical flows.

- IO
- Visualization
- Flow warping

We provide two options to dump optical flow files: uncompressed and compressed.
The uncompressed way just dumps the floating numbers to a binary file. It is
lossless but the dumped file has a larger size.
The compressed way quantizes the optical flow to 0-255 and dumps it as a
jpeg image. The flow of x-dim and y-dim will be concatenated into a single image.

1. IO

```python
flow = np.random.rand(800, 600, 2).astype(np.float32)
# dump the flow to a flo file (~3.7M)
mmcv.flowwrite(flow, 'uncompressed.flo')
# dump the flow to a jpeg file (~230K)
# the shape of the dumped image is (800, 1200)
mmcv.flowwrite(flow, 'compressed.jpg', quantize=True, concat_axis=1)

# read the flow file, the shape of loaded flow is (800, 600, 2) for both ways
flow = mmcv.flowread('uncompressed.flo')
flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)
```

2. Visualization

It is possible to visualize optical flows with `mmcv.flowshow()`.

```python
mmcv.flowshow(flow)
```

![progress](../_static/flow_visualization.png)

3. Flow warping

```python
img1 = mmcv.imread('img1.jpg')
flow = mmcv.flowread('flow.flo')
warped_img2 = mmcv.flow_warp(img1, flow)
```

img1 (left) and img2 (right)

![raw images](../_static/flow_raw_images.png)

optical flow (img2 -> img1)

![optical flow](../_static/flow_img2toimg1.png)

warped image and difference with ground truth

![warped image](../_static/flow_warp_diff.png)


================================================
FILE: docs/en/understand_mmcv/data_transform.md
================================================
# Data Transformation

In the OpenMMLab algorithm library, dataset construction and data preparation are decoupled. Usually, the construction of the dataset only parses the dataset and records the basic information of each sample, while the data preparation is a series of data transformations including data loading, preprocessing, formatting, and other operations performed according to the basic information of the sample.

## Design of data transformation

In MMCV, we use various callable data transformation classes to manipulate data. These data transformation classes can accept several configuration parameters for the instantiation and then process the input data dictionary by `__call__` method. All data transformation methods accept a dictionary as the input and produce the output as a dictionary as well. A simple example is as follows:

```python
>>> import numpy as np
>>> from mmcv.transforms import Resize
>>>
>>> transform = Resize(scale=(224, 224))
>>> data_dict = {'img': np.random.rand(256, 256, 3)}
>>> data_dict = transform(data_dict)
>>> print(data_dict['img'].shape)
(224, 224, 3)
```

The data transformation class reads some fields of the input dictionary and may add or update some fields. The keys of these fields are mostly fixed. For example, `Resize` will always read fields such as `"img"` in the input dictionary. More information about the conventions for input and output fields could be found in the documentation of the corresponding class.

```{note}
By convention, the order of image shape which is used as **initialization parameters** in data transformation (such as Resize, Pad) is (width, height). In the dictionary returned by the data transformation, the image related shape, such as `img_shape`, `ori_shape`, `pad_shape`, etc., is (height, width).
```

MMCV provides a unified base class called `BaseTransform` for all data transformation classes:

```python
class BaseTransform(metaclass=ABCMeta):

    def __call__(self, results: dict) -> dict:

        return self.transform(results)

    @abstractmethod
    def transform(self, results: dict) -> dict:
        pass
```

All data transformation classes must inherit `BaseTransform` and implement the `transform` method. Both the input and output of the `transform` method are a dictionary. In the **Custom data transformation class** section, we will describe how to implement a data transformation class in more detail.

## Data pipeline

As mentioned above, the inputs and outputs of all data transformations are dictionaries. Moreover, according to the \[Convention on Datasets\] (TODO) in OpenMMLab, the basic information of each sample in the dataset is also a dictionary. This way, we can connect all data transformation operations end to end and combine them into a data pipeline. This pipeline inputs the information dictionary of the samples in the dataset and outputs the information dictionary after a series of processing.

Taking the classification task as an example, we show a typical data pipeline in the figure below. For each sample, the information stored in the dataset is a dictionary, as shown on the far left in the figure. After each data transformation operation represented by the blue block, a new field (marked in green) will be added to the data dictionary or an existing field (marked in orange) will be updated.

<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/154197953-bf0b1a16-3f41-4bc7-9e67-b2b9b323d895.png" width="90%"/>
</div>

The data pipeline is a list of several data transformation configuration dictionaries in the configuration file. Each dataset needs to set the parameter `pipeline` to define the data preparation operations the dataset needs to perform. The configuration of the above data pipeline in the configuration file is as follows:

```python
pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='Resize', size=256, keep_ratio=True),
    dict(type='CenterCrop', crop_size=224),
    dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
    dict(type='ClsFormatBundle')
]

dataset = dict(
    ...
    pipeline=pipeline,
    ...
)
```

## Common data transformation classes

The commonly used data transformation classes can be roughly divided into data loading, data preprocessing and augmentation, and data formatting. In MMCV, we provide some commonly used classes as follows:

### Data loading

To support the loading of large-scale datasets, data is usually not loaded when `Dataset` is initialized. Only the corresponding path is loaded. Therefore, it is necessary to load specific data in the data pipeline.

|            Class            |                    Feature                     |
| :-------------------------: | :--------------------------------------------: |
| [`LoadImageFromFile`](TODO) |              Load from file path               |
|  [`LoadAnnotations`](TODO)  | Load and organize the annotations (bbox, etc.) |

### Data preprocessing and enhancement

Data preprocessing and augmentation usually involve transforming the image itself, such as cropping, padding, scaling, etc.

|              Class               |                        Feature                         |
| :------------------------------: | :----------------------------------------------------: |
|          [`Pad`](TODO)           |                        Padding                         |
|       [`CenterCrop`](TODO)       |                      Center crop                       |
|       [`Normalize`](TODO)        |                  Image normalization                   |
|         [`Resize`](TODO)         |         Resize to the specified size or ratio          |
|      [`RandomResize`](TODO)      |  Scale the image randomly within the specified range   |
| [`RandomMultiscaleResize`](TODO) | Scale the image to a random size from multiple options |
|    [`RandomGrayscale`](TODO)     |                    Random grayscale                    |
|       [`RandomFlip`](TODO)       |                      Random flip                       |
|   [`MultiScaleFlipAug`](TODO)    |    Support scaling and flipping during the testing     |

### Data formatting

Data formatting operations are type conversions performed on the data.

|          Class          |                   Feature                    |
| :---------------------: | :------------------------------------------: |
|   [`ToTensor`](TODO)    | Convert the specified data to `torch.Tensor` |
| [`ImageToTensor`](TODO) |     Convert the image to `torch.Tensor`      |

## Customize data transformation classes

To implement a new data transformation class, you must inherit `BaseTransform` and implement the `transform` method. Here, we use a simple flip transform (`MyFlip`) as an example:

```python
import random
import mmcv
from mmcv.transforms import BaseTransform, TRANSFORMS

@TRANSFORMS.register_module()
class MyFlip(BaseTransform):
    def __init__(self, direction: str):
        super().__init__()
        self.direction = direction

    def transform(self, results: dict) -> dict:
        img = results['img']
        results['img'] = mmcv.imflip(img, direction=self.direction)
        return results
```

Now, we can instantiate `MyFlip` as a callable object to handle our data dictionary.

```python
import numpy as np

transform = MyFlip(direction='horizontal')
data_dict = {'img': np.random.rand(224, 224, 3)}
data_dict = transform(data_dict)
processed_img = data_dict['img']
```

Alternatively, use `MyFlip` transform in the `pipeline` of the config file.

```python
pipeline = [
    ...
    dict(type='MyFlip', direction='horizontal'),
    ...
]
```

It should be noted that if you want to use it in the configuration file, you must ensure that the file where the `MyFlip` class is located can be imported at the runtime.

## Transform wrapper

Transform wrappers are a special class of data transformations. They do not operate on images, labels or other information in the data dictionary by themselves. Instead, they enhance the behavior of data transformations defined in them.

### KeyMapper

`KeyMapper` is used to map fields in the data dictionary. For example, image processing transforms usually get their values from the `"img"` field in the data dictionary. But sometimes we want these transforms to handle images in other fields in the data dictionary, such as the `"gt_img"` field.

When used with registry and configuration file, the field map wrapper should be used as follows:

```python
pipeline = [
    ...
    dict(type='KeyMapper',
        mapping={
            'img': 'gt_img',  # map "gt_img" to "img"
            'mask': ...,  # The "mask" field in the raw data is not used. That is, for wrapped data transformations, the "mask" field is not included in the data
        },
        auto_remap=True,  # remap "img" back to "gt_img" after the transformation
        transforms=[
            # only need to specify "img" in `RandomFlip`
            dict(type='RandomFlip'),
        ])
    ...
]
```

With `KeyMapper`, we don't need to consider various possible input field names in the `transform` method when we implement the data transformation class. We only need to deal with the default fields.

### RandomChoice and RandomApply

`RandomChoice` is used to randomly select a data transformation pipeline from the given choices. With this wrapper, we can easily implement some data augmentation functions, such as AutoAugment.

In configuration file, you can use `RandomChoice` as follows:

```python
pipeline = [
    ...
    dict(type='RandomChoice',
        transforms=[
            [
                dict(type='Posterize', bits=4),
                dict(type='Rotate', angle=30.)
            ],  # the first combo option
            [
                dict(type='Equalize'),
                dict(type='Rotate', angle=30)
            ],  # the second combo option
        ],
        prob=[0.4, 0.6]  # the prob of each combo
        )
    ...
]
```

`RandomApply` is used to randomly perform a combination of data transformations with a specified probability. For example:

```python
pipeline = [
    ...
    dict(type='RandomApply',
        transforms=[dict(type='Rotate', angle=30.)],
        prob=0.3)  # perform the transformation with prob as 0.3
    ...
]
```

### TransformBroadcaster

Usually, a data transformation class only reads the target of an operation from one field. While we can also use `KeyMapper` to change the fields read, there is no way to apply transformations to the data of multiple fields at once. To achieve this, we need to use the multi-target extension wrapper `TransformBroadcaster`.

`TransformBroadcaster` has two uses, one is to apply data transformation to multiple specified fields, and the other is to apply data transformation to a group of targets under a field.

1. Apply to multiple fields

   Suppose we need to apply a data transformation to images in two fields `"lq"` (low-quality) and `"gt"` (ground-truth).

   ```python
   pipeline = [
       dict(type='TransformBroadcaster',
           # apply to the "lq" and "gt" fields respectively, and set the "img" field to both
           mapping={'img': ['lq', 'gt']},
           # remap the "img" field back to the original field after the transformation
           auto_remap=True,
           # whether to share random variables in the transformation of each target
           # more introduction will be referred in the following chapters (random variable sharing)
           share_random_params=True,
           transforms=[
               # only need to manipulate the "img" field in the `RandomFlip` class
               dict(type='RandomFlip'),
           ])
   ]
   ```

   In the `mapping` setting of the multi-target extension, we can also use `...` to ignore the specified original field. As shown in the following example, the wrapped `RandomCrop` will crop the image in the field `"img"` and update the size of the cropped image if the field `"img_shape"` exists. If we want to do the same random cropping for both image fields `"lq"` and `"gt"` at the same time but update the `"img_shape"` field only once, we can do it as in the example:

   ```python
   pipeline = [
       dict(type='TransformBroadcaster',
           mapping={
               'img': ['lq', 'gt'],
               'img_shape': ['img_shape', ...],
            },
           # remap the "img" and "img_shape" fields back to their original fields after the transformation
           auto_remap=True,
           # whether to share random variables in the transformation of each target
           # more introduction will be referred in the following chapters (random variable sharing)
           share_random_params=True,
           transforms=[
               # "img" and "img_shape" fields are manipulated in the `RandomCrop` class
               # if "img_shape" is missing, only operate on "img"
               dict(type='RandomCrop'),
           ])
   ]
   ```

2. A set of targets applied to a field

   Suppose we need to apply a data transformation to the `"images"` field, which is a list of images.

   ```python
   pipeline = [
       dict(type='TransformBroadcaster',
           # map each image under the "images" field to the "img" field
           mapping={'img': 'images'},
           # remap the images under the "img" field back to the list in the "images" field after the transformation
           auto_remap=True,
           # whether to share random variables in the transformation of each target
           share_random_params=True,
           transforms=[
               # in the `RandomFlip` transformation class, we only need to manipulate the "img" field
               dict(type='RandomFlip'),
           ])
   ]
   ```

#### Decorator `cache_randomness`

In `TransformBroadcaster`, we provide the `share_random_params` option to support sharing random states across multiple data transformations. For example, in a super-resolution task, we want to apply **the same** random transformations **simultaneously** to the low-resolution image and the original image. If we use this function in a custom data transformation class, we need to mark which random variables support sharing in the class. This can be achieved with the decorator `cache_randomness`.

Taking `MyFlip` from the above example, we want to perform flipping randomly with a certain probability:

```python
from mmcv.transforms.utils import cache_randomness

@TRANSFORMS.register_module()
class MyRandomFlip(BaseTransform):
    def __init__(self, prob: float, direction: str):
        super().__init__()
        self.prob = prob
        self.direction = direction

    @cache_randomness  # label the output of the method as a shareable random variable
    def do_flip(self):
        flip = True if random.random() > self.prob else False
        return flip

    def transform(self, results: dict) -> dict:
        img = results['img']
        if self.do_flip():
            results['img'] = mmcv.imflip(img, direction=self.direction)
        return results
```

In the above example, we decorate the `do_flip` method with `cache_randomness`, marking the method return value `flip` as a random variable that supports sharing. Therefore, in the transformation of `TransformBroadcaster` to multiple targets, the value of this variable will remain the same.

#### Decorator `avoid_cache_randomness`

In some cases, we cannot separate the process of generating random variables in data transformation into a class method. For example, modules from third-party libraries used in data transformation encapsulate the relevant parts of random variables inside, making them impossible to be extracted as class methods for data transformation. Such data transformations cannot support shared random variables through the decorator `cache_randomness` annotation, and thus cannot share random variables during multi-objective expansion.

To avoid misuse of such data transformations in multi-object extensions, we provide another decorator, `avoid_cache_randomness`, to mark such data transformations:

```python
from mmcv.transforms.utils import avoid_cache_randomness

@TRANSFORMS.register_module()
@avoid_cache_randomness
class MyRandomTransform(BaseTransform):

    def transform(self, results: dict) -> dict:
        ...
```

Data transformation classes marked with `avoid_cache_randomness` will throw an exception when their instance is wrapped by `TransformBroadcaster` and the parameter `share_random_params` is set to True. This reminds the user not to use it in this way.

There are a few things to keep in mind when using `avoid_cache_randomness`:

1. `avoid_cache_randomness` is only used to decorate data transformation classes (subclasses of `BaseTransfrom`) and cannot be used to decorate other general classes, class methods, or functions
2. When a data transformation decorated with `avoid_cache_randomness` is used as a base class, its subclasses **will not inherit** its feature. If the subclass is still unable to share random variables, `avoid_cache_randomness` should be used again.
3. A data transformation needs to be modified with `avoid_cache_randomness` only when a data transformation is random and cannot share its random parameters. Data transformations without randomness require no decoration


================================================
FILE: docs/en/understand_mmcv/ops.md
================================================
## ops

We implement common ops used in detection, segmentation, etc.

| Device                       | CPU | CUDA | MLU | MPS | Ascend |
| ---------------------------- | --- | ---- | --- | --- | ------ |
| ActiveRotatedFilter          | √   | √    |     |     | √      |
| AssignScoreWithK             |     | √    |     |     |        |
| BallQuery                    |     | √    | √   |     | √      |
| BBoxOverlaps                 |     | √    | √   | √   | √      |
| BorderAlign                  |     | √    |     |     |        |
| BoxIouRotated                | √   | √    | √   |     | √      |
| BoxIouQuadri                 | √   | √    |     |     |        |
| CARAFE                       |     | √    | √   |     |        |
| ChamferDistance              |     | √    |     |     | √      |
| CrissCrossAttention          |     | √    |     |     |        |
| ContourExpand                | √   |      |     |     |        |
| ConvexIoU                    |     | √    |     |     |        |
| CornerPool                   |     | √    |     |     |        |
| Correlation                  |     | √    |     |     |        |
| Deformable Convolution v1/v2 | √   | √    | √   |     | √      |
| Deformable RoIPool           |     | √    | √   |     | √      |
| DiffIoURotated               |     | √    | √   |     |        |
| DynamicScatter               |     | √    | √   |     |        |
| FurthestPointSample          |     | √    |     |     |        |
| FurthestPointSampleWithDist  |     | √    |     |     |        |
| FusedBiasLeakyrelu           |     | √    |     |     | √      |
| GatherPoints                 |     | √    |     |     | √      |
| GroupPoints                  |     | √    |     |     |        |
| Iou3d                        |     | √    | √   |     |        |
| KNN                          |     | √    |     |     |        |
| MaskedConv                   |     | √    | √   |     | √      |
| MergeCells                   |     | √    |     |     |        |
| MinAreaPolygon               |     | √    |     |     |        |
| ModulatedDeformConv2d        | √   | √    | √   |     | √      |
| MultiScaleDeformableAttn     |     | √    | √   |     | √      |
| NMS                          | √   | √    | √   |     | √      |
| NMSRotated                   | √   | √    | √   |     | √      |
| NMSQuadri                    | √   | √    |     |     |        |
| PixelGroup                   | √   |      |     |     |        |
| PointsInBoxes                | √   | √    |     |     |        |
| PointsInPolygons             |     | √    |     |     | √      |
| PSAMask                      | √   | √    | √   |     | √      |
| RotatedFeatureAlign          | √   | √    | √   |     | √      |
| RoIPointPool3d               |     | √    | √   |     |        |
| RoIPool                      |     | √    | √   |     | √      |
| RoIAlignRotated              | √   | √    | √   |     | √      |
| RiRoIAlignRotated            |     | √    |     |     |        |
| RoIAlign                     | √   | √    | √   |     | √      |
| RoIAwarePool3d               |     | √    | √   |     |        |
| SAConv2d                     |     | √    |     |     |        |
| SigmoidFocalLoss             |     | √    | √   |     | √      |
| SoftmaxFocalLoss             |     | √    |     |     | √      |
| SoftNMS                      |     | √    |     |     |        |
| Sparse Convolution           |     | √    | √   |     |        |
| Synchronized BatchNorm       |     | √    |     |     |        |
| ThreeInterpolate             |     | √    |     |     |        |
| ThreeNN                      |     | √    | √   |     |        |
| TINShift                     |     | √    | √   |     |        |
| UpFirDn2d                    |     | √    |     |     |        |
| Voxelization                 | √   | √    | √   |     | √      |
| PrRoIPool                    |     | √    |     |     |        |
| BezierAlign                  | √   | √    |     |     |        |
| BiasAct                      |     | √    |     |     |        |
| FilteredLrelu                |     | √    |     |     |        |
| Conv2dGradfix                |     | √    |     |     |        |


================================================
FILE: docs/en/understand_mmcv/visualization.md
================================================
## Visualization

`mmcv` can show images and annotations (currently supported types include bounding boxes).

```python
# show an image file
mmcv.imshow('a.jpg')

# show a loaded image
img = np.random.rand(100, 100, 3)
mmcv.imshow(img)

# show image with bounding boxes
img = np.random.rand(100, 100, 3)
bboxes = np.array([[0, 0, 50, 50], [20, 20, 60, 60]])
mmcv.imshow_bboxes(img, bboxes)
```

`mmcv` can also visualize special images such as optical flows.

```python
flow = mmcv.flowread('test.flo')
mmcv.flowshow(flow)
```


================================================
FILE: docs/zh_cn/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/zh_cn/_static/css/readthedocs.css
================================================
.header-logo {
    background-image: url("../image/mmcv-logo.png");
    background-size: 85px 40px;
    height: 40px;
    width: 85px;
}

table.colwidths-auto td {
    width: 50%
}


================================================
FILE: docs/zh_cn/_static/version.json
================================================
{
    "Linux": [
        {
            "cuda": "12.1",
            "torch": "2.4.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.4.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.7",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.7",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.6",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.6",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.5",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.0",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.5.x",
            "mmcv": [
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.5.x",
            "mmcv": [
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "9.2",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "9.2",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "9.2",
            "torch": "1.5.x",
            "mmcv": [
                "2.0.0rc3",
                "2.0.0rc2"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.4.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.5.x",
            "mmcv": [
                "2.0.0rc3",
                "2.0.0rc2"
            ]
        }
    ],
    "Windows": [
        {
            "cuda": "12.1",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "12.1",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "11.8",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.7",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.7",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.6",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.6",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.5",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.3",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "11.1",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "10.2",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.8.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "10.1",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.3.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.2.x",
            "mmcv": [
                "2.2.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.1.x",
            "mmcv": [
                "2.2.0",
                "2.1.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.0.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.13.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.12.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.11.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.10.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.9.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.8.x",
            "mmcv": [
                "2.2.0",
                "2.1.0",
                "2.0.1",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0rc1"
            ]
        }
    ],
    "macOS": [
        {
            "cuda": "cpu",
            "torch": "2.1.x",
            "mmcv": [
                "2.1.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "2.0.x",
            "mmcv": [
                "2.1.0",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.13.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0"
            ]
        },
        {
            "cuda": "mps",
            "torch": "1.13.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.12.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.11.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.10.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.9.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.8.x",
            "mmcv": [
                "2.1.0",
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2",
                "2.0.0"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.7.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2"
            ]
        },
        {
            "cuda": "cpu",
            "torch": "1.6.x",
            "mmcv": [
                "2.0.0rc4",
                "2.0.0rc3",
                "2.0.0rc2"
            ]
        }
    ]
}


================================================
FILE: docs/zh_cn/_templates/classtemplate.rst
================================================
.. role:: hidden
    :class: hidden-section
.. currentmodule:: {{ module }}


{{ name | underline}}

.. autoclass:: {{ name }}
    :members:


..
  autogenerated from source/_templates/classtemplate.rst
  note it does not have :inherited-members:


================================================
FILE: docs/zh_cn/api/arraymisc.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.arraymisc
===================================

.. contents:: mmcv.arraymisc
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.arraymisc

.. autosummary::
   :toctree: generated
   :nosignatures:

   quantize
   dequantize


================================================
FILE: docs/zh_cn/api/cnn.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.cnn
===================================

.. contents:: mmcv.cnn
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.cnn

Module
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   ContextBlock
   Conv2d
   Conv3d
   ConvAWS2d
   ConvModule
   ConvTranspose2d
   ConvTranspose3d
   ConvWS2d
   DepthwiseSeparableConvModule
   GeneralizedAttention
   HSigmoid
   HSwish
   LayerScale
   Linear
   MaxPool2d
   MaxPool3d
   NonLocal1d
   NonLocal2d
   NonLocal3d
   Scale
   Swish
   Conv2dRFSearchOp

Build Function
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   build_activation_layer
   build_conv_layer
   build_norm_layer
   build_padding_layer
   build_plugin_layer
   build_upsample_layer

Miscellaneous
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   fuse_conv_bn
   conv_ws_2d
   is_norm
   make_res_layer
   make_vgg_layer
   get_model_complexity_info


================================================
FILE: docs/zh_cn/api/image.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.image
===================================

.. contents:: mmcv.image
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.image

IO
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   imfrombytes
   imread
   imwrite
   use_backend

Color Space
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   bgr2gray
   bgr2hls
   bgr2hsv
   bgr2rgb
   bgr2ycbcr
   gray2bgr
   gray2rgb
   hls2bgr
   hsv2bgr
   imconvert
   rgb2bgr
   rgb2gray
   rgb2ycbcr
   ycbcr2bgr
   ycbcr2rgb

Geometric
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   cutout
   imcrop
   imflip
   impad
   impad_to_multiple
   imrescale
   imresize
   imresize_like
   imresize_to_multiple
   imrotate
   imshear
   imtranslate
   rescale_size

Photometric
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   adjust_brightness
   adjust_color
   adjust_contrast
   adjust_hue
   adjust_lighting
   adjust_sharpness
   auto_contrast
   clahe
   imdenormalize
   imequalize
   iminvert
   imnormalize
   lut_transform
   posterize
   solarize

Miscellaneous
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   tensor2imgs


================================================
FILE: docs/zh_cn/api/ops.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.ops
===================================

.. contents:: mmcv.ops
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.ops

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   BorderAlign
   CARAFE
   CARAFENaive
   CARAFEPack
   Conv2d
   ConvTranspose2d
   CornerPool
   Correlation
   CrissCrossAttention
   DeformConv2d
   DeformConv2dPack
   DeformRoIPool
   DeformRoIPoolPack
   DynamicScatter
   FusedBiasLeakyReLU
   GroupAll
   Linear
   MaskedConv2d
   MaxPool2d
   ModulatedDeformConv2d
   ModulatedDeformConv2dPack
   ModulatedDeformRoIPoolPack
   MultiScaleDeformableAttention
   PSAMask
   PointsSampler
   PrRoIPool
   QueryAndGroup
   RiRoIAlignRotated
   RoIAlign
   RoIAlignRotated
   RoIAwarePool3d
   RoIPointPool3d
   RoIPool
   SAConv2d
   SigmoidFocalLoss
   SimpleRoIAlign
   SoftmaxFocalLoss
   SparseConv2d
   SparseConv3d
   SparseConvTensor
   SparseConvTranspose2d
   SparseConvTranspose3d
   SparseInverseConv2d
   SparseInverseConv3d
   SparseMaxPool2d
   SparseMaxPool3d
   SparseModule
   SparseSequential
   SubMConv2d
   SubMConv3d
   SyncBatchNorm
   TINShift
   Voxelization

.. autosummary::
   :toctree: generated
   :nosignatures:

   active_rotated_filter
   assign_score_withk
   ball_query
   batched_nms
   bbox_overlaps
   border_align
   box_iou_rotated
   boxes_iou3d
   boxes_iou_bev
   boxes_overlap_bev
   carafe
   carafe_naive
   chamfer_distance
   contour_expand
   convex_giou
   convex_iou
   deform_conv2d
   deform_roi_pool
   diff_iou_rotated_2d
   diff_iou_rotated_3d
   dynamic_scatter
   furthest_point_sample
   furthest_point_sample_with_dist
   fused_bias_leakyrelu
   gather_points
   grouping_operation
   knn
   masked_conv2d
   min_area_polygons
   modulated_deform_conv2d
   nms
   nms3d
   nms3d_normal
   nms_bev
   nms_match
   nms_normal_bev
   nms_rotated
   pixel_group
   point_sample
   points_in_boxes_all
   points_in_boxes_cpu
   points_in_boxes_part
   points_in_polygons
   prroi_pool
   rel_roi_point_to_rel_img_point
   riroi_align_rotated
   roi_align
   roi_align_rotated
   roi_pool
   rotated_feature_align
   scatter_nd
   sigmoid_focal_loss
   soft_nms
   softmax_focal_loss
   three_interpolate
   three_nn
   tin_shift
   upfirdn2d
   voxelization


================================================
FILE: docs/zh_cn/api/transforms.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.transforms
===================================

.. currentmodule:: mmcv.transforms

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   BaseTransform
   TestTimeAug

Loading
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   LoadAnnotations
   LoadImageFromFile

Processing
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   CenterCrop
   MultiScaleFlipAug
   Normalize
   Pad
   RandomChoiceResize
   RandomFlip
   RandomGrayscale
   RandomResize
   Resize
   ToTensor
   ImageToTensor

Wrapper
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   Compose
   KeyMapper
   RandomApply
   RandomChoice
   TransformBroadcaster


================================================
FILE: docs/zh_cn/api/utils.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.utils
===================================

.. contents:: mmcv.utils
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.utils

.. autosummary::
   :toctree: generated
   :nosignatures:

   IS_CUDA_AVAILABLE
   IS_MLU_AVAILABLE
   IS_MPS_AVAILABLE
   collect_env
   jit
   skip_no_elena


================================================
FILE: docs/zh_cn/api/video.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.video
===================================

.. contents:: mmcv.video
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.video

IO
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   VideoReader
   Cache

.. autosummary::
   :toctree: generated
   :nosignatures:

   frames2video

Optical Flow
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   dequantize_flow
   flow_from_bytes
   flow_warp
   flowread
   flowwrite
   quantize_flow
   sparse_flow_from_bytes

Video Processing
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   concat_video
   convert_video
   cut_video
   resize_video


================================================
FILE: docs/zh_cn/api/visualization.rst
================================================
.. role:: hidden
    :class: hidden-section

mmcv.visualization
===================================

.. contents:: mmcv.visualization
   :depth: 2
   :local:
   :backlinks: top

.. currentmodule:: mmcv.visualization

Color
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:
   :template: classtemplate.rst

   Color

.. autosummary::
   :toctree: generated
   :nosignatures:

   color_val

Image
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   imshow
   imshow_bboxes
   imshow_det_bboxes

Optical Flow
----------------

.. autosummary::
   :toctree: generated
   :nosignatures:

   flow2rgb
   flowshow
   make_color_wheel


================================================
FILE: docs/zh_cn/community/code_style.md
================================================
## 代码规范

### 代码规范标准

#### PEP 8 —— Python 官方代码规范

[Python 官方的代码风格指南](https://www.python.org/dev/peps/pep-0008/)，包含了以下几个方面的内容：

- 代码布局，介绍了 Python 中空行、断行以及导入相关的代码风格规范。比如一个常见的问题：当我的代码较长，无法在一行写下时，何处可以断行？

- 表达式，介绍了 Python 中表达式空格相关的一些风格规范。

- 尾随逗号相关的规范。当列表较长，无法一行写下而写成如下逐行列表时，推荐在末项后加逗号，从而便于追加选项、版本控制等。

  ```python
  # Correct:
  FILES = ['setup.cfg', 'tox.ini']
  # Correct:
  FILES = [
      'setup.cfg',
      'tox.ini',
  ]
  # Wrong:
  FILES = ['setup.cfg', 'tox.ini',]
  # Wrong:
  FILES = [
      'setup.cfg',
      'tox.ini'
  ]
  ```

- 命名相关规范、注释相关规范、类型注解相关规范，我们将在后续章节中做详细介绍。

  "A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is the most important." PEP 8 -- Style Guide for Python Code

:::{note}
PEP 8 的代码规范并不是绝对的，项目内的一致性要优先于 PEP 8 的规范。OpenMMLab 各个项目都在 setup.cfg 设定了一些代码规范的设置，请遵照这些设置。一个例子是在 PEP 8 中有如下一个例子：

```python
# Correct:
hypot2 = x*x + y*y
# Wrong:
hypot2 = x * x + y * y
```

这一规范是为了指示不同优先级，但 OpenMMLab 的设置中通常没有启用 yapf 的 `ARITHMETIC_PRECEDENCE_INDICATION` 选项，因而格式规范工具不会按照推荐样式格式化，以设置为准。
:::

#### Google 开源项目风格指南

[Google 使用的编程风格指南](https://google.github.io/styleguide/pyguide.html)，包括了 Python 相关的章节。相较于 PEP 8，该指南提供了更为详尽的代码指南。该指南包括了语言规范和风格规范两个部分。

其中，语言规范对 Python 中很多语言特性进行了优缺点的分析，并给出了使用指导意见，如异常、Lambda 表达式、列表推导式、metaclass 等。

风格规范的内容与 PEP 8 较为接近，大部分约定建立在 PEP 8 的基础上，也有一些更为详细的约定，如函数长度、TODO 注释、文件与 socket 对象的访问等。

推荐将该指南作为参考进行开发，但不必严格遵照，一来该指南存在一些 Python 2 兼容需求，例如指南中要求所有无基类的类应当显式地继承 Object, 而在仅使用 Python 3 的环境中，这一要求是不必要的，依本项目中的惯例即可。二来 OpenMMLab 的项目作为框架级的开源软件，不必对一些高级技巧过于避讳，尤其是 MMCV。但尝试使用这些技巧前应当认真考虑是否真的有必要，并寻求其他开发人员的广泛评估。

另外需要注意的一处规范是关于包的导入，在该指南中，要求导入本地包时必须使用路径全称，且导入的每一个模块都应当单独成行，通常这是不必要的，而且也不符合目前项目的开发惯例，此处进行如下约定：

```python
# Correct
from mmcv.cnn.bricks import (Conv2d, build_norm_layer, DropPath, MaxPool2d,
                             Linear)
from ..utils import ext_loader

# Wrong
from mmcv.cnn.bricks import Conv2d, build_norm_layer, DropPath, MaxPool2d, \
                            Linear  # 使用括号进行连接，而不是反斜杠
from ...utils import is_str  # 最多向上回溯一层，过多的回溯容易导致结构混乱
```

OpenMMLab 项目使用 pre-commit 工具自动格式化代码，详情见[贡献代码](./contributing.md#代码风格)。

### 命名规范

#### 命名规范的重要性

优秀的命名是良好代码可读的基础。基础的命名规范对各类变量的命名做了要求，使读者可以方便地根据代码名了解变量是一个类 / 局部变量 / 全局变量等。而优秀的命名则需要代码作者对于变量的功能有清晰的认识，以及良好的表达能力，从而使读者根据名称就能了解其含义，甚至帮助了解该段代码的功能。

#### 基础命名规范

| 类型            | 公有             | 私有               |
| --------------- | ---------------- | ------------------ |
| 模块            | lower_with_under | \_lower_with_under |
| 包              | lower_with_under |                    |
| 类              | CapWords         | \_CapWords         |
| 异常            | CapWordsError    |                    |
| 函数（方法）    | lower_with_under | \_lower_with_under |
| 函数 / 方法参数 | lower_with_under |                    |
| 全局 / 类内常量 | CAPS_WITH_UNDER  | \_CAPS_WITH_UNDER  |
| 全局 / 类内变量 | lower_with_under | \_lower_with_under |
| 变量            | lower_with_under | \_lower_with_under |
| 局部变量        | lower_with_under |                    |

注意：

- 尽量避免变量名与保留字冲突，特殊情况下如不可避免，可使用一个后置下划线，如 class\_
- 尽量不要使用过于简单的命名，除了约定俗成的循环变量 i，文件变量 f，错误变量 e 等。
- 不会被用到的变量可以命名为 \_，逻辑检查器会将其忽略。

#### 命名技巧

良好的变量命名需要保证三点：

1. 含义准确，没有歧义
2. 长短适中
3. 前后统一

```python
# Wrong
class Masks(metaclass=ABCMeta):  # 命名无法表现基类；Instance or Semantic？
    pass

# Correct
class BaseInstanceMasks(metaclass=ABCMeta):
    pass

# Wrong，不同地方含义相同的变量尽量用统一的命名
def __init__(self, inplanes, planes):
    pass

def __init__(self, in_channels, out_channels):
    pass
```

常见的函数命名方法：

- 动宾命名法：crop_img, init_weights
- 动宾倒置命名法：imread, bbox_flip

注意函数命名与参数的顺序，保证主语在前，符合语言习惯：

- check_keys_exist(key, container)
- check_keys_contain(container, key)

注意避免非常规或统一约定的缩写，如 nb -> num_blocks，in_nc -> in_channels

### docstring 规范

#### 为什么要写 docstring

docstring 是对一个类、一个函数功能与 API 接口的详细描述，有两个功能，一是帮助其他开发者了解代码功能，方便 debug 和复用代码；二是在 Readthedocs 文档中自动生成相关的 API reference 文档，帮助不了解源代码的社区用户使用相关功能。

#### 如何写 docstring

与注释不同，一份规范的 docstring 有着严格的格式要求，以便于 Python 解释器以及 sphinx 进行文档解析，详细的 docstring 约定参见 [PEP 257](https://www.python.org/dev/peps/pep-0257/)。此处以例子的形式介绍各种文档的标准格式，参考格式为 [Google 风格](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)。

1. 模块文档

   代码风格规范推荐为每一个模块（即 Python 文件）编写一个 docstring，但目前 OpenMMLab 项目大部分没有此类 docstring，因此不做硬性要求。

   ```python
   """A one line summary of the module or program, terminated by a period.

   Leave one blank line. The rest of this docstring should contain an
   overall description of the module or program. Optionally, it may also
   contain a brief description of exported classes and functions and/or usage
   examples.

   Typical usage example:

   foo = ClassFoo()
   bar = foo.FunctionBar()
   """
   ```

2. 类文档

   类文档是我们最常需要编写的，此处，按照 OpenMMLab 的惯例，我们使用了与 Google 风格不同的写法。如下例所示，文档中没有使用 Attributes 描述类属性，而是使用 Args 描述 __init__ 函数的参数。

   在 Args 中，遵照 `parameter (type): Description.` 的格式，描述每一个参数类型和功能。其中，多种类型可使用 `(float or str)` 的写法，可以为 None 的参数可以写为 `(int, optional)`。

   ```python
   class BaseRunner(metaclass=ABCMeta):
       """The base class of Runner, a training helper for PyTorch.

       All subclasses should implement the following APIs:

       - ``run()``
       - ``train()``
       - ``val()``
       - ``save_checkpoint()``

       Args:
           model (:obj:`torch.nn.Module`): The model to be run.
           batch_processor (callable, optional): A callable method that process
               a data batch. The interface of this method should be
               ``batch_processor(model, data, train_mode) -> dict``.
               Defaults to None.
           optimizer (dict or :obj:`torch.optim.Optimizer`, optional): It can be
               either an optimizer (in most cases) or a dict of optimizers
               (in models that requires more than one optimizer, e.g., GAN).
               Defaults to None.
           work_dir (str, optional): The working directory to save checkpoints
               and logs. Defaults to None.
           logger (:obj:`logging.Logger`): Logger used during training.
                Defaults to None. (The default value is just for backward
                compatibility)
           meta (dict, optional): A dict records some import information such as
               environment info and seed, which will be logged in logger hook.
               Defaults to None.
           max_epochs (int, optional): Total training epochs. Defaults to None.
           max_iters (int, optional): Total training iterations. Defaults to None.
       """

       def __init__(self,
                    model,
                    batch_processor=None,
                    optimizer=None,
                    work_dir=None,
                    logger=None,
                    meta=None,
                    max_iters=None,
                    max_epochs=None):
           ...
   ```

   另外，在一些算法实现的主体类中，建议加入原论文的链接；如果参考了其他开源代码的实现，则应加入 modified from，而如果是直接复制了其他代码库的实现，则应加入 copied from ，并注意源码的 License。如有必要，也可以通过 .. math:: 来加入数学公式

   ```python
   # 参考实现
   # This func is modified from `detectron2
   # <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.

   # 复制代码
   # This code was copied from the `ubelt
   # library<https://github.com/Erotemic/ubelt>`_.

   # 引用论文 & 添加公式
   class LabelSmoothLoss(nn.Module):
       r"""Initializer for the label smoothed cross entropy loss.

       Refers to `Rethinking the Inception Architecture for Computer Vision
       <https://arxiv.org/abs/1512.00567>`_.

       This decreases gap between output scores and encourages generalization.
       Labels provided to forward can be one-hot like vectors (NxC) or class
       indices (Nx1).
       And this accepts linear combination of one-hot like labels from mixup or
       cutmix except multi-label task.

       Args:
           label_smooth_val (float): The degree of label smoothing.
           num_classes (int, optional): Number of classes. Defaults to None.
           mode (str): Refers to notes, Options are "original", "classy_vision",
               "multi_label". Defaults to "classy_vision".
           reduction (str): The method used to reduce the loss.
               Options are "none", "mean" and "sum". Defaults to 'mean'.
           loss_weight (float):  Weight of the loss. Defaults to 1.0.

       Note:
           if the ``mode`` is "original", this will use the same label smooth
           method as the original paper as:

           .. math::
               (1-\epsilon)\delta_{k, y} + \frac{\epsilon}{K}

           where :math:`\epsilon` is the ``label_smooth_val``, :math:`K` is
           the ``num_classes`` and :math:`\delta_{k,y}` is Dirac delta,
           which equals 1 for k=y and 0 otherwise.

           if the ``mode`` is "classy_vision", this will use the same label
           smooth method as the `facebookresearch/ClassyVision
           <https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/losses/label_smoothing_loss.py>`_ repo as:

           .. math::
               \frac{\delta_{k, y} + \epsilon/K}{1+\epsilon}

           if the ``mode`` is "multi_label", this will accept labels from
           multi-label task and smoothing them as:

           .. math::
               (1-2\epsilon)\delta_{k, y} + \epsilon
   ```

```{note}
注意 \`\`here\`\`、\`here\`、"here" 三种引号功能是不同。

在 reStructured 语法中，\`\`here\`\` 表示一段代码；\`here\` 表示斜体；"here" 无特殊含义，一般可用来表示字符串。其中 \`here\` 的用法与 Markdown 中不同，需要多加留意。
另外还有 :obj:\`type\` 这种更规范的表示类的写法，但鉴于长度，不做特别要求，一般仅用于表示非常用类型。
```

3. 方法（函数）文档

   函数文档与类文档的结构基本一致，但需要加入返回值文档。对于较为复杂的函数和类，可以使用 Examples 字段加入示例；如果需要对参数加入一些较长的备注，可以加入 Note 字段进行说明。

   对于使用较为复杂的类或函数，比起看大段大段的说明文字和参数文档，添加合适的示例更能帮助用户迅速了解其用法。需要注意的是，这些示例最好是能够直接在 Python 交互式环境中运行的，并给出一些相对应的结果。如果存在多个示例，可以使用注释简单说明每段示例，也能起到分隔作用。

   ```python
   def import_modules_from_strings(imports, allow_failed_imports=False):
       """Import modules from the given list of strings.

       Args:
           imports (list | str | None): The given module names to be imported.
           allow_failed_imports (bool): If True, the failed imports will return
               None. Otherwise, an ImportError is raise. Defaults to False.

       Returns:
           List[module] | module | None: The imported modules.
           All these three lines in docstring will be compiled into the same
           line in readthedocs.

       Examples:
           >>> osp, sys = import_modules_from_strings(
           ...     ['os.path', 'sys'])
           >>> import os.path as osp_
           >>> import sys as sys_
           >>> assert osp == osp_
           >>> assert sys == sys_
       """
       ...
   ```

   如果函数接口在某个版本发生了变化，需要在 docstring 中加入相关的说明，必要时添加 Note 或者 Warning 进行说明，例如：

   ```python
   class CheckpointHook(Hook):
       """Save checkpoints periodically.

       Args:
           out_dir (str, optional): The root directory to save checkpoints. If
               not specified, ``runner.work_dir`` will be used by default. If
               specified, the ``out_dir`` will be the concatenation of
               ``out_dir`` and the last level directory of ``runner.work_dir``.
               Defaults to None. `Changed in version 1.3.15.`
           file_client_args (dict, optional): Arguments to instantiate a
               FileClient. See :class:`mmcv.fileio.FileClient` for details.
               Defaults to None. `New in version 1.3.15.`

       Warning:
           Before v1.3.15, the ``out_dir`` argument indicates the path where the
           checkpoint is stored. However, in v1.3.15 and later, ``out_dir``
           indicates the root directory and the final path to save checkpoint is
           the concatenation of out_dir and the last level directory of
           ``runner.work_dir``. Suppose the value of ``out_dir`` is
           "/path/of/A" and the value of ``runner.work_dir`` is "/path/of/B",
           then the final path will be "/path/of/A/B".
   ```

   如果参数或返回值里带有需要展开描述字段的 dict，则应该采用如下格式：

   ```python
   def func(x):
       r"""
       Args:
           x (None): A dict with 2 keys, ``padded_targets``, and ``targets``.

               - ``targets`` (list[Tensor]): A list of tensors.
                 Each tensor has the shape of :math:`(T_i)`. Each
                 element is the index of a character.
               - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`.
                 Each item is the length of a word.

       Returns:
           dict: A dict with 2 keys, ``padded_targets``, and ``targets``.

           - ``targets`` (list[Tensor]): A list of tensors.
             Each tensor has the shape of :math:`(T_i)`. Each
             element is the index of a character.
           - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`.
             Each item is the length of a word.
       """
       return x
   ```

```{important}
为了生成 readthedocs 文档，文档的编写需要按照 ReStructrued 文档格式，否则会产生文档渲染错误，在提交 PR 前，最好生成并预览一下文档效果。
语法规范参考：

- [reStructuredText Primer - Sphinx documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#)
- [Example Google Style Python Docstrings ‒ napoleon 0.7 documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google)
```

### 注释规范

#### 为什么要写注释

对于一个开源项目，团队合作以及社区之间的合作是必不可少的，因而尤其要重视合理的注释。不写注释的代码，很有可能过几个月自己也难以理解，造成额外的阅读和修改成本。

#### 如何写注释

最需要写注释的是代码中那些技巧性的部分。如果你在下次代码审查的时候必须解释一下，那么你应该现在就给它写注释。对于复杂的操作，应该在其操作开始前写上若干行注释。对于不是一目了然的代码，应在其行尾添加注释。
—— Google 开源项目风格指南

```python
# We use a weighted dictionary search to find out where i is in
# the array. We extrapolate position based on the largest num
# in the array and the array size and then do binary search to
# get the exact number.
if i & (i-1) == 0:  # True if i is 0 or a power of 2.
```

为了提高可读性, 注释应该至少离开代码2个空格.
另一方面, 绝不要描述代码. 假设阅读代码的人比你更懂Python, 他只是不知道你的代码要做什么.
—— Google 开源项目风格指南

```python
# Wrong:
# Now go through the b array and make sure whenever i occurs
# the next element is i+1

# Wrong:
if i & (i-1) == 0:  # True if i bitwise and i-1 is 0.
```

在注释中，可以使用 Markdown 语法，因为开发人员通常熟悉 Markdown 语法，这样可以便于交流理解，如可使用单反引号表示代码和变量（注意不要和 docstring 中的 ReStructured 语法混淆）

```python
# `_reversed_padding_repeated_twice` is the padding to be passed to
# `F.pad` if needed (e.g., for non-zero padding types that are
# implemented as two ops: padding + conv). `F.pad` accepts paddings in
# reverse order than the dimension.
self._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2)
```

#### 注释示例

1. 出自 `mmcv/utils/registry.py`，对于较为复杂的逻辑结构，通过注释，明确了优先级关系。

   ```python
   # self.build_func will be set with the following priority:
   # 1. build_func
   # 2. parent.build_func
   # 3. build_from_cfg
   if build_func is None:
       if parent is not None:
           self.build_func = parent.build_func
       else:
           self.build_func = build_from_cfg
   else:
       self.build_func = build_func
   ```

2. 出自 `mmcv/runner/checkpoint.py`，对于 bug 修复中的一些特殊处理，可以附带相关的 issue 链接，帮助其他人了解 bug 背景。

   ```python
   def _save_ckpt(checkpoint, file):
       # The 1.6 release of PyTorch switched torch.save to use a new
       # zipfile-based file format. It will cause RuntimeError when a
       # checkpoint was saved in high version (PyTorch version>=1.6.0) but
       # loaded in low version (PyTorch version<1.6.0). More details at
       # https://github.com/open-mmlab/mmpose/issues/904
       if digit_version(TORCH_VERSION) >= digit_version('1.6.0'):
           torch.save(checkpoint, file, _use_new_zipfile_serialization=False)
       else:
           torch.save(checkpoint, file)
   ```

### 类型注解

#### 为什么要写类型注解

类型注解是对函数中变量的类型做限定或提示，为代码的安全性提供保障、增强代码的可读性、避免出现类型相关的错误。
Python 没有对类型做强制限制，类型注解只起到一个提示作用，通常你的 IDE 会解析这些类型注解，然后在你调用相关代码时对类型做提示。另外也有类型注解检查工具，这些工具会根据类型注解，对代码中可能出现的问题进行检查，减少 bug 的出现。
需要注意的是，通常我们不需要注释模块中的所有函数：

1. 公共的 API 需要注释
2. 在代码的安全性，清晰性和灵活性上进行权衡是否注释
3. 对于容易出现类型相关的错误的代码进行注释
4. 难以理解的代码请进行注释
5. 若代码中的类型已经稳定，可以进行注释. 对于一份成熟的代码，多数情况下，即使注释了所有的函数，也不会丧失太多的灵活性.

#### 如何写类型注解

1. 函数 / 方法类型注解，通常不对 self 和 cls 注释。

   ```python
   from typing import Optional, List, Tuple

   # 全部位于一行
   def my_method(self, first_var: int) -> int:
       pass

   # 另起一行
   def my_method(
           self, first_var: int,
           second_var: float) -> Tuple[MyLongType1, MyLongType1, MyLongType1]:
       pass

   # 单独成行（具体的应用场合与行宽有关，建议结合 yapf 自动化格式使用）
   def my_method(
       self, first_var: int, second_var: float
   ) -> Tuple[MyLongType1, MyLongType1, MyLongType1]:
       pass

   # 引用尚未被定义的类型
   class MyClass:
       def __init__(self,
                    stack: List["MyClass"]) -> None:
           pass
   ```

   注：类型注解中的类型可以是 Python 内置类型，也可以是自定义类，还可以使用 Python 提供的 wrapper 类对类型注解进行装饰，一些常见的注解如下：

   ```python
   # 数值类型
   from numbers import Number

   # 可选类型，指参数可以为 None
   from typing import Optional
   def foo(var: Optional[int] = None):
       pass

   # 联合类型，指同时接受多种类型
   from typing import Union
   def foo(var: Union[float, str]):
       pass

   from typing import Sequence  # 序列类型
   from typing import Iterable  # 可迭代类型
   from typing import Any  # 任意类型
   from typing import Callable  # 可调用类型

   from typing import List, Dict  # 列表和字典的泛型类型
   from typing import Tuple  # 元组的特殊格式
   # 虽然在 Python 3.9 中，list, tuple 和 dict 本身已支持泛型，但为了支持之前的版本
   # 我们在进行类型注解时还是需要使用 List, Tuple, Dict 类型
   # 另外，在对参数类型进行注解时，尽量使用 Sequence & Iterable & Mapping
   # List, Tuple, Dict 主要用于返回值类型注解
   # 参见 https://docs.python.org/3/library/typing.html#typing.List
   ```

2. 变量类型注解，一般用于难以直接推断其类型时

   ```python
   # Recommend: 带类型注解的赋值
   a: Foo = SomeUndecoratedFunction()
   a: List[int]: [1, 2, 3]         # List 只支持单一类型泛型，可使用 Union
   b: Tuple[int, int] = (1, 2)     # 长度固定为 2
   c: Tuple[int, ...] = (1, 2, 3)  # 变长
   d: Dict[str, int] = {'a': 1, 'b': 2}

   # Not Recommend：行尾类型注释
   # 虽然这种方式被写在了 Google 开源指南中，但这是一种为了支持 Python 2.7 版本
   # 而补充的注释方式，鉴于我们只支持 Python 3, 为了风格统一，不推荐使用这种方式。
   a = SomeUndecoratedFunction()  # type: Foo
   a = [1, 2, 3]  # type: List[int]
   b = (1, 2, 3)  # type: Tuple[int, ...]
   c = (1, "2", 3.5)  # type: Tuple[int, Text, float]
   ```

3. 泛型

   上文中我们知道，typing 中提供了 list 和 dict 的泛型类型，那么我们自己是否可以定义类似的泛型呢？

   ```python
   from typing import TypeVar, Generic

   KT = TypeVar('KT')
   VT = TypeVar('VT')

   class Mapping(Generic[KT, VT]):
       def __init__(self, data: Dict[KT, VT]):
           self._data = data

       def __getitem__(self, key: KT) -> VT:
           return self._data[key]
   ```

   使用上述方法，我们定义了一个拥有泛型能力的映射类，实际用法如下：

   ```python
   mapping = Mapping[str, float]({'a': 0.5})
   value: float = example['a']
   ```

   另外，我们也可以利用 TypeVar 在函数签名中指定联动的多个类型：

   ```python
   from typing import TypeVar, List

   T = TypeVar('T')  # Can be anything
   A = TypeVar('A', str, bytes)  # Must be str or bytes


   def repeat(x: T, n: int) -> List[T]:
       """Return a list containing n references to x."""
       return [x]*n


   def longest(x: A, y: A) -> A:
       """Return the longest of two strings."""
       return x if len(x) >= len(y) else y
   ```

更多关于类型注解的写法请参考 [typing](https://docs.python.org/3/library/typing.html)。

#### 类型注解检查工具

[mypy](https://mypy.readthedocs.io/en/stable/) 是一个 Python 静态类型检查工具。根据你的类型注解，mypy 会检查传参、赋值等操作是否符合类型注解，从而避免可能出现的 bug。

例如如下的一个  Python 脚本文件 test.py:

```python
def foo(var: int) -> float:
    return float(var)

a: str = foo('2.0')
b: int = foo('3.0')  # type: ignore
```

运行 mypy test.py 可以得到如下检查结果，分别指出了第 4 行在函数调用和返回值赋值两处类型错误。而第 5 行同样存在两个类型错误，由于使用了 type: ignore 而被忽略了，只有部分特殊情况可能需要此类忽略。

```
test.py:4: error: Incompatible types in assignment (expression has type "float", variable has type "int")
test.py:4: error: Argument 1 to "foo" has incompatible type "str"; expected "int"
Found 2 errors in 1 file (checked 1 source file)
```


================================================
FILE: docs/zh_cn/community/contributing.md
================================================
## 贡献代码

欢迎加入 MMCV 社区，我们致力于打造最前沿的计算机视觉基础库，我们欢迎任何类型的贡献，包括但不限于

**修复错误**

修复代码实现错误的步骤如下：

1. 如果提交的代码改动较大，建议先提交 issue，并正确描述 issue 的现象、原因和复现方式，讨论后确认修复方案。
2. 修复错误并补充相应的单元测试，提交拉取请求。

**新增功能或组件**

1. 如果新功能或模块涉及较大的代码改动，建议先提交 issue，确认功能的必要性。
2. 实现新增功能并添单元测试，提交拉取请求。

**文档补充**

修复文档可以直接提交拉取请求

添加文档或将文档翻译成其他语言步骤如下

1. 提交 issue，确认添加文档的必要性。
2. 添加文档，提交拉取请求。

### 拉取请求工作流

如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)

#### 1. 复刻仓库

当你第一次提交拉取请求时，先复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮，复刻后的代码库将会出现在你的 GitHub 个人主页下。

<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">

将代码克隆到本地

```shell
git clone git@github.com:{username}/mmcv.git
```

添加原代码库为上游代码库

```bash
git remote add upstream git@github.com:open-mmlab/mmcv
```

检查 remote 是否添加成功，在终端输入 `git remote -v`

```bash
origin	git@github.com:{username}/mmcv.git (fetch)
origin	git@github.com:{username}/mmcv.git (push)
upstream	git@github.com:open-mmlab/mmcv (fetch)
upstream	git@github.com:open-mmlab/mmcv (push)
```

```{note}
这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 git clone 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 open-mmlab。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
```

#### 2. 配置 pre-commit

在本地开发环境中，我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格，以确保代码风格的统一。在提交代码，需要先安装 pre-commit（需要在 MMCV 目录下执行）:

```shell
pip install -U pre-commit
pre-commit install
```

检查 pre-commit 是否配置成功，并安装 `.pre-commit-config.yaml` 中的钩子：

```shell
pre-commit run --all-files
```

<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">

<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">

```{note}
如果你是中国用户，由于网络原因，可能会出现安装失败的情况，这时可以使用国内源

pre-commit install -c .pre-commit-config-zh-cn.yaml

pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml
```

如果安装过程被中断，可以重复执行 `pre-commit run ...` 继续安装。

如果提交的代码不符合代码风格规范，pre-commit 会发出警告，并自动修复部分错误。

<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">

如果我们想临时绕开 pre-commit 的检查提交一次代码，可以在 `git commit` 时加上 `--no-verify`（需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查）。

```shell
git commit -m "xxx" --no-verify
```

#### 3. 创建开发分支

安装完 pre-commit 之后，我们需要基于 main 创建开发分支，建议的分支命名规则为 `username/pr_name`。

```shell
git checkout -b yhc/refactor_contributing_doc
```

在后续的开发中，如果本地仓库的 main 分支落后于 upstream 的 main 分支，我们需要先拉取 upstream 的代码进行同步，再执行上面的命令

```shell
git pull upstream main
```

#### 4. 提交代码并在本地通过单元测试

- MMCV 引入了 mypy 来做静态类型检查，以增加代码的鲁棒性。因此我们在提交代码时，需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。

- 提交的代码同样需要通过单元测试

  ```shell
  # 通过全量单元测试
  pytest tests

  # 我们需要保证提交的代码能够通过修改模块的单元测试，以 runner 为例
  pytest tests/test_runner/test_runner.py
  ```

  如果你由于缺少依赖无法运行修改模块的单元测试，可以参考[指引-单元测试](#单元测试)

- 如果修改/添加了文档，参考[指引](#文档渲染)确认文档渲染正常。

#### 5. 推送代码到远程

代码通过单元测试和 pre-commit 检查后，将代码推送到远程仓库，如果是第一次推送，可以在 `git push` 后加上 `-u` 参数以关联远程分支

```shell
git push -u origin {branch_name}
```

这样下次就可以直接使用 `git push` 命令推送代码了，而无需指定分支和远程仓库。

#### 6. 提交拉取请求（PR）

(1) 在 GitHub 的 Pull request 界面创建拉取请求
<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">

(2) 根据指引修改 PR 描述，以便于其他开发者更好地理解你的修改

<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">

描述规范详见[拉取请求规范](#拉取请求规范)

&#160;

**注意事项**

(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 Issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）

(b) 如果是第一次为 OpenMMLab 做贡献，需要签署 CLA

<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">

(c) 检查提交的 PR 是否通过 CI（集成测试）

<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">

MMCV 会在不同的平台（Linux、Window、Mac），基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试，以保证代码的正确性，如果有任何一个没有通过，我们可点击上图中的 `Details` 来查看具体的测试信息，以便于我们修改代码。

(3) 如果 PR 通过了 CI，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤，直到 reviewer 同意合入 PR。

<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">

所有 reviewer 同意合入 PR 后，我们会尽快将 PR 合并到主分支。

#### 7. 解决冲突

随着时间的推移，我们的代码库会不断更新，这时候，如果你的 PR 与主分支存在冲突，你需要解决冲突，解决冲突的方式有两种：

```shell
git fetch --all --prune
git rebase upstream/main
```

或者

```shell
git fetch --all --prune
git merge upstream/main
```

如果你非常善于处理冲突，那么可以使用 rebase 的方式来解决冲突，因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用，那么可以使用 `merge` 的方式来解决冲突。

### 指引

#### 单元测试

如果你无法正常执行部分模块的单元测试，例如 [video](https://github.com/open-mmlab/mmcv/tree/main/mmcv/video) 模块，可能是你的当前环境没有安装以下依赖

```shell
# Linux
sudo apt-get update -y
sudo apt-get install -y libturbojpeg
sudo apt-get install -y ffmpeg

# Windows
conda install ffmpeg
```

在提交修复代码错误或新增特性的拉取请求时，我们应该尽可能的让单元测试覆盖所有提交的代码，计算单元测试覆盖率的方法如下

```shell
python -m coverage run -m pytest /path/to/test_file
python -m coverage html
# check file in htmlcov/index.html
```

#### 文档渲染

在提交修复代码错误或新增特性的拉取请求时，可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。
本地生成渲染后的文档的方法如下

```shell
pip install -r requirements/docs.txt
cd docs/zh_cn/
# or docs/en
make html
# check file in ./docs/zh_cn/_build/html/index.html
```

### 代码风格

#### Python

[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码

- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装
- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误
- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具
- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具

yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到

通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，
修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。
pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。

pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。

更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。

#### C++ and CUDA

C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)

### 拉取请求规范

1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题

2. 一个`拉取请求`对应一个短期分支

3. 粒度要细，一个`拉取请求`只做一件事情，避免超大的`拉取请求`

   - Bad：实现 Faster R-CNN
   - Acceptable：给 Faster R-CNN 添加一个 box head
   - Good：给 box head 增加一个参数来支持自定义的 conv 层数

4. 每次 Commit 时需要提供清晰且有意义 commit 信息

5. 提供清晰且有意义的`拉取请求`描述

   - 标题写明白任务名称，一般格式:\[Prefix\] Short description of the pull request (Suffix)
   - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review)
   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
   - 关联相关的`议题` (issue) 和其他`拉取请求`

6. 如果引入了其他三方库，或借鉴了三方库的代码，请确认他们的许可证和 mmcv 兼容，并在借鉴的代码上补充 `This code is inspired from http://`


================================================
FILE: docs/zh_cn/community/pr.md
================================================
## 拉取请求

本文档的内容已迁移到[贡献指南](contributing.md)。


================================================
FILE: docs/zh_cn/compatibility.md
================================================
### v2.0.0

OpenMMLab 团队于 2022 年 9 月 1 日在世界人工智能大会发布了新一代训练引擎 [MMEngine](https://github.com/open-mmlab/mmengine)，它是一个用于训练深度学习模型的基础库。相比于 MMCV，它提供了更高级且通用的训练器、接口更加统一的开放架构以及可定制化程度更高的训练流程。

OpenMMLab 团队于 2023 年 4 月 6 日发布 MMCV [v2.0.0](https://github.com/open-mmlab/mmcv/releases/tag/v2.0.0)。在 2.x 版本中，它有以下重大变化：

（1）删除了以下组件：

- `mmcv.fileio` 模块，删除于 PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179)。在需要使用 FileIO 的地方使用 mmengine 中的 FileIO 模块
- `mmcv.runner`、`mmcv.parallel`、`mmcv.engine` 和 `mmcv.device`，删除于 PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216)
- `mmcv.utils` 的所有类（例如 `Config` 和 `Registry`）和大部分函数，删除于 PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217)，只保留少数和 mmcv 相关的函数
- `mmcv.onnx`、`mmcv.tensorrt` 模块以及相关的函数，删除于 PR [#2225](https://github.com/open-mmlab/mmcv/pull/2225)
- 删除 MMCV 所有的根注册器并将类或者函数注册到 MMEngine 的[根注册器](https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py)

（2）新增了 [`mmcv.transforms`](https://github.com/open-mmlab/mmcv/tree/main/mmcv/transforms) 数据变换模块

（3）在 PR [#2235](https://github.com/open-mmlab/mmcv/pull/2235) 中将包名 **mmcv** 重命名为 **mmcv-lite**、 **mmcv-full** 重命名为 **mmcv**。此外，将环境变量 `MMCV_WITH_OPS` 的默认值从 0 改为 1

<table class="docutils">
<thead>
  <tr>
    <th align="center">MMCV < 2.0</th>
    <th align="center">MMCV >= 2.0 </th>
<tbody>
  <tr>
  <td valign="top">

```bash
# 包含算子，因为 mmcv-full 的最高版本小于 2.0.0，所以无需加版本限制
pip install openmim
mim install mmcv-full

# 不包含算子
pip install openmim
mim install "mmcv < 2.0.0"
```

</td>
  <td valign="top">

```bash
# 包含算子
pip install openmim
mim install mmcv

# 不包含算子，因为 mmcv-lite 的起始版本为 2.0.0，所以无需加版本限制
pip install openmim
mim install mmcv-lite
```

</td>
</tr>
</thead>
</table>

### v1.3.18

部分自定义算子对于不同的设备有不同实现，为此添加的大量宏命令与类型检查使得代码变得难以维护。例如：

```c++
  if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(rois);
    CHECK_CUDA_INPUT(output);
    CHECK_CUDA_INPUT(argmax_y);
    CHECK_CUDA_INPUT(argmax_x);

    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
                           aligned_height, aligned_width, spatial_scale,
                           sampling_ratio, pool_mode, aligned);
#else
    AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
  } else {
    CHECK_CPU_INPUT(input);
    CHECK_CPU_INPUT(rois);
    CHECK_CPU_INPUT(output);
    CHECK_CPU_INPUT(argmax_y);
    CHECK_CPU_INPUT(argmax_x);
    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
                          aligned_height, aligned_width, spatial_scale,
                          sampling_ratio, pool_mode, aligned);
  }
```

为此我们设计了注册与分发的机制以更好的管理这些算子实现。

```c++

void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,
                                       int aligned_height, int aligned_width,
                                       float spatial_scale, int sampling_ratio,
                                       int pool_mode, bool aligned);

void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  ROIAlignForwardCUDAKernelLauncher(
      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
      spatial_scale, sampling_ratio, pool_mode, aligned);
}

// 注册算子的cuda实现
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);
REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);

// roi_align.cpp
// 使用dispatcher根据参数中的Tensor device类型对实现进行分发
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
                       argmax_x, aligned_height, aligned_width, spatial_scale,
                       sampling_ratio, pool_mode, aligned);
}

```

### v1.3.11

为了灵活地支持更多的后端和硬件，例如 `NVIDIA GPUs` 、`AMD GPUs`，我们重构了 `mmcv/ops/csrc` 目录。注意，这次重构不会影响 API 的使用。更多相关信息，请参考 [PR1206](https://github.com/open-mmlab/mmcv/pull/1206)。

原始的目录结构如下所示

```
.
├── common_cuda_helper.hpp
├── ops_cuda_kernel.cuh
├── pytorch_cpp_helper.hpp
├── pytorch_cuda_helper.hpp
├── parrots_cpp_helper.hpp
├── parrots_cuda_helper.hpp
├── parrots_cudawarpfunction.cuh
├── onnxruntime
│   ├── onnxruntime_register.h
│   ├── onnxruntime_session_options_config_keys.h
│   ├── ort_mmcv_utils.h
│   ├── ...
│   ├── onnx_ops.h
│   └── cpu
│       ├── onnxruntime_register.cpp
│       ├── ...
│       └── onnx_ops_impl.cpp
├── parrots
│   ├── ...
│   ├── ops.cpp
│   ├── ops_cuda.cu
│   ├── ops_parrots.cpp
│   └── ops_pytorch.h
├── pytorch
│   ├── ...
│   ├── ops.cpp
│   ├── ops_cuda.cu
│   ├── pybind.cpp
└── tensorrt
    ├── trt_cuda_helper.cuh
    ├── trt_plugin_helper.hpp
    ├── trt_plugin.hpp
    ├── trt_serialize.hpp
    ├── ...
    ├── trt_ops.hpp
    └── plugins
        ├── trt_cuda_helper.cu
        ├── trt_plugin.cpp
        ├── ...
        ├── trt_ops.cpp
        └── trt_ops_kernel.cu
```

重构之后，它的结构如下所示

```
.
├── common
│   ├── box_iou_rotated_utils.hpp
│   ├── parrots_cpp_helper.hpp
│   ├── parrots_cuda_helper.hpp
│   ├── pytorch_cpp_helper.hpp
│   ├── pytorch_cuda_helper.hpp
│   └── cuda
│       ├── common_cuda_helper.hpp
│       ├── parrots_cudawarpfunction.cuh
│       ├── ...
│       └── ops_cuda_kernel.cuh
├── onnxruntime
│   ├── onnxruntime_register.h
│   ├── onnxruntime_session_options_config_keys.h
│   ├── ort_mmcv_utils.h
│   ├── ...
│   ├── onnx_ops.h
│   └── cpu
│       ├── onnxruntime_register.cpp
│       ├── ...
│       └── onnx_ops_impl.cpp
├── parrots
│   ├── ...
│   ├── ops.cpp
│   ├── ops_parrots.cpp
│   └── ops_pytorch.h
├── pytorch
│   ├── info.cpp
│   ├── pybind.cpp
│   ├── ...
│   ├── ops.cpp
│   └── cuda
│       ├── ...
│       └── ops_cuda.cu
└── tensorrt
    ├── trt_cuda_helper.cuh
    ├── trt_plugin_helper.hpp
    ├── trt_plugin.hpp
    ├── trt_serialize.hpp
    ├── ...
    ├── trt_ops.hpp
    └── plugins
        ├── trt_cuda_helper.cu
        ├── trt_plugin.cpp
        ├── ...
        ├── trt_ops.cpp
        └── trt_ops_kernel.cu
```


================================================
FILE: docs/zh_cn/conf.py
================================================
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys

import pytorch_sphinx_theme
from sphinx.builders.html import StandaloneHTMLBuilder

sys.path.insert(0, os.path.abspath('../..'))

version_file = '../../mmcv/version.py'
with open(version_file) as f:
    exec(compile(f.read(), version_file, 'exec'))
__version__ = locals()['__version__']

# -- Project information -----------------------------------------------------

project = 'mmcv'
copyright = '2018-2022, OpenMMLab'
author = 'MMCV Authors'

# The short X.Y version
version = __version__
# The full version, including alpha/beta/rc tags
release = __version__

# -- General configuration ---------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.

extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
    'sphinx.ext.intersphinx',
    'sphinx.ext.napoleon',
    'sphinx.ext.viewcode',
    'sphinx.ext.autosectionlabel',
    'sphinx_markdown_tables',
    'myst_parser',
    'sphinx_copybutton',
]  # yapf: disable

myst_heading_anchors = 4

myst_enable_extensions = ['colon_fence']

# Configuration for intersphinx
intersphinx_mapping = {
    'python': ('https://docs.python.org/3', None),
    'numpy': ('https://numpy.org/doc/stable', None),
    'torch': ('https://pytorch.org/docs/stable/', None),
    'mmengine': ('https://mmengine.readthedocs.io/en/latest', None),
}

autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']
autosectionlabel_prefix_document = True

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
source_suffix = {
    '.rst': 'restructuredtext',
    '.md': 'markdown',
}

# The master toctree document.
master_doc = 'index'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'zh_CN'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
# html_theme = 'sphinx_rtd_theme'
html_theme = 'pytorch_sphinx_theme'
html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
    'menu': [
        {
            'name': 'GitHub',
            'url': 'https://github.com/open-mmlab/mmcv'
        },
    ],
    # Specify the language of shared menu
    'menu_lang': 'cn',
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = ['css/readthedocs.css']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself.  Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}

# -- Options for HTMLHelp output ---------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = 'mmcvdoc'

# -- Options for LaTeX output ------------------------------------------------

latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',

    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',

    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',

    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
    (master_doc, 'mmcv.tex', 'mmcv Documentation', 'MMCV Contributors',
     'manual'),
]

# -- Options for manual page output ------------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, 'mmcv', 'mmcv Documentation', [author], 1)]

# -- Options for Texinfo output ----------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
    (master_doc, 'mmcv', 'mmcv Documentation', author, 'mmcv',
     'One line description of project.', 'Miscellaneous'),
]

# -- Options for Epub output -------------------------------------------------

# Bibliographic Dublin Core info.
epub_title = project

# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''

# A unique identification for the text.
#
# epub_uid = ''

# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']

# set priority when building html
StandaloneHTMLBuilder.supported_image_types = [
    'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
]
# -- Extension configuration -------------------------------------------------
# Ignore >>> when copying code
copybutton_prompt_text = r'>>> |\.\.\. '
copybutton_prompt_is_regexp = True


================================================
FILE: docs/zh_cn/docutils.conf
================================================
[html writers]
table_style: colwidths-auto


================================================
FILE: docs/zh_cn/faq.md
================================================
## 常见问题

在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题，并且知道可以帮到大家的解决办法，
欢迎随时丰富这个列表。

### 安装问题

- KeyError: "xxx: 'yyy is not in the zzz registry'"

  只有模块所在的文件被导入时，注册机制才会被触发，所以您需要在某处导入该文件，更多详情请查看 [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974)。

- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"

  1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv
  2. 参考 [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) 或者 [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) 安装 mmcv-full

- "invalid device function" 或者 "no kernel image is available for execution"

  1. 检查 GPU 的 CUDA 计算能力
  2. 运行 `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的，您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV。兼容性问题可能会出现在使用旧版的 GPUs，如：colab 上的 Tesla K80 (3.7)
  3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如，您可能使用 CUDA 10.0 编译 mmcv，但在 CUDA 9.0 的环境中运行它

- "undefined symbol" 或者 "cannot open xxx.so"

  1. 如果符号和 CUDA/C++ 相关（例如：libcudart.so 或者 GLIBCXX），请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致
  2. 如果符号和 PyTorch 相关（例如：符号包含 caffe、aten 和 TH），请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致
  3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同

- "RuntimeError: CUDA error: invalid configuration argument"

  这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低 [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
  的值并重新编译 mmcv。

- "RuntimeError: nms is not compiled with GPU support"

  这个错误是由于您的 CUDA 环境没有正确安装。
  您可以尝试重新安装您的 CUDA 环境，然后删除 mmcv/build 文件夹并重新编译 mmcv。

- "Segmentation fault"

  1. 检查 GCC 的版本，通常是因为 PyTorch 版本与 GCC 版本不匹配 （例如 GCC \< 4.9 )，我们推荐用户使用 GCC 5.4，我们也不推荐使用 GCC 5.5， 因为有反馈 GCC 5.5 会导致 "segmentation fault" 并且切换到 GCC 5.4 就可以解决问题
  2. 检查是否正确安装 CUDA 版本的 PyTorc。输入以下命令并检查是否返回 True
     ```shell
     python -c 'import torch; print(torch.cuda.is_available())'
     ```
  3. 如果 `torch` 安装成功，那么检查 MMCV 是否安装成功。输入以下命令，如果没有报错说明 mmcv-full 安装成。
     ```shell
     python -c 'import mmcv; import mmcv.ops'
     ```
  4. 如果 MMCV 与 PyTorch 都安装成功了，则可以使用 `ipdb` 设置断点或者使用 `print` 函数，分析是哪一部分的代码导致了 `segmentation fault`

- "libtorch_cuda_cu.so: cannot open shared object file"

  `mmcv-full` 依赖 `libtorch_cuda_cu.so` 文件，但程序运行时没能找到该文件。我们可以检查该文件是否存在 `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` 也可以尝试重装 PyTorch。

- "fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!"

  如果您在 Windows 上编译 mmcv-full 并且 CUDA 的版本是 9.2，您很可能会遇到这个问题 `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`，您可以尝试使用低版本的 Microsoft Visual Studio，例如 vs2017。

- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized"

  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.5.0，您很可能会遇到这个问题 `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`。解决这个问题的方法是将 `torch/csrc/jit/api/module.h` 文件中所有 `static constexpr bool all_slots = false;` 替换为 `static bool all_slots = false;`。更多细节可以查看 [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394)。

- "error: a member with an in-class initializer must be const"

  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.6.0，您很可能会遇到这个问题 `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. 解决这个问题的方法是将 `torch/include\torch/csrc/jit/api/module.h` 文件中的所有 `CONSTEXPR_EXCEPT_WIN_CUDA ` 替换为 `const`。更多细节可以查看 [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575)。

- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized"

  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.7.0，您很可能会遇到这个问题 `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. 解决这个问题的方法是修改 PyTorch 中的几个文件：

  - 删除 `torch/include\torch/csrc/jit/ir/ir.h` 文件中的 `static constexpr Symbol Kind = ::c10::prim::profile;` 和 `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;`
  - 将 `torch\include\pybind11\cast.h` 文件中的 `explicit operator type&() { return *(this->value); }` 替换为 `explicit operator type&() { return *((type*)this->value); }`
  - 将 `torch/include\torch/csrc/jit/api/module.h` 文件中的 所有 `CONSTEXPR_EXCEPT_WIN_CUDA` 替换为 `const`

  更多细节可以查看 [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956)。

- MMCV 和 MMDetection 的兼容性问题；"ConvWS is already registered in conv layer"

  请参考 [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 为您的 MMDetection 版本安装正确版本的 MMCV。

### 使用问题

- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"

  1. 这个错误是因为有些参数没有参与 loss 的计算，可能是代码中存在多个分支，导致有些分支没有参与 loss 的计算。更多细节见 [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582)。
  2. 你可以设置 DDP 中的 `find_unused_parameters` 为 `True`，或者手动查找哪些参数没有用到。

- "RuntimeError: Trying to backward through the graph a second time"

  不能同时设置 `GradientCumulativeOptimizerHook` 和 `OptimizerHook`，这会导致 `loss.backward()` 被调用两次，于是程序抛出 `RuntimeError`。我们只需设置其中的一个。更多细节见 [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379)。


================================================
FILE: docs/zh_cn/get_started/api_reference.md
================================================
# 接口对照表

由于 MMCV v1.x 升级到 MMCV v2.x 时移除了 `mmcv.fileio`，`mmcv.runner`，`mmcv.parallel`，`mmcv.engine`，`mmcv.device` 模块，以及 `mmcv.utils` 中的所有类和大部分函数，分别删除于 PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179)，PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216)，PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217)。因此我们提供了如下的接口对照表，以便于大家快速查找迁移后的接口。

## 相关讨论

- [Remove runner, parallel, engine and device](https://github.com/open-mmlab/mmcv/pull/2216)
- [ImportError: cannot import name 'is_list_of' from 'mmcv.utils'](https://github.com/open-mmlab/mmcv/issues/2282)
- [Could not find the files in MMengine which are removed in MMCV_v2x parallel. example, for DataContainer](https://github.com/open-mmlab/mmcv/issues/2934)
- [mmcv.cnn.bricks.registry](https://github.com/open-mmlab/mmengine/discussions/1356)
- [Replace mmcv's function and modules imported with mmengine's](https://github.com/open-mmlab/mmdetection/pull/8594)

## `mmcv.fileio`

| MMCV                                              | MMCV URL                                                                              | MMEngine                                                    | MMEngine URL                                                                                   |
| ------------------------------------------------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |
| mmcv.fileio.file_client.BaseStorageBackend        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.base.BaseStorageBackend            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/base.py              |
| mmcv.fileio.file_client.CephBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             |                                                             |                                                                                                |
| mmcv.fileio.file_client.PetrelBackend             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.petrel_backend.PetrelBackend       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/petrel_backend.py    |
| mmcv.fileio.file_client.MemcachedBackend          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.memcached_backend.MemcachedBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/memcached_backend.py |
| mmcv.fileio.file_client.LmdbBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.lmdb_backend.LmdbBackend           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/lmdb_backend.py      |
| mmcv.fileio.file_client.HardDiskBackend           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.file_client.HardDiskBackend                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py                |
| mmcv.fileio.file_client.HTTPBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.http_backend.HTTPBackend           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/http_backend.py      |
| mmcv.fileio.file_client.FileClient                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.file_client.FileClient                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py                |
| mmcv.fileio.io.load                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.io.load                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py                         |
| mmcv.fileio.io.dump                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.io.dump                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py                         |
| mmcv.fileio.io.\_register_handler                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.handlers.\_register_handler                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py    |
| mmcv.fileio.io.register_handler                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.handlers.register_handler                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py    |
| mmcv.fileio.parse.list_from_file                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py                   | mmengine.fileio.parse.list_from_file                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py                      |
| mmcv.fileio.parse.dict_from_file                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py                   | mmengine.fileio.parse.dict_from_file                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py                      |
| mmcv.fileio.handlers.base.BaseFileHandler         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/base.py           | mmengine.fileio.handlers.base.BaseFileHandler               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/base.py              |
| mmcv.fileio.handlers.json_handler.set_default     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py   | mmengine.fileio.handlers.json_handler.set_default           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py      |
| mmcv.fileio.handlers.json_handler.JsonHandler     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py   | mmengine.fileio.handlers.json_handler.JsonHandler           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py      |
| mmcv.fileio.handlers.pickle_handler.PickleHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/pickle_handler.py | mmengine.fileio.handlers.pickle_handler.PickleHandler       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/pickle_handler.py    |
| mmcv.fileio.handlers.yaml_handler.YamlHandler     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/yaml_handler.py   | mmengine.fileio.handlers.yaml_handler.YamlHandler           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/yaml_handler.py      |

## `mmcv.runner`

| MMCV                                                                  | MMCV URL                                                                                    | MMEngine                                                                                                                                | MMEngine URL                                                                                                                                                                                           |
| --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| mmcv.runner.hooks.logger.base.LoggerHook                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/base.py             | mmengine.hooks.logger_hook.LoggerHook                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py                                                                                                                         |
| mmcv.runner.hooks.logger.clearml.ClearMLLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/clearml.py          | 相似功能：mmengine.visualization.vis_backend.ClearMLVisBackend                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.dvclive.DvcliveLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/dvclive.py          | 相似功能：mmengine.visualization.vis_backend.DVCLiveVisBackend                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.mlflow.MlflowLoggerHook                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/mlflow.py           | 相似功能：mmengine.visualization.vis_backend.MLflowVisBackend                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.neptune.NeptuneLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/neptune.py          | 相似功能：mmengine.visualization.vis_backend.NeptuneVisBackend                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.pavi.PaviLoggerHook                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/pavi.py             |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.logger.segmind.SegmindLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/segmind.py          |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.logger.tensorboard.TensorboardLoggerHook            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/tensorboard.py      | 相似功能：mmengine.visualization.vis_backend.TensorboardVisBackend                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.logger.text.TextLoggerHook                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/text.py             |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.logger.wandb.WandbLoggerHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/wandb.py            | 相似功能：mmengine.visualization.vis_backend.WandbVisBackend                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |
| mmcv.runner.hooks.checkpoint.CheckpointHook                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/checkpoint.py              | mmengine.hooks.checkpoint_hook.CheckpointHook                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |
| mmcv.runner.hooks.closure.ClosureHook                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/closure.py                 |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.ema.EMAHook                                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/ema.py                     | mmengine.hooks.ema_hook.EMAHook                                                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/ema_hook.py                                                                                                                            |
| mmcv.runner.hooks.evaluation.EvalHook                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py              | 部分功能被移至 mmengine.hooks.checkpoint_hook.CheckpointHook                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |
| mmcv.runner.hooks.evaluation.DistEvalHook                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py              | 部分功能被移至 mmengine.hooks.checkpoint_hook.CheckpointHook                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |
| mmcv.runner.hooks.hook.HOOKS                                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py                    | mmengine.registry.root.HOOKS                                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |
| mmcv.runner.hooks.hook.Hook                                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py                    | mmengine.hooks.hook.Hook                                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py                                                                                                                                |
| mmcv.runner.hooks.iter_timer.IterTimerHook                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/iter_timer.py              | mmengine.hooks.iter_timer_hook.IterTimerHook                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/iter_timer_hook.py                                                                                                                     |
| mmcv.runner.hooks.lr_updater.LrUpdaterHook                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.LRSchedulerMixin                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.FixedLrUpdaterHook                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.ConstantLR                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.StepLrUpdaterHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.StepLR                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.ExpLrUpdaterHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.ExponentialLR                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.PolyLrUpdaterHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.PolyLR                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.InvLrUpdaterHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.lr_updater.CosineAnnealingUpdaterHook               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineAnnealingLR                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.FlatCosineAnnealingUpdaterHook           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.lr_updater.CosineRestartLrUpdaterHook               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.get_position_from_periods                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR.get_position_from_periods                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.CyclicLrUpdaterHook                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.lr_updater.OneCycleLrUpdaterHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.LinearAnnealingLrUpdaterHook             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.lr_updater.annealing_cos                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_annealing_cos                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.annealing_linear                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_annealing_linear                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.lr_updater.format_param                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_format_param                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |
| mmcv.runner.hooks.memory.EmptyCacheHook                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/memory.py                  | mmengine.hoos.empty_cache_hook.EmptyCacheHook                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/empty_cache_hook.py                                                                                                                    |
| mmcv.runner.hooks.momentum_updater.MomentumUpdaterHook                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.MomentumSchedulerMixin                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |
| mmcv.runner.hooks.momentum_updater.StepMomentumUpdaterHook            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.StepMomentum                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |
| mmcv.runner.hooks.momentum_updater.CosineAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.CosineAnnealingMomentum                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |
| mmcv.runner.hooks.momentum_updater.LinearAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.momentum_updater.CyclikcMomentumUpdaterHook         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.momentum_updater.OneCycleMomentumUpdaterHook        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.optimizer.OptimizerHook                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | mmengine.optimizer.optimizer_wrapper.OptimWrapper                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/optimizer_wrapper.py                                                                                                         |
| mmcv.runner.hooks.optimizer.GradientCumulativeOptimizerHook           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.optimizer.Fp16OptimizerHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | 相关功能被移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.optimizer.Fp16OptimizerHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | 相关功能被移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.hooks.profiler.ProfilerHook                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/profiler.py                | mmengine.hooks.profiler_hook.ProfilerHook                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/profiler_hook.py                                                                                                                       |
| mmcv.runner.hooks.sampler_seed.DistSamplerSeedHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sampler_seed.py            | mmengine.hooks.sampler_seed_hook.DistSamplerSeedHook                                                                                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sampler_seed_hook.py                                                                                                                   |
| mmcv.runner.hooks.sync_buffer.SyncbuffersHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sync_buffer.py             | mmengine.hooks.sync_buffer_hook.SyncBufferHook                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sync_buffer_hook.py                                                                                                                    |
| mmcv.runner.optimizer.builder.OPTIMIZERS                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             | mmengine.registry.root.OPTIMIZERS                                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |
| mmcv.runner.optimizer.builder.OPTIMIZER_BUILDERS                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.optimizer.builder.register_torch_optimizers               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             | mmengine.optim.optimizer.builder.register_torch_optimizers                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/builder.py                                                                                                                   |
| mmcv.runner.optimizer.builder.TORCH_OPTIMIZERS                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.optimizer.builder.build_optimizer_constructor             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.optimizer.builder.build_optimizer                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.optimizer.default_constructor.DefaultOptimizerConstructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/default_constructor.py |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.base_module.BaseModule                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.BaseModule                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |
| mmcv.runner.base_module.Sequential                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.Sequential                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |
| mmcv.runner.base_module.ModuleList                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.ModuleList                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |
| mmcv.runner.base_module.ModuleDict                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.ModuleDict                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |
| mmcv.runner.base_runner.BaseRunner                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_runner.py                   | mmengine.runner.runner.Runner                                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py                                                                                                                             |
| mmcv.runner.builder.RUNNERS                                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       | mmengine.registry.root.RUNNERS                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |
| mmcv.runner.builder.RUNNER_BUILDERS                                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       | mmengine.registry.root.RUNNER_CONSTRUCTORS                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |
| mmcv.runner.builder.build_runner_constructor                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.builder.build_runner                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.checkpoint.ENV_MMCV_HOME                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.ENV_MMENGINE_HOME                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.ENV_XDG_CACHE_HOME                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.ENV_XDG_CACHE_HOME                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.DEFAULT_CACHE_HOME                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.DEFAULT_CACHE_DIR                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_get_mmcv_home                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_get_mmengine_home                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_state_dict                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_state_dict                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_torchvision_models                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_torchvision_models                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_external_models                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_external_models                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_mmcls_models                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_mmcls_models                                                                                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_deprecated_model_names                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_deprecated_model_names                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_process_mmcls_checkpoint                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_process_mmcls_checkpoint                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.CheckpointLoader                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.CheckpointLoader                                                                                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_local                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_local                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_http                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_http                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_pavi                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_pavi                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_ceph                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_ceph                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_torchvision                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_torchvision                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_openmmlab                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_openmmlab                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_from_mmcls                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_mmcls                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_load_checkpoint                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_load_checkpoint                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_load_checkpoint_with_prefix                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_load_checkpoint_with_prefix                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.load_checkpoint                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_checkpoint                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.weights_to_cpu                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.weights_to_cpu                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.\_save_to_state_dict                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\_save_to_state_dict                                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.get_state_dict                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_state_dict                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.checkpoint.save_checkpoint                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.save_checkpoint                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |
| mmcv.runner.default_coonstructor.DefaultRunnerConstructor             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/default_constructor.py           |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.\_find_free_port                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.\_is_free_port                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.init_dist                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.init_dist                                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.\_init_dist_pytorch                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\_init_dist_pytorch                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.\_init_dist_mpi                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\_init_dist_mpi                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.\_init_dist_slurm                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\_init_dist_slurm                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.get_dist_info                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.get_dist_info                                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.master_only                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.master_only                                                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |
| mmcv.runner.dist_utils.allreduce_params                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.allreduce_grads                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.dist_utils.\_allreduce_coalesced                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.epoch_based_runner.EpochBasedRunner                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py            | mmengine.runner.loops.EpochBasedTrainLoop                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py                                                                                                                              |
| mmcv.runner.epoch_based_runner.Runner                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py            |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.fp16_utils.cast_tensor_type                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.auto_fp16                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.force_fp32                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.allreduce_grads                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.wrap_fp16_model                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.patch_norm_fp32                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.patch_forward_method                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.fp16_utils.LossScaler                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |
| mmcv.runner.iter_based_runner.IterLoader                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py             |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.iter_based_runner.IterBasedRunner                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py             | mmengine.runner.loops.IterBasedTrainLoop                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py                                                                                                                              |
| mmcv.runner.log_buffer.LogBuffer                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/log_buffer.py                    |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.priority.Priority                                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py                      | mmengine.runer.priority.Priority                                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py                                                                                                                           |
| mmcv.runner.priority.get_priority                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py                      | mmengine.runner.priority.get_priority                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py                                                                                                                           |
| mmcv.runner.utils.get_host_info                                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.utils.get_time_str                                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.utils.obj_from_dict                                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                         |                                                                                                                                                                                                        |
| mmcv.runner.utils.set_random_seed                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         | mmengine.runner.utils.set_random_seed                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/utils.py                                                                                                                              |

## `mmcv.parallel`

| MMCV                                                           | MMCV URL                                                                       | MMEngine                                                        | MMEngine URL                                                                              |
| -------------------------------------------------------------- | ------------------------------------------------------------------------------ | --------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| mmcv.parallel.\_functions.scatter                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                 |                                                                                           |
| mmcv.parallel.\_functions.synchronize_stream                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                 |                                                                                           |
| mmcv.parallel.\_functions.get_input_device                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                 |                                                                                           |
| mmcv.parallel.\_functions.Scatter                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                 |                                                                                           |
| mmcv.parallel.collate.collate                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/collate.py        |                                                                 |                                                                                           |
| mmcv.parallel.data_container.assert_tensor_type                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py |                                                                 |                                                                                           |
| mmcv.parallel.data_container.DataContainer                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py | 相似功能：mmengine/structures/base_data_element.BaseDataElement | https://github.com/open-mmlab/mmengine/blob/main/mmengine/structures/base_data_element.py |
| mmcv.parallel.data_parallel.MMDataParallel                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_parallel.py  |                                                                 |                                                                                           |
| mmcv.parallel.distributed.MMDistributedDataParallel            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py    | mmengine.model.wrappers.distributed.MMDistributedDataParallel   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py   |
| mmcv.parallel.distributed_deprecated.MMDistributedDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py    | mmengine.model.wrappers.distributed.MMDistributedDataParallel   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py   |
| mmcv.parallel.registry.MODULE_WRAPPERS                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/registry.py       | mmengine.registry.root.MODEL_WRAPPERS                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                |
| mmcv.parallel.scatter_gather.scatter                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py |                                                                 |                                                                                           |
| mmcv.parallel.scatter_gather.scatter_kwargs                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py |                                                                 |                                                                                           |
| mmcv.parallel.utils.is_module_wrapper                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/utils.py          | mmengine.model.wrappers.utils.is_model_wrapper                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/utils.py         |

## `mmcv.engine`

| MMCV                                 | MMCV URL                                                           | MMEngine | MMEngine URL |
| ------------------------------------ | ------------------------------------------------------------------ | -------- | ------------ |
| mmcv.engine.test.single_gpu_test     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |
| mmcv.engine.test.multi_gpu_test      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |
| mmcv.engine.test.collect_results_cpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |
| mmcv.engine.test.collect_results_gpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |

## `mmcv.device`

| MMCV                                      | MMCV URL                                                                     | MMEngine                         | MMEngine URL                                                              |
| ----------------------------------------- | ---------------------------------------------------------------------------- | -------------------------------- | ------------------------------------------------------------------------- |
| mmcv.device.ipu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/ipu               |                                  |                                                                           |
| mmcv.device.mlu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mlu               |                                  |                                                                           |
| mmcv.device.mps                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mps               |                                  |                                                                           |
| mmcv.device.npu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/npu               |                                  |                                                                           |
| mmcv.device.\_functions.scatter           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py     |                                  |                                                                           |
| mmcv.device.\_functions.Scatter           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py     |                                  |                                                                           |
| mmcv.device.scatter_gather.scatter        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py |                                  |                                                                           |
| mmcv.device.scatter_gather.scatter_kwargs | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py |                                  |                                                                           |
| mmcv.device.utils.get_device              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/utils.py          | mmengine.device.utils.get_device | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py |

## `mmcv.utils`

| MMCV                                                   | MMCV URL                                                                     | MMEngine                                                            | MMEngine URL                                                                                |
| ------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
| mmcv.utils.config.BASE_KEY                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.BASE_KEY                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.DELETE_KEY                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DELETE_KEY                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.DEPRECATION_KEY                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DEPRECATION_KEY                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.ConfigDict                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.ConfigDict                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.add_args                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.add_args                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.Config                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.Config                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.config.DictAction                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DictAction                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |
| mmcv.utils.device_type.is_ipu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     |                                                                     |                                                                                             |
| mmcv.utils.device_type.IS_IPU_AVAILABLE                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     |                                                                     |                                                                                             |
| mmcv.utils.device_type.is_mlu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_mlu_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |
| mmcv.utils.device_type.is_mps_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_mps_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |
| mmcv.utils.device_type.is_npu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_npu_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |
| mmcv.utils.hub.\_is_legacy_zip_format                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.\_is_legacy_zip_format                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |
| mmcv.utils.hub.\_legacy_zip_load                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.\_legacy_zip_load                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |
| mmcv.utils.hub.load_url                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.load_url                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |
| mmcv.utils.logging.logger_initialized                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |
| mmcv.utils.logging.get_logger                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |
| mmcv.utils.logging.print_log                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |
| mmcv.utils.misc.\_ntuple                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\_ntuple                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_1tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_1tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_2tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_2tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_3tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_3tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_4tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_4tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.to_ntuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_ntuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_str                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_str                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.import_modules_from_strings            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.import_modules_from_strings                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.iter_cast                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.iter_cast                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.list_cast                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.list_cast                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.tuple_cast                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.tuple_cast                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_seq_of                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_seq_of                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_list_of                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_list_of                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_tuple_of                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_tuple_of                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.slice_list                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.slice_list                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.concat_list                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.concat_list                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.check_prerequisites                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.check_prerequisites                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.\_check_py_package                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\_check_py_package                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.\_check_executable                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\_check_executable                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.requires_package                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.requires_package                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.requires_executable                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.requires_executable                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.deprecated_api_warning                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.deprecated_api_warning                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.is_method_overridden                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_method_overridden                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.misc.has_method                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.has_method                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |
| mmcv.utils.parrots_wrapper.TORCH_VERSION               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.TORCH_VERSION               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.is_cuda_available           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.devices.utils.is_cuda_available                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |
| mmcv.utils.parrots_wrapper.IS_CUDA_AVAILABLE           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py |                                                                     |                                                                                             |
| mmcv.utils.parrots_wrapper.is_rocm_pytorch             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.is_rocm_pytorch             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_cuda_home             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_cuda_home             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.get_build_config            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.get_build_config            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_conv                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_conv                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_dataloader            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_dataloader            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_extension             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_extension             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_pool                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_pool                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.\_get_norm                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_norm                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.parrots_wrapper.SyncBatchNorm               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.SyncBatchNorm               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |
| mmcv.utils.path.is_filepath                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.is_filepath                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.fopen                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.fopen                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.check_file_exist                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.check_file_exist                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.mkdir_or_exist                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.mkdir_or_exist                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.symlink                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.symlink                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.scandir                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.scandir                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.path.find_vcs_root                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.find_vcs_root                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |
| mmcv.utils.progressbar.ProgressBar                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.ProgressBar                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.progressbar.track_progress                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_progress                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.progressbar.init_pool                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.init_pool                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.progressbar.track_parallel_progress         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_parallel_progress                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.progressbar.track_iter_progress             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_iter_progress                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |
| mmcv.utils.registry.build_from_cfg                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py        | mmengine.registry.build_functions.build_from_cfg                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/build_functions.py       |
| mmcv.utils.registry.Registry                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py        | mmengine.registry.registry.Registry                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/registry.py              |
| mmcv.utils.seed.worker_init_fn                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/seed.py            | mmengine.dataset.utils.worker_init_fn                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/utils.py                  |
| mmcv.utils.testing.check_python_script                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.check_python_script                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.\_any                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.\_any                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_dict_contains_subset         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_dict_contains_subset                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_attrs_equal                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_attrs_equal                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_dict_has_keys                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_dict_has_keys                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_keys_equal                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_keys_equal                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_is_norm_layer                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_is_norm_layer                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.testing.assert_params_all_zeros             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_params_all_zeros                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |
| mmcv.utils.timer.TimerError                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.TimerError                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |
| mmcv.utils.timer.Timer                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.Timer                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |
| mmcv.utils.timer.\_g_timers                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.\_g_timers                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |
| mmcv.utils.timer.check_time                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.check_time                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |
| mmcv.utils.torch_ops.\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py       | mmengine.utils.dl_utils.torch_ops.\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py       |
| mmcv.utils.torch_ops.torch_meshgrid                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py       | mmengine.utils.dl_utils.torch_ops.torch_meshgrid                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py       |
| mmcv.utils.trace.is_jit_tracing                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/trace.py           | mmengine.utils.dl_utils.trace.is_jit_tracing                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/trace.py           |
| mmcv.utils.version_utils.digit_version                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.digit_version                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |
| mmcv.utils.version_utils.\_minimal_ext_cmd             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.\_minimal_ext_cmd                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |
| mmcv.utils.version_utils.get_git_hash                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.get_git_hash                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |

## `mmcv.cnn`

| MMCV                                         | MMCV URL                                                                 | MMEngine                                   | MMEngine URL                                                             |
| -------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------------------ |
| mmcv.cnn.utils.sync_bn.\_BatchNormXd         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.\_BatchNormXd         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py |
| mmcv.cnn.utils.sync_bn.revert_sync_batchnorm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.revert_sync_batchnorm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py |

## `mmcv.model_zoo`

| MMCV                                 | MMCV URL                                                                            | MMEngine                           | MMEngine URL                                                                        |
| ------------------------------------ | ----------------------------------------------------------------------------------- | ---------------------------------- | ----------------------------------------------------------------------------------- |
| mmcv.model_zoo.deprecated.json       | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/deprecated.json       | mmengine.hub.deprecated.json       | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/deprecated.json       |
| mmcv.model_zoo.mmcls.json            | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/mmcls.json            | mmengine.hub.mmcls.json            | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/mmcls.json            |
| mmcv.model_zoo.open_mmlab.json       | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/open_mmlab.json       | mmengine.hub.openmmlab.json        | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/openmmlab.json        |
| mmcv.model_zoo.torchvision_0.12.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/torchvision_0.12.json | mmengine.hub.torchvision_0.12.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/torchvision_0.12.json |


================================================
FILE: docs/zh_cn/get_started/article.md
================================================
## 解读文章汇总

这篇文章汇总了 [OpenMMLab](https://www.zhihu.com/people/openmmlab) 解读的部分文章（更多文章和视频见 [OpenMMLabCourse](https://github.com/open-mmlab/OpenMMLabCourse)），如果您有推荐的文章（不一定是 OpenMMLab 发布的文章，可以是自己写的文章），非常欢迎提 [Pull Request](http://127.0.0.1:5501/mmcv/docs/zh_cn/_build/html/community/pr.html) 添加到这里。

### MMCV 解读文章

#### 框架解读

- [MMCV 核心组件分析(一)：整体概述](https://zhuanlan.zhihu.com/p/336081587)
- [MMCV 核心组件分析(二)：FileHandler](https://zhuanlan.zhihu.com/p/336097883)
- [MMCV 核心组件分析(三): FileClient](https://zhuanlan.zhihu.com/p/339190576)
- [MMCV 核心组件分析(四): Config](https://zhuanlan.zhihu.com/p/346203167)
- [MMCV 核心组件分析(五): Registry](https://zhuanlan.zhihu.com/p/355271993)
- [MMCV 核心组件分析(六): Hook](https://zhuanlan.zhihu.com/p/355272220)
- [MMCV 核心组件分析(七): Runner](https://zhuanlan.zhihu.com/p/355272459)
- [MMCV Hook 食用指南](https://zhuanlan.zhihu.com/p/448600739)
- [PyTorch & MMCV Dispatcher 机制解析](https://zhuanlan.zhihu.com/p/451671838)

#### 工具解读

- [训练可视化工具哪款是你的菜？MMCV一行代码随你挑](https://zhuanlan.zhihu.com/p/387078211)

#### 安装指南

- [久等了！Windows 平台 MMCV 的预编译包终于来了！](https://zhuanlan.zhihu.com/p/441653536)
- [Windows 环境从零安装 mmcv-full](https://zhuanlan.zhihu.com/p/434491590)

#### 知乎问答

- [深度学习科研，如何高效进行代码和实验管理？](https://www.zhihu.com/question/269707221/answer/2480772257)
- [深度学习方面的科研工作中的实验代码有什么规范和写作技巧？如何妥善管理实验数据？](https://www.zhihu.com/question/268193800/answer/2586000037)

### 下游算法库解读文章

- [MMDetection](https://mmdetection.readthedocs.io/zh_CN/latest/article.html)

### PyTorch 解读文章

- [PyTorch1.11 亮点一览：TorchData、functorch、DDP 静态图](https://zhuanlan.zhihu.com/p/486222256)
- [PyTorch1.12 亮点一览：DataPipe + TorchArrow 新的数据加载与处理范式](https://zhuanlan.zhihu.com/p/537868554)
- [PyTorch 源码解读之 nn.Module：核心网络模块接口详解](https://zhuanlan.zhihu.com/p/340453841)
- [PyTorch 源码解读之 torch.autograd：梯度计算详解](https://zhuanlan.zhihu.com/p/321449610)
- [PyTorch 源码解读之 torch.utils.data：解析数据处理全流程](https://zhuanlan.zhihu.com/p/337850513)
- [PyTorch 源码解读之 torch.optim：优化算法接口详解](https://zhuanlan.zhihu.com/p/346205754)
- [PyTorch 源码解读之 DP & DDP：模型并行和分布式训练解析](https://zhuanlan.zhihu.com/p/343951042)
- [PyTorch 源码解读之 BN & SyncBN：BN 与 多卡同步 BN 详解](https://zhuanlan.zhihu.com/p/337732517)
- [PyTorch 源码解读之 torch.cuda.amp: 自动混合精度详解](https://zhuanlan.zhihu.com/p/348554267)
- [PyTorch 源码解读之 cpp_extension：揭秘 C++/CUDA 算子实现和调用全流程](https://zhuanlan.zhihu.com/p/348555597)
- [PyTorch 源码解读之即时编译篇](https://zhuanlan.zhihu.com/p/361101354)
- [PyTorch 源码解读之分布式训练了解一下？](https://zhuanlan.zhihu.com/p/361314953)
- [PyTorch 源码解读之 torch.serialization & torch.hub](https://zhuanlan.zhihu.com/p/364239544)

### 其他

- [困扰我 48 小时的深拷贝，今天终于...](https://zhuanlan.zhihu.com/p/470892209)
- [拿什么拯救我的 4G 显卡](https://zhuanlan.zhihu.com/p/430123077)
- [是谁偷偷动了我的 logger](https://zhuanlan.zhihu.com/p/481383590)
- [三句话，让 logger 言听计从](https://zhuanlan.zhihu.com/p/487524917)
- [Logging 不为人知的二三事](https://zhuanlan.zhihu.com/p/502610682)
- [Type Hints 入门教程，让代码更加规范整洁](https://zhuanlan.zhihu.com/p/519335398)
- [手把手教你如何高效地在 MMCV 中贡献算子](https://zhuanlan.zhihu.com/p/464492627)
- [OpenMMLab 支持 IPU 训练芯片](https://zhuanlan.zhihu.com/p/517527926)
- [基于 MMCV 走上开源大佬之路？](https://zhuanlan.zhihu.com/p/391144979)


================================================
FILE: docs/zh_cn/get_started/build.md
================================================
## 从源码编译 MMCV

### 编译 mmcv

在编译 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证

```bash
python -c 'import torch;print(torch.__version__)'
```

:::{note}

- 如果克隆代码仓库的速度过慢，可以使用以下命令克隆（注意：gitee 的 mmcv 不一定和 github 的保持一致，因为每天只同步一次）

```bash
git clone https://gitee.com/open-mmlab/mmcv.git
```

- 如果打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。

- 如果编译过程安装依赖库的时间过长，可以[设置 pypi 源](https://mirrors.tuna.tsinghua.edu.cn/help/pypi/)

```bash
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
```

:::

#### 在 Linux 上编译 mmcv

| TODO: 视频教程

1. 克隆代码仓库

   ```bash
   git clone https://github.com/open-mmlab/mmcv.git
   cd mmcv
   ```

2. 安装 `ninja` 和 `psutil` 以加快编译速度

   ```bash
   pip install -r requirements/optional.txt
   ```

3. 检查 nvcc 的版本（要求大于等于 9.2，如果没有 GPU，可以跳过）

   ```bash
   nvcc --version
   ```

   上述命令如果输出以下信息，表示 nvcc 的设置没有问题，否则需要设置 CUDA_HOME

   ```
   nvcc: NVIDIA (R) Cuda compiler driver
   Copyright (c) 2005-2020 NVIDIA Corporation
   Built on Mon_Nov_30_19:08:53_PST_2020
   Cuda compilation tools, release 11.2, V11.2.67
   Build cuda_11.2.r11.2/compiler.29373293_0
   ```

   :::{note}
   如果想要支持 ROCm，可以参考 [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) 安装 ROCm。
   :::

4. 检查 gcc 的版本（要求大于等于**5.4**）

   ```bash
   gcc --version
   ```

5. 开始编译（预估耗时 10 分钟）

   ```bash
   pip install -e . -v
   ```

6. 验证安装

   ```bash
   python .dev_scripts/check_installation.py
   ```

   如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.html)是否已经有解决方案。

   如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。

#### 在 macOS 上编译 mmcv

| TODO: 视频教程

```{note}
如果你使用的是搭载 apple silicon 的 mac 设备，请安装 PyTorch 1.13+ 的版本，否则会遇到 [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218) 中的问题。
```

1. 克隆代码仓库

   ```bash
   git clone https://github.com/open-mmlab/mmcv.git
   cd mmcv
   ```

2. 安装 `ninja` 和 `psutil` 以加快编译速度

   ```bash
   pip install -r requirements/optional.txt
   ```

3. 开始编译

   ```bash
   pip install -e .
   ```

4. 验证安装

   ```bash
   python .dev_scripts/check_installation.py
   ```

   如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.md)是否已经有解决方案。

   如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。

#### 在 Windows 上编译 mmcv

| TODO: 视频教程

在 Windows 上编译 mmcv 比 Linux 复杂，本节将一步步介绍如何在 Windows 上编译 mmcv。

##### 依赖项

请先安装以下的依赖项：

- [Git](https://git-scm.com/download/win)：安装期间，请选择 **add git to Path**
- [Visual Studio Community 2019](https://visualstudio.microsoft.com)：用于编译 C++ 和 CUDA 代码
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)：包管理工具
- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)：如果只需要 CPU 版本可以不安装 CUDA，安装 CUDA 时，可根据需要进行自定义安装。如果已经安装新版本的显卡驱动，建议取消驱动程序的安装

```{note}
如果不清楚如何安装以上依赖，请参考[Windows 环境从零安装 mmcv](https://zhuanlan.zhihu.com/p/434491590)。
另外，你需要知道如何在 Windows 上设置变量环境，尤其是 "PATH" 的设置，以下安装过程都会用到。
```

##### 通用步骤

1. 从 Windows 菜单启动 Anaconda 命令行

   如 Miniconda 安装程序建议，不要使用原始的 `cmd.exe` 或是 `powershell.exe`。命令行有两个版本，一个基于 PowerShell，一个基于传统的 `cmd.exe`。请注意以下说明都是使用的基于 PowerShell

2. 创建一个新的 Conda 环境

   ```powershell
   (base) PS C:\Users\xxx> conda create --name mmcv python=3.7
   (base) PS C:\Users\xxx> conda activate mmcv  # 确保做任何操作前先激活环境
   ```

3. 安装 PyTorch 时，可以根据需要安装支持 CUDA 或不支持 CUDA 的版本

   ```powershell
   # CUDA version
   (mmcv) PS C:\Users\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
   # CPU version
   (mmcv) PS C:\Users\xxx> conda install install pytorch torchvision cpuonly -c pytorch
   ```

4. 克隆代码仓库

   ```powershell
   (mmcv) PS C:\Users\xxx> git clone https://github.com/open-mmlab/mmcv.git
   (mmcv) PS C:\Users\xxx> cd mmcv
   ```

5. 安装 `ninja` 和 `psutil` 以加快编译速度

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> pip install -r requirements/optional.txt
   ```

6. 设置 MSVC 编译器

   设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`，则 `cl.exe` 可以在命令行中运行，如下所示。

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> cl
   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
   Copyright (C) Microsoft Corporation.   All rights reserved.

   usage: cl [ option... ] filename... [ / link linkoption... ]
   ```

   为了兼容性，我们使用 x86-hosted 以及 x64-targeted 版本，即路径中的 `Hostx86\x64` 。

   因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本，只有 utf-8 将会被识别，你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。

##### 编译与安装 mmcv

mmcv 有两个版本：

- 只包含 CPU 算子的版本

  编译 CPU 算子，但只有 x86 将会被编译，并且编译版本只能在 CPU only 情况下运行

- 既包含 CPU 算子，又包含 CUDA 算子的版本

  同时编译 CPU 和 CUDA 算子，`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU

###### CPU 版本

编译安装

```powershell
(mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
(mmcv) PS C:\Users\xxx\mmcv> python setup.py develop  # 安装
```

###### GPU 版本

1. 检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> ls env:

   Name                           Value
   ----                           -----
   CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
   CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
   CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
   ```

   如果没有，你可以按照下面的步骤设置

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
   # 或者
   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2  # CUDA_PATH_V10_2 已经在环境变量中
   ```

2. 设置 CUDA 的目标架构

   ```powershell
   # 这里需要改成你的显卡对应的目标架构
   (mmcv) PS C:\Users\xxx\mmcv> $env:TORCH_CUDA_ARCH_LIST="7.5"
   ```

   :::{note}
   可以点击 [cuda-gpus](https://developer.nvidia.com/cuda-gpus) 查看 GPU 的计算能力，也可以通过 CUDA 目录下的 deviceQuery.exe 工具查看

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> &"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\demo_suite\deviceQuery.exe"
   Device 0: "NVIDIA GeForce GTX 1660 SUPER"
   CUDA Driver Version / Runtime Version          11.7 / 11.1
   CUDA Capability Major/Minor version number:    7.5
   ```

   上面的 7.5 表示目标架构。注意：需把上面命令的 v10.2 换成你的 CUDA 版本。
   :::

3. 编译安装

   ```powershell
   (mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
   (mmcv) PS C:\Users\xxx\mmcv> python setup.py develop # 安装
   ```

   ```{note}
   如果你的 PyTorch 版本是 1.6.0，你可能会遇到一些 [issue](https://github.com/pytorch/pytorch/issues/42467) 提到的错误，你可以参考这个 [pull request](https://github.com/pytorch/pytorch/pull/43380/files) 修改本地环境的 PyTorch 源代码
   ```

##### 验证安装

```powershell
(mmcv) PS C:\Users\xxx\mmcv> python .dev_scripts/check_installation.py
```

如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.md)是否已经有解决方案。
如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。

### 编译 mmcv-lite

如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。

1. 克隆代码仓库

   ```bash
   git clone https://github.com/open-mmlab/mmcv.git
   cd mmcv
   ```

2. 开始编译

   ```bash
   MMCV_WITH_OPS=0 pip install -e . -v
   ```

3. 验证安装

   ```bash
   python -c 'import mmcv;print(mmcv.__version__)'
   ```

### 在寒武纪 MLU 机器编译 mmcv-full

#### 安装 torch_mlu

##### 选项1: 基于寒武纪 docker image 安装

首先请下载并且拉取寒武纪 docker (请向 service@cambricon.com 发邮件以获得最新的寒武纪 pytorch 发布 docker)。

```
docker pull ${docker image}
```

进入 docker, [编译 MMCV MLU](#编译mmcv-mlu) 并[进行验证](#验证是否成功安装)。

##### 选项2：基于 cambricon pytorch 源码编译安装

请向 service@cambricon.com 发送邮件或联系 Cambricon 工程师以获取合适版本的 CATCH 软件包，在您获得合适版本的 CATCH 软件包后，请参照 ${CATCH-path}/CONTRIBUTING.md 中的步骤安装 CATCH。

#### 编译 MMCV

克隆代码仓库

```bash
git clone https://github.com/open-mmlab/mmcv.git
```

算子库 mlu-ops 在编译 MMCV 时自动下载到默认路径(mmcv/mlu-ops)，你也可以在编译前设置环境变量 MMCV_MLU_OPS_PATH 指向已经存在的 mlu-ops 算子库路径。

```bash
export MMCV_MLU_OPS_PATH=/xxx/xxx/mlu-ops
```

开始编译

```bash
cd mmcv
export MMCV_WITH_OPS=1
export FORCE_MLU=1
python setup.py install
```

#### 验证是否成功安装

完成上述安装步骤之后，您可以尝试运行下面的 Python 代码以测试您是否成功在 MLU 设备上安装了 mmcv-full

```python
import torch
import torch_mlu
from mmcv.ops import sigmoid_focal_loss
x = torch.randn(3, 10).mlu()
x.requires_grad = True
y = torch.tensor([1, 5, 3]).mlu()
w = torch.ones(10).float().mlu()
output = sigmoid_focal_loss(x, y, 2.0, 0.25, w, 'none')
```

### 在昇腾 NPU 机器编译 mmcv

在编译 mmcv 前，需要安装 torch_npu，完整安装教程详见 [PyTorch 安装指南](https://gitee.com/ascend/pytorch/blob/master/docs/zh/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97.md#pytorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97)

#### 选项 1: 使用 NPU 设备源码编译安装 mmcv (推荐方式)

- 拉取 [MMCV 源码](https://github.com/open-mmlab/mmcv.git)

```bash
git pull https://github.com/open-mmlab/mmcv.git
```

- 编译

```bash
MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_NPU=1 python setup.py build_ext
```

- 安装

```bash
MMCV_WITH_OPS=1 FORCE_NPU=1 python setup.py develop
```

#### 选项 2: 使用 pip 安装 Ascend 编译版本的 mmcv

Ascend 编译版本的 mmcv 在 mmcv >= 1.7.0 时已经支持直接 pip 安装

```bash
pip install mmcv -f https://download.openmmlab.com/mmcv/dist/ascend/torch1.8.0/index.html
```

#### 验证

```python
import torch
import torch_npu
from mmcv.ops import softmax_focal_loss

# Init tensor to the NPU
x = torch.randn(3, 10).npu()
y = torch.tensor([1, 5, 3]).npu()
w = torch.ones(10).float().npu()

output = softmax_focal_loss(x, y, 2.0, 0.25, w, 'none')
print(output)
```


================================================
FILE: docs/zh_cn/get_started/installation.md
================================================
## 安装 MMCV

MMCV 有两个版本：

- **mmcv**: 完整版，包含所有的特性以及丰富的开箱即用的 CPU 和 CUDA 算子。注意，完整版本可能需要更长时间来编译。
- **mmcv-lite**: 精简版，不包含 CPU 和 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用算子的话，精简版可以作为一个考虑选项。

```{warning}
请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果 CUDA 可用，强烈推荐安装 mmcv`。
```

### 安装 mmcv

在安装 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证

```bash
python -c 'import torch;print(torch.__version__)'
```

如果输出版本信息，则表示 PyTorch 已安装。

#### 使用 mim 安装（推荐）

[mim](https://github.com/open-mmlab/mim) 是 OpenMMLab 项目的包管理工具，使用它可以很方便地安装 mmcv。

```bash
pip install -U openmim
mim install mmcv
```

如果发现上述的安装命令没有使用预编译包（以 `.whl` 结尾）而是使用源码包（以 `.tar.gz` 结尾）安装，则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包，此时，你可以[源码安装 mmcv](build.md)。

<details>
<summary>使用预编译包的安装日志</summary>

Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
Collecting mmcv<br />
<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>

</details>

<details>
<summary>使用源码包的安装日志</summary>

Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
Collecting mmcv==2.0.0<br />
<b>Downloading mmcv-2.0.0.tar.gz</b>

</details>

如需安装指定版本的 mmcv，例如安装 2.0.0 版本的 mmcv，可使用以下命令

```bash
mim install mmcv==2.0.0
```

:::{note}
如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。

另外，如果安装依赖库的时间过长，可以指定 pypi 源

```bash
mim install "mmcv>=2.0.0rc1" -i https://pypi.tuna.tsinghua.edu.cn/simple
```

:::

安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。

#### 使用 pip 安装

使用以下命令查看 CUDA 和 PyTorch 的版本

```bash
python -c 'import torch;print(torch.__version__);print(torch.version.cuda)'
```

根据系统的类型、CUDA 版本、PyTorch 版本以及 MMCV 版本选择相应的安装命令

<html>
<body>
<style>
    select {
        /*z-index: 1000;*/
        position: absolute;
        top: 10px;
        width: 6.7rem;
    }
    #select-container {
        position: relative;
        height: 30px;
    }
    #select-cmd {
        background-color: #f5f6f7;
        font-size: 14px;
        margin-top: 20px;
    }
    /* 让每一个都间隔1.3rem */
    #select-os {
        /* left: 1.375rem; */
        left: 0;
    }
    #select-cuda {
        /* left: 9.375rem;    9.375 = 1.375 + 6.7 + 1.3 */
        left: 8rem;
    }
    #select-torch {
        /* left: 17.375rem;    17.375 = 9.375 + 6.7 + 1.3 */
        left: 16rem;
    }
    #select-mmcv {
        /* left: 25.375rem;    25.375 = 17.375 + 6.7 + 1.3 */
        left: 24rem;
    }
</style>
<div id="select-container">
    <select
            size="1"
            onmousedown="handleSelectMouseDown(this.id)"
            onclick="clickOutside(this, () => handleSelectBlur(this.id))"
            onchange="changeOS(this.value)"
            id="select-os">
    </select>
    <select
            size="1"
            onmousedown="handleSelectMouseDown(this.id)"
            onclick="clickOutside(this, () => handleSelectBlur(this.is))"
            onchange="changeCUDA(this.value)"
            id="select-cuda">
    </select>
    <select
            size="1"
            onmousedown="handleSelectMouseDown(this.id)"
            onclick="clickOutside(this, () => handleSelectBlur(this.is))"
            onchange="changeTorch(this.value)"
            id="select-torch">
    </select>
    <select
            size="1"
            onmousedown="handleSelectMouseDown(this.id)"
            onclick="clickOutside(this, () => handleSelectBlur(this.is))"
            onchange="changeMMCV(this.value)"
            id="select-mmcv">
    </select>
</div>
<pre id="select-cmd"></pre>
</body>
<script>
    // 各个select当前的值
    let osVal, cudaVal, torchVal, mmcvVal;
    function clickOutside(targetDom, handler) {
        const clickHandler = (e) => {
            if (!targetDom || targetDom.contains(e.target)) return;
            handler?.();
            document.removeEventListener('click', clickHandler, false);
        };
        document.addEventListener('click', clickHandler, false);
    }
    function changeMMCV(val) {
        mmcvVal = val;
        change("select-mmcv");
    }
    function changeTorch(val) {
        torchVal = val;
        change("select-torch");
    }
    function changeCUDA(val) {
        cudaVal = val;
        change("select-cuda");
    }
    function changeOS(val) {
        osVal = val;
        change("select-os");
    }
    // 控制size大小相关的几个方法
    function handleSelectMouseDown(id) {
        const dom = document.getElementById(id);
        if (!dom) return;
        const len = dom?.options?.length;
        if (len >= 10) {
            dom.size = 10;
            dom.style.zIndex = 100;
        }
    }
    function handleSelectClick() {
        const selects = Array.from(document.getElementsByTagName("select"));
        selects.forEach(select => {
            select.size = 1;
        });
    }
    function handleSelectBlur(id) {
        const dom = document.getElementById(id);
        if (!dom) {
            // 如果没有指定特定的id，那就直接把所有的select都设置成size = 1
            handleSelectClick();
            return;
        }
        dom.size = 1;
        dom.style.zIndex = 1;
    }
    function changeCmd() {
        const cmd = document.getElementById("select-cmd");
        let cmdString = "pip install mmcv=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html";
        // e.g: pip install mmcv==2.0.0rc1 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9/index.html
        let cudaVersion;
        if (cudaVal === "cpu" || cudaVal === "mps") {
            cudaVersion = "cpu";
        } else {
            cudaVersion = `cu${cudaVal.split(".").join("")}`;
        }
        const torchVersion = `torch${torchVal.substring(0, torchVal.length - 2)}`;
        cmdString = cmdString.replace("{cu_version}", cudaVersion).replace("{mmcv_version}", mmcvVal).replace("{torch_version}", torchVersion);
        cmd.textContent = cmdString;
    }
    // string数组去重
    function unique(arr) {
        if (!arr || !Array.isArray(arr)) return [];
        return [...new Set(arr)];
    }
    // 根据string数组生成option的DocumentFragment
    function genOptionFragment(data, id) {
        const name = id.includes("-")? id.split("-")[1] : id;
        const fragment = new DocumentFragment();
        data.forEach(option => {
            const ele = document.createElement("option");
            let text = `${name} ${option}`;
            if (name === "os" || option.toUpperCase() === "CPU" || option.toUpperCase() === "MPS") {
                text = `${option}`;
            }
            ele.textContent = text;
            // 添加value属性，方便下拉框选择时直接读到数据
            ele.value = option;
            // 添加点击事件监听
            ele.addEventListener('click', handleSelectClick);
            fragment.appendChild(ele);
        });
        return fragment;
    }
    // 在dom树中找到id对应的dom（select元素），并将生成的options添加到元素内
    function findAndAppend(data, id) {
        const fragment = genOptionFragment(data, id);
        const dom = document.getElementById(id);
        if (dom) dom.replaceChildren(fragment);
    }
    /**
     * change方法的重点在于
     * 1. 各个下拉框数据的联动
     *      OS ==> cuda ==> torch ==> mmcv
     * 2. 命令行的修改
     */
    function change(id) {
        const order = ["select-mmcv", "select-torch", "select-cuda", "select-os"];
        const idx = order.indexOf(id);
        if (idx === -1) return;
        const versionDetail = version[osVal];
        if (idx >= 3) {
            // 根据os修改cuda
            let cuda = [];
            versionDetail.forEach(v => {
                cuda.push(v.cuda);
            });
            cuda = unique(cuda);
            cudaVal = cuda[0];
            findAndAppend(cuda, "select-cuda");
        }
        if (idx >= 2) {
            // 根据cuda修改torch
            const torch = [];
            versionDetail.forEach(v => {
                if (v.cuda === cudaVal) torch.push(v.torch);
            });
            torchVal = torch[0];
            findAndAppend(torch, "select-torch");
        }
        if (idx >= 1) {
            // 根据torch修改mmcv
            let mmcv = [];
            versionDetail.forEach(v => {
                if (v.cuda === cudaVal && v.torch === torchVal) mmcv = v.mmcv;
            });
            mmcvVal = mmcv[0];
            findAndAppend(mmcv, "select-mmcv");
        }
        changeCmd();
    }
    // 初始化，处理version数据，并调用findAndAppend
    function init() {
        // 增加一个全局的click事件监听，作为select onBlur事件失效的兜底
        // document.addEventListener("click", handleSelectBlur);
        const version = window.version;
        // OS
        const os = Object.keys(version);
        osVal = os[0];
        findAndAppend(os, "select-os");
        change("select-os");
        changeCmd();
    }
    // 利用xhr获取本地version数据，如果作为html直接浏览的话需要使用本地服务器打开，否则会有跨域问题
    window.onload = function () {
        const url = "../_static/version.json"
        // 申明一个XMLHttpRequest
        const request = new XMLHttpRequest();
        // 设置请求方法与路径
        request.open("get", url);
        // 不发送数据到服务器
        request.send(null);
        //XHR对象获取到返回信息后执行
        request.onload = function () {
            // 返回状态为200，即为数据获取成功
            if (request.status !== 200) return;
            const data = JSON.parse(request.responseText);
            window.version = data;
            init();
        }
    }
</script>
</html>

如果在上面的下拉框中没有找到对应的版本，则可能是没有对应 PyTorch 或者 CUDA 或者 mmcv 版本的预编译包，此时，你可以[源码安装 mmcv](build.md)。

:::{note}
PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv 只提供 1.x.0 的编译包。如果你
的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv。例如，如果你的
PyTorch 版本是 1.8.1，你可以放心选择 1.8.x。
:::

:::{note}
如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。

另外，如果安装依赖库的时间过长，可以指定 pypi 源

```bash
pip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html -i https://pypi.tuna.tsinghua.edu.cn/simple
```

:::

安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。

#### 使用 docker 镜像

先将算法库克隆到本地再构建镜像

```bash
git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
docker build -t mmcv -f docker/release/Dockerfile .
```

也可以直接使用下面的命令构建镜像

```bash
docker build -t mmcv https://github.com/open-mmlab/mmcv.git#main:docker/release
```

[Dockerfile](release/Dockerfile) 默认安装最新的 mmcv，如果你想要指定版本，可以使用下面的命令

```bash
docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0 .
```

如果你想要使用其他版本的 PyTorch 和 CUDA，你可以在构建镜像时指定它们的版本。

例如指定 PyTorch 的版本是 1.11，CUDA 的版本是 11.3

```bash
docker build -t mmcv -f docker/release/Dockerfile \
    --build-arg PYTORCH=1.11.0 \
    --build-arg CUDA=11.3 \
    --build-arg CUDNN=8 \
    --build-arg MMCV=2.0.0 .
```

更多 PyTorch 和 CUDA 镜像可以点击 [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags) 查看。

### 安装 mmcv-lite

如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。

```python
pip install mmcv-lite
```


================================================
FILE: docs/zh_cn/get_started/introduction.md
================================================
## 介绍 MMCV

MMCV 是一个面向计算机视觉的基础库，它提供了以下功能：

- [图像和视频处理](../understand_mmcv/data_process.md)
- [图像和标注结果可视化](../understand_mmcv/visualization.md)
- [图像变换](../understand_mmcv/data_transform.md)
- [多种 CNN 网络结构](../understand_mmcv/cnn.md)
- [高质量实现的常见 CUDA 算子](../understand_mmcv/ops.md)

MMCV 支持多种平台，包括：

- Linux
- Windows
- macOS

它支持的 OpenMMLab 项目：

- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架


================================================
FILE: docs/zh_cn/get_started/previous_versions.md
================================================
## 其他版本的 PyTorch

我们不再提供在较低的 `PyTorch` 版本下编译的 `mmcv-full` 包，但为了您的方便，您可以在下面找到它们。

### PyTorch 1.4

| 1.0.0 \<= mmcv_version \<= 1.2.1

#### CUDA 10.1

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
```

#### CUDA 9.2

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
```

#### CPU

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.4.0/index.html
```

### PyTorch v1.3

| 1.0.0 \<= mmcv_version \<= 1.3.16

#### CUDA 10.1

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
```

#### CUDA 9.2

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
```

#### CPU

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.3.0/index.html
```


================================================
FILE: docs/zh_cn/index.rst
================================================
欢迎来到 MMCV 的中文文档！
=============================

您可以在页面左下角切换中英文文档。

.. toctree::
   :maxdepth: 2
   :caption: 介绍与安装

   get_started/introduction.md
   get_started/installation.md
   get_started/build.md
   get_started/article.md
   get_started/api_reference.md

.. toctree::
   :maxdepth: 2
   :caption: 深入理解 MMCV

   understand_mmcv/data_process.md
   understand_mmcv/data_transform.md
   understand_mmcv/visualization.md
   understand_mmcv/cnn.md
   understand_mmcv/ops.md

.. toctree::
   :caption: 语言切换

   switch_language.md

.. toctree::
   :maxdepth: 2
   :caption: 兼容性

   compatibility.md

.. toctree::

   faq.md

.. toctree::
   :maxdepth: 2
   :caption: 社区

   community/contributing.md
   community/pr.md
   community/code_style.md

.. toctree::
   :maxdepth: 1
   :caption: API 文档

   mmcv.image <api/image>
   mmcv.video <api/video>
   mmcv.visualization <api/visualization>
   mmcv.cnn <api/cnn>
   mmcv.ops <api/ops>
   mmcv.transforms <api/transforms>
   mmcv.arraymisc <api/arraymisc>
   mmcv.utils <api/utils>


Indices and tables
==================

* :ref:`genindex`
* :ref:`search`


================================================
FILE: docs/zh_cn/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%

:end
popd


================================================
FILE: docs/zh_cn/switch_language.md
================================================
## <a href='https://mmcv.readthedocs.io/en/2.x/'>English</a>

## <a href='https://mmcv.readthedocs.io/zh_CN/2.x/'>简体中文</a>


================================================
FILE: docs/zh_cn/understand_mmcv/cnn.md
================================================
## 卷积神经网络

我们为卷积神经网络提供了一些构建模块，包括层构建、模块组件和权重初始化。

### 网络层的构建

在运行实验时，我们可能需要尝试同属一种类型但不同配置的层，但又不希望每次都修改代码。于是我们提供一些层构建方法，可以从字典构建层，字典可以在配置文件中配置，也可以通过命令行参数指定。

#### 用法

一个简单的例子：

```python
from mmcv.cnn import build_conv_layer

cfg = dict(type='Conv3d')
layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
```

- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名）
- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN（IN是IN2d的别名）
- `build_activation_layer`：支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU
- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle
- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate

#### 拓展

我们还允许自定义层和算子来扩展构建方法。

1. 编写和注册自己的模块：

   ```python
   from mmengine.registry import MODELS

   @MODELS.register_module()
   class MyUpsample:

       def __init__(self, scale_factor):
           pass

       def forward(self, x):
           pass
   ```

2. 在某处导入 `MyUpsample` （例如 `__init__.py` ）然后使用它：

   ```python
   from mmcv.cnn import build_upsample_layer

   cfg = dict(type='MyUpsample', scale_factor=2)
   layer = build_upsample_layer(cfg)
   ```

### 模块组件

我们还提供了常用的模块组件，以方便网络构建。
卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成，更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。

```python
from mmcv.cnn import ConvModule

# conv + bn + relu
conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
# conv + gn + relu
conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
# conv + relu
conv = ConvModule(3, 8, 2)
# conv
conv = ConvModule(3, 8, 2, act_cfg=None)
# conv + leaky relu
conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
# bn + conv + relu
conv = ConvModule(
    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
```


================================================
FILE: docs/zh_cn/understand_mmcv/data_process.md
================================================
## 数据处理

### 图像

图像模块提供了一些图像预处理的函数，该模块依赖 `opencv` 。

#### 读取/保存/显示

使用 `imread` 和 `imwrite` 函数可以读取和保存图像。

```python
import mmcv

img = mmcv.imread('test.jpg')
img = mmcv.imread('test.jpg', flag='grayscale')
img_ = mmcv.imread(img)  # 相当于什么也没做
mmcv.imwrite(img, 'out.jpg')
```

从二进制中读取图像

```python
with open('test.jpg', 'rb') as f:
    data = f.read()
img = mmcv.imfrombytes(data)
```

显示图像文件或已读取的图像

```python
mmcv.imshow('tests/data/color.jpg')

for i in range(10):
    img = np.random.randint(256, size=(100, 100, 3), dtype=np.uint8)
    mmcv.imshow(img, win_name='test image', wait_time=200)
```

#### 色彩空间转换

支持的转换函数：

- bgr2gray
- gray2bgr
- bgr2rgb
- rgb2bgr
- bgr2hsv
- hsv2bgr

```python
img = mmcv.imread('tests/data/color.jpg')
img1 = mmcv.bgr2rgb(img)
img2 = mmcv.rgb2gray(img1)
img3 = mmcv.bgr2hsv(img)
```

#### 缩放

有三种缩放图像的方法。所有以 `imresize_*` 开头的函数都有一个 `return_scale` 参数，如果
该参数为 `False` ，函数的返回值只有调整之后的图像，否则是一个元组 `(resized_img, scale)` 。

```python
# 缩放图像至给定的尺寸
mmcv.imresize(img, (1000, 600), return_scale=True)

# 缩放图像至与给定的图像同样的尺寸
mmcv.imresize_like(img, dst_img, return_scale=False)

# 以一定的比例缩放图像
mmcv.imrescale(img, 0.5)

# 缩放图像至最长的边不大于1000、最短的边不大于800并且没有改变图像的长宽比
mmcv.imrescale(img, (1000, 800))
```

#### 旋转

我们可以使用 `imrotate` 旋转图像一定的角度。旋转的中心需要指定，默认值是原始图像的中心。有
两种旋转的模式，一种保持图像的尺寸不变，因此旋转后原始图像中的某些部分会被裁剪，另一种是扩大
图像的尺寸进而保留完整的原始图像。

```python
img = mmcv.imread('tests/data/color.jpg')

# 顺时针旋转图像30度
img_ = mmcv.imrotate(img, 30)

# 逆时针旋转图像90度
img_ = mmcv.imrotate(img, -90)

# 顺时针旋转图像30度并且缩放图像为原始图像的1.5倍
img_ = mmcv.imrotate(img, 30, scale=1.5)

# 以坐标(100, 100)为中心顺时针旋转图像30度
img_ = mmcv.imrotate(img, 30, center=(100, 100))

# 顺时针旋转图像30度并扩大图像的尺寸
img_ = mmcv.imrotate(img, 30, auto_bound=True)
```

#### 翻转

我们可以使用 `imflip` 翻转图像。

```python
img = mmcv.imread('tests/data/color.jpg')

# 水平翻转图像
mmcv.imflip(img)

# 垂直翻转图像
mmcv.imflip(img, direction='vertical')
```

#### 裁剪

`imcrop` 可以裁剪图像的一个或多个区域，每个区域用左上角和右下角坐标表示，形如(x1, y1, x2, y2)

```python
import mmcv
import numpy as np

img = mmcv.imread('tests/data/color.jpg')

# 裁剪区域 (10, 10, 100, 120)
bboxes = np.array([10, 10, 100, 120])
patch = mmcv.imcrop(img, bboxes)

# 裁剪两个区域，分别是 (10, 10, 100, 120) 和 (0, 0, 50, 50)
bboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])
patches = mmcv.imcrop(img, bboxes)

# 裁剪两个区域并且缩放区域1.2倍
patches = mmcv.imcrop(img, bboxes, scale=1.2)
```

#### 填充

`impad` and `impad_to_multiple` 可以用给定的值将图像填充至给定的尺寸。

```python
img = mmcv.imread('tests/data/color.jpg')

# 用给定值将图像填充至 (1000, 1200)
img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)

# 用给定值分别填充图像的3个通道至 (1000, 1200)
img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))

# 用给定值填充图像的左、右、上、下四条边
img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)

# 用3个值分别填充图像的左、右、上、下四条边的3个通道
img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))

# 将图像的四条边填充至能够被给定值整除
img_ = mmcv.impad_to_multiple(img, 32)
```

### 视频

视频模块提供了以下的功能：

- 一个 `VideoReader` 类，具有友好的 API 接口可以读取和转换视频
- 一些编辑视频的方法，包括 `cut` ， `concat` ， `resize`
- 光流的读取/保存/变换

#### VideoReader

`VideoReader` 类提供了和序列一样的接口去获取视频帧。该类会缓存所有被访问过的帧。

```python
video = mmcv.VideoReader('test.mp4')

# 获取基本的信息
print(len(video))
print(video.width, video.height, video.resolution, video.fps)

# 遍历所有的帧
for frame in video:
    print(frame.shape)

# 读取下一帧
img = video.read()

# 使用索引获取帧
img = video[100]

# 获取指定范围的帧
img = video[5:10]
```

将视频切成帧并保存至给定目录或者从给定目录中生成视频。

```python
# 将视频切成帧并保存至目录
video = mmcv.VideoReader('test.mp4')
video.cvt2frames('out_dir')

# 从给定目录中生成视频
mmcv.frames2video('out_dir', 'test.avi')
```

#### 编辑函数

有几个用于编辑视频的函数，这些函数是对 `ffmpeg` 的封装。

```python
# 裁剪视频
mmcv.cut_video('test.mp4', 'clip1.mp4', start=3, end=10, vcodec='h264')

# 将多个视频拼接成一个视频
mmcv.concat_video(['clip1.mp4', 'clip2.mp4'], 'joined.mp4', log_level='quiet')

# 将视频缩放至给定的尺寸
mmcv.resize_video('test.mp4', 'resized1.mp4', (360, 240))

# 将视频缩放至给定的倍率
mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)
```

#### 光流

`mmcv` 提供了以下用于操作光流的函数：

- 读取/保存
- 可视化
- 流变换

我们提供了两种将光流dump到文件的方法，分别是非压缩和压缩的方法。非压缩的方法直接将浮点数值的光流
保存至二进制文件，虽然光流无损但文件会比较大。而压缩的方法先量化光流至 0-255 整形数值再保存为
jpeg图像。光流的x维度和y维度会被拼接到图像中。

1. 读取/保存

```python
flow = np.random.rand(800, 600, 2).astype(np.float32)
# 保存光流到flo文件 (~3.7M)
mmcv.flowwrite(flow, 'uncompressed.flo')
# 保存光流为jpeg图像 (~230K)，图像的尺寸为 (800, 1200)
mmcv.flowwrite(flow, 'compressed.jpg', quantize=True, concat_axis=1)

# 读取光流文件，以下两种方式读取的光流尺寸均为 (800, 600, 2)
flow = mmcv.flowread('uncompressed.flo')
flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)
```

2. 可视化

使用 `mmcv.flowshow()` 可视化光流

```python
mmcv.flowshow(flow)
```

![progress](../../en/_static/flow_visualization.png)

1. 流变换

```python
img1 = mmcv.imread('img1.jpg')
flow = mmcv.flowread('flow.flo')
warped_img2 = mmcv.flow_warp(img1, flow)
```

img1 (左) and img2 (右)

![raw images](../../en/_static/flow_raw_images.png)

光流 (img2 -> img1)

![optical flow](../../en/_static/flow_img2toimg1.png)

变换后的图像和真实图像的差异

![warped image](../../en/_static/flow_warp_diff.png)


================================================
FILE: docs/zh_cn/understand_mmcv/data_transform.md
================================================
# 数据变换

在 OpenMMLab 算法库中，数据集的构建和数据的准备是相互解耦的。通常，数据集的构建只对数据集进行解析，记录每个样本的基本信息；而数据的准备则是通过一系列的数据变换，根据样本的基本信息进行数据加载、预处理、格式化等操作。

## 数据变换的设计

在 MMCV 中，我们使用各种可调用的数据变换类来进行数据的操作。这些数据变换类可以接受若干配置参数进行实例化，之后通过调用的方式对输入的数据字典进行处理。同时，我们约定所有数据变换都接受一个字典作为输入，并将处理后的数据输出为一个字典。一个简单的例子如下：

```python
>>> import numpy as np
>>> from mmcv.transforms import Resize
>>>
>>> transform = Resize(scale=(224, 224))
>>> data_dict = {'img': np.random.rand(256, 256, 3)}
>>> data_dict = transform(data_dict)
>>> print(data_dict['img'].shape)
(224, 224, 3)
```

数据变换类会读取输入字典的某些字段，并且可能添加、或者更新某些字段。这些字段的键大部分情况下是固定的，如 `Resize` 会固定地读取输入字典中的 `"img"` 等字段。我们可以在对应类的文档中了解对输入输出字段的约定。

```{note}
默认情况下，在需要图像尺寸作为**初始化参数**的数据变换 (如Resize, Pad) 中，图像尺寸的顺序均为 (width, height)。在数据变换**返回的字典**中，图像相关的尺寸， 如 `img_shape`、`ori_shape`、`pad_shape` 等，均为 (height, width)。
```

MMCV 为所有的数据变换类提供了一个统一的基类 (`BaseTransform`)：

```python
class BaseTransform(metaclass=ABCMeta):

    def __call__(self, results: dict) -> dict:

        return self.transform(results)

    @abstractmethod
    def transform(self, results: dict) -> dict:
        pass
```

所有的数据变换类都需要继承 `BaseTransform`，并实现 `transform` 方法。`transform` 方法的输入和输出均为一个字典。在**自定义数据变换类**一节中，我们会更详细地介绍如何实现一个数据变换类。

## 数据流水线

如上所述，所有数据变换的输入和输出都是一个字典，而且根据 OpenMMLab 中 [有关数据集的约定](TODO)，数据集中每个样本的基本信息都是一个字典。这样一来，我们可以将所有的数据变换操作首尾相接，组合成为一条数据流水线（data pipeline），输入数据集中样本的信息字典，输出完成一系列处理后的信息字典。

以分类任务为例，我们在下图展示了一个典型的数据流水线。对每个样本，数据集中保存的基本信息是一个如图中最左侧所示的字典，之后每经过一个由蓝色块代表的数据变换操作，数据字典中都会加入新的字段（标记为绿色）或更新现有的字段（标记为橙色）。

<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/154197953-bf0b1a16-3f41-4bc7-9e67-b2b9b323d895.png" width="90%"/>
</div>

在配置文件中，数据流水线是一个若干数据变换配置字典组成的列表，每个数据集都需要设置参数 `pipeline` 来定义该数据集需要进行的数据准备操作。如上数据流水线在配置文件中的配置如下：

```python
pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='Resize', size=256, keep_ratio=True),
    dict(type='CenterCrop', crop_size=224),
    dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
    dict(type='ClsFormatBundle')
]

dataset = dict(
    ...
    pipeline=pipeline,
    ...
)
```

## 常用的数据变换类

按照功能，常用的数据变换类可以大致分为数据加载、数据预处理与增强、数据格式化。在 MMCV 中，我们提供了一些常用的数据变换类如下：

### 数据加载

为了支持大规模数据集的加载，通常在 `Dataset` 初始化时不加载数据，只加载相应的路径。因此需要在数据流水线中进行具体数据的加载。

|            class            |                   功能                    |
| :-------------------------: | :---------------------------------------: |
| [`LoadImageFromFile`](TODO) |             根据路径加载图像              |
|  [`LoadAnnotations`](TODO)  | 加载和组织标注信息，如 bbox、语义分割图等 |

### 数据预处理及增强

数据预处理和增强通常是对图像本身进行变换，如裁剪、填充、缩放等。

|              class               |                功能                |
| :------------------------------: | :--------------------------------: |
|          [`Pad`](TODO)           |            填充图像边缘            |
|       [`CenterCrop`](TODO)       |              居中裁剪              |
|       [`Normalize`](TODO)        |          对图像进行归一化          |
|         [`Resize`](TODO)         |     按照指定尺寸或比例缩放图像     |
|      [`RandomResize`](TODO)      |    缩放图像至指定范围的随机尺寸    |
| [`RandomMultiscaleResize`](TODO) | 缩放图像至多个尺寸中的随机一个尺寸 |
|    [`RandomGrayscale`](TODO)     |             随机灰度化             |
|       [`RandomFlip`](TODO)       |            图像随机翻转            |
|   [`MultiScaleFlipAug`](TODO)    |   支持缩放和翻转的测试时数据增强   |

### 数据格式化

数据格式化操作通常是对数据进行的类型转换。

|          class          |               功能                |
| :---------------------: | :-------------------------------: |
|   [`ToTensor`](TODO)    | 将指定的数据转换为 `torch.Tensor` |
| [`ImageToTensor`](TODO) |    将图像转换为 `torch.Tensor`    |

## 自定义数据变换类

要实现一个新的数据变换类，需要继承 `BaseTransform`，并实现 `transform` 方法。这里，我们使用一个简单的翻转变换（`MyFlip`）作为示例：

```python
import random
import mmcv
from mmcv.transforms import BaseTransform, TRANSFORMS

@TRANSFORMS.register_module()
class MyFlip(BaseTransform):
    def __init__(self, direction: str):
        super().__init__()
        self.direction = direction

    def transform(self, results: dict) -> dict:
        img = results['img']
        results['img'] = mmcv.imflip(img, direction=self.direction)
        return results
```

从而，我们可以实例化一个 `MyFlip` 对象，并将之作为一个可调用对象，来处理我们的数据字典。

```python
import numpy as np

transform = MyFlip(direction='horizontal')
data_dict = {'img': np.random.rand(224, 224, 3)}
data_dict = transform(data_dict)
processed_img = data_dict['img']
```

又或者，在配置文件的 pipeline 中使用 `MyFlip` 变换

```python
pipeline = [
    ...
    dict(type='MyFlip', direction='horizontal'),
    ...
]
```

需要注意的是，如需在配置文件中使用，需要保证 `MyFlip` 类所在的文件在运行时能够被导入。

## 变换包装

变换包装是一种特殊的数据变换类，他们本身并不操作数据字典中的图像、标签等信息，而是对其中定义的数据变换的行为进行增强。

### 字段映射（KeyMapper）

字段映射包装（`KeyMapper`）用于对数据字典中的字段进行映射。例如，一般的图像处理变换都从数据字典中的 `"img"` 字段获得值。但有些时候，我们希望这些变换处理数据字典中其他字段中的图像，比如 `"gt_img"` 字段。

如果配合注册器和配置文件使用的话，在配置文件中数据集的 `pipeline` 中如下例使用字段映射包装：

```python
pipeline = [
    ...
    dict(type='KeyMapper',
        mapping={
            'img': 'gt_img',  # 将 "gt_img" 字段映射至 "img" 字段
            'mask': ...,  # 不使用原始数据中的 "mask" 字段。即对于被包装的数据变换，数据中不包含 "mask" 字段
        },
        auto_remap=True,  # 在完成变换后，将 "img" 重映射回 "gt_img" 字段
        transforms=[
            # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
            dict(type='RandomFlip'),
        ])
    ...
]
```

利用字段映射包装，我们在实现数据变换类时，不需要考虑在 `transform` 方法中考虑各种可能的输入字段名，只需要处理默认的字段即可。

### 随机选择（RandomChoice）和随机执行（RandomApply）

随机选择包装（`RandomChoice`）用于从一系列数据变换组合中随机应用一个数据变换组合。利用这一包装，我们可以简单地实现一些数据增强功能，比如 AutoAugment。

如果配合注册器和配置文件使用的话，在配置文件中数据集的 `pipeline` 中如下例使用随机选择包装：

```python
pipeline = [
    ...
    dict(type='RandomChoice',
        transforms=[
            [
                dict(type='Posterize', bits=4),
                dict(type='Rotate', angle=30.)
            ],  # 第一种随机变化组合
            [
                dict(type='Equalize'),
                dict(type='Rotate', angle=30)
            ],  # 第二种随机变换组合
        ],
        prob=[0.4, 0.6]  # 两种随机变换组合各自的选用概率
        )
    ...
]
```

随机执行包装（`RandomApply`）用于以指定概率随机执行数据变换组合。例如：

```python
pipeline = [
    ...
    dict(type='RandomApply',
        transforms=[dict(type='Rotate', angle=30.)],
        prob=0.3)  # 以 0.3 的概率执行被包装的数据变换
    ...
]
```

### 多目标扩展（TransformBroadcaster）

通常，一个数据变换类只会从一个固定的字段读取操作目标。虽然我们也可以使用 `KeyMapper` 来改变读取的字段，但无法将变换一次性应用于多个字段的数据。为了实现这一功能，我们需要借助多目标扩展包装（`TransformBroadcaster`）。

多目标扩展包装（`TransformBroadcaster`）有两个用法，一是将数据变换作用于指定的多个字段，二是将数据变换作用于某个字段下的一组目标中。

1. 应用于多个字段

   假设我们需要将数据变换应用于 `"lq"` (low-quality) 和 `"gt"` (ground-truth) 两个字段中的图像上。

   ```python
   pipeline = [
       dict(type='TransformBroadcaster',
           # 分别应用于 "lq" 和 "gt" 两个字段，并将二者应设置 "img" 字段
           mapping={'img': ['lq', 'gt']},
           # 在完成变换后，将 "img" 字段重映射回原先的字段
           auto_remap=True,
           # 是否在对各目标的变换中共享随机变量
           # 更多介绍参加后续章节（随机变量共享）
           share_random_params=True,
           transforms=[
               # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
               dict(type='RandomFlip'),
           ])
   ]
   ```

   在多目标扩展的 `mapping` 设置中，我们同样可以使用 `...` 来忽略指定的原始字段。如以下例子中，被包裹的 `RandomCrop` 会对字段 `"img"` 中的图像进行裁剪，并且在字段 `"img_shape"` 存在时更新剪裁后的图像大小。如果我们希望同时对两个图像字段 `"lq"` 和 `"gt"` 进行相同的随机裁剪，但只更新一次 `"img_shape"` 字段，可以通过例子中的方式实现：

   ```python
   pipeline = [
       dict(type='TransformBroadcaster',
           mapping={
               'img': ['lq', 'gt'],
               'img_shape': ['img_shape', ...],
            },
           # 在完成变换后，将 "img" 和 "img_shape" 字段重映射回原先的字段
           auto_remap=True,
           # 是否在对各目标的变换中共享随机变量
           # 更多介绍参加后续章节（随机变量共享）
           share_random_params=True,
           transforms=[
               # `RandomCrop` 类中会操作 "img" 和 "img_shape" 字段。若 "img_shape" 空缺，
               # 则只操作 "img"
               dict(type='RandomCrop'),
           ])
   ]
   ```

2. 应用于一个字段的一组目标

   假设我们需要将数据变换应用于 `"images"` 字段，该字段为一个图像组成的 list。

   ```python
   pipeline = [
       dict(type='TransformBroadcaster',
           # 将 "images" 字段下的每张图片映射至 "img" 字段
           mapping={'img': 'images'},
           # 在完成变换后，将 "img" 字段下的图片重映射回 "images" 字段的列表中
           auto_remap=True,
           # 是否在对各目标的变换中共享随机变量
           share_random_params=True,
           transforms=[
               # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
               dict(type='RandomFlip'),
           ])
   ]
   ```

#### 装饰器 `cache_randomness`

在 `TransformBroadcaster` 中，我们提供了 `share_random_params` 选项来支持在多次数据变换中共享随机状态。例如，在超分辨率任务中，我们希望将随机变换**同步**作用于低分辨率图像和原始图像。如果我们希望在自定义的数据变换类中使用这一功能，需要在类中标注哪些随机变量是支持共享的。这可以通过装饰器 `cache_randomness` 来实现。

以上文中的 `MyFlip` 为例，我们希望以一定的概率随机执行翻转：

```python
from mmcv.transforms.utils import cache_randomness

@TRANSFORMS.register_module()
class MyRandomFlip(BaseTransform):
    def __init__(self, prob: float, direction: str):
        super().__init__()
        self.prob = prob
        self.direction = direction

    @cache_randomness  # 标注该方法的输出为可共享的随机变量
    def do_flip(self):
        flip = True if random.random() > self.prob else False
        return flip

    def transform(self, results: dict) -> dict:
        img = results['img']
        if self.do_flip():
            results['img'] = mmcv.imflip(img, direction=self.direction)
        return results
```

在上面的例子中，我们用`cache_randomness` 装饰 `do_flip`方法，即将该方法返回值 `flip` 标注为一个支持共享的随机变量。进而，在 `TransformBroadcaster` 对多个目标的变换中，这一变量的值都会保持一致。

#### 装饰器 `avoid_cache_randomness`

在一些情况下，我们无法将数据变换中产生随机变量的过程单独放在类方法中。例如数据变换中使用的来自第三方库的模块，这些模块将随机变量相关的部分封装在了内部，导致无法将其抽出为数据变换的类方法。这样的数据变换无法通过装饰器 `cache_randomness` 标注支持共享的随机变量，进而无法在多目标扩展时共享随机变量。

为了避免在多目标扩展中误用此类数据变换，我们提供了另一个装饰器 `avoid_cache_randomness`，用来对此类数据变换进行标记：

```python
from mmcv.transforms.utils import avoid_cache_randomness

@TRANSFORMS.register_module()
@avoid_cache_randomness
class MyRandomTransform(BaseTransform):

    def transform(self, results: dict) -> dict:
        ...
```

用 `avoid_cache_randomness` 标记的数据变换类，当其实例被 `TransformBroadcaster` 包装且将参数 `share_random_params` 设置为 True 时，会抛出异常，以此提醒用户不能这样使用。

在使用 `avoid_cache_randomness` 时需要注意以下几点：

1. `avoid_cache_randomness` 只用于装饰数据变换类（BaseTransfrom 的子类），而不能用与装饰其他一般的类、类方法或函数
2. 被 `avoid_cache_randomness` 修饰的数据变换作为基类时，其子类将**不会继承**这一特性。如果子类仍无法共享随机变量，则应再次使用 `avoid_cache_randomness` 修饰
3. 只有当一个数据变换具有随机性，且无法共享随机参数时，才需要以 `avoid_cache_randomness` 修饰。无随机性的数据变换不需要修饰


================================================
FILE: docs/zh_cn/understand_mmcv/ops.md
================================================
## 算子

MMCV 提供了检测、分割等任务中常用的算子

| Device                       | CPU | CUDA | MLU | MPS | Ascend |
| ---------------------------- | --- | ---- | --- | --- | ------ |
| ActiveRotatedFilter          | √   | √    |     |     | √      |
| AssignScoreWithK             |     | √    |     |     |        |
| BallQuery                    |     | √    | √   |     | √      |
| BBoxOverlaps                 |     | √    | √   | √   | √      |
| BorderAlign                  |     | √    |     |     |        |
| BoxIouRotated                | √   | √    | √   |     | √      |
| BoxIouQuadri                 | √   | √    |     |     |        |
| CARAFE                       |     | √    | √   |     |        |
| ChamferDistance              |     | √    |     |     | √      |
| CrissCrossAttention          |     | √    |     |     |        |
| ContourExpand                | √   |      |     |     |        |
| ConvexIoU                    |     | √    |     |     |        |
| CornerPool                   |     | √    |     |     |        |
| Correlation                  |     | √    |     |     |        |
| Deformable Convolution v1/v2 | √   | √    | √   |     | √      |
| Deformable RoIPool           |     | √    | √   |     | √      |
| DiffIoURotated               |     | √    | √   |     |        |
| DynamicScatter               |     | √    | √   |     |        |
| FurthestPointSample          |     | √    |     |     | √      |
| FurthestPointSampleWithDist  |     | √    |     |     | √      |
| FusedBiasLeakyrelu           |     | √    |     |     | √      |
| GatherPoints                 |     | √    |     |     | √      |
| GroupPoints                  |     | √    |     |     |        |
| Iou3d                        |     | √    | √   |     |        |
| KNN                          |     | √    |     |     |        |
| MaskedConv                   |     | √    | √   |     | √      |
| MergeCells                   |     | √    |     |     |        |
| MinAreaPolygon               |     | √    |     |     |        |
| ModulatedDeformConv2d        | √   | √    | √   |     | √      |
| MultiScaleDeformableAttn     |     | √    | √   |     | √      |
| NMS                          | √   | √    | √   |     | √      |
| NMSRotated                   | √   | √    | √   |     | √      |
| NMSQuadri                    | √   | √    |     |     |        |
| PixelGroup                   | √   |      |     |     |        |
| PointsInBoxes                | √   | √    |     |     |        |
| PointsInPolygons             |     | √    |     |     |        |
| PSAMask                      | √   | √    | √   |     | √      |
| RotatedFeatureAlign          | √   | √    | √   |     | √      |
| RoIPointPool3d               |     | √    | √   |     |        |
| RoIPool                      |     | √    | √   |     | √      |
| RoIAlignRotated              | √   | √    | √   |     | √      |
| RiRoIAlignRotated            |     | √    |     |     |        |
| RoIAlign                     | √   | √    | √   |     | √      |
| RoIAwarePool3d               |     | √    | √   |     |        |
| SAConv2d                     |     | √    |     |     |        |
| SigmoidFocalLoss             |     | √    | √   |     | √      |
| SoftmaxFocalLoss             |     | √    |     |     | √      |
| SoftNMS                      |     | √    |     |     |        |
| Sparse Convolution           |     | √    | √   |     |        |
| Synchronized BatchNorm       |     | √    |     |     |        |
| ThreeInterpolate             |     | √    |     |     |        |
| ThreeNN                      |     | √    | √   |     |        |
| TINShift                     |     | √    | √   |     |        |
| UpFirDn2d                    |     | √    |     |     |        |
| Voxelization                 | √   | √    | √   |     | √      |
| PrRoIPool                    |     | √    |     |     |        |
| BezierAlign                  | √   | √    |     |     |        |
| BiasAct                      |     | √    |     |     |        |
| FilteredLrelu                |     | √    |     |     |        |
| Conv2dGradfix                |     | √    |     |     |        |


================================================
FILE: docs/zh_cn/understand_mmcv/visualization.md
================================================
## 可视化

`mmcv` 可以展示图像以及标注（目前只支持标注框）

```python
# 展示图像文件
mmcv.imshow('a.jpg')

# 展示已加载的图像
img = np.random.rand(100, 100, 3)
mmcv.imshow(img)

# 展示带有标注框的图像
img = np.random.rand(100, 100, 3)
bboxes = np.array([[0, 0, 50, 50], [20, 20, 60, 60]])
mmcv.imshow_bboxes(img, bboxes)
```

`mmcv` 也可以展示特殊的图像，例如光流

```python
flow = mmcv.flowread('test.flo')
mmcv.flowshow(flow)
```


================================================
FILE: mmcv/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# flake8: noqa
from .arraymisc import *
from .image import *
from .transforms import *
from .version import *
from .video import *
from .visualization import *

# The following modules are not imported to this level, so mmcv may be used
# without PyTorch.
# - op
# - utils


================================================
FILE: mmcv/arraymisc/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .quantization import dequantize, quantize

__all__ = ['quantize', 'dequantize']


================================================
FILE: mmcv/arraymisc/quantization.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Union

import numpy as np


def quantize(arr: np.ndarray,
             min_val: Union[int, float],
             max_val: Union[int, float],
             levels: int,
             dtype=np.int64) -> tuple:
    """Quantize an array of (-inf, inf) to [0, levels-1].

    Args:
        arr (ndarray): Input array.
        min_val (int or float): Minimum value to be clipped.
        max_val (int or float): Maximum value to be clipped.
        levels (int): Quantization levels.
        dtype (np.type): The type of the quantized array.

    Returns:
        tuple: Quantized array.
    """
    if not (isinstance(levels, int) and levels > 1):
        raise ValueError(
            f'levels must be a positive integer, but got {levels}')
    if min_val >= max_val:
        raise ValueError(
            f'min_val ({min_val}) must be smaller than max_val ({max_val})')

    arr = np.clip(arr, min_val, max_val) - min_val
    quantized_arr = np.minimum(
        np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)

    return quantized_arr


def dequantize(arr: np.ndarray,
               min_val: Union[int, float],
               max_val: Union[int, float],
               levels: int,
               dtype=np.float64) -> tuple:
    """Dequantize an array.

    Args:
        arr (ndarray): Input array.
        min_val (int or float): Minimum value to be clipped.
        max_val (int or float): Maximum value to be clipped.
        levels (int): Quantization levels.
        dtype (np.type): The type of the dequantized array.

    Returns:
        tuple: Dequantized array.
    """
    if not (isinstance(levels, int) and levels > 1):
        raise ValueError(
            f'levels must be a positive integer, but got {levels}')
    if min_val >= max_val:
        raise ValueError(
            f'min_val ({min_val}) must be smaller than max_val ({max_val})')

    dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
                                                   min_val) / levels + min_val

    return dequantized_arr


================================================
FILE: mmcv/cnn/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .alexnet import AlexNet
# yapf: disable
from .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
                     ConvTranspose2d, ConvTranspose3d, ConvWS2d,
                     DepthwiseSeparableConvModule, GeneralizedAttention,
                     HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
                     NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
                     build_activation_layer, build_conv_layer,
                     build_norm_layer, build_padding_layer, build_plugin_layer,
                     build_upsample_layer, conv_ws_2d, is_norm)
# yapf: enable
from .resnet import ResNet, make_res_layer
from .rfsearch import Conv2dRFSearchOp, RFSearchHook
from .utils import fuse_conv_bn, get_model_complexity_info
from .vgg import VGG, make_vgg_layer

__all__ = [
    'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
    'ConvModule', 'build_activation_layer', 'build_conv_layer',
    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
    'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d',
    'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention',
    'Scale', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
    'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d',
    'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'fuse_conv_bn',
    'get_model_complexity_info', 'Conv2dRFSearchOp', 'RFSearchHook'
]


================================================
FILE: mmcv/cnn/alexnet.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import logging
from typing import Optional

import torch
import torch.nn as nn
from mmengine.runner import load_checkpoint


class AlexNet(nn.Module):
    """AlexNet backbone.

    Args:
        num_classes (int): number of classes for classification.
    """

    def __init__(self, num_classes: int = -1):
        super().__init__()
        self.num_classes = num_classes
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        if self.num_classes > 0:
            self.classifier = nn.Sequential(
                nn.Dropout(),
                nn.Linear(256 * 6 * 6, 4096),
                nn.ReLU(inplace=True),
                nn.Dropout(),
                nn.Linear(4096, 4096),
                nn.ReLU(inplace=True),
                nn.Linear(4096, num_classes),
            )

    def init_weights(self, pretrained: Optional[str] = None) -> None:
        if isinstance(pretrained, str):
            logger = logging.getLogger()
            load_checkpoint(self, pretrained, strict=False, logger=logger)
        elif pretrained is None:
            # use default initializer
            pass
        else:
            raise TypeError('pretrained must be a str or None')

    def forward(self, x: torch.Tensor) -> torch.Tensor:

        x = self.features(x)
        if self.num_classes > 0:
            x = x.view(x.size(0), 256 * 6 * 6)
            x = self.classifier(x)

        return x


================================================
FILE: mmcv/cnn/bricks/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .activation import build_activation_layer
from .context_block import ContextBlock
from .conv import build_conv_layer
from .conv2d_adaptive_padding import Conv2dAdaptivePadding
from .conv_module import ConvModule
from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
from .drop import Dropout, DropPath
from .generalized_attention import GeneralizedAttention
from .hsigmoid import HSigmoid
from .hswish import HSwish
from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
from .norm import build_norm_layer, is_norm
from .padding import build_padding_layer
from .plugin import build_plugin_layer
from .scale import LayerScale, Scale
from .swish import Swish
from .upsample import build_upsample_layer
from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
                       Linear, MaxPool2d, MaxPool3d)

__all__ = [
    'ConvModule', 'build_activation_layer', 'build_conv_layer',
    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
    'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
    'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d',
    'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding',
    'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d',
    'Conv3d', 'Dropout', 'DropPath', 'LayerScale'
]


================================================
FILE: mmcv/cnn/bricks/activation.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.registry import MODELS
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION

for module in [
        nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
        nn.Sigmoid, nn.Tanh
]:
    MODELS.register_module(module=module)

if digit_version(torch.__version__) >= digit_version('1.7.0'):
    MODELS.register_module(module=nn.SiLU, name='SiLU')
else:

    class SiLU(nn.Module):
        """Sigmoid Weighted Liner Unit."""

        def __init__(self, inplace=False):
            super().__init__()
            self.inplace = inplace

        def forward(self, inputs) -> torch.Tensor:
            if self.inplace:
                return inputs.mul_(torch.sigmoid(inputs))
            else:
                return inputs * torch.sigmoid(inputs)

    MODELS.register_module(module=SiLU, name='SiLU')


@MODELS.register_module(name='Clip')
@MODELS.register_module()
class Clamp(nn.Module):
    """Clamp activation layer.

    This activation function is to clamp the feature map value within
    :math:`[min, max]`. More details can be found in ``torch.clamp()``.

    Args:
        min (Number | optional): Lower-bound of the range to be clamped to.
            Default to -1.
        max (Number | optional): Upper-bound of the range to be clamped to.
            Default to 1.
    """

    def __init__(self, min: float = -1., max: float = 1.):
        super().__init__()
        self.min = min
        self.max = max

    def forward(self, x) -> torch.Tensor:
        """Forward function.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: Clamped tensor.
        """
        return torch.clamp(x, min=self.min, max=self.max)


class GELU(nn.Module):
    r"""Applies the Gaussian Error Linear Units function:

    .. math::
        \text{GELU}(x) = x * \Phi(x)
    where :math:`\Phi(x)` is the Cumulative Distribution Function for
    Gaussian Distribution.

    Shape:
        - Input: :math:`(N, *)` where `*` means, any number of additional
          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    .. image:: scripts/activation_images/GELU.png

    Examples::

        >>> m = nn.GELU()
        >>> input = torch.randn(2)
        >>> output = m(input)
    """

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.gelu(input)


if (TORCH_VERSION == 'parrots'
        or digit_version(TORCH_VERSION) < digit_version('1.4')):
    MODELS.register_module(module=GELU)
else:
    MODELS.register_module(module=nn.GELU)


def build_activation_layer(cfg: Dict) -> nn.Module:
    """Build activation layer.

    Args:
        cfg (dict): The activation layer config, which should contain:

            - type (str): Layer type.
            - layer args: Args needed to instantiate an activation layer.

    Returns:
        nn.Module: Created activation layer.
    """
    return MODELS.build(cfg)


================================================
FILE: mmcv/cnn/bricks/context_block.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Union

import torch
from mmengine.model import constant_init, kaiming_init
from mmengine.registry import MODELS
from torch import nn


def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None:
    if isinstance(m, nn.Sequential):
        constant_init(m[-1], val=0)
    else:
        constant_init(m, val=0)


@MODELS.register_module()
class ContextBlock(nn.Module):
    """ContextBlock module in GCNet.

    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
    (https://arxiv.org/abs/1904.11492) for details.

    Args:
        in_channels (int): Channels of the input feature map.
        ratio (float): Ratio of channels of transform bottleneck
        pooling_type (str): Pooling method for context modeling.
            Options are 'att' and 'avg', stand for attention pooling and
            average pooling respectively. Default: 'att'.
        fusion_types (Sequence[str]): Fusion method for feature fusion,
            Options are 'channels_add', 'channel_mul', stand for channelwise
            addition and multiplication respectively. Default: ('channel_add',)
    """

    _abbr_ = 'context_block'

    def __init__(self,
                 in_channels: int,
                 ratio: float,
                 pooling_type: str = 'att',
                 fusion_types: tuple = ('channel_add', )):
        super().__init__()
        assert pooling_type in ['avg', 'att']
        assert isinstance(fusion_types, (list, tuple))
        valid_fusion_types = ['channel_add', 'channel_mul']
        assert all([f in valid_fusion_types for f in fusion_types])
        assert len(fusion_types) > 0, 'at least one fusion should be used'
        self.in_channels = in_channels
        self.ratio = ratio
        self.planes = int(in_channels * ratio)
        self.pooling_type = pooling_type
        self.fusion_types = fusion_types
        if pooling_type == 'att':
            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
            self.softmax = nn.Softmax(dim=2)
        else:
            self.avg_pool = nn.AdaptiveAvgPool2d(1)
        if 'channel_add' in fusion_types:
            self.channel_add_conv = nn.Sequential(
                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
                nn.LayerNorm([self.planes, 1, 1]),
                nn.ReLU(inplace=True),  # yapf: disable
                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
        else:
            self.channel_add_conv = None
        if 'channel_mul' in fusion_types:
            self.channel_mul_conv = nn.Sequential(
                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
                nn.LayerNorm([self.planes, 1, 1]),
                nn.ReLU(inplace=True),  # yapf: disable
                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
        else:
            self.channel_mul_conv = None
        self.reset_parameters()

    def reset_parameters(self):
        if self.pooling_type == 'att':
            kaiming_init(self.conv_mask, mode='fan_in')
            self.conv_mask.inited = True

        if self.channel_add_conv is not None:
            last_zero_init(self.channel_add_conv)
        if self.channel_mul_conv is not None:
            last_zero_init(self.channel_mul_conv)

    def spatial_pool(self, x: torch.Tensor) -> torch.Tensor:
        batch, channel, height, width = x.size()
        if self.pooling_type == 'att':
            input_x = x
            # [N, C, H * W]
            input_x = input_x.view(batch, channel, height * width)
            # [N, 1, C, H * W]
            input_x = input_x.unsqueeze(1)
            # [N, 1, H, W]
            context_mask = self.conv_mask(x)
            # [N, 1, H * W]
            context_mask = context_mask.view(batch, 1, height * width)
            # [N, 1, H * W]
            context_mask = self.softmax(context_mask)
            # [N, 1, H * W, 1]
            context_mask = context_mask.unsqueeze(-1)
            # [N, 1, C, 1]
            context = torch.matmul(input_x, context_mask)
            # [N, C, 1, 1]
            context = context.view(batch, channel, 1, 1)
        else:
            # [N, C, 1, 1]
            context = self.avg_pool(x)

        return context

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # [N, C, 1, 1]
        context = self.spatial_pool(x)

        out = x
        if self.channel_mul_conv is not None:
            # [N, C, 1, 1]
            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
            out = out * channel_mul_term
        if self.channel_add_conv is not None:
            # [N, C, 1, 1]
            channel_add_term = self.channel_add_conv(context)
            out = out + channel_add_term

        return out


================================================
FILE: mmcv/cnn/bricks/conv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import inspect
from typing import Dict, Optional

from mmengine.registry import MODELS
from torch import nn

MODELS.register_module('Conv1d', module=nn.Conv1d)
MODELS.register_module('Conv2d', module=nn.Conv2d)
MODELS.register_module('Conv3d', module=nn.Conv3d)
MODELS.register_module('Conv', module=nn.Conv2d)


def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
    """Build convolution layer.

    Args:
        cfg (None or dict): The conv layer config, which should contain:
            - type (str): Layer type.
            - layer args: Args needed to instantiate an conv layer.
        args (argument list): Arguments passed to the `__init__`
            method of the corresponding conv layer.
        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
            method of the corresponding conv layer.

    Returns:
        nn.Module: Created conv layer.
    """
    if cfg is None:
        cfg_ = dict(type='Conv2d')
    else:
        if not isinstance(cfg, dict):
            raise TypeError('cfg must be a dict')
        if 'type' not in cfg:
            raise KeyError('the cfg dict must contain the key "type"')
        cfg_ = cfg.copy()

    layer_type = cfg_.pop('type')
    if inspect.isclass(layer_type):
        return layer_type(*args, **kwargs, **cfg_)  # type: ignore
    # Switch registry to the target scope. If `conv_layer` cannot be found
    # in the registry, fallback to search `conv_layer` in the
    # mmengine.MODELS.
    with MODELS.switch_scope_and_registry(None) as registry:
        conv_layer = registry.get(layer_type)
    if conv_layer is None:
        raise KeyError(f'Cannot find {conv_layer} in registry under scope '
                       f'name {registry.scope}')
    layer = conv_layer(*args, **kwargs, **cfg_)

    return layer


================================================
FILE: mmcv/cnn/bricks/conv2d_adaptive_padding.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
from typing import Tuple, Union

import torch
from mmengine.registry import MODELS
from torch import nn
from torch.nn import functional as F


@MODELS.register_module()
class Conv2dAdaptivePadding(nn.Conv2d):
    """Implementation of 2D convolution in tensorflow with `padding` as "same",
    which applies padding to input (if needed) so that input image gets fully
    covered by filter and stride you specified. For stride 1, this will ensure
    that output image size is same as input. For stride of 2, output dimensions
    will be half, for example.

    Args:
        in_channels (int): Number of channels in the input image
        out_channels (int): Number of channels produced by the convolution
        kernel_size (int or tuple): Size of the convolving kernel
        stride (int or tuple, optional): Stride of the convolution. Default: 1
        padding (int or tuple, optional): Zero-padding added to both sides of
            the input. Default: 0
        dilation (int or tuple, optional): Spacing between kernel elements.
            Default: 1
        groups (int, optional): Number of blocked connections from input
            channels to output channels. Default: 1
        bias (bool, optional): If ``True``, adds a learnable bias to the
            output. Default: ``True``
    """

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: Union[int, Tuple[int, int]],
                 stride: Union[int, Tuple[int, int]] = 1,
                 padding: Union[int, Tuple[int, int]] = 0,
                 dilation: Union[int, Tuple[int, int]] = 1,
                 groups: int = 1,
                 bias: bool = True):
        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
                         dilation, groups, bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        img_h, img_w = x.size()[-2:]
        kernel_h, kernel_w = self.weight.size()[-2:]
        stride_h, stride_w = self.stride
        output_h = math.ceil(img_h / stride_h)
        output_w = math.ceil(img_w / stride_w)
        pad_h = (
            max((output_h - 1) * self.stride[0] +
                (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
        pad_w = (
            max((output_w - 1) * self.stride[1] +
                (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
        if pad_h > 0 or pad_w > 0:
            x = F.pad(x, [
                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
            ])
        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
                        self.dilation, self.groups)


================================================
FILE: mmcv/cnn/bricks/conv_module.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from functools import partial
from typing import Dict, Optional, Tuple, Union

import torch
import torch.nn as nn
from mmengine.model import constant_init, kaiming_init
from mmengine.registry import MODELS
from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm

from .activation import build_activation_layer
from .conv import build_conv_layer
from .norm import build_norm_layer
from .padding import build_padding_layer


def efficient_conv_bn_eval_forward(bn: _BatchNorm,
                                   conv: nn.modules.conv._ConvNd,
                                   x: torch.Tensor):
    """
    Implementation based on https://arxiv.org/abs/2305.11624
    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
    It leverages the associative law between convolution and affine transform,
    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
    It works for Eval mode of ConvBN blocks during validation, and can be used
    for training as well. It reduces memory and computation cost.

    Args:
        bn (_BatchNorm): a BatchNorm module.
        conv (nn._ConvNd): a conv module
        x (torch.Tensor): Input feature map.
    """
    # These lines of code are designed to deal with various cases
    # like bn without affine transform, and conv without bias
    weight_on_the_fly = conv.weight
    if conv.bias is not None:
        bias_on_the_fly = conv.bias
    else:
        bias_on_the_fly = torch.zeros_like(bn.running_var)

    if bn.weight is not None:
        bn_weight = bn.weight
    else:
        bn_weight = torch.ones_like(bn.running_var)

    if bn.bias is not None:
        bn_bias = bn.bias
    else:
        bn_bias = torch.zeros_like(bn.running_var)

    # shape of [C_out, 1, 1, 1] in Conv2d
    weight_coeff = torch.rsqrt(bn.running_var +
                               bn.eps).reshape([-1] + [1] *
                                               (len(conv.weight.shape) - 1))
    # shape of [C_out, 1, 1, 1] in Conv2d
    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff

    # shape of [C_out, C_in, k, k] in Conv2d
    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
    # shape of [C_out] in Conv2d
    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
        (bias_on_the_fly - bn.running_mean)

    return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)


@MODELS.register_module()
class ConvModule(nn.Module):
    """A conv block that bundles conv/norm/activation layers.

    This block simplifies the usage of convolution layers, which are commonly
    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
    It is based upon three build methods: `build_conv_layer()`,
    `build_norm_layer()` and `build_activation_layer()`.

    Besides, we add some additional features in this module.
    1. Automatically set `bias` of the conv layer.
    2. Spectral norm is supported.
    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
    supports zero and circular padding, and we add "reflect" padding mode.

    Args:
        in_channels (int): Number of channels in the input feature map.
            Same as that in ``nn._ConvNd``.
        out_channels (int): Number of channels produced by the convolution.
            Same as that in ``nn._ConvNd``.
        kernel_size (int | tuple[int]): Size of the convolving kernel.
            Same as that in ``nn._ConvNd``.
        stride (int | tuple[int]): Stride of the convolution.
            Same as that in ``nn._ConvNd``.
        padding (int | tuple[int]): Zero-padding added to both sides of
            the input. Same as that in ``nn._ConvNd``.
        dilation (int | tuple[int]): Spacing between kernel elements.
            Same as that in ``nn._ConvNd``.
        groups (int): Number of blocked connections from input channels to
            output channels. Same as that in ``nn._ConvNd``.
        bias (bool | str): If specified as `auto`, it will be decided by the
            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
            False. Default: "auto".
        conv_cfg (dict): Config dict for convolution layer. Default: None,
            which means using conv2d.
        norm_cfg (dict): Config dict for normalization layer. Default: None.
        act_cfg (dict): Config dict for activation layer.
            Default: dict(type='ReLU').
        inplace (bool): Whether to use inplace mode for activation.
            Default: True.
        with_spectral_norm (bool): Whether use spectral norm in conv module.
            Default: False.
        padding_mode (str): If the `padding_mode` has not been supported by
            current `Conv2d` in PyTorch, we will use our own padding layer
            instead. Currently, we support ['zeros', 'circular'] with official
            implementation and ['reflect'] with our own implementation.
            Default: 'zeros'.
        order (tuple[str]): The order of conv/norm/activation layers. It is a
            sequence of "conv", "norm" and "act". Common examples are
            ("conv", "norm", "act") and ("act", "conv", "norm").
            Default: ('conv', 'norm', 'act').
        efficient_conv_bn_eval (bool): Whether use efficient conv when the
            consecutive bn is in eval mode (either training or testing), as
            proposed in https://arxiv.org/abs/2305.11624 . Default: `False`.
    """

    _abbr_ = 'conv_block'

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: Union[int, Tuple[int, int]],
                 stride: Union[int, Tuple[int, int]] = 1,
                 padding: Union[int, Tuple[int, int]] = 0,
                 dilation: Union[int, Tuple[int, int]] = 1,
                 groups: int = 1,
                 bias: Union[bool, str] = 'auto',
                 conv_cfg: Optional[Dict] = None,
                 norm_cfg: Optional[Dict] = None,
                 act_cfg: Optional[Dict] = dict(type='ReLU'),
                 inplace: bool = True,
                 with_spectral_norm: bool = False,
                 padding_mode: str = 'zeros',
                 order: tuple = ('conv', 'norm', 'act'),
                 efficient_conv_bn_eval: bool = False):
        super().__init__()
        assert conv_cfg is None or isinstance(conv_cfg, dict)
        assert norm_cfg is None or isinstance(norm_cfg, dict)
        assert act_cfg is None or isinstance(act_cfg, dict)
        official_padding_mode = ['zeros', 'circular']
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.inplace = inplace
        self.with_spectral_norm = with_spectral_norm
        self.with_explicit_padding = padding_mode not in official_padding_mode
        self.order = order
        assert isinstance(self.order, tuple) and len(self.order) == 3
        assert set(order) == {'conv', 'norm', 'act'}

        self.with_norm = norm_cfg is not None
        self.with_activation = act_cfg is not None
        # if the conv layer is before a norm layer, bias is unnecessary.
        if bias == 'auto':
            bias = not self.with_norm
        self.with_bias = bias

        if self.with_explicit_padding:
            pad_cfg = dict(type=padding_mode)
            self.padding_layer = build_padding_layer(pad_cfg, padding)

        # reset padding to 0 for conv module
        conv_padding = 0 if self.with_explicit_padding else padding
        # build convolution layer
        self.conv = build_conv_layer(
            conv_cfg,
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=conv_padding,
            dilation=dilation,
            groups=groups,
            bias=bias)
        # export the attributes of self.conv to a higher level for convenience
        self.in_channels = self.conv.in_channels
        self.out_channels = self.conv.out_channels
        self.kernel_size = self.conv.kernel_size
        self.stride = self.conv.stride
        self.padding = padding
        self.dilation = self.conv.dilation
        self.transposed = self.conv.transposed
        self.output_padding = self.conv.output_padding
        self.groups = self.conv.groups

        if self.with_spectral_norm:
            self.conv = nn.utils.spectral_norm(self.conv)

        # build normalization layers
        if self.with_norm:
            # norm layer is after conv layer
            if order.index('norm') > order.index('conv'):
                norm_channels = out_channels
            else:
                norm_channels = in_channels
            self.norm_name, norm = build_norm_layer(
                norm_cfg, norm_channels)  # type: ignore
            self.add_module(self.norm_name, norm)
            if self.with_bias:
                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
                    warnings.warn(
                        'Unnecessary conv bias before batch/instance norm')
        else:
            self.norm_name = None  # type: ignore

        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)

        # build activation layer
        if self.with_activation:
            act_cfg_ = act_cfg.copy()  # type: ignore
            # nn.Tanh has no 'inplace' argument
            if act_cfg_['type'] not in [
                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
            ]:
                act_cfg_.setdefault('inplace', inplace)
            self.activate = build_activation_layer(act_cfg_)

        # Use msra init by default
        self.init_weights()

    @property
    def norm(self):
        if self.norm_name:
            return getattr(self, self.norm_name)
        else:
            return None

    def init_weights(self):
        # 1. It is mainly for customized conv layers with their own
        #    initialization manners by calling their own ``init_weights()``,
        #    and we do not want ConvModule to override the initialization.
        # 2. For customized conv layers without their own initialization
        #    manners (that is, they don't have their own ``init_weights()``)
        #    and PyTorch's conv layers, they will be initialized by
        #    this method with default ``kaiming_init``.
        # Note: For PyTorch's conv layers, they will be overwritten by our
        #    initialization implementation using default ``kaiming_init``.
        if not hasattr(self.conv, 'init_weights'):
            if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
                nonlinearity = 'leaky_relu'
                a = self.act_cfg.get('negative_slope', 0.01)
            else:
                nonlinearity = 'relu'
                a = 0
            kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
        if self.with_norm:
            constant_init(self.norm, 1, bias=0)

    def forward(self,
                x: torch.Tensor,
                activate: bool = True,
                norm: bool = True) -> torch.Tensor:
        layer_index = 0
        while layer_index < len(self.order):
            layer = self.order[layer_index]
            if layer == 'conv':
                if self.with_explicit_padding:
                    x = self.padding_layer(x)
                # if the next operation is norm and we have a norm layer in
                # eval mode and we have enabled `efficient_conv_bn_eval` for
                # the conv operator, then activate the optimized forward and
                # skip the next norm operator since it has been fused
                if layer_index + 1 < len(self.order) and \
                        self.order[layer_index + 1] == 'norm' and norm and \
                        self.with_norm and not self.norm.training and \
                        self.efficient_conv_bn_eval_forward is not None:
                    self.conv.forward = partial(
                        self.efficient_conv_bn_eval_forward, self.norm,
                        self.conv)
                    layer_index += 1
                    x = self.conv(x)
                    del self.conv.forward
                else:
                    x = self.conv(x)
            elif layer == 'norm' and norm and self.with_norm:
                x = self.norm(x)
            elif layer == 'act' and activate and self.with_activation:
                x = self.activate(x)
            layer_index += 1
        return x

    def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
        # efficient_conv_bn_eval works for conv + bn
        # with `track_running_stats` option
        if efficient_conv_bn_eval and self.norm \
                            and isinstance(self.norm, _BatchNorm) \
                            and self.norm.track_running_stats:
            self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward  # noqa: E501
        else:
            self.efficient_conv_bn_eval_forward = None  # type: ignore

    @staticmethod
    def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,
                            bn: torch.nn.modules.batchnorm._BatchNorm,
                            efficient_conv_bn_eval=True) -> 'ConvModule':
        """Create a ConvModule from a conv and a bn module."""
        self = ConvModule.__new__(ConvModule)
        super(ConvModule, self).__init__()

        self.conv_cfg = None
        self.norm_cfg = None
        self.act_cfg = None
        self.inplace = False
        self.with_spectral_norm = False
        self.with_explicit_padding = False
        self.order = ('conv', 'norm', 'act')

        self.with_norm = True
        self.with_activation = False
        self.with_bias = conv.bias is not None

        # build convolution layer
        self.conv = conv
        # export the attributes of self.conv to a higher level for convenience
        self.in_channels = self.conv.in_channels
        self.out_channels = self.conv.out_channels
        self.kernel_size = self.conv.kernel_size
        self.stride = self.conv.stride
        self.padding = self.conv.padding
        self.dilation = self.conv.dilation
        self.transposed = self.conv.transposed
        self.output_padding = self.conv.output_padding
        self.groups = self.conv.groups

        # build normalization layers
        self.norm_name, norm = 'bn', bn
        self.add_module(self.norm_name, norm)

        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)

        return self


================================================
FILE: mmcv/cnn/bricks/conv_ws.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from collections import OrderedDict
from typing import Dict, List, Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.registry import MODELS


def conv_ws_2d(input: torch.Tensor,
               weight: torch.Tensor,
               bias: Optional[torch.Tensor] = None,
               stride: Union[int, Tuple[int, int]] = 1,
               padding: Union[int, Tuple[int, int]] = 0,
               dilation: Union[int, Tuple[int, int]] = 1,
               groups: int = 1,
               eps: float = 1e-5) -> torch.Tensor:
    c_in = weight.size(0)
    weight_flat = weight.view(c_in, -1)
    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
    weight = (weight - mean) / (std + eps)
    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)


@MODELS.register_module('ConvWS')
class ConvWS2d(nn.Conv2d):

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: Union[int, Tuple[int, int]],
                 stride: Union[int, Tuple[int, int]] = 1,
                 padding: Union[int, Tuple[int, int]] = 0,
                 dilation: Union[int, Tuple[int, int]] = 1,
                 groups: int = 1,
                 bias: bool = True,
                 eps: float = 1e-5):
        super().__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias)
        self.eps = eps

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
                          self.dilation, self.groups, self.eps)


@MODELS.register_module(name='ConvAWS')
class ConvAWS2d(nn.Conv2d):
    """AWS (Adaptive Weight Standardization)

    This is a variant of Weight Standardization
    (https://arxiv.org/pdf/1903.10520.pdf)
    It is used in DetectoRS to avoid NaN
    (https://arxiv.org/pdf/2006.02334.pdf)

    Args:
        in_channels (int): Number of channels in the input image
        out_channels (int): Number of channels produced by the convolution
        kernel_size (int or tuple): Size of the conv kernel
        stride (int or tuple, optional): Stride of the convolution. Default: 1
        padding (int or tuple, optional): Zero-padding added to both sides of
            the input. Default: 0
        dilation (int or tuple, optional): Spacing between kernel elements.
            Default: 1
        groups (int, optional): Number of blocked connections from input
            channels to output channels. Default: 1
        bias (bool, optional): If set True, adds a learnable bias to the
            output. Default: True
    """

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: Union[int, Tuple[int, int]],
                 stride: Union[int, Tuple[int, int]] = 1,
                 padding: Union[int, Tuple[int, int]] = 0,
                 dilation: Union[int, Tuple[int, int]] = 1,
                 groups: int = 1,
                 bias: bool = True):
        super().__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias)
        self.register_buffer('weight_gamma',
                             torch.ones(self.out_channels, 1, 1, 1))
        self.register_buffer('weight_beta',
                             torch.zeros(self.out_channels, 1, 1, 1))

    def _get_weight(self, weight: torch.Tensor) -> torch.Tensor:
        weight_flat = weight.view(weight.size(0), -1)
        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
        weight = (weight - mean) / std
        weight = self.weight_gamma * weight + self.weight_beta
        return weight

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        weight = self._get_weight(self.weight)
        return F.conv2d(x, weight, self.bias, self.stride, self.padding,
                        self.dilation, self.groups)

    def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
                              local_metadata: Dict, strict: bool,
                              missing_keys: List[str],
                              unexpected_keys: List[str],
                              error_msgs: List[str]) -> None:
        """Override default load function.

        AWS overrides the function _load_from_state_dict to recover
        weight_gamma and weight_beta if they are missing. If weight_gamma and
        weight_beta are found in the checkpoint, this function will return
        after super()._load_from_state_dict. Otherwise, it will compute the
        mean and std of the pretrained weights and store them in weight_beta
        and weight_gamma.
        """

        self.weight_gamma.data.fill_(-1)
        local_missing_keys: List = []
        super()._load_from_state_dict(state_dict, prefix, local_metadata,
                                      strict, local_missing_keys,
                                      unexpected_keys, error_msgs)
        if self.weight_gamma.data.mean() > 0:
            for k in local_missing_keys:
                missing_keys.append(k)
            return
        weight = self.weight.data
        weight_flat = weight.view(weight.size(0), -1)
        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
        self.weight_beta.data.copy_(mean)
        self.weight_gamma.data.copy_(std)
        missing_gamma_beta = [
            k for k in local_missing_keys
            if k.endswith('weight_gamma') or k.endswith('weight_beta')
        ]
        for k in missing_gamma_beta:
            local_missing_keys.remove(k)
        for k in local_missing_keys:
            missing_keys.append(k)


================================================
FILE: mmcv/cnn/bricks/depthwise_separable_conv_module.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, Optional, Tuple, Union

import torch
import torch.nn as nn

from .conv_module import ConvModule


class DepthwiseSeparableConvModule(nn.Module):
    """Depthwise separable convolution module.

    See https://arxiv.org/pdf/1704.04861.pdf for details.

    This module can replace a ConvModule with the conv block replaced by two
    conv block: depthwise conv block and pointwise conv block. The depthwise
    conv block contains depthwise-conv/norm/activation layers. The pointwise
    conv block contains pointwise-conv/norm/activation layers. It should be
    noted that there will be norm/activation layer in the depthwise conv block
    if `norm_cfg` and `act_cfg` are specified.

    Args:
        in_channels (int): Number of channels in the input feature map.
            Same as that in ``nn._ConvNd``.
        out_channels (int): Number of channels produced by the convolution.
            Same as that in ``nn._ConvNd``.
        kernel_size (int | tuple[int]): Size of the convolving kernel.
            Same as that in ``nn._ConvNd``.
        stride (int | tuple[int]): Stride of the convolution.
            Same as that in ``nn._ConvNd``. Default: 1.
        padding (int | tuple[int]): Zero-padding added to both sides of
            the input. Same as that in ``nn._ConvNd``. Default: 0.
        dilation (int | tuple[int]): Spacing between kernel elements.
            Same as that in ``nn._ConvNd``. Default: 1.
        norm_cfg (dict): Default norm config for both depthwise ConvModule and
            pointwise ConvModule. Default: None.
        act_cfg (dict): Default activation config for both depthwise ConvModule
            and pointwise ConvModule. Default: dict(type='ReLU').
        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
            'default', it will be the same as `norm_cfg`. Default: 'default'.
        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
            'default', it will be the same as `act_cfg`. Default: 'default'.
        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
            'default', it will be the same as `norm_cfg`. Default: 'default'.
        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
            'default', it will be the same as `act_cfg`. Default: 'default'.
        kwargs (optional): Other shared arguments for depthwise and pointwise
            ConvModule. See ConvModule for ref.
    """

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: Union[int, Tuple[int, int]],
                 stride: Union[int, Tuple[int, int]] = 1,
                 padding: Union[int, Tuple[int, int]] = 0,
                 dilation: Union[int, Tuple[int, int]] = 1,
                 norm_cfg: Optional[Dict] = None,
                 act_cfg: Dict = dict(type='ReLU'),
                 dw_norm_cfg: Union[Dict, str] = 'default',
                 dw_act_cfg: Union[Dict, str] = 'default',
                 pw_norm_cfg: Union[Dict, str] = 'default',
                 pw_act_cfg: Union[Dict, str] = 'default',
                 **kwargs):
        super().__init__()
        assert 'groups' not in kwargs, 'groups should not be specified'

        # if norm/activation config of depthwise/pointwise ConvModule is not
        # specified, use default config.
        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
        dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
        pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg

        # depthwise convolution
        self.depthwise_conv = ConvModule(
            in_channels,
            in_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=in_channels,
            norm_cfg=dw_norm_cfg,  # type: ignore
            act_cfg=dw_act_cfg,  # type: ignore
            **kwargs)

        self.pointwise_conv = ConvModule(
            in_channels,
            out_channels,
            1,
            norm_cfg=pw_norm_cfg,  # type: ignore
            act_cfg=pw_act_cfg,  # type: ignore
            **kwargs)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.depthwise_conv(x)
        x = self.pointwise_conv(x)
        return x


================================================
FILE: mmcv/cnn/bricks/drop.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any, Dict, Optional

import torch
import torch.nn as nn
from mmengine.registry import MODELS


def drop_path(x: torch.Tensor,
              drop_prob: float = 0.,
              training: bool = False) -> torch.Tensor:
    """Drop paths (Stochastic Depth) per sample (when applied in main path of
    residual blocks).

    We follow the implementation
    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py
    # noqa: E501
    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    # handle tensors with different dimensions, not just 4D tensors.
    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
    random_tensor = keep_prob + torch.rand(
        shape, dtype=x.dtype, device=x.device)
    output = x.div(keep_prob) * random_tensor.floor()
    return output


@MODELS.register_module()
class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
    residual blocks).

    We follow the implementation
    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501

    Args:
        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
    """

    def __init__(self, drop_prob: float = 0.1):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return drop_path(x, self.drop_prob, self.training)


@MODELS.register_module()
class Dropout(nn.Dropout):
    """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
    ``DropPath``

    Args:
        drop_prob (float): Probability of the elements to be
            zeroed. Default: 0.5.
        inplace (bool):  Do the operation inplace or not. Default: False.
    """

    def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
        super().__init__(p=drop_prob, inplace=inplace)


def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:
    """Builder for drop out layers."""
    return MODELS.build(cfg, default_args=default_args)


================================================
FILE: mmcv/cnn/bricks/generalized_attention.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import kaiming_init
from mmengine.registry import MODELS


@MODELS.register_module()
class GeneralizedAttention(nn.Module):
    """GeneralizedAttention module.

    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
    (https://arxiv.org/abs/1904.05873) for details.

    Args:
        in_channels (int): Channels of the input feature map.
        spatial_range (int): The spatial range. -1 indicates no spatial range
            constraint. Default: -1.
        num_heads (int): The head number of empirical_attention module.
            Default: 9.
        position_embedding_dim (int): The position embedding dimension.
            Default: -1.
        position_magnitude (int): A multiplier acting on coord difference.
            Default: 1.
        kv_stride (int): The feature stride acting on key/value feature map.
            Default: 2.
        q_stride (int): The feature stride acting on query feature map.
            Default: 1.
        attention_type (str): A binary indicator string for indicating which
            items in generalized empirical_attention module are used.
            Default: '1111'.

            - '1000' indicates 'query and key content' (appr - appr) item,
            - '0100' indicates 'query content and relative position'
              (appr - position) item,
            - '0010' indicates 'key content only' (bias - appr) item,
            - '0001' indicates 'relative position only' (bias - position) item.
    """

    _abbr_ = 'gen_attention_block'

    def __init__(self,
                 in_channels: int,
                 spatial_range: int = -1,
                 num_heads: int = 9,
                 position_embedding_dim: int = -1,
                 position_magnitude: int = 1,
                 kv_stride: int = 2,
                 q_stride: int = 1,
                 attention_type: str = '1111'):

        super().__init__()

        # hard range means local range for non-local operation
        self.position_embedding_dim = (
            position_embedding_dim
            if position_embedding_dim > 0 else in_channels)

        self.position_magnitude = position_magnitude
        self.num_heads = num_heads
        self.in_channels = in_channels
        self.spatial_range = spatial_range
        self.kv_stride = kv_stride
        self.q_stride = q_stride
        self.attention_type = [bool(int(_)) for _ in attention_type]
        self.qk_embed_dim = in_channels // num_heads
        out_c = self.qk_embed_dim * num_heads

        if self.attention_type[0] or self.attention_type[1]:
            self.query_conv = nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_c,
                kernel_size=1,
                bias=False)
            self.query_conv.kaiming_init = True

        if self.attention_type[0] or self.attention_type[2]:
            self.key_conv = nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_c,
                kernel_size=1,
                bias=False)
            self.key_conv.kaiming_init = True

        self.v_dim = in_channels // num_heads
        self.value_conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=self.v_dim * num_heads,
            kernel_size=1,
            bias=False)
        self.value_conv.kaiming_init = True

        if self.attention_type[1] or self.attention_type[3]:
            self.appr_geom_fc_x = nn.Linear(
                self.position_embedding_dim // 2, out_c, bias=False)
            self.appr_geom_fc_x.kaiming_init = True

            self.appr_geom_fc_y = nn.Linear(
                self.position_embedding_dim // 2, out_c, bias=False)
            self.appr_geom_fc_y.kaiming_init = True

        if self.attention_type[2]:
            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
            self.appr_bias = nn.Parameter(appr_bias_value)

        if self.attention_type[3]:
            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
            self.geom_bias = nn.Parameter(geom_bias_value)

        self.proj_conv = nn.Conv2d(
            in_channels=self.v_dim * num_heads,
            out_channels=in_channels,
            kernel_size=1,
            bias=True)
        self.proj_conv.kaiming_init = True
        self.gamma = nn.Parameter(torch.zeros(1))

        if self.spatial_range >= 0:
            # only works when non local is after 3*3 conv
            if in_channels == 256:
                max_len = 84
            elif in_channels == 512:
                max_len = 42

            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
            local_constraint_map = np.ones(
                (max_len, max_len, max_len_kv, max_len_kv), dtype=int)
            for iy in range(max_len):
                for ix in range(max_len):
                    local_constraint_map[
                        iy, ix,
                        max((iy - self.spatial_range) //
                            self.kv_stride, 0):min((iy + self.spatial_range +
                                                    1) // self.kv_stride +
                                                   1, max_len),
                        max((ix - self.spatial_range) //
                            self.kv_stride, 0):min((ix + self.spatial_range +
                                                    1) // self.kv_stride +
                                                   1, max_len)] = 0

            self.local_constraint_map = nn.Parameter(
                torch.from_numpy(local_constraint_map).byte(),
                requires_grad=False)

        if self.q_stride > 1:
            self.q_downsample = nn.AvgPool2d(
                kernel_size=1, stride=self.q_stride)
        else:
            self.q_downsample = None

        if self.kv_stride > 1:
            self.kv_downsample = nn.AvgPool2d(
                kernel_size=1, stride=self.kv_stride)
        else:
            self.kv_downsample = None

        self.init_weights()

    def get_position_embedding(self,
                               h,
                               w,
                               h_kv,
                               w_kv,
                               q_stride,
                               kv_stride,
                               device,
                               dtype,
                               feat_dim,
                               wave_length=1000):
        # the default type of Tensor is float32, leading to type mismatch
        # in fp16 mode. Cast it to support fp16 mode.
        h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
        h_idxs = h_idxs.view((h, 1)) * q_stride

        w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
        w_idxs = w_idxs.view((w, 1)) * q_stride

        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
            device=device, dtype=dtype)
        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride

        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
            device=device, dtype=dtype)
        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride

        # (h, h_kv, 1)
        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
        h_diff *= self.position_magnitude

        # (w, w_kv, 1)
        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
        w_diff *= self.position_magnitude

        feat_range = torch.arange(0, feat_dim / 4).to(
            device=device, dtype=dtype)

        dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
        dim_mat = dim_mat**((4. / feat_dim) * feat_range)
        dim_mat = dim_mat.view((1, 1, -1))

        embedding_x = torch.cat(
            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)

        embedding_y = torch.cat(
            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)

        return embedding_x, embedding_y

    def forward(self, x_input: torch.Tensor) -> torch.Tensor:
        num_heads = self.num_heads

        # use empirical_attention
        if self.q_downsample is not None:
            x_q = self.q_downsample(x_input)
        else:
            x_q = x_input
        n, _, h, w = x_q.shape

        if self.kv_downsample is not None:
            x_kv = self.kv_downsample(x_input)
        else:
            x_kv = x_input
        _, _, h_kv, w_kv = x_kv.shape

        if self.attention_type[0] or self.attention_type[1]:
            proj_query = self.query_conv(x_q).view(
                (n, num_heads, self.qk_embed_dim, h * w))
            proj_query = proj_query.permute(0, 1, 3, 2)

        if self.attention_type[0] or self.attention_type[2]:
            proj_key = self.key_conv(x_kv).view(
                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))

        if self.attention_type[1] or self.attention_type[3]:
            position_embed_x, position_embed_y = self.get_position_embedding(
                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
                x_input.device, x_input.dtype, self.position_embedding_dim)
            # (n, num_heads, w, w_kv, dim)
            position_feat_x = self.appr_geom_fc_x(position_embed_x).\
                view(1, w, w_kv, num_heads, self.qk_embed_dim).\
                permute(0, 3, 1, 2, 4).\
                repeat(n, 1, 1, 1, 1)

            # (n, num_heads, h, h_kv, dim)
            position_feat_y = self.appr_geom_fc_y(position_embed_y).\
                view(1, h, h_kv, num_heads, self.qk_embed_dim).\
                permute(0, 3, 1, 2, 4).\
                repeat(n, 1, 1, 1, 1)

            position_feat_x /= math.sqrt(2)
            position_feat_y /= math.sqrt(2)

        # accelerate for saliency only
        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
            appr_bias = self.appr_bias.\
                view(1, num_heads, 1, self.qk_embed_dim).\
                repeat(n, 1, 1, 1)

            energy = torch.matmul(appr_bias, proj_key).\
                view(n, num_heads, 1, h_kv * w_kv)

            h = 1
            w = 1
        else:
            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
            if not self.attention_type[0]:
                energy = torch.zeros(
                    n,
                    num_heads,
                    h,
                    w,
                    h_kv,
                    w_kv,
                    dtype=x_input.dtype,
                    device=x_input.device)

            # attention_type[0]: appr - appr
            # attention_type[1]: appr - position
            # attention_type[2]: bias - appr
            # attention_type[3]: bias - position
            if self.attention_type[0] or self.attention_type[2]:
                if self.attention_type[0] and self.attention_type[2]:
                    appr_bias = self.appr_bias.\
                        view(1, num_heads, 1, self.qk_embed_dim)
                    energy = torch.matmul(proj_query + appr_bias, proj_key).\
                        view(n, num_heads, h, w, h_kv, w_kv)

                elif self.attention_type[0]:
                    energy = torch.matmul(proj_query, proj_key).\
                        view(n, num_heads, h, w, h_kv, w_kv)

                elif self.attention_type[2]:
                    appr_bias = self.appr_bias.\
                        view(1, num_heads, 1, self.qk_embed_dim).\
                        repeat(n, 1, 1, 1)

                    energy += torch.matmul(appr_bias, proj_key).\
                        view(n, num_heads, 1, 1, h_kv, w_kv)

            if self.attention_type[1] or self.attention_type[3]:
                if self.attention_type[1] and self.attention_type[3]:
                    geom_bias = self.geom_bias.\
                        view(1, num_heads, 1, self.qk_embed_dim)

                    proj_query_reshape = (proj_query + geom_bias).\
                        view(n, num_heads, h, w, self.qk_embed_dim)

                    energy_x = torch.matmul(
                        proj_query_reshape.permute(0, 1, 3, 2, 4),
                        position_feat_x.permute(0, 1, 2, 4, 3))
                    energy_x = energy_x.\
                        permute(0, 1, 3, 2, 4).unsqueeze(4)

                    energy_y = torch.matmul(
                        proj_query_reshape,
                        position_feat_y.permute(0, 1, 2, 4, 3))
                    energy_y = energy_y.unsqueeze(5)

                    energy += energy_x + energy_y

                elif self.attention_type[1]:
                    proj_query_reshape = proj_query.\
                        view(n, num_heads, h, w, self.qk_embed_dim)
                    proj_query_reshape = proj_query_reshape.\
                        permute(0, 1, 3, 2, 4)
                    position_feat_x_reshape = position_feat_x.\
                        permute(0, 1, 2, 4, 3)
                    position_feat_y_reshape = position_feat_y.\
                        permute(0, 1, 2, 4, 3)

                    energy_x = torch.matmul(proj_query_reshape,
                                            position_feat_x_reshape)
                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)

                    energy_y = torch.matmul(proj_query_reshape,
                                            position_feat_y_reshape)
                    energy_y = energy_y.unsqueeze(5)

                    energy += energy_x + energy_y

                elif self.attention_type[3]:
                    geom_bias = self.geom_bias.\
                        view(1, num_heads, self.qk_embed_dim, 1).\
                        repeat(n, 1, 1, 1)

                    position_feat_x_reshape = position_feat_x.\
                        view(n, num_heads, w * w_kv, self.qk_embed_dim)

                    position_feat_y_reshape = position_feat_y.\
                        view(n, num_heads, h * h_kv, self.qk_embed_dim)

                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)

                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)

                    energy += energy_x + energy_y

            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)

        if self.spatial_range >= 0:
            cur_local_constraint_map = \
                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
                contiguous().\
                view(1, 1, h*w, h_kv*w_kv)

            energy = energy.masked_fill_(cur_local_constraint_map.bool(),
                                         float('-inf'))

        attention = F.softmax(energy, 3)

        proj_value = self.value_conv(x_kv)
        proj_value_reshape = proj_value.\
            view((n, num_heads, self.v_dim, h_kv * w_kv)).\
            permute(0, 1, 3, 2)

        out = torch.matmul(attention, proj_value_reshape).\
            permute(0, 1, 3, 2).\
            contiguous().\
            view(n, self.v_dim * self.num_heads, h, w)

        out = self.proj_conv(out)

        # output is downsampled, upsample back to input size
        if self.q_downsample is not None:
            out = F.interpolate(
                out,
                size=x_input.shape[2:],
                mode='bilinear',
                align_corners=False)

        out = self.gamma * out + x_input
        return out

    def init_weights(self):
        for m in self.modules():
            if hasattr(m, 'kaiming_init') and m.kaiming_init:
                kaiming_init(
                    m,
                    mode='fan_in',
                    nonlinearity='leaky_relu',
                    bias=0,
                    distribution='uniform',
                    a=1)


================================================
FILE: mmcv/cnn/bricks/hsigmoid.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings

import torch
import torch.nn as nn
from mmengine.registry import MODELS


@MODELS.register_module()
class HSigmoid(nn.Module):
    """Hard Sigmoid Module. Apply the hard sigmoid function:
    Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
    Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)

    Note:
        In MMCV v1.4.4, we modified the default value of args to align with
        PyTorch official.

    Args:
        bias (float): Bias of the input feature map. Default: 3.0.
        divisor (float): Divisor of the input feature map. Default: 6.0.
        min_value (float): Lower bound value. Default: 0.0.
        max_value (float): Upper bound value. Default: 1.0.

    Returns:
        Tensor: The output tensor.
    """

    def __init__(self,
                 bias: float = 3.0,
                 divisor: float = 6.0,
                 min_value: float = 0.0,
                 max_value: float = 1.0):
        super().__init__()
        warnings.warn(
            'In MMCV v1.4.4, we modified the default value of args to align '
            'with PyTorch official. Previous Implementation: '
            'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '
            'Current Implementation: '
            'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')
        self.bias = bias
        self.divisor = divisor
        assert self.divisor != 0
        self.min_value = min_value
        self.max_value = max_value

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = (x + self.bias) / self.divisor

        return x.clamp_(self.min_value, self.max_value)


================================================
FILE: mmcv/cnn/bricks/hswish.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
from mmengine.registry import MODELS
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION


class HSwish(nn.Module):
    """Hard Swish Module.

    This module applies the hard swish function:

    .. math::
        Hswish(x) = x * ReLU6(x + 3) / 6

    Args:
        inplace (bool): can optionally do the operation in-place.
            Default: False.

    Returns:
        Tensor: The output tensor.
    """

    def __init__(self, inplace: bool = False):
        super().__init__()
        self.act = nn.ReLU6(inplace)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self.act(x + 3) / 6


if (TORCH_VERSION == 'parrots'
        or digit_version(TORCH_VERSION) < digit_version('1.7')):
    # Hardswish is not supported when PyTorch version < 1.6.
    # And Hardswish in PyTorch 1.6 does not support inplace.
    MODELS.register_module(module=HSwish)
else:
    MODELS.register_module(module=nn.Hardswish, name='HSwish')


================================================
FILE: mmcv/cnn/bricks/non_local.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from abc import ABCMeta
from typing import Dict, Optional

import torch
import torch.nn as nn
from mmengine.model import constant_init, normal_init
from mmengine.registry import MODELS

from .conv_module import ConvModule


class _NonLocalNd(nn.Module, metaclass=ABCMeta):
    """Basic Non-local module.

    This module is proposed in
    "Non-local Neural Networks"
    Paper reference: https://arxiv.org/abs/1711.07971
    Code reference: https://github.com/AlexHex7/Non-local_pytorch

    Args:
        in_channels (int): Channels of the input feature map.
        reduction (int): Channel reduction ratio. Default: 2.
        use_scale (bool): Whether to scale pairwise_weight by
            `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
            Default: True.
        conv_cfg (None | dict): The config dict for convolution layers.
            If not specified, it will use `nn.Conv2d` for convolution layers.
            Default: None.
        norm_cfg (None | dict): The config dict for normalization layers.
            Default: None. (This parameter is only applicable to conv_out.)
        mode (str): Options are `gaussian`, `concatenation`,
            `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
    """

    def __init__(self,
                 in_channels: int,
                 reduction: int = 2,
                 use_scale: bool = True,
                 conv_cfg: Optional[Dict] = None,
                 norm_cfg: Optional[Dict] = None,
                 mode: str = 'embedded_gaussian',
                 **kwargs):
        super().__init__()
        self.in_channels = in_channels
        self.reduction = reduction
        self.use_scale = use_scale
        self.inter_channels = max(in_channels // reduction, 1)
        self.mode = mode

        if mode not in [
                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
        ]:
            raise ValueError("Mode should be in 'gaussian', 'concatenation', "
                             f"'embedded_gaussian' or 'dot_product', but got "
                             f'{mode} instead.')

        # g, theta, phi are defaulted as `nn.ConvNd`.
        # Here we use ConvModule for potential usage.
        self.g = ConvModule(
            self.in_channels,
            self.inter_channels,
            kernel_size=1,
            conv_cfg=conv_cfg,
            act_cfg=None)  # type: ignore
        self.conv_out = ConvModule(
            self.inter_channels,
            self.in_channels,
            kernel_size=1,
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg,
            act_cfg=None)

        if self.mode != 'gaussian':
            self.theta = ConvModule(
                self.in_channels,
                self.inter_channels,
                kernel_size=1,
                conv_cfg=conv_cfg,
                act_cfg=None)
            self.phi = ConvModule(
                self.in_channels,
                self.inter_channels,
                kernel_size=1,
                conv_cfg=conv_cfg,
                act_cfg=None)

        if self.mode == 'concatenation':
            self.concat_project = ConvModule(
                self.inter_channels * 2,
                1,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=False,
                act_cfg=dict(type='ReLU'))

        self.init_weights(**kwargs)

    def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None:
        if self.mode != 'gaussian':
            for m in [self.g, self.theta, self.phi]:
                normal_init(m.conv, std=std)
        else:
            normal_init(self.g.conv, std=std)
        if zeros_init:
            if self.conv_out.norm_cfg is None:
                constant_init(self.conv_out.conv, 0)
            else:
                constant_init(self.conv_out.norm, 0)
        else:
            if self.conv_out.norm_cfg is None:
                normal_init(self.conv_out.conv, std=std)
            else:
                normal_init(self.conv_out.norm, std=std)

    def gaussian(self, theta_x: torch.Tensor,
                 phi_x: torch.Tensor) -> torch.Tensor:
        # NonLocal1d pairwise_weight: [N, H, H]
        # NonLocal2d pairwise_weight: [N, HxW, HxW]
        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
        pairwise_weight = torch.matmul(theta_x, phi_x)
        pairwise_weight = pairwise_weight.softmax(dim=-1)
        return pairwise_weight

    def embedded_gaussian(self, theta_x: torch.Tensor,
                          phi_x: torch.Tensor) -> torch.Tensor:
        # NonLocal1d pairwise_weight: [N, H, H]
        # NonLocal2d pairwise_weight: [N, HxW, HxW]
        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
        pairwise_weight = torch.matmul(theta_x, phi_x)
        if self.use_scale:
            # theta_x.shape[-1] is `self.inter_channels`
            pairwise_weight /= theta_x.shape[-1]**0.5
        pairwise_weight = pairwise_weight.softmax(dim=-1)
        return pairwise_weight

    def dot_product(self, theta_x: torch.Tensor,
                    phi_x: torch.Tensor) -> torch.Tensor:
        # NonLocal1d pairwise_weight: [N, H, H]
        # NonLocal2d pairwise_weight: [N, HxW, HxW]
        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
        pairwise_weight = torch.matmul(theta_x, phi_x)
        pairwise_weight /= pairwise_weight.shape[-1]
        return pairwise_weight

    def concatenation(self, theta_x: torch.Tensor,
                      phi_x: torch.Tensor) -> torch.Tensor:
        # NonLocal1d pairwise_weight: [N, H, H]
        # NonLocal2d pairwise_weight: [N, HxW, HxW]
        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
        h = theta_x.size(2)
        w = phi_x.size(3)
        theta_x = theta_x.repeat(1, 1, 1, w)
        phi_x = phi_x.repeat(1, 1, h, 1)

        concat_feature = torch.cat([theta_x, phi_x], dim=1)
        pairwise_weight = self.concat_project(concat_feature)
        n, _, h, w = pairwise_weight.size()
        pairwise_weight = pairwise_weight.view(n, h, w)
        pairwise_weight /= pairwise_weight.shape[-1]

        return pairwise_weight

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Assume `reduction = 1`, then `inter_channels = C`
        # or `inter_channels = C` when `mode="gaussian"`

        # NonLocal1d x: [N, C, H]
        # NonLocal2d x: [N, C, H, W]
        # NonLocal3d x: [N, C, T, H, W]
        n = x.size(0)

        # NonLocal1d g_x: [N, H, C]
        # NonLocal2d g_x: [N, HxW, C]
        # NonLocal3d g_x: [N, TxHxW, C]
        g_x = self.g(x).view(n, self.inter_channels, -1)
        g_x = g_x.permute(0, 2, 1)

        # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
        # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
        # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
        if self.mode == 'gaussian':
            theta_x = x.view(n, self.in_channels, -1)
            theta_x = theta_x.permute(0, 2, 1)
            if self.sub_sample:
                phi_x = self.phi(x).view(n, self.in_channels, -1)
            else:
                phi_x = x.view(n, self.in_channels, -1)
        elif self.mode == 'concatenation':
            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
        else:
            theta_x = self.theta(x).view(n, self.inter_channels, -1)
            theta_x = theta_x.permute(0, 2, 1)
            phi_x = self.phi(x).view(n, self.inter_channels, -1)

        pairwise_func = getattr(self, self.mode)
        # NonLocal1d pairwise_weight: [N, H, H]
        # NonLocal2d pairwise_weight: [N, HxW, HxW]
        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
        pairwise_weight = pairwise_func(theta_x, phi_x)

        # NonLocal1d y: [N, H, C]
        # NonLocal2d y: [N, HxW, C]
        # NonLocal3d y: [N, TxHxW, C]
        y = torch.matmul(pairwise_weight, g_x)
        # NonLocal1d y: [N, C, H]
        # NonLocal2d y: [N, C, H, W]
        # NonLocal3d y: [N, C, T, H, W]
        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
                                                    *x.size()[2:])

        output = x + self.conv_out(y)

        return output


class NonLocal1d(_NonLocalNd):
    """1D Non-local module.

    Args:
        in_channels (int): Same as `NonLocalND`.
        sub_sample (bool): Whether to apply max pooling after pairwise
            function (Note that the `sub_sample` is applied on spatial only).
            Default: False.
        conv_cfg (None | dict): Same as `NonLocalND`.
            Default: dict(type='Conv1d').
    """

    def __init__(self,
                 in_channels: int,
                 sub_sample: bool = False,
                 conv_cfg: Dict = dict(type='Conv1d'),
                 **kwargs):
        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)

        self.sub_sample = sub_sample

        if sub_sample:
            max_pool_layer = nn.MaxPool1d(kernel_size=2)
            self.g = nn.Sequential(self.g, max_pool_layer)
            if self.mode != 'gaussian':
                self.phi = nn.Sequential(self.phi, max_pool_layer)
            else:
                self.phi = max_pool_layer


@MODELS.register_module()
class NonLocal2d(_NonLocalNd):
    """2D Non-local module.

    Args:
        in_channels (int): Same as `NonLocalND`.
        sub_sample (bool): Whether to apply max pooling after pairwise
            function (Note that the `sub_sample` is applied on spatial only).
            Default: False.
        conv_cfg (None | dict): Same as `NonLocalND`.
            Default: dict(type='Conv2d').
    """

    _abbr_ = 'nonlocal_block'

    def __init__(self,
                 in_channels: int,
                 sub_sample: bool = False,
                 conv_cfg: Dict = dict(type='Conv2d'),
                 **kwargs):
        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)

        self.sub_sample = sub_sample

        if sub_sample:
            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
            self.g = nn.Sequential(self.g, max_pool_layer)
            if self.mode != 'gaussian':
                self.phi = nn.Sequential(self.phi, max_pool_layer)
            else:
                self.phi = max_pool_layer


class NonLocal3d(_NonLocalNd):
    """3D Non-local module.

    Args:
        in_channels (int): Same as `NonLocalND`.
        sub_sample (bool): Whether to apply max pooling after pairwise
            function (Note that the `sub_sample` is applied on spatial only).
            Default: False.
        conv_cfg (None | dict): Same as `NonLocalND`.
            Default: dict(type='Conv3d').
    """

    def __init__(self,
                 in_channels: int,
                 sub_sample: bool = False,
                 conv_cfg: Dict = dict(type='Conv3d'),
                 **kwargs):
        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
        self.sub_sample = sub_sample

        if sub_sample:
            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
            self.g = nn.Sequential(self.g, max_pool_layer)
            if self.mode != 'gaussian':
                self.phi = nn.Sequential(self.phi, max_pool_layer)
            else:
                self.phi = max_pool_layer


================================================
FILE: mmcv/cnn/bricks/norm.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import inspect
from typing import Dict, Tuple, Union

import torch.nn as nn
from mmengine.registry import MODELS
from mmengine.utils import is_tuple_of
from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm, _BatchNorm,
                                                     _InstanceNorm)

MODELS.register_module('BN', module=nn.BatchNorm2d)
MODELS.register_module('BN1d', module=nn.BatchNorm1d)
MODELS.register_module('BN2d', module=nn.BatchNorm2d)
MODELS.register_module('BN3d', module=nn.BatchNorm3d)
MODELS.register_module('SyncBN', module=SyncBatchNorm)
MODELS.register_module('GN', module=nn.GroupNorm)
MODELS.register_module('LN', module=nn.LayerNorm)
MODELS.register_module('IN', module=nn.InstanceNorm2d)
MODELS.register_module('IN1d', module=nn.InstanceNorm1d)
MODELS.register_module('IN2d', module=nn.InstanceNorm2d)
MODELS.register_module('IN3d', module=nn.InstanceNorm3d)


def infer_abbr(class_type):
    """Infer abbreviation from the class name.

    When we build a norm layer with `build_norm_layer()`, we want to preserve
    the norm type in variable names, e.g, self.bn1, self.gn. This method will
    infer the abbreviation to map class types to abbreviations.

    Rule 1: If the class has the property "_abbr_", return the property.
    Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
    InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
    "in" respectively.
    Rule 3: If the class name contains "batch", "group", "layer" or "instance",
    the abbreviation of this layer will be "bn", "gn", "ln" and "in"
    respectively.
    Rule 4: Otherwise, the abbreviation falls back to "norm".

    Args:
        class_type (type): The norm layer type.

    Returns:
        str: The inferred abbreviation.
    """
    if not inspect.isclass(class_type):
        raise TypeError(
            f'class_type must be a type, but got {type(class_type)}')
    if hasattr(class_type, '_abbr_'):
        return class_type._abbr_
    if issubclass(class_type, _InstanceNorm):  # IN is a subclass of BN
        return 'in'
    elif issubclass(class_type, _BatchNorm):
        return 'bn'
    elif issubclass(class_type, nn.GroupNorm):
        return 'gn'
    elif issubclass(class_type, nn.LayerNorm):
        return 'ln'
    else:
        class_name = class_type.__name__.lower()
        if 'batch' in class_name:
            return 'bn'
        elif 'group' in class_name:
            return 'gn'
        elif 'layer' in class_name:
            return 'ln'
        elif 'instance' in class_name:
            return 'in'
        else:
            return 'norm_layer'


def build_norm_layer(cfg: Dict,
                     num_features: int,
                     postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
    """Build normalization layer.

    Args:
        cfg (dict): The norm layer config, which should contain:

            - type (str): Layer type.
            - layer args: Args needed to instantiate a norm layer.
            - requires_grad (bool, optional): Whether stop gradient updates.
        num_features (int): Number of input channels.
        postfix (int | str): The postfix to be appended into norm abbreviation
            to create named layer.

    Returns:
        tuple[str, nn.Module]: The first element is the layer name consisting
        of abbreviation and postfix, e.g., bn1, gn. The second element is the
        created norm layer.
    """
    if not isinstance(cfg, dict):
        raise TypeError('cfg must be a dict')
    if 'type' not in cfg:
        raise KeyError('the cfg dict must contain the key "type"')
    cfg_ = cfg.copy()

    layer_type = cfg_.pop('type')

    if inspect.isclass(layer_type):
        norm_layer = layer_type
    else:
        # Switch registry to the target scope. If `norm_layer` cannot be found
        # in the registry, fallback to search `norm_layer` in the
        # mmengine.MODELS.
        with MODELS.switch_scope_and_registry(None) as registry:
            norm_layer = registry.get(layer_type)
        if norm_layer is None:
            raise KeyError(f'Cannot find {norm_layer} in registry under '
                           f'scope name {registry.scope}')
    abbr = infer_abbr(norm_layer)

    assert isinstance(postfix, (int, str))
    name = abbr + str(postfix)

    requires_grad = cfg_.pop('requires_grad', True)
    cfg_.setdefault('eps', 1e-5)
    if norm_layer is not nn.GroupNorm:
        layer = norm_layer(num_features, **cfg_)
        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
            layer._specify_ddp_gpu_num(1)
    else:
        assert 'num_groups' in cfg_
        layer = norm_layer(num_channels=num_features, **cfg_)

    for param in layer.parameters():
        param.requires_grad = requires_grad

    return name, layer


def is_norm(layer: nn.Module,
            exclude: Union[type, tuple, None] = None) -> bool:
    """Check if a layer is a normalization layer.

    Args:
        layer (nn.Module): The layer to be checked.
        exclude (type | tuple[type]): Types to be excluded.

    Returns:
        bool: Whether the layer is a norm layer.
    """
    if exclude is not None:
        if not isinstance(exclude, tuple):
            exclude = (exclude, )
        if not is_tuple_of(exclude, type):
            raise TypeError(
                f'"exclude" must be either None or type or a tuple of types, '
                f'but got {type(exclude)}: {exclude}')

    if exclude and isinstance(layer, exclude):
        return False

    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
    return isinstance(layer, all_norm_bases)


================================================
FILE: mmcv/cnn/bricks/padding.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import inspect
from typing import Dict

import torch.nn as nn
from mmengine.registry import MODELS

MODELS.register_module('zero', module=nn.ZeroPad2d)
MODELS.register_module('reflect', module=nn.ReflectionPad2d)
MODELS.register_module('replicate', module=nn.ReplicationPad2d)


def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
    """Build padding layer.

    Args:
        cfg (dict): The padding layer config, which should contain:
            - type (str): Layer type.
            - layer args: Args needed to instantiate a padding layer.

    Returns:
        nn.Module: Created padding layer.
    """
    if not isinstance(cfg, dict):
        raise TypeError('cfg must be a dict')
    if 'type' not in cfg:
        raise KeyError('the cfg dict must contain the key "type"')

    cfg_ = cfg.copy()
    padding_type = cfg_.pop('type')
    if inspect.isclass(padding_type):
        return padding_type(*args, **kwargs, **cfg_)
    # Switch registry to the target scope. If `padding_layer` cannot be found
    # in the registry, fallback to search `padding_layer` in the
    # mmengine.MODELS.
    with MODELS.switch_scope_and_registry(None) as registry:
        padding_layer = registry.get(padding_type)
    if padding_layer is None:
        raise KeyError(f'Cannot find {padding_layer} in registry under scope '
                       f'name {registry.scope}')
    layer = padding_layer(*args, **kwargs, **cfg_)

    return layer


================================================
FILE: mmcv/cnn/bricks/plugin.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import inspect
import platform
from typing import Dict, Tuple, Union

import torch.nn as nn
from mmengine.registry import MODELS

if platform.system() == 'Windows':
    import regex as re  # type: ignore
else:
    import re  # type: ignore


def infer_abbr(class_type: type) -> str:
    """Infer abbreviation from the class name.

    This method will infer the abbreviation to map class types to
    abbreviations.

    Rule 1: If the class has the property "abbr", return the property.
    Rule 2: Otherwise, the abbreviation falls back to snake case of class
    name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.

    Args:
        class_type (type): The norm layer type.

    Returns:
        str: The inferred abbreviation.
    """

    def camel2snack(word):
        """Convert camel case word into snack case.

        Modified from `inflection lib
        <https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.

        Example::

            >>> camel2snack("FancyBlock")
            'fancy_block'
        """

        word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
        word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
        word = word.replace('-', '_')
        return word.lower()

    if not inspect.isclass(class_type):
        raise TypeError(
            f'class_type must be a type, but got {type(class_type)}')
    if hasattr(class_type, '_abbr_'):
        return class_type._abbr_  # type: ignore
    else:
        return camel2snack(class_type.__name__)


def build_plugin_layer(cfg: Dict,
                       postfix: Union[int, str] = '',
                       **kwargs) -> Tuple[str, nn.Module]:
    """Build plugin layer.

    Args:
        cfg (dict): cfg should contain:

            - type (str): identify plugin layer type.
            - layer args: args needed to instantiate a plugin layer.
        postfix (int, str): appended into norm abbreviation to
            create named layer. Default: ''.

    Returns:
        tuple[str, nn.Module]: The first one is the concatenation of
        abbreviation and postfix. The second is the created plugin layer.
    """
    if not isinstance(cfg, dict):
        raise TypeError('cfg must be a dict')
    if 'type' not in cfg:
        raise KeyError('the cfg dict must contain the key "type"')
    cfg_ = cfg.copy()

    layer_type = cfg_.pop('type')
    if inspect.isclass(layer_type):
        plugin_layer = layer_type
    else:
        # Switch registry to the target scope. If `plugin_layer` cannot be
        # found in the registry, fallback to search `plugin_layer` in the
        # mmengine.MODELS.
        with MODELS.switch_scope_and_registry(None) as registry:
            plugin_layer = registry.get(layer_type)
        if plugin_layer is None:
            raise KeyError(
                f'Cannot find {plugin_layer} in registry under scope '
                f'name {registry.scope}')
    abbr = infer_abbr(plugin_layer)

    assert isinstance(postfix, (int, str))
    name = abbr + str(postfix)

    layer = plugin_layer(**kwargs, **cfg_)

    return name, layer


================================================
FILE: mmcv/cnn/bricks/scale.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn


class Scale(nn.Module):
    """A learnable scale parameter.

    This layer scales the input by a learnable factor. It multiplies a
    learnable scale parameter of shape (1,) with input of any shape.

    Args:
        scale (float): Initial value of scale factor. Default: 1.0
    """

    def __init__(self, scale: float = 1.0):
        super().__init__()
        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self.scale


class LayerScale(nn.Module):
    """LayerScale layer.

    Args:
        dim (int): Dimension of input features.
        inplace (bool): Whether performs operation in-place.
            Default: `False`.
        data_format (str): The input data format, could be 'channels_last'
            or 'channels_first', representing (B, C, H, W) and
            (B, N, C) format data respectively. Default: 'channels_last'.
        scale (float): Initial value of scale factor. Default: 1.0
    """

    def __init__(self,
                 dim: int,
                 inplace: bool = False,
                 data_format: str = 'channels_last',
                 scale: float = 1e-5):
        super().__init__()
        assert data_format in ('channels_last', 'channels_first'), \
            "'data_format' could only be channels_last or channels_first."
        self.inplace = inplace
        self.data_format = data_format
        self.weight = nn.Parameter(torch.ones(dim) * scale)

    def forward(self, x) -> torch.Tensor:
        if self.data_format == 'channels_first':
            shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2))))
        else:
            shape = tuple((*(1 for _ in range(x.dim() - 1)), -1))
        if self.inplace:
            return x.mul_(self.weight.view(*shape))
        else:
            return x * self.weight.view(*shape)


================================================
FILE: mmcv/cnn/bricks/swish.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
from mmengine.registry import MODELS


@MODELS.register_module()
class Swish(nn.Module):
    """Swish Module.

    This module applies the swish function:

    .. math::
        Swish(x) = x * Sigmoid(x)

    Returns:
        Tensor: The output tensor.
    """

    def __init__(self):
        super().__init__()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * torch.sigmoid(x)


================================================
FILE: mmcv/cnn/bricks/transformer.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import math
import warnings
from typing import Sequence

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.config import ConfigDict
from mmengine.model import BaseModule, ModuleList, Sequential
from mmengine.registry import MODELS
from mmengine.utils import deprecated_api_warning, to_2tuple

from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
                      build_norm_layer)
from .drop import build_dropout
from .scale import LayerScale

# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
try:
    from mmcv.ops.multi_scale_deform_attn import \
        MultiScaleDeformableAttention  # noqa F401
    warnings.warn(
        ImportWarning(
            '``MultiScaleDeformableAttention`` has been moved to '
            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
        ))

except ImportError:
    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
                  '``mmcv.ops.multi_scale_deform_attn``, '
                  'You should install ``mmcv`` rather than ``mmcv-lite`` '
                  'if you need this module. ')


def build_positional_encoding(cfg, default_args=None):
    """Builder for Position Encoding."""
    return MODELS.build(cfg, default_args=default_args)


def build_attention(cfg, default_args=None):
    """Builder for attention."""
    return MODELS.build(cfg, default_args=default_args)


def build_feedforward_network(cfg, default_args=None):
    """Builder for feed-forward network (FFN)."""
    return MODELS.build(cfg, default_args=default_args)


def build_transformer_layer(cfg, default_args=None):
    """Builder for transformer layer."""
    return MODELS.build(cfg, default_args=default_args)


def build_transformer_layer_sequence(cfg, default_args=None):
    """Builder for transformer encoder and transformer decoder."""
    return MODELS.build(cfg, default_args=default_args)


class AdaptivePadding(nn.Module):
    """Applies padding adaptively to the input.

    This module can make input get fully covered by filter
    you specified. It support two modes "same" and "corner". The
    "same" mode is same with "SAME" padding mode in TensorFlow, pad
    zero around input. The "corner"  mode would pad zero
    to bottom right.

    Args:
        kernel_size (int | tuple): Size of the kernel. Default: 1.
        stride (int | tuple): Stride of the filter. Default: 1.
        dilation (int | tuple): Spacing between kernel elements.
            Default: 1.
        padding (str): Support "same" and "corner", "corner" mode
            would pad zero to bottom right, and "same" mode would
            pad zero around input. Default: "corner".

    Example:
        >>> kernel_size = 16
        >>> stride = 16
        >>> dilation = 1
        >>> input = torch.rand(1, 1, 15, 17)
        >>> adap_pad = AdaptivePadding(
        >>>     kernel_size=kernel_size,
        >>>     stride=stride,
        >>>     dilation=dilation,
        >>>     padding="corner")
        >>> out = adap_pad(input)
        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
        >>> input = torch.rand(1, 1, 16, 17)
        >>> out = adap_pad(input)
        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
    """

    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
        super().__init__()
        assert padding in ('same', 'corner')

        kernel_size = to_2tuple(kernel_size)
        stride = to_2tuple(stride)
        dilation = to_2tuple(dilation)

        self.padding = padding
        self.kernel_size = kernel_size
        self.stride = stride
        self.dilation = dilation

    def get_pad_shape(self, input_shape):
        """Calculate the padding size of input.

        Args:
            input_shape (:obj:`torch.Size`): arrange as (H, W).

        Returns:
            Tuple[int]: The padding size along the
            original H and W directions
        """
        input_h, input_w = input_shape
        kernel_h, kernel_w = self.kernel_size
        stride_h, stride_w = self.stride
        output_h = math.ceil(input_h / stride_h)
        output_w = math.ceil(input_w / stride_w)
        pad_h = max((output_h - 1) * stride_h +
                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
        pad_w = max((output_w - 1) * stride_w +
                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
        return pad_h, pad_w

    def forward(self, x):
        """Add padding to `x`

        Args:
            x (Tensor): Input tensor has shape (B, C, H, W).

        Returns:
            Tensor: The tensor with adaptive padding
        """
        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
        if pad_h > 0 or pad_w > 0:
            if self.padding == 'corner':
                x = F.pad(x, [0, pad_w, 0, pad_h])
            elif self.padding == 'same':
                x = F.pad(x, [
                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
                    pad_h - pad_h // 2
                ])
        return x


class PatchEmbed(BaseModule):
    """Image to Patch Embedding.

    We use a conv layer to implement PatchEmbed.

    Args:
        in_channels (int): The num of input channels. Default: 3
        embed_dims (int): The dimensions of embedding. Default: 768
        conv_type (str): The type of convolution
            to generate patch embedding. Default: "Conv2d".
        kernel_size (int): The kernel_size of embedding conv. Default: 16.
        stride (int): The slide stride of embedding conv.
            Default: 16.
        padding (int | tuple | string): The padding length of
            embedding conv. When it is a string, it means the mode
            of adaptive padding, support "same" and "corner" now.
            Default: "corner".
        dilation (int): The dilation rate of embedding conv. Default: 1.
        bias (bool): Bias of embed conv. Default: True.
        norm_cfg (dict, optional): Config dict for normalization layer.
            Default: None.
        input_size (int | tuple | None): The size of input, which will be
            used to calculate the out size. Only works when `dynamic_size`
            is False. Default: None.
        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
            Default: None.
    """

    def __init__(self,
                 in_channels=3,
                 embed_dims=768,
                 conv_type='Conv2d',
                 kernel_size=16,
                 stride=16,
                 padding='corner',
                 dilation=1,
                 bias=True,
                 norm_cfg=None,
                 input_size=None,
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)

        self.embed_dims = embed_dims
        if stride is None:
            stride = kernel_size

        kernel_size = to_2tuple(kernel_size)
        stride = to_2tuple(stride)
        dilation = to_2tuple(dilation)

        if isinstance(padding, str):
            self.adaptive_padding = AdaptivePadding(
                kernel_size=kernel_size,
                stride=stride,
                dilation=dilation,
                padding=padding)
            # disable the padding of conv
            padding = 0
        else:
            self.adaptive_padding = None
        padding = to_2tuple(padding)

        self.projection = build_conv_layer(
            dict(type=conv_type),
            in_channels=in_channels,
            out_channels=embed_dims,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        if norm_cfg is not None:
            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
        else:
            self.norm = None

        if input_size:
            input_size = to_2tuple(input_size)
            # `init_out_size` would be used outside to
            # calculate the num_patches
            # e.g. when `use_abs_pos_embed` outside
            self.init_input_size = input_size
            if self.adaptive_padding:
                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
                input_h, input_w = input_size
                input_h = input_h + pad_h
                input_w = input_w + pad_w
                input_size = (input_h, input_w)

            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
                     (kernel_size[0] - 1) - 1) // stride[0] + 1
            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
                     (kernel_size[1] - 1) - 1) // stride[1] + 1
            self.init_out_size = (h_out, w_out)
        else:
            self.init_input_size = None
            self.init_out_size = None

    def forward(self, x):
        """
        Args:
            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.

        Returns:
            tuple: Contains merged results and its spatial shape.

            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
            - out_size (tuple[int]): Spatial shape of x, arrange as
              (out_h, out_w).
        """

        if self.adaptive_padding:
            x = self.adaptive_padding(x)

        x = self.projection(x)
        out_size = (x.shape[2], x.shape[3])
        x = x.flatten(2).transpose(1, 2)
        if self.norm is not None:
            x = self.norm(x)
        return x, out_size


class PatchMerging(BaseModule):
    """Merge patch feature map.

    This layer groups feature map by kernel_size, and applies norm and linear
    layers to the grouped feature map ((used in Swin Transformer)).
    Our implementation uses `nn.Unfold` to
    merge patches, which is about 25% faster than the original
    implementation. However, we need to modify pretrained
    models for compatibility.

    Args:
        in_channels (int): The num of input channels.
            to gets fully covered by filter and stride you specified.
        out_channels (int): The num of output channels.
        kernel_size (int | tuple, optional): the kernel size in the unfold
            layer. Defaults to 2.
        stride (int | tuple, optional): the stride of the sliding blocks in the
            unfold layer. Default: None. (Would be set as `kernel_size`)
        padding (int | tuple | string ): The padding length of
            embedding conv. When it is a string, it means the mode
            of adaptive padding, support "same" and "corner" now.
            Default: "corner".
        dilation (int | tuple, optional): dilation parameter in the unfold
            layer. Default: 1.
        bias (bool, optional): Whether to add bias in linear layer or not.
            Defaults: False.
        norm_cfg (dict, optional): Config dict for normalization layer.
            Default: dict(type='LN').
        init_cfg (dict, optional): The extra config for initialization.
            Default: None.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=2,
                 stride=None,
                 padding='corner',
                 dilation=1,
                 bias=False,
                 norm_cfg=dict(type='LN'),
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)
        self.in_channels = in_channels
        self.out_channels = out_channels
        if stride:
            stride = stride
        else:
            stride = kernel_size

        kernel_size = to_2tuple(kernel_size)
        stride = to_2tuple(stride)
        dilation = to_2tuple(dilation)

        if isinstance(padding, str):
            self.adaptive_padding = AdaptivePadding(
                kernel_size=kernel_size,
                stride=stride,
                dilation=dilation,
                padding=padding)
            # disable the padding of unfold
            padding = 0
        else:
            self.adaptive_padding = None

        padding = to_2tuple(padding)
        self.sampler = nn.Unfold(
            kernel_size=kernel_size,
            dilation=dilation,
            padding=padding,
            stride=stride)

        sample_dim = kernel_size[0] * kernel_size[1] * in_channels

        if norm_cfg is not None:
            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
        else:
            self.norm = None

        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)

    def forward(self, x, input_size):
        """
        Args:
            x (Tensor): Has shape (B, H*W, C_in).
            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
                Default: None.

        Returns:
            tuple: Contains merged results and its spatial shape.

            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
            - out_size (tuple[int]): Spatial shape of x, arrange as
              (Merged_H, Merged_W).
        """
        B, L, C = x.shape
        assert isinstance(input_size, Sequence), f'Expect ' \
                                                 f'input_size is ' \
                                                 f'`Sequence` ' \
                                                 f'but get {input_size}'

        H, W = input_size
        assert L == H * W, 'input feature has wrong size'

        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W

        if self.adaptive_padding:
            x = self.adaptive_padding(x)
            H, W = x.shape[-2:]

        # Use nn.Unfold to merge patch. About 25% faster than original method,
        # but need to modify pretrained model for compatibility
        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
        x = self.sampler(x)

        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
                 (self.sampler.kernel_size[0] - 1) -
                 1) // self.sampler.stride[0] + 1
        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
                 (self.sampler.kernel_size[1] - 1) -
                 1) // self.sampler.stride[1] + 1

        output_size = (out_h, out_w)
        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
        x = self.norm(x) if self.norm else x
        x = self.reduction(x)
        return x, output_size


@MODELS.register_module()
class MultiheadAttention(BaseModule):
    """A wrapper for ``torch.nn.MultiheadAttention``.

    This module implements MultiheadAttention with identity connection,
    and positional encoding  is also passed as input.

    Args:
        embed_dims (int): The embedding dimension.
        num_heads (int): Parallel attention heads.
        attn_drop (float): A Dropout layer on attn_output_weights.
            Default: 0.0.
        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
            Default: 0.0.
        dropout_layer (obj:`ConfigDict`): The dropout_layer used
            when adding the shortcut.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        batch_first (bool): When it is True,  Key, Query and Value are shape of
            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
             Default to False.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 attn_drop=0.,
                 proj_drop=0.,
                 dropout_layer=dict(type='Dropout', drop_prob=0.),
                 init_cfg=None,
                 batch_first=False,
                 **kwargs):
        super().__init__(init_cfg)
        if 'dropout' in kwargs:
            warnings.warn(
                'The arguments `dropout` in MultiheadAttention '
                'has been deprecated, now you can separately '
                'set `attn_drop`(float), proj_drop(float), '
                'and `dropout_layer`(dict) ', DeprecationWarning)
            attn_drop = kwargs['dropout']
            dropout_layer['drop_prob'] = kwargs.pop('dropout')

        self.embed_dims = embed_dims
        self.num_heads = num_heads
        self.batch_first = batch_first

        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
                                          **kwargs)

        self.proj_drop = nn.Dropout(proj_drop)
        self.dropout_layer = build_dropout(
            dropout_layer) if dropout_layer else nn.Identity()

    @deprecated_api_warning({'residual': 'identity'},
                            cls_name='MultiheadAttention')
    def forward(self,
                query,
                key=None,
                value=None,
                identity=None,
                query_pos=None,
                key_pos=None,
                attn_mask=None,
                key_padding_mask=None,
                **kwargs):
        """Forward function for `MultiheadAttention`.

        **kwargs allow passing a more general data flow when combining
        with other operations in `transformerlayer`.

        Args:
            query (Tensor): The input query with shape [num_queries, bs,
                embed_dims] if self.batch_first is False, else
                [bs, num_queries embed_dims].
            key (Tensor): The key tensor with shape [num_keys, bs,
                embed_dims] if self.batch_first is False, else
                [bs, num_keys, embed_dims] .
                If None, the ``query`` will be used. Defaults to None.
            value (Tensor): The value tensor with same shape as `key`.
                Same in `nn.MultiheadAttention.forward`. Defaults to None.
                If None, the `key` will be used.
            identity (Tensor): This tensor, with the same shape as x,
                will be used for the identity link.
                If None, `x` will be used. Defaults to None.
            query_pos (Tensor): The positional encoding for query, with
                the same shape as `x`. If not None, it will
                be added to `x` before forward function. Defaults to None.
            key_pos (Tensor): The positional encoding for `key`, with the
                same shape as `key`. Defaults to None. If not None, it will
                be added to `key` before forward function. If None, and
                `query_pos` has the same shape as `key`, then `query_pos`
                will be used for `key_pos`. Defaults to None.
            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
                num_keys]. Same in `nn.MultiheadAttention.forward`.
                Defaults to None.
            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
                Defaults to None.

        Returns:
            Tensor: forwarded results with shape
            [num_queries, bs, embed_dims]
            if self.batch_first is False, else
            [bs, num_queries embed_dims].
        """

        if key is None:
            key = query
        if value is None:
            value = key
        if identity is None:
            identity = query
        if key_pos is None:
            if query_pos is not None:
                # use query_pos if key_pos is not available
                if query_pos.shape == key.shape:
                    key_pos = query_pos
                else:
                    warnings.warn(f'position encoding of key is'
                                  f'missing in {self.__class__.__name__}.')
        if query_pos is not None:
            query = query + query_pos
        if key_pos is not None:
            key = key + key_pos

        # Because the dataflow('key', 'query', 'value') of
        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
        # embed_dims), We should adjust the shape of dataflow from
        # batch_first (batch, num_query, embed_dims) to num_query_first
        # (num_query ,batch, embed_dims), and recover ``attn_output``
        # from num_query_first to batch_first.
        if self.batch_first:
            query = query.transpose(0, 1)
            key = key.transpose(0, 1)
            value = value.transpose(0, 1)

        out = self.attn(
            query=query,
            key=key,
            value=value,
            attn_mask=attn_mask,
            key_padding_mask=key_padding_mask)[0]

        if self.batch_first:
            out = out.transpose(0, 1)

        return identity + self.dropout_layer(self.proj_drop(out))


@MODELS.register_module()
class FFN(BaseModule):
    """Implements feed-forward networks (FFNs) with identity connection.

    Args:
        embed_dims (int): The feature dimension. Same as
            `MultiheadAttention`. Defaults: 256.
        feedforward_channels (int): The hidden dimension of FFNs.
            Defaults: 1024.
        num_fcs (int, optional): The number of fully-connected layers in
            FFNs. Default: 2.
        act_cfg (dict, optional): The activation config for FFNs.
            Default: dict(type='ReLU')
        ffn_drop (float, optional): Probability of an element to be
            zeroed in FFN. Default 0.0.
        add_identity (bool, optional): Whether to add the
            identity connection. Default: `True`.
        dropout_layer (obj:`ConfigDict`): The dropout_layer used
            when adding the shortcut.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        layer_scale_init_value (float): Initial value of scale factor in
            LayerScale. Default: 1.0
    """

    @deprecated_api_warning(
        {
            'dropout': 'ffn_drop',
            'add_residual': 'add_identity'
        },
        cls_name='FFN')
    def __init__(self,
                 embed_dims=256,
                 feedforward_channels=1024,
                 num_fcs=2,
                 act_cfg=dict(type='ReLU', inplace=True),
                 ffn_drop=0.,
                 dropout_layer=None,
                 add_identity=True,
                 init_cfg=None,
                 layer_scale_init_value=0.):
        super().__init__(init_cfg)
        assert num_fcs >= 2, 'num_fcs should be no less ' \
            f'than 2. got {num_fcs}.'
        self.embed_dims = embed_dims
        self.feedforward_channels = feedforward_channels
        self.num_fcs = num_fcs

        layers = []
        in_channels = embed_dims
        for _ in range(num_fcs - 1):
            layers.append(
                Sequential(
                    Linear(in_channels, feedforward_channels),
                    build_activation_layer(act_cfg), nn.Dropout(ffn_drop)))
            in_channels = feedforward_channels
        layers.append(Linear(feedforward_channels, embed_dims))
        layers.append(nn.Dropout(ffn_drop))
        self.layers = Sequential(*layers)
        self.dropout_layer = build_dropout(
            dropout_layer) if dropout_layer else torch.nn.Identity()
        self.add_identity = add_identity

        if layer_scale_init_value > 0:
            self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value)
        else:
            self.gamma2 = nn.Identity()

    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
    def forward(self, x, identity=None):
        """Forward function for `FFN`.

        The function would add x to the output tensor if residue is None.
        """
        out = self.layers(x)
        out = self.gamma2(out)
        if not self.add_identity:
            return self.dropout_layer(out)
        if identity is None:
            identity = x
        return identity + self.dropout_layer(out)


@MODELS.register_module()
class BaseTransformerLayer(BaseModule):
    """Base `TransformerLayer` for vision transformer.

    It can be built from `mmcv.ConfigDict` and support more flexible
    customization, for example, using any number of `FFN or LN ` and
    use different kinds of `attention` by specifying a list of `ConfigDict`
    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
    when you specifying `norm` as the first element of `operation_order`.
    More details about the `prenorm`: `On Layer Normalization in the
    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .

    Args:
        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
            Configs for `self_attention` or `cross_attention` modules,
            The order of the configs in the list should be consistent with
            corresponding attentions in operation_order.
            If it is a dict, all of the attention modules in operation_order
            will be built with this config. Default: None.
        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
            Configs for FFN, The order of the configs in the list should be
            consistent with corresponding ffn in operation_order.
            If it is a dict, all of the attention modules in operation_order
            will be built with this config.
        operation_order (tuple[str]): The execution order of operation
            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
            Support `prenorm` when you specifying first element as `norm`.
            Default：None.
        norm_cfg (dict): Config dict for normalization layer.
            Default: dict(type='LN').
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        batch_first (bool): Key, Query and Value are shape
            of (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
    """

    def __init__(self,
                 attn_cfgs=None,
                 ffn_cfgs=dict(
                     type='FFN',
                     embed_dims=256,
                     feedforward_channels=1024,
                     num_fcs=2,
                     ffn_drop=0.,
                     act_cfg=dict(type='ReLU', inplace=True),
                 ),
                 operation_order=None,
                 norm_cfg=dict(type='LN'),
                 init_cfg=None,
                 batch_first=False,
                 **kwargs):

        deprecated_args = dict(
            feedforward_channels='feedforward_channels',
            ffn_dropout='ffn_drop',
            ffn_num_fcs='num_fcs')
        for ori_name, new_name in deprecated_args.items():
            if ori_name in kwargs:
                warnings.warn(
                    f'The arguments `{ori_name}` in BaseTransformerLayer '
                    f'has been deprecated, now you should set `{new_name}` '
                    f'and other FFN related arguments '
                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
                ffn_cfgs[new_name] = kwargs[ori_name]

        super().__init__(init_cfg)

        self.batch_first = batch_first

        assert set(operation_order) & {
            'self_attn', 'norm', 'ffn', 'cross_attn'} == \
            set(operation_order), f'The operation_order of' \
            f' {self.__class__.__name__} should ' \
            f'contains all four operation type ' \
            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"

        num_attn = operation_order.count('self_attn') + operation_order.count(
            'cross_attn')
        if isinstance(attn_cfgs, dict):
            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
        else:
            assert num_attn == len(attn_cfgs), f'The length ' \
                f'of attn_cfg {num_attn} is ' \
                f'not consistent with the number of attention' \
                f'in operation_order {operation_order}.'

        self.num_attn = num_attn
        self.operation_order = operation_order
        self.norm_cfg = norm_cfg
        self.pre_norm = operation_order[0] == 'norm'
        self.attentions = ModuleList()

        index = 0
        for operation_name in operation_order:
            if operation_name in ['self_attn', 'cross_attn']:
                if 'batch_first' in attn_cfgs[index]:
                    assert self.batch_first == attn_cfgs[index]['batch_first']
                else:
                    attn_cfgs[index]['batch_first'] = self.batch_first
                attention = build_attention(attn_cfgs[index])
                # Some custom attentions used as `self_attn`
                # or `cross_attn` can have different behavior.
                attention.operation_name = operation_name
                self.attentions.append(attention)
                index += 1

        self.embed_dims = self.attentions[0].embed_dims

        self.ffns = ModuleList()
        num_ffns = operation_order.count('ffn')
        if isinstance(ffn_cfgs, dict):
            ffn_cfgs = ConfigDict(ffn_cfgs)
        if isinstance(ffn_cfgs, dict):
            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
        assert len(ffn_cfgs) == num_ffns
        for ffn_index in range(num_ffns):
            if 'embed_dims' not in ffn_cfgs[ffn_index]:
                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
            else:
                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
            self.ffns.append(
                build_feedforward_network(ffn_cfgs[ffn_index],
                                          dict(type='FFN')))

        self.norms = ModuleList()
        num_norms = operation_order.count('norm')
        for _ in range(num_norms):
            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])

    def forward(self,
                query,
                key=None,
                value=None,
                query_pos=None,
                key_pos=None,
                attn_masks=None,
                query_key_padding_mask=None,
                key_padding_mask=None,
                **kwargs):
        """Forward function for `TransformerDecoderLayer`.

        **kwargs contains some specific arguments of attentions.

        Args:
            query (Tensor): The input query with shape
                [num_queries, bs, embed_dims] if
                self.batch_first is False, else
                [bs, num_queries embed_dims].
            key (Tensor): The key tensor with shape [num_keys, bs,
                embed_dims] if self.batch_first is False, else
                [bs, num_keys, embed_dims] .
            value (Tensor): The value tensor with same shape as `key`.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`.
                Default: None.
            attn_masks (List[Tensor] | None): 2D Tensor used in
                calculation of corresponding attention. The length of
                it should equal to the number of `attention` in
                `operation_order`. Default: None.
            query_key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_queries]. Only used in `self_attn` layer.
                Defaults to None.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_keys]. Default: None.

        Returns:
            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
        """

        norm_index = 0
        attn_index = 0
        ffn_index = 0
        identity = query
        if attn_masks is None:
            attn_masks = [None for _ in range(self.num_attn)]
        elif isinstance(attn_masks, torch.Tensor):
            attn_masks = [
                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
            ]
            warnings.warn(f'Use same attn_mask in all attentions in '
                          f'{self.__class__.__name__} ')
        else:
            assert len(attn_masks) == self.num_attn, f'The length of ' \
                        f'attn_masks {len(attn_masks)} must be equal ' \
                        f'to the number of attention in ' \
                        f'operation_order {self.num_attn}'

        for layer in self.operation_order:
            if layer == 'self_attn':
                temp_key = temp_value = query
                query = self.attentions[attn_index](
                    query,
                    temp_key,
                    temp_value,
                    identity if self.pre_norm else None,
                    query_pos=query_pos,
                    key_pos=query_pos,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=query_key_padding_mask,
                    **kwargs)
                attn_index += 1
                identity = query

            elif layer == 'norm':
                query = self.norms[norm_index](query)
                norm_index += 1

            elif layer == 'cross_attn':
                query = self.attentions[attn_index](
                    query,
                    key,
                    value,
                    identity if self.pre_norm else None,
                    query_pos=query_pos,
                    key_pos=key_pos,
                    attn_mask=attn_masks[attn_index],
                    key_padding_mask=key_padding_mask,
                    **kwargs)
                attn_index += 1
                identity = query

            elif layer == 'ffn':
                query = self.ffns[ffn_index](
                    query, identity if self.pre_norm else None)
                ffn_index += 1

        return query


@MODELS.register_module()
class TransformerLayerSequence(BaseModule):
    """Base class for TransformerEncoder and TransformerDecoder in vision
    transformer.

    As base-class of Encoder and Decoder in vision transformer.
    Support customization such as specifying different kind
    of `transformer_layer` in `transformer_coder`.

    Args:
        transformerlayer (list[obj:`mmcv.ConfigDict`] |
            obj:`mmcv.ConfigDict`): Config of transformerlayer
            in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
             it would be repeated `num_layer` times to a
             list[`mmcv.ConfigDict`]. Default: None.
        num_layers (int): The number of `TransformerLayer`. Default: None.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
    """

    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
        super().__init__(init_cfg)
        if isinstance(transformerlayers, dict):
            transformerlayers = [
                copy.deepcopy(transformerlayers) for _ in range(num_layers)
            ]
        else:
            assert isinstance(transformerlayers, list) and \
                   len(transformerlayers) == num_layers
        self.num_layers = num_layers
        self.layers = ModuleList()
        for i in range(num_layers):
            self.layers.append(build_transformer_layer(transformerlayers[i]))
        self.embed_dims = self.layers[0].embed_dims
        self.pre_norm = self.layers[0].pre_norm

    def forward(self,
                query,
                key,
                value,
                query_pos=None,
                key_pos=None,
                attn_masks=None,
                query_key_padding_mask=None,
                key_padding_mask=None,
                **kwargs):
        """Forward function for `TransformerCoder`.

        Args:
            query (Tensor): Input query with shape
                `(num_queries, bs, embed_dims)`.
            key (Tensor): The key tensor with shape
                `(num_keys, bs, embed_dims)`.
            value (Tensor): The value tensor with shape
                `(num_keys, bs, embed_dims)`.
            query_pos (Tensor): The positional encoding for `query`.
                Default: None.
            key_pos (Tensor): The positional encoding for `key`.
                Default: None.
            attn_masks (List[Tensor], optional): Each element is 2D Tensor
                which is used in calculation of corresponding attention in
                operation_order. Default: None.
            query_key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_queries]. Only used in self-attention
                Default: None.
            key_padding_mask (Tensor): ByteTensor for `query`, with
                shape [bs, num_keys]. Default: None.

        Returns:
            Tensor:  results with shape [num_queries, bs, embed_dims].
        """
        for layer in self.layers:
            query = layer(
                query,
                key,
                value,
                query_pos=query_pos,
                key_pos=key_pos,
                attn_masks=attn_masks,
                query_key_padding_mask=query_key_padding_mask,
                key_padding_mask=key_padding_mask,
                **kwargs)
        return query


================================================
FILE: mmcv/cnn/bricks/upsample.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import inspect
from typing import Dict

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import xavier_init
from mmengine.registry import MODELS

MODELS.register_module('nearest', module=nn.Upsample)
MODELS.register_module('bilinear', module=nn.Upsample)


@MODELS.register_module(name='pixel_shuffle')
class PixelShufflePack(nn.Module):
    """Pixel Shuffle upsample layer.

    This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
    achieve a simple upsampling with pixel shuffle.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        scale_factor (int): Upsample ratio.
        upsample_kernel (int): Kernel size of the conv layer to expand the
            channels.
    """

    def __init__(self, in_channels: int, out_channels: int, scale_factor: int,
                 upsample_kernel: int):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.scale_factor = scale_factor
        self.upsample_kernel = upsample_kernel
        self.upsample_conv = nn.Conv2d(
            self.in_channels,
            self.out_channels * scale_factor * scale_factor,
            self.upsample_kernel,
            padding=(self.upsample_kernel - 1) // 2)
        self.init_weights()

    def init_weights(self):
        xavier_init(self.upsample_conv, distribution='uniform')

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.upsample_conv(x)
        x = F.pixel_shuffle(x, self.scale_factor)
        return x


def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
    """Build upsample layer.

    Args:
        cfg (dict): The upsample layer config, which should contain:

            - type (str): Layer type.
            - scale_factor (int): Upsample ratio, which is not applicable to
              deconv.
            - layer args: Args needed to instantiate a upsample layer.
        args (argument list): Arguments passed to the ``__init__``
            method of the corresponding conv layer.
        kwargs (keyword arguments): Keyword arguments passed to the
            ``__init__`` method of the corresponding conv layer.

    Returns:
        nn.Module: Created upsample layer.
    """
    if not isinstance(cfg, dict):
        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
    if 'type' not in cfg:
        raise KeyError(
            f'the cfg dict must contain the key "type", but got {cfg}')
    cfg_ = cfg.copy()

    layer_type = cfg_.pop('type')

    if inspect.isclass(layer_type):
        upsample = layer_type
    # Switch registry to the target scope. If `upsample` cannot be found
    # in the registry, fallback to search `upsample` in the
    # mmengine.MODELS.
    else:
        with MODELS.switch_scope_and_registry(None) as registry:
            upsample = registry.get(layer_type)
        if upsample is None:
            raise KeyError(f'Cannot find {upsample} in registry under scope '
                           f'name {registry.scope}')
        if upsample is nn.Upsample:
            cfg_['mode'] = layer_type
    layer = upsample(*args, **kwargs, **cfg_)
    return layer


================================================
FILE: mmcv/cnn/bricks/wrappers.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501

Wrap some nn modules to support empty tensor input. Currently, these wrappers
are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
heads are trained on only positive RoIs.
"""
import math

import torch
import torch.nn as nn
from mmengine.registry import MODELS
from torch.nn.modules.utils import _pair, _triple

if torch.__version__ == 'parrots':
    TORCH_VERSION = torch.__version__
else:
    # torch.__version__ could be 1.3.1+cu92, we only need the first two
    # for comparison
    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])


def obsolete_torch_version(torch_version, version_threshold) -> bool:
    return torch_version == 'parrots' or torch_version <= version_threshold


class NewEmptyTensorOp(torch.autograd.Function):

    @staticmethod
    def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:
        ctx.shape = x.shape
        return x.new_empty(new_shape)

    @staticmethod
    def backward(ctx, grad: torch.Tensor) -> tuple:
        shape = ctx.shape
        return NewEmptyTensorOp.apply(grad, shape), None


@MODELS.register_module('Conv', force=True)
class Conv2d(nn.Conv2d):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_channels]
            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
                                     self.padding, self.stride, self.dilation):
                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
                out_shape.append(o)
            empty = NewEmptyTensorOp.apply(x, out_shape)
            if self.training:
                # produce dummy gradient to avoid DDP warning.
                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
                return empty + dummy
            else:
                return empty

        return super().forward(x)


@MODELS.register_module('Conv3d', force=True)
class Conv3d(nn.Conv3d):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_channels]
            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
                                     self.padding, self.stride, self.dilation):
                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
                out_shape.append(o)
            empty = NewEmptyTensorOp.apply(x, out_shape)
            if self.training:
                # produce dummy gradient to avoid DDP warning.
                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
                return empty + dummy
            else:
                return empty

        return super().forward(x)


@MODELS.register_module()
@MODELS.register_module('deconv')
class ConvTranspose2d(nn.ConvTranspose2d):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_channels]
            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
                                         self.padding, self.stride,
                                         self.dilation, self.output_padding):
                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
            empty = NewEmptyTensorOp.apply(x, out_shape)
            if self.training:
                # produce dummy gradient to avoid DDP warning.
                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
                return empty + dummy
            else:
                return empty

        return super().forward(x)


@MODELS.register_module()
@MODELS.register_module('deconv3d')
class ConvTranspose3d(nn.ConvTranspose3d):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_channels]
            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
                                         self.padding, self.stride,
                                         self.dilation, self.output_padding):
                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
            empty = NewEmptyTensorOp.apply(x, out_shape)
            if self.training:
                # produce dummy gradient to avoid DDP warning.
                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
                return empty + dummy
            else:
                return empty

        return super().forward(x)


class MaxPool2d(nn.MaxPool2d):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # PyTorch 1.9 does not support empty tensor inference yet
        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
            out_shape = list(x.shape[:2])
            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
                                     _pair(self.padding), _pair(self.stride),
                                     _pair(self.dilation)):
                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
                o = math.ceil(o) if self.ceil_mode else math.floor(o)
                out_shape.append(o)
            empty = NewEmptyTensorOp.apply(x, out_shape)
            return empty

        return super().forward(x)


class MaxPool3d(nn.MaxPool3d):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # PyTorch 1.9 does not support empty tensor inference yet
        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
            out_shape = list(x.shape[:2])
            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
                                     _triple(self.padding),
                                     _triple(self.stride),
                                     _triple(self.dilation)):
                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
                o = math.ceil(o) if self.ceil_mode else math.floor(o)
                out_shape.append(o)
            empty = NewEmptyTensorOp.apply(x, out_shape)
            return empty

        return super().forward(x)


class Linear(torch.nn.Linear):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # empty tensor forward of Linear layer is supported in Pytorch 1.6
        if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_features]
            empty = NewEmptyTensorOp.apply(x, out_shape)
            if self.training:
                # produce dummy gradient to avoid DDP warning.
                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
                return empty + dummy
            else:
                return empty

        return super().forward(x)


================================================
FILE: mmcv/cnn/resnet.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import logging
from typing import Optional, Sequence, Tuple, Union

import torch.nn as nn
import torch.utils.checkpoint as cp
from mmengine.model import constant_init, kaiming_init
from mmengine.runner import load_checkpoint
from torch import Tensor


def conv3x3(in_planes: int,
            out_planes: int,
            stride: int = 1,
            dilation: int = 1):
    """3x3 convolution with padding."""
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=dilation,
        dilation=dilation,
        bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self,
                 inplanes: int,
                 planes: int,
                 stride: int = 1,
                 dilation: int = 1,
                 downsample: Optional[nn.Module] = None,
                 style: str = 'pytorch',
                 with_cp: bool = False):
        super().__init__()
        assert style in ['pytorch', 'caffe']
        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
        assert not with_cp

    def forward(self, x: Tensor) -> Tensor:
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self,
                 inplanes: int,
                 planes: int,
                 stride: int = 1,
                 dilation: int = 1,
                 downsample: Optional[nn.Module] = None,
                 style: str = 'pytorch',
                 with_cp: bool = False):
        """Bottleneck block.

        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
        it is "caffe", the stride-two layer is the first 1x1 conv layer.
        """
        super().__init__()
        assert style in ['pytorch', 'caffe']
        if style == 'pytorch':
            conv1_stride = 1
            conv2_stride = stride
        else:
            conv1_stride = stride
            conv2_stride = 1
        self.conv1 = nn.Conv2d(
            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
        self.conv2 = nn.Conv2d(
            planes,
            planes,
            kernel_size=3,
            stride=conv2_stride,
            padding=dilation,
            dilation=dilation,
            bias=False)

        self.bn1 = nn.BatchNorm2d(planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(
            planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
        self.with_cp = with_cp

    def forward(self, x: Tensor) -> Tensor:

        def _inner_forward(x):
            residual = x

            out = self.conv1(x)
            out = self.bn1(out)
            out = self.relu(out)

            out = self.conv2(out)
            out = self.bn2(out)
            out = self.relu(out)

            out = self.conv3(out)
            out = self.bn3(out)

            if self.downsample is not None:
                residual = self.downsample(x)

            out += residual

            return out

        if self.with_cp and x.requires_grad:
            out = cp.checkpoint(_inner_forward, x)
        else:
            out = _inner_forward(x)

        out = self.relu(out)

        return out


def make_res_layer(block: nn.Module,
                   inplanes: int,
                   planes: int,
                   blocks: int,
                   stride: int = 1,
                   dilation: int = 1,
                   style: str = 'pytorch',
                   with_cp: bool = False) -> nn.Module:
    downsample = None
    if stride != 1 or inplanes != planes * block.expansion:
        downsample = nn.Sequential(
            nn.Conv2d(
                inplanes,
                planes * block.expansion,
                kernel_size=1,
                stride=stride,
                bias=False),
            nn.BatchNorm2d(planes * block.expansion),
        )

    layers = []
    layers.append(
        block(
            inplanes,
            planes,
            stride,
            dilation,
            downsample,
            style=style,
            with_cp=with_cp))
    inplanes = planes * block.expansion
    for _ in range(1, blocks):
        layers.append(
            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))

    return nn.Sequential(*layers)


class ResNet(nn.Module):
    """ResNet backbone.

    Args:
        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
        num_stages (int): Resnet stages, normally 4.
        strides (Sequence[int]): Strides of the first block of each stage.
        dilations (Sequence[int]): Dilation of each stage.
        out_indices (Sequence[int]): Output from which stages.
        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
            layer is the 3x3 conv layer, otherwise the stride-two layer is
            the first 1x1 conv layer.
        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
            not freezing any parameters.
        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
            running stats (mean and var).
        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
            memory while slowing down the training speed.
    """

    arch_settings = {
        18: (BasicBlock, (2, 2, 2, 2)),
        34: (BasicBlock, (3, 4, 6, 3)),
        50: (Bottleneck, (3, 4, 6, 3)),
        101: (Bottleneck, (3, 4, 23, 3)),
        152: (Bottleneck, (3, 8, 36, 3))
    }

    def __init__(self,
                 depth: int,
                 num_stages: int = 4,
                 strides: Sequence[int] = (1, 2, 2, 2),
                 dilations: Sequence[int] = (1, 1, 1, 1),
                 out_indices: Sequence[int] = (0, 1, 2, 3),
                 style: str = 'pytorch',
                 frozen_stages: int = -1,
                 bn_eval: bool = True,
                 bn_frozen: bool = False,
                 with_cp: bool = False):
        super().__init__()
        if depth not in self.arch_settings:
            raise KeyError(f'invalid depth {depth} for resnet')
        assert num_stages >= 1 and num_stages <= 4
        block, stage_blocks = self.arch_settings[depth]
        stage_blocks = stage_blocks[:num_stages]  # type: ignore
        assert len(strides) == len(dilations) == num_stages
        assert max(out_indices) < num_stages

        self.out_indices = out_indices
        self.style = style
        self.frozen_stages = frozen_stages
        self.bn_eval = bn_eval
        self.bn_frozen = bn_frozen
        self.with_cp = with_cp

        self.inplanes: int = 64
        self.conv1 = nn.Conv2d(
            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.res_layers = []
        for i, num_blocks in enumerate(stage_blocks):
            stride = strides[i]
            dilation = dilations[i]
            planes = 64 * 2**i
            res_layer = make_res_layer(
                block,
                self.inplanes,
                planes,
                num_blocks,
                stride=stride,
                dilation=dilation,
                style=self.style,
                with_cp=with_cp)
            self.inplanes = planes * block.expansion  # type: ignore
            layer_name = f'layer{i + 1}'
            self.add_module(layer_name, res_layer)
            self.res_layers.append(layer_name)

        self.feat_dim = block.expansion * 64 * 2**(  # type: ignore
            len(stage_blocks) - 1)

    def init_weights(self, pretrained: Optional[str] = None) -> None:
        if isinstance(pretrained, str):
            logger = logging.getLogger()
            load_checkpoint(self, pretrained, strict=False, logger=logger)
        elif pretrained is None:
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    kaiming_init(m)
                elif isinstance(m, nn.BatchNorm2d):
                    constant_init(m, 1)
        else:
            raise TypeError('pretrained must be a str or None')

    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        outs = []
        for i, layer_name in enumerate(self.res_layers):
            res_layer = getattr(self, layer_name)
            x = res_layer(x)
            if i in self.out_indices:
                outs.append(x)
        if len(outs) == 1:
            return outs[0]
        else:
            return tuple(outs)

    def train(self, mode: bool = True) -> None:
        super().train(mode)
        if self.bn_eval:
            for m in self.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.eval()
                    if self.bn_frozen:
                        for params in m.parameters():
                            params.requires_grad = False
        if mode and self.frozen_stages >= 0:
            for param in self.conv1.parameters():
                param.requires_grad = False
            for param in self.bn1.parameters():
                param.requires_grad = False
            self.bn1.eval()
            self.bn1.weight.requires_grad = False
            self.bn1.bias.requires_grad = False
            for i in range(1, self.frozen_stages + 1):
                mod = getattr(self, f'layer{i}')
                mod.eval()
                for param in mod.parameters():
                    param.requires_grad = False


================================================
FILE: mmcv/cnn/rfsearch/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp
from .search import RFSearchHook

__all__ = ['BaseConvRFSearchOp', 'Conv2dRFSearchOp', 'RFSearchHook']


================================================
FILE: mmcv/cnn/rfsearch/operator.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy

import numpy as np
import torch
import torch.nn as nn
from mmengine.logging import print_log
from mmengine.model import BaseModule
from torch import Tensor

from .utils import expand_rates, get_single_padding


class BaseConvRFSearchOp(BaseModule):
    """Based class of ConvRFSearchOp.

    Args:
        op_layer (nn.Module): pytorch module, e,g, Conv2d
        global_config (dict): config dict.
    """

    def __init__(self, op_layer: nn.Module, global_config: dict):
        super().__init__()
        self.op_layer = op_layer
        self.global_config = global_config

    def normlize(self, weights: nn.Parameter) -> nn.Parameter:
        """Normalize weights.

        Args:
            weights (nn.Parameter): Weights to be normalized.

        Returns:
            nn.Parameters: Normalized weights.
        """
        abs_weights = torch.abs(weights)
        normalized_weights = abs_weights / torch.sum(abs_weights)
        return normalized_weights


class Conv2dRFSearchOp(BaseConvRFSearchOp):
    """Enable Conv2d with receptive field searching ability.

    Args:
        op_layer (nn.Module): pytorch module, e,g, Conv2d
        global_config (dict): config dict. Defaults to None.
            By default this must include:

            - "init_alphas": The value for initializing weights of each branch.
            - "num_branches": The controller of the size of
              search space (the number of branches).
            - "exp_rate": The controller of the sparsity of search space.
            - "mmin": The minimum dilation rate.
            - "mmax": The maximum dilation rate.

            Extra keys may exist, but are used by RFSearchHook, e.g., "step",
            "max_step", "search_interval", and "skip_layer".
        verbose (bool): Determines whether to print rf-next
            related logging messages.
            Defaults to True.
    """

    def __init__(self,
                 op_layer: nn.Module,
                 global_config: dict,
                 verbose: bool = True):
        super().__init__(op_layer, global_config)
        assert global_config is not None, 'global_config is None'
        self.num_branches = global_config['num_branches']
        assert self.num_branches in [2, 3]
        self.verbose = verbose
        init_dilation = op_layer.dilation
        self.dilation_rates = expand_rates(init_dilation, global_config)
        if self.op_layer.kernel_size[
                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
            self.dilation_rates = [(op_layer.dilation[0], r[1])
                                   for r in self.dilation_rates]
        if self.op_layer.kernel_size[
                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
            self.dilation_rates = [(r[0], op_layer.dilation[1])
                                   for r in self.dilation_rates]

        self.branch_weights = nn.Parameter(torch.Tensor(self.num_branches))
        if self.verbose:
            print_log(f'Expand as {self.dilation_rates}', 'current')
        nn.init.constant_(self.branch_weights, global_config['init_alphas'])

    def forward(self, input: Tensor) -> Tensor:
        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
        if len(self.dilation_rates) == 1:
            outputs = [
                nn.functional.conv2d(
                    input,
                    weight=self.op_layer.weight,
                    bias=self.op_layer.bias,
                    stride=self.op_layer.stride,
                    padding=self.get_padding(self.dilation_rates[0]),
                    dilation=self.dilation_rates[0],
                    groups=self.op_layer.groups,
                )
            ]
        else:
            outputs = [
                nn.functional.conv2d(
                    input,
                    weight=self.op_layer.weight,
                    bias=self.op_layer.bias,
                    stride=self.op_layer.stride,
                    padding=self.get_padding(r),
                    dilation=r,
                    groups=self.op_layer.groups,
                ) * norm_w[i] for i, r in enumerate(self.dilation_rates)
            ]
        output = outputs[0]
        for i in range(1, len(self.dilation_rates)):
            output += outputs[i]
        return output

    def estimate_rates(self) -> None:
        """Estimate new dilation rate based on trained branch_weights."""
        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
        if self.verbose:
            print_log(
                'Estimate dilation {} with weight {}.'.format(
                    self.dilation_rates,
                    norm_w.detach().cpu().numpy().tolist()), 'current')

        sum0, sum1, w_sum = 0, 0, 0
        for i in range(len(self.dilation_rates)):
            sum0 += norm_w[i].item() * self.dilation_rates[i][0]
            sum1 += norm_w[i].item() * self.dilation_rates[i][1]
            w_sum += norm_w[i].item()
        estimated = [
            np.clip(
                int(round(sum0 / w_sum)), self.global_config['mmin'],
                self.global_config['mmax']).item(),
            np.clip(
                int(round(sum1 / w_sum)), self.global_config['mmin'],
                self.global_config['mmax']).item()
        ]
        self.op_layer.dilation = tuple(estimated)
        self.op_layer.padding = self.get_padding(self.op_layer.dilation)
        self.dilation_rates = [tuple(estimated)]
        if self.verbose:
            print_log(f'Estimate as {tuple(estimated)}', 'current')

    def expand_rates(self) -> None:
        """Expand dilation rate."""
        dilation = self.op_layer.dilation
        dilation_rates = expand_rates(dilation, self.global_config)
        if self.op_layer.kernel_size[
                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
            dilation_rates = [(dilation[0], r[1]) for r in dilation_rates]
        if self.op_layer.kernel_size[
                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
            dilation_rates = [(r[0], dilation[1]) for r in dilation_rates]

        self.dilation_rates = copy.deepcopy(dilation_rates)
        if self.verbose:
            print_log(f'Expand as {self.dilation_rates}', 'current')
        nn.init.constant_(self.branch_weights,
                          self.global_config['init_alphas'])

    def get_padding(self, dilation) -> tuple:
        padding = (get_single_padding(self.op_layer.kernel_size[0],
                                      self.op_layer.stride[0], dilation[0]),
                   get_single_padding(self.op_layer.kernel_size[1],
                                      self.op_layer.stride[1], dilation[1]))
        return padding


================================================
FILE: mmcv/cnn/rfsearch/search.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import Dict, Optional

import mmengine
import torch  # noqa
import torch.nn as nn
from mmengine.hooks import Hook
from mmengine.logging import print_log
from mmengine.registry import HOOKS

from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp  # noqa
from .utils import get_single_padding, write_to_json


@HOOKS.register_module()
class RFSearchHook(Hook):
    """Rcecptive field search via dilation rates.

    Please refer to `RF-Next: Efficient Receptive Field
    Search for Convolutional Neural Networks
    <https://arxiv.org/abs/2206.06637>`_ for more details.


    Args:
        mode (str, optional): It can be set to the following types:
            'search', 'fixed_single_branch', or 'fixed_multi_branch'.
            Defaults to 'search'.
        config (Dict, optional): config dict of search.
            By default this config contains "search",
            and config["search"] must include:

            - "step": recording the current searching step.
            - "max_step": The maximum number of searching steps
              to update the structures.
            - "search_interval": The interval (epoch/iteration)
              between two updates.
            - "exp_rate": The controller of the sparsity of search space.
            - "init_alphas": The value for initializing weights of each branch.
            - "mmin": The minimum dilation rate.
            - "mmax": The maximum dilation rate.
            - "num_branches": The controller of the size of
              search space (the number of branches).
            - "skip_layer": The modules in skip_layer will be ignored
              during the receptive field search.
        rfstructure_file (str, optional): Path to load searched receptive
            fields of the model. Defaults to None.
        by_epoch (bool, optional): Determine to perform step by epoch or
            by iteration. If set to True, it will step by epoch. Otherwise, by
            iteration. Defaults to True.
        verbose (bool): Determines whether to print rf-next related logging
            messages. Defaults to True.
    """

    def __init__(self,
                 mode: str = 'search',
                 config: Dict = {},
                 rfstructure_file: Optional[str] = None,
                 by_epoch: bool = True,
                 verbose: bool = True):
        assert mode in ['search', 'fixed_single_branch', 'fixed_multi_branch']
        assert config is not None
        self.config = config
        self.config['structure'] = {}
        self.verbose = verbose
        if rfstructure_file is not None:
            rfstructure = mmengine.load(rfstructure_file)['structure']
            self.config['structure'] = rfstructure
        self.mode = mode
        self.num_branches = self.config['search']['num_branches']
        self.by_epoch = by_epoch

    def init_model(self, model: nn.Module):
        """Init model with search ability.

        Args:
            model (nn.Module): pytorch model

        Raises:
            NotImplementedError: only support three modes:
                search/fixed_single_branch/fixed_multi_branch
        """
        if self.verbose:
            print_log('RFSearch init begin.', 'current')
        if self.mode == 'search':
            if self.config['structure']:
                self.set_model(model, search_op='Conv2d')
            self.wrap_model(model, search_op='Conv2d')
        elif self.mode == 'fixed_single_branch':
            self.set_model(model, search_op='Conv2d')
        elif self.mode == 'fixed_multi_branch':
            self.set_model(model, search_op='Conv2d')
            self.wrap_model(model, search_op='Conv2d')
        else:
            raise NotImplementedError
        if self.verbose:
            print_log('RFSearch init end.', 'current')

    def after_train_epoch(self, runner):
        """Performs a dilation searching step after one training epoch."""
        if self.by_epoch and self.mode == 'search':
            self.step(runner.model, runner.work_dir)

    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
        """Performs a dilation searching step after one training iteration."""
        if not self.by_epoch and self.mode == 'search':
            self.step(runner.model, runner.work_dir)

    def step(self, model: nn.Module, work_dir: str) -> None:
        """Performs a dilation searching step.

        Args:
            model (nn.Module): pytorch model
            work_dir (str): Directory to save the searching results.
        """
        self.config['search']['step'] += 1
        if (self.config['search']['step']
            ) % self.config['search']['search_interval'] == 0 and (self.config[
                'search']['step']) < self.config['search']['max_step']:
            self.estimate_and_expand(model)
            for name, module in model.named_modules():
                if isinstance(module, BaseConvRFSearchOp):
                    self.config['structure'][name] = module.op_layer.dilation

            write_to_json(
                self.config,
                os.path.join(
                    work_dir,
                    'local_search_config_step%d.json' %
                    self.config['search']['step'],
                ),
            )

    def estimate_and_expand(self, model: nn.Module) -> None:
        """Estimate and search for RFConvOp.

        Args:
            model (nn.Module): pytorch model
        """
        for module in model.modules():
            if isinstance(module, BaseConvRFSearchOp):
                module.estimate_rates()
                module.expand_rates()

    def wrap_model(self,
                   model: nn.Module,
                   search_op: str = 'Conv2d',
                   prefix: str = '') -> None:
        """Wrap model to support searchable conv op.

        Args:
            model (nn.Module): pytorch model
            search_op (str): The module that uses RF search.
                Defaults to 'Conv2d'.
            init_rates (int, optional): Set to other initial dilation rates.
                Defaults to None.
            prefix (str): Prefix for function recursion. Defaults to ''.
        """
        op = 'torch.nn.' + search_op
        for name, module in model.named_children():
            if prefix == '':
                fullname = 'module.' + name
            else:
                fullname = prefix + '.' + name
            if self.config['search']['skip_layer'] is not None:
                if any(layer in fullname
                       for layer in self.config['search']['skip_layer']):
                    continue
            if isinstance(module, eval(op)):
                if 1 < module.kernel_size[0] and \
                    0 != module.kernel_size[0] % 2 or \
                    1 < module.kernel_size[1] and \
                        0 != module.kernel_size[1] % 2:
                    moduleWrap = eval(search_op + 'RFSearchOp')(
                        module, self.config['search'], self.verbose)
                    moduleWrap = moduleWrap.to(module.weight.device)
                    if self.verbose:
                        print_log(
                            'Wrap model %s to %s.' %
                            (str(module), str(moduleWrap)), 'current')
                    setattr(model, name, moduleWrap)
            elif not isinstance(module, BaseConvRFSearchOp):
                self.wrap_model(module, search_op, fullname)

    def set_model(self,
                  model: nn.Module,
                  search_op: str = 'Conv2d',
                  init_rates: Optional[int] = None,
                  prefix: str = '') -> None:
        """Set model based on config.

        Args:
            model (nn.Module): pytorch model
            config (Dict): config file
            search_op (str): The module that uses RF search.
                Defaults to 'Conv2d'.
            init_rates (int, optional):  Set to other initial dilation rates.
                Defaults to None.
            prefix (str): Prefix for function recursion. Defaults to ''.
        """
        op = 'torch.nn.' + search_op
        for name, module in model.named_children():
            if prefix == '':
                fullname = 'module.' + name
            else:
                fullname = prefix + '.' + name
            if self.config['search']['skip_layer'] is not None:
                if any(layer in fullname
                       for layer in self.config['search']['skip_layer']):
                    continue
            if isinstance(module, eval(op)):
                if 1 < module.kernel_size[0] and \
                    0 != module.kernel_size[0] % 2 or \
                    1 < module.kernel_size[1] and \
                        0 != module.kernel_size[1] % 2:
                    if isinstance(self.config['structure'][fullname], int):
                        self.config['structure'][fullname] = [
                            self.config['structure'][fullname],
                            self.config['structure'][fullname]
                        ]
                    module.dilation = (
                        self.config['structure'][fullname][0],
                        self.config['structure'][fullname][1],
                    )
                    module.padding = (
                        get_single_padding(
                            module.kernel_size[0], module.stride[0],
                            self.config['structure'][fullname][0]),
                        get_single_padding(
                            module.kernel_size[1], module.stride[1],
                            self.config['structure'][fullname][1]))
                    setattr(model, name, module)
                    if self.verbose:
                        print_log(
                            'Set module %s dilation as: [%d %d]' %
                            (fullname, module.dilation[0], module.dilation[1]),
                            'current')
            elif not isinstance(module, BaseConvRFSearchOp):
                self.set_model(module, search_op, init_rates, fullname)


================================================
FILE: mmcv/cnn/rfsearch/utils.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import mmengine
import numpy as np


def write_to_json(config: dict, filename: str):
    """Save config to json file.

    Args:
        config (dict): Config to be saved.
        filename (str): Path to save config.
    """

    with open(filename, 'w', encoding='utf-8') as f:
        mmengine.dump(config, f, file_format='json')


def expand_rates(dilation: tuple, config: dict) -> list:
    """Expand dilation rate according to config.

    Args:
        dilation (int): _description_
        config (dict): config dict

    Returns:
        list: list of expanded dilation rates
    """
    exp_rate = config['exp_rate']

    large_rates = []
    small_rates = []
    for _ in range(config['num_branches'] // 2):
        large_rates.append(
            tuple([
                np.clip(
                    int(round((1 + exp_rate) * dilation[0])), config['mmin'],
                    config['mmax']).item(),
                np.clip(
                    int(round((1 + exp_rate) * dilation[1])), config['mmin'],
                    config['mmax']).item()
            ]))
        small_rates.append(
            tuple([
                np.clip(
                    int(round((1 - exp_rate) * dilation[0])), config['mmin'],
                    config['mmax']).item(),
                np.clip(
                    int(round((1 - exp_rate) * dilation[1])), config['mmin'],
                    config['mmax']).item()
            ]))

    small_rates.reverse()

    if config['num_branches'] % 2 == 0:
        rate_list = small_rates + large_rates
    else:
        rate_list = small_rates + [dilation] + large_rates

    unique_rate_list = list(set(rate_list))
    unique_rate_list.sort(key=rate_list.index)
    return unique_rate_list


def get_single_padding(kernel_size: int,
                       stride: int = 1,
                       dilation: int = 1) -> int:
    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
    return padding


================================================
FILE: mmcv/cnn/utils/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .flops_counter import get_model_complexity_info
from .fuse_conv_bn import fuse_conv_bn

__all__ = ['get_model_complexity_info', 'fuse_conv_bn']


================================================
FILE: mmcv/cnn/utils/flops_counter.py
================================================
# Modified from flops-counter.pytorch by Vladislav Sovrasov
# original repo: https://github.com/sovrasov/flops-counter.pytorch

# MIT License

# Copyright (c) 2018 Vladislav Sovrasov

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import sys
import warnings
from functools import partial
from typing import Any, Callable, Dict, Optional, TextIO, Tuple

import numpy as np
import torch
import torch.nn as nn

from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, Linear,
                             MaxPool2d, MaxPool3d)


def get_model_complexity_info(model: nn.Module,
                              input_shape: tuple,
                              print_per_layer_stat: bool = True,
                              as_strings: bool = True,
                              input_constructor: Optional[Callable] = None,
                              flush: bool = False,
                              ost: TextIO = sys.stdout) -> tuple:
    """Get complexity information of a model.

    This method can calculate FLOPs and parameter counts of a model with
    corresponding input shape. It can also print complexity information for
    each layer in a model.

    Supported layers are listed as below:
        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,
          ``nn.LeakyReLU``, ``nn.ReLU6``.
        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
          ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
          ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
          ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
          ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
          ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
          ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
        - Linear: ``nn.Linear``.
        - Deconvolution: ``nn.ConvTranspose2d``.
        - Upsample: ``nn.Upsample``.

    Args:
        model (nn.Module): The model for complexity calculation.
        input_shape (tuple): Input shape used for calculation.
        print_per_layer_stat (bool): Whether to print complexity information
            for each layer in a model. Default: True.
        as_strings (bool): Output FLOPs and params counts in a string form.
            Default: True.
        input_constructor (None | callable): If specified, it takes a callable
            method that generates input. otherwise, it will generate a random
            tensor with input shape to calculate FLOPs. Default: None.
        flush (bool): same as that in :func:`print`. Default: False.
        ost (stream): same as ``file`` param in :func:`print`.
            Default: sys.stdout.

    Returns:
        tuple[float | str]: If ``as_strings`` is set to True, it will return
        FLOPs and parameter counts in a string format. otherwise, it will
        return those in a float number format.
    """
    assert type(input_shape) is tuple
    assert len(input_shape) >= 1
    assert isinstance(model, nn.Module)
    flops_model = add_flops_counting_methods(model)
    flops_model.eval()
    flops_model.start_flops_count()
    if input_constructor:
        input = input_constructor(input_shape)
        _ = flops_model(**input)
    else:
        try:
            batch = torch.ones(()).new_empty(
                (1, *input_shape),
                dtype=next(flops_model.parameters()).dtype,
                device=next(flops_model.parameters()).device)
        except StopIteration:
            # Avoid StopIteration for models which have no parameters,
            # like `nn.Relu()`, `nn.AvgPool2d`, etc.
            batch = torch.ones(()).new_empty((1, *input_shape))

        _ = flops_model(batch)

    flops_count, params_count = flops_model.compute_average_flops_cost()
    if print_per_layer_stat:
        print_model_with_flops(
            flops_model, flops_count, params_count, ost=ost, flush=flush)
    flops_model.stop_flops_count()

    if as_strings:
        return flops_to_string(flops_count), params_to_string(params_count)

    return flops_count, params_count


def flops_to_string(flops: float,
                    units: Optional[str] = 'GFLOPs',
                    precision: int = 2) -> str:
    """Convert FLOPs number into a string.

    Note that Here we take a multiply-add counts as one FLOP.

    Args:
        flops (float): FLOPs number to be converted.
        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
        precision (int): Digit number after the decimal point. Default: 2.

    Returns:
        str: The converted FLOPs number with units.

    Examples:
        >>> flops_to_string(1e9)
        '1.0 GFLOPs'
        >>> flops_to_string(2e5, 'MFLOPs')
        '0.2 MFLOPs'
        >>> flops_to_string(3e-9, None)
        '3e-09 FLOPs'
    """
    if units is None:
        if flops // 10**9 > 0:
            return str(round(flops / 10.**9, precision)) + ' GFLOPs'
        elif flops // 10**6 > 0:
            return str(round(flops / 10.**6, precision)) + ' MFLOPs'
        elif flops // 10**3 > 0:
            return str(round(flops / 10.**3, precision)) + ' KFLOPs'
        else:
            return str(flops) + ' FLOPs'
    else:
        if units == 'GFLOPs':
            return str(round(flops / 10.**9, precision)) + ' ' + units
        elif units == 'MFLOPs':
            return str(round(flops / 10.**6, precision)) + ' ' + units
        elif units == 'KFLOPs':
            return str(round(flops / 10.**3, precision)) + ' ' + units
        else:
            return str(flops) + ' FLOPs'


def params_to_string(num_params: float,
                     units: Optional[str] = None,
                     precision: int = 2) -> str:
    """Convert parameter number into a string.

    Args:
        num_params (float): Parameter number to be converted.
        units (str | None): Converted FLOPs units. Options are None, 'M',
            'K' and ''. If set to None, it will automatically choose the most
            suitable unit for Parameter number. Default: None.
        precision (int): Digit number after the decimal point. Default: 2.

    Returns:
        str: The converted parameter number with units.

    Examples:
        >>> params_to_string(1e9)
        '1000.0 M'
        >>> params_to_string(2e5)
        '200.0 k'
        >>> params_to_string(3e-9)
        '3e-09'
    """
    if units is None:
        if num_params // 10**6 > 0:
            return str(round(num_params / 10**6, precision)) + ' M'
        elif num_params // 10**3:
            return str(round(num_params / 10**3, precision)) + ' k'
        else:
            return str(num_params)
    else:
        if units == 'M':
            return str(round(num_params / 10.**6, precision)) + ' ' + units
        elif units == 'K':
            return str(round(num_params / 10.**3, precision)) + ' ' + units
        else:
            return str(num_params)


def print_model_with_flops(model: nn.Module,
                           total_flops: float,
                           total_params: float,
                           units: Optional[str] = 'GFLOPs',
                           precision: int = 3,
                           ost: TextIO = sys.stdout,
                           flush: bool = False) -> None:
    """Print a model with FLOPs for each layer.

    Args:
        model (nn.Module): The model to be printed.
        total_flops (float): Total FLOPs of the model.
        total_params (float): Total parameter counts of the model.
        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
        precision (int): Digit number after the decimal point. Default: 3.
        ost (stream): same as `file` param in :func:`print`.
            Default: sys.stdout.
        flush (bool): same as that in :func:`print`. Default: False.

    Example:
        >>> class ExampleModel(nn.Module):

        >>> def __init__(self):
        >>>     super().__init__()
        >>>     self.conv1 = nn.Conv2d(3, 8, 3)
        >>>     self.conv2 = nn.Conv2d(8, 256, 3)
        >>>     self.conv3 = nn.Conv2d(256, 8, 3)
        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        >>>     self.flatten = nn.Flatten()
        >>>     self.fc = nn.Linear(8, 1)

        >>> def forward(self, x):
        >>>     x = self.conv1(x)
        >>>     x = self.conv2(x)
        >>>     x = self.conv3(x)
        >>>     x = self.avg_pool(x)
        >>>     x = self.flatten(x)
        >>>     x = self.fc(x)
        >>>     return x

        >>> model = ExampleModel()
        >>> x = (3, 16, 16)
        to print the complexity information state for each layer, you can use
        >>> get_model_complexity_info(model, x)
        or directly use
        >>> print_model_with_flops(model, 4579784.0, 37361)
        ExampleModel(
          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501
          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
        )
    """

    def accumulate_params(self):
        if is_supported_instance(self):
            return self.__params__
        else:
            sum = 0
            for m in self.children():
                sum += m.accumulate_params()
            return sum

    def accumulate_flops(self):
        if is_supported_instance(self):
            return self.__flops__ / model.__batch_counter__
        else:
            sum = 0
            for m in self.children():
                sum += m.accumulate_flops()
            return sum

    def flops_repr(self):
        accumulated_num_params = self.accumulate_params()
        accumulated_flops_cost = self.accumulate_flops()
        return ', '.join([
            params_to_string(
                accumulated_num_params, units='M', precision=precision),
            f'{accumulated_num_params / total_params:.3%} Params',
            flops_to_string(
                accumulated_flops_cost, units=units, precision=precision),
            f'{accumulated_flops_cost / total_flops:.3%} FLOPs',
            self.original_extra_repr()
        ])

    def add_extra_repr(m):
        m.accumulate_flops = accumulate_flops.__get__(m)
        m.accumulate_params = accumulate_params.__get__(m)
        flops_extra_repr = flops_repr.__get__(m)
        if m.extra_repr != flops_extra_repr:
            m.original_extra_repr = m.extra_repr
            m.extra_repr = flops_extra_repr
            assert m.extra_repr != m.original_extra_repr

    def del_extra_repr(m):
        if hasattr(m, 'original_extra_repr'):
            m.extra_repr = m.original_extra_repr
            del m.original_extra_repr
        if hasattr(m, 'accumulate_flops'):
            del m.accumulate_flops

    model.apply(add_extra_repr)
    print(model, file=ost, flush=flush)
    model.apply(del_extra_repr)


def get_model_parameters_number(model: nn.Module) -> float:
    """Calculate parameter number of a model.

    Args:
        model (nn.module): The model for parameter number calculation.

    Returns:
        float: Parameter number of the model.
    """
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return num_params


def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:
    # adding additional methods to the existing module object,
    # this is done this way so that each function has access to self object
    net_main_module.start_flops_count = start_flops_count.__get__(  # type: ignore # noqa E501
        net_main_module)
    net_main_module.stop_flops_count = stop_flops_count.__get__(  # type: ignore # noqa E501
        net_main_module)
    net_main_module.reset_flops_count = reset_flops_count.__get__(  # type: ignore # noqa E501
        net_main_module)
    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # type: ignore # noqa E501
        net_main_module)

    net_main_module.reset_flops_count()

    return net_main_module


def compute_average_flops_cost(self) -> Tuple[float, float]:
    """Compute average FLOPs cost.

    A method to compute average FLOPs cost, which will be available after
    `add_flops_counting_methods()` is called on a desired net object.

    Returns:
        float: Current mean flops consumption per image.
    """
    batches_count = self.__batch_counter__
    flops_sum = 0
    for module in self.modules():
        if is_supported_instance(module):
            flops_sum += module.__flops__
    params_sum = get_model_parameters_number(self)
    return flops_sum / batches_count, params_sum


def start_flops_count(self) -> None:
    """Activate the computation of mean flops consumption per image.

    A method to activate the computation of mean flops consumption per image.
    which will be available after ``add_flops_counting_methods()`` is called on
    a desired net object. It should be called before running the network.
    """
    add_batch_counter_hook_function(self)

    def add_flops_counter_hook_function(module: nn.Module) -> None:
        if is_supported_instance(module):
            if hasattr(module, '__flops_handle__'):
                return

            else:
                handle = module.register_forward_hook(
                    get_modules_mapping()[type(module)])

            module.__flops_handle__ = handle

    self.apply(partial(add_flops_counter_hook_function))


def stop_flops_count(self) -> None:
    """Stop computing the mean flops consumption per image.

    A method to stop computing the mean flops consumption per image, which will
    be available after ``add_flops_counting_methods()`` is called on a desired
    net object. It can be called to pause the computation whenever.
    """
    remove_batch_counter_hook_function(self)
    self.apply(remove_flops_counter_hook_function)


def reset_flops_count(self) -> None:
    """Reset statistics computed so far.

    A method to Reset computed statistics, which will be available after
    `add_flops_counting_methods()` is called on a desired net object.
    """
    add_batch_counter_variables_or_reset(self)
    self.apply(add_flops_counter_variable_or_reset)


# ---- Internal functions
def empty_flops_counter_hook(module: nn.Module, input: tuple,
                             output: Any) -> None:
    module.__flops__ += 0


def upsample_flops_counter_hook(module: nn.Module, input: tuple,
                                output: torch.Tensor) -> None:
    output_size = output[0]
    batch_size = output_size.shape[0]
    output_elements_count = batch_size
    for val in output_size.shape[1:]:
        output_elements_count *= val
    module.__flops__ += int(output_elements_count)


def relu_flops_counter_hook(module: nn.Module, input: tuple,
                            output: torch.Tensor) -> None:
    active_elements_count = output.numel()
    module.__flops__ += int(active_elements_count)


def linear_flops_counter_hook(module: nn.Module, input: tuple,
                              output: torch.Tensor) -> None:
    output_last_dim = output.shape[
        -1]  # pytorch checks dimensions, so here we don't care much
    module.__flops__ += int(np.prod(input[0].shape) * output_last_dim)


def pool_flops_counter_hook(module: nn.Module, input: tuple,
                            output: torch.Tensor) -> None:
    module.__flops__ += int(np.prod(input[0].shape))


def norm_flops_counter_hook(module: nn.Module, input: tuple,
                            output: torch.Tensor) -> None:
    batch_flops = np.prod(input[0].shape)
    if (getattr(module, 'affine', False)
            or getattr(module, 'elementwise_affine', False)):
        batch_flops *= 2
    module.__flops__ += int(batch_flops)


def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,
                              output: torch.Tensor) -> None:
    # Can have multiple inputs, getting the first one
    batch_size = input[0].shape[0]
    input_height, input_width = input[0].shape[2:]

    kernel_height, kernel_width = conv_module.kernel_size
    in_channels = conv_module.in_channels
    out_channels = conv_module.out_channels
    groups = conv_module.groups

    filters_per_channel = out_channels // groups
    conv_per_position_flops = (
        kernel_height * kernel_width * in_channels * filters_per_channel)

    active_elements_count = batch_size * input_height * input_width
    overall_conv_flops = conv_per_position_flops * active_elements_count
    bias_flops = 0
    if conv_module.bias is not None:
        output_height, output_width = output.shape[2:]
        bias_flops = out_channels * batch_size * output_height * output_width
    overall_flops = overall_conv_flops + bias_flops

    conv_module.__flops__ += int(overall_flops)


def conv_flops_counter_hook(conv_module: nn.Module, input: tuple,
                            output: torch.Tensor) -> None:
    # Can have multiple inputs, getting the first one
    batch_size = input[0].shape[0]
    output_dims = list(output.shape[2:])

    kernel_dims = list(conv_module.kernel_size)
    in_channels = conv_module.in_channels
    out_channels = conv_module.out_channels
    groups = conv_module.groups

    filters_per_channel = out_channels // groups
    conv_per_position_flops = int(
        np.prod(kernel_dims)) * in_channels * filters_per_channel

    active_elements_count = batch_size * int(np.prod(output_dims))

    overall_conv_flops = conv_per_position_flops * active_elements_count

    bias_flops = 0

    if conv_module.bias is not None:

        bias_flops = out_channels * active_elements_count

    overall_flops = overall_conv_flops + bias_flops

    conv_module.__flops__ += int(overall_flops)


def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None:
    batch_size = 1
    if len(input) > 0:
        # Can have multiple inputs, getting the first one
        batch_size = len(input[0])
    else:
        warnings.warn('No positional inputs found for a module, '
                      'assuming batch size is 1.')
    module.__batch_counter__ += batch_size


def add_batch_counter_variables_or_reset(module: nn.Module) -> None:

    module.__batch_counter__ = 0


def add_batch_counter_hook_function(module: nn.Module) -> None:
    if hasattr(module, '__batch_counter_handle__'):
        return

    handle = module.register_forward_hook(batch_counter_hook)
    module.__batch_counter_handle__ = handle


def remove_batch_counter_hook_function(module: nn.Module) -> None:
    if hasattr(module, '__batch_counter_handle__'):
        module.__batch_counter_handle__.remove()
        del module.__batch_counter_handle__


def add_flops_counter_variable_or_reset(module: nn.Module) -> None:
    if is_supported_instance(module):
        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
            warnings.warn('variables __flops__ or __params__ are already '
                          'defined for the module' + type(module).__name__ +
                          ' ptflops can affect your code!')
        module.__flops__ = 0
        module.__params__ = get_model_parameters_number(module)


def is_supported_instance(module: nn.Module) -> bool:
    if type(module) in get_modules_mapping():
        return True
    return False


def remove_flops_counter_hook_function(module: nn.Module) -> None:
    if is_supported_instance(module):
        if hasattr(module, '__flops_handle__'):
            module.__flops_handle__.remove()
            del module.__flops_handle__


def get_modules_mapping() -> Dict:
    return {
        # convolutions
        nn.Conv1d: conv_flops_counter_hook,
        nn.Conv2d: conv_flops_counter_hook,
        Conv2d: conv_flops_counter_hook,
        nn.Conv3d: conv_flops_counter_hook,
        Conv3d: conv_flops_counter_hook,
        # activations
        nn.ReLU: relu_flops_counter_hook,
        nn.PReLU: relu_flops_counter_hook,
        nn.ELU: relu_flops_counter_hook,
        nn.LeakyReLU: relu_flops_counter_hook,
        nn.ReLU6: relu_flops_counter_hook,
        # poolings
        nn.MaxPool1d: pool_flops_counter_hook,
        nn.AvgPool1d: pool_flops_counter_hook,
        nn.AvgPool2d: pool_flops_counter_hook,
        nn.MaxPool2d: pool_flops_counter_hook,
        MaxPool2d: pool_flops_counter_hook,
        nn.MaxPool3d: pool_flops_counter_hook,
        MaxPool3d: pool_flops_counter_hook,
        nn.AvgPool3d: pool_flops_counter_hook,
        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
        # normalizations
        nn.BatchNorm1d: norm_flops_counter_hook,
        nn.BatchNorm2d: norm_flops_counter_hook,
        nn.BatchNorm3d: norm_flops_counter_hook,
        nn.GroupNorm: norm_flops_counter_hook,
        nn.InstanceNorm1d: norm_flops_counter_hook,
        nn.InstanceNorm2d: norm_flops_counter_hook,
        nn.InstanceNorm3d: norm_flops_counter_hook,
        nn.LayerNorm: norm_flops_counter_hook,
        # FC
        nn.Linear: linear_flops_counter_hook,
        Linear: linear_flops_counter_hook,
        # Upscale
        nn.Upsample: upsample_flops_counter_hook,
        # Deconvolution
        nn.ConvTranspose2d: deconv_flops_counter_hook,
        ConvTranspose2d: deconv_flops_counter_hook,
    }


================================================
FILE: mmcv/cnn/utils/fuse_conv_bn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn


def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:
    """Fuse conv and bn into one module.

    Args:
        conv (nn.Module): Conv to be fused.
        bn (nn.Module): BN to be fused.

    Returns:
        nn.Module: Fused module.
    """
    conv_w = conv.weight
    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
        bn.running_mean)

    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
    conv.weight = nn.Parameter(conv_w *
                               factor.reshape([conv.out_channels, 1, 1, 1]))
    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
    return conv


def fuse_conv_bn(module: nn.Module) -> nn.Module:
    """Recursively fuse conv and bn in a module.

    During inference, the functionary of batch norm layers is turned off
    but only the mean and var alone channels are used, which exposes the
    chance to fuse it with the preceding conv layers to save computations and
    simplify network structures.

    Args:
        module (nn.Module): Module to be fused.

    Returns:
        nn.Module: Fused module.
    """
    last_conv = None
    last_conv_name = None

    for name, child in module.named_children():
        if isinstance(child,
                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
            if last_conv is None:  # only fuse BN that is after Conv
                continue
            fused_conv = _fuse_conv_bn(last_conv, child)
            module._modules[last_conv_name] = fused_conv
            # To reduce changes, set BN as Identity instead of deleting it.
            module._modules[name] = nn.Identity()
            last_conv = None
        elif isinstance(child, nn.Conv2d):
            last_conv = child
            last_conv_name = name
        else:
            fuse_conv_bn(child)
    return module


================================================
FILE: mmcv/cnn/vgg.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import logging
from typing import List, Optional, Sequence, Tuple, Union

import torch.nn as nn
from mmengine.model import constant_init, kaiming_init, normal_init
from mmengine.runner import load_checkpoint
from torch import Tensor


def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
    """3x3 convolution with padding."""
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        padding=dilation,
        dilation=dilation)


def make_vgg_layer(inplanes: int,
                   planes: int,
                   num_blocks: int,
                   dilation: int = 1,
                   with_bn: bool = False,
                   ceil_mode: bool = False) -> List[nn.Module]:
    layers = []
    for _ in range(num_blocks):
        layers.append(conv3x3(inplanes, planes, dilation))
        if with_bn:
            layers.append(nn.BatchNorm2d(planes))
        layers.append(nn.ReLU(inplace=True))
        inplanes = planes
    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))

    return layers


class VGG(nn.Module):
    """VGG backbone.

    Args:
        depth (int): Depth of vgg, from {11, 13, 16, 19}.
        with_bn (bool): Use BatchNorm or not.
        num_classes (int): number of classes for classification.
        num_stages (int): VGG stages, normally 5.
        dilations (Sequence[int]): Dilation of each stage.
        out_indices (Sequence[int]): Output from which stages.
        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
            not freezing any parameters.
        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
            running stats (mean and var).
        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
    """

    arch_settings = {
        11: (1, 1, 2, 2, 2),
        13: (2, 2, 2, 2, 2),
        16: (2, 2, 3, 3, 3),
        19: (2, 2, 4, 4, 4)
    }

    def __init__(self,
                 depth: int,
                 with_bn: bool = False,
                 num_classes: int = -1,
                 num_stages: int = 5,
                 dilations: Sequence[int] = (1, 1, 1, 1, 1),
                 out_indices: Sequence[int] = (0, 1, 2, 3, 4),
                 frozen_stages: int = -1,
                 bn_eval: bool = True,
                 bn_frozen: bool = False,
                 ceil_mode: bool = False,
                 with_last_pool: bool = True):
        super().__init__()
        if depth not in self.arch_settings:
            raise KeyError(f'invalid depth {depth} for vgg')
        assert num_stages >= 1 and num_stages <= 5
        stage_blocks = self.arch_settings[depth]
        self.stage_blocks = stage_blocks[:num_stages]
        assert len(dilations) == num_stages
        assert max(out_indices) <= num_stages

        self.num_classes = num_classes
        self.out_indices = out_indices
        self.frozen_stages = frozen_stages
        self.bn_eval = bn_eval
        self.bn_frozen = bn_frozen

        self.inplanes = 3
        start_idx = 0
        vgg_layers = []
        self.range_sub_modules = []
        for i, num_blocks in enumerate(self.stage_blocks):
            num_modules = num_blocks * (2 + with_bn) + 1
            end_idx = start_idx + num_modules
            dilation = dilations[i]
            planes = 64 * 2**i if i < 4 else 512
            vgg_layer = make_vgg_layer(
                self.inplanes,
                planes,
                num_blocks,
                dilation=dilation,
                with_bn=with_bn,
                ceil_mode=ceil_mode)
            vgg_layers.extend(vgg_layer)
            self.inplanes = planes
            self.range_sub_modules.append([start_idx, end_idx])
            start_idx = end_idx
        if not with_last_pool:
            vgg_layers.pop(-1)
            self.range_sub_modules[-1][1] -= 1
        self.module_name = 'features'
        self.add_module(self.module_name, nn.Sequential(*vgg_layers))

        if self.num_classes > 0:
            self.classifier = nn.Sequential(
                nn.Linear(512 * 7 * 7, 4096),
                nn.ReLU(True),
                nn.Dropout(),
                nn.Linear(4096, 4096),
                nn.ReLU(True),
                nn.Dropout(),
                nn.Linear(4096, num_classes),
            )

    def init_weights(self, pretrained: Optional[str] = None) -> None:
        if isinstance(pretrained, str):
            logger = logging.getLogger()
            load_checkpoint(self, pretrained, strict=False, logger=logger)
        elif pretrained is None:
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    kaiming_init(m)
                elif isinstance(m, nn.BatchNorm2d):
                    constant_init(m, 1)
                elif isinstance(m, nn.Linear):
                    normal_init(m, std=0.01)
        else:
            raise TypeError('pretrained must be a str or None')

    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]:
        outs = []
        vgg_layers = getattr(self, self.module_name)
        for i in range(len(self.stage_blocks)):
            for j in range(*self.range_sub_modules[i]):
                vgg_layer = vgg_layers[j]
                x = vgg_layer(x)
            if i in self.out_indices:
                outs.append(x)
        if self.num_classes > 0:
            x = x.view(x.size(0), -1)
            x = self.classifier(x)
            outs.append(x)
        if len(outs) == 1:
            return outs[0]
        else:
            return tuple(outs)

    def train(self, mode: bool = True) -> None:
        super().train(mode)
        if self.bn_eval:
            for m in self.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.eval()
                    if self.bn_frozen:
                        for params in m.parameters():
                            params.requires_grad = False
        vgg_layers = getattr(self, self.module_name)
        if mode and self.frozen_stages >= 0:
            for i in range(self.frozen_stages):
                for j in range(*self.range_sub_modules[i]):
                    mod = vgg_layers[j]
                    mod.eval()
                    for param in mod.parameters():
                        param.requires_grad = False


================================================
FILE: mmcv/image/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
                         gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
                         rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
from .geometric import (cutout, imcrop, imflip, imflip_, impad,
                        impad_to_multiple, imrescale, imresize, imresize_like,
                        imresize_to_multiple, imrotate, imshear, imtranslate,
                        rescale_size)
from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
from .misc import tensor2imgs
from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
                          adjust_hue, adjust_lighting, adjust_sharpness,
                          auto_contrast, clahe, imdenormalize, imequalize,
                          iminvert, imnormalize, imnormalize_, lut_transform,
                          posterize, solarize)

__all__ = [
    'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
    'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
    'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
    'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
    'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
    'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
    'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
    'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
    'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting',
    'adjust_hue'
]


================================================
FILE: mmcv/image/colorspace.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Callable, Union

import cv2
import numpy as np


def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:
    """Convert an image from the src colorspace to dst colorspace.

    Args:
        img (ndarray): The input image.
        src (str): The source colorspace, e.g., 'rgb', 'hsv'.
        dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.

    Returns:
        ndarray: The converted image.
    """
    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
    out_img = cv2.cvtColor(img, code)
    return out_img


def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
    """Convert a BGR image to grayscale image.

    Args:
        img (ndarray): The input image.
        keepdim (bool): If False (by default), then return the grayscale image
            with 2 dims, otherwise 3 dims.

    Returns:
        ndarray: The converted grayscale image.
    """
    out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if keepdim:
        out_img = out_img[..., None]
    return out_img


def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
    """Convert a RGB image to grayscale image.

    Args:
        img (ndarray): The input image.
        keepdim (bool): If False (by default), then return the grayscale image
            with 2 dims, otherwise 3 dims.

    Returns:
        ndarray: The converted grayscale image.
    """
    out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    if keepdim:
        out_img = out_img[..., None]
    return out_img


def gray2bgr(img: np.ndarray) -> np.ndarray:
    """Convert a grayscale image to BGR image.

    Args:
        img (ndarray): The input image.

    Returns:
        ndarray: The converted BGR image.
    """
    img = img[..., None] if img.ndim == 2 else img
    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    return out_img


def gray2rgb(img: np.ndarray) -> np.ndarray:
    """Convert a grayscale image to RGB image.

    Args:
        img (ndarray): The input image.

    Returns:
        ndarray: The converted RGB image.
    """
    img = img[..., None] if img.ndim == 2 else img
    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    return out_img


def _convert_input_type_range(img: np.ndarray) -> np.ndarray:
    """Convert the type and range of the input image.

    It converts the input image to np.float32 type and range of [0, 1].
    It is mainly used for pre-processing the input image in colorspace
    conversion functions such as rgb2ycbcr and ycbcr2rgb.

    Args:
        img (ndarray): The input image. It accepts:
            1. np.uint8 type with range [0, 255];
            2. np.float32 type with range [0, 1].

    Returns:
        (ndarray): The converted image with type of np.float32 and range of
            [0, 1].
    """
    img_type = img.dtype
    img = img.astype(np.float32)
    if img_type == np.float32:
        pass
    elif img_type == np.uint8:
        img /= 255.
    else:
        raise TypeError('The img type should be np.float32 or np.uint8, '
                        f'but got {img_type}')
    return img


def _convert_output_type_range(
        img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray:
    """Convert the type and range of the image according to dst_type.

    It converts the image to desired type and range. If `dst_type` is np.uint8,
    images will be converted to np.uint8 type with range [0, 255]. If
    `dst_type` is np.float32, it converts the image to np.float32 type with
    range [0, 1].
    It is mainly used for post-processing images in colorspace conversion
    functions such as rgb2ycbcr and ycbcr2rgb.

    Args:
        img (ndarray): The image to be converted with np.float32 type and
            range [0, 255].
        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
            converts the image to np.uint8 type with range [0, 255]. If
            dst_type is np.float32, it converts the image to np.float32 type
            with range [0, 1].

    Returns:
        (ndarray): The converted image with desired type and range.
    """
    if dst_type not in (np.uint8, np.float32):
        raise TypeError('The dst_type should be np.float32 or np.uint8, '
                        f'but got {dst_type}')
    if dst_type == np.uint8:
        img = img.round()
    else:
        img /= 255.
    return img.astype(dst_type)


def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
    """Convert a RGB image to YCbCr image.

    This function produces the same results as Matlab's `rgb2ycbcr` function.
    It implements the ITU-R BT.601 conversion for standard-definition
    television. See more details in
    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.

    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
    In OpenCV, it implements a JPEG conversion. See more details in
    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.

    Args:
        img (ndarray): The input image. It accepts:
            1. np.uint8 type with range [0, 255];
            2. np.float32 type with range [0, 1].
        y_only (bool): Whether to only return Y channel. Default: False.

    Returns:
        ndarray: The converted YCbCr image. The output image has the same type
        and range as input image.
    """
    img_type = img.dtype
    img = _convert_input_type_range(img)
    if y_only:
        out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
    else:
        out_img = np.matmul(
            img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
                  [24.966, 112.0, -18.214]]) + [16, 128, 128]
    out_img = _convert_output_type_range(out_img, img_type)
    return out_img


def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
    """Convert a BGR image to YCbCr image.

    The bgr version of rgb2ycbcr.
    It implements the ITU-R BT.601 conversion for standard-definition
    television. See more details in
    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.

    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
    In OpenCV, it implements a JPEG conversion. See more details in
    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.

    Args:
        img (ndarray): The input image. It accepts:
            1. np.uint8 type with range [0, 255];
            2. np.float32 type with range [0, 1].
        y_only (bool): Whether to only return Y channel. Default: False.

    Returns:
        ndarray: The converted YCbCr image. The output image has the same type
        and range as input image.
    """
    img_type = img.dtype
    img = _convert_input_type_range(img)
    if y_only:
        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
    else:
        out_img = np.matmul(
            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
    out_img = _convert_output_type_range(out_img, img_type)
    return out_img


def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
    """Convert a YCbCr image to RGB image.

    This function produces the same results as Matlab's ycbcr2rgb function.
    It implements the ITU-R BT.601 conversion for standard-definition
    television. See more details in
    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.

    It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
    In OpenCV, it implements a JPEG conversion. See more details in
    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.

    Args:
        img (ndarray): The input image. It accepts:
            1. np.uint8 type with range [0, 255];
            2. np.float32 type with range [0, 1].

    Returns:
        ndarray: The converted RGB image. The output image has the same type
        and range as input image.
    """
    img_type = img.dtype
    img = _convert_input_type_range(img) * 255
    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
                              [0, -0.00153632, 0.00791071],
                              [0.00625893, -0.00318811, 0]]) * 255.0 + [
                                  -222.921, 135.576, -276.836
                              ]
    out_img = _convert_output_type_range(out_img, img_type)
    return out_img


def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
    """Convert a YCbCr image to BGR image.

    The bgr version of ycbcr2rgb.
    It implements the ITU-R BT.601 conversion for standard-definition
    television. See more details in
    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.

    It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
    In OpenCV, it implements a JPEG conversion. See more details in
    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.

    Args:
        img (ndarray): The input image. It accepts:
            1. np.uint8 type with range [0, 255];
            2. np.float32 type with range [0, 1].

    Returns:
        ndarray: The converted BGR image. The output image has the same type
        and range as input image.
    """
    img_type = img.dtype
    img = _convert_input_type_range(img) * 255
    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
                              [0.00791071, -0.00153632, 0],
                              [0, -0.00318811, 0.00625893]]) * 255.0 + [
                                  -276.836, 135.576, -222.921
                              ]
    out_img = _convert_output_type_range(out_img, img_type)
    return out_img


def convert_color_factory(src: str, dst: str) -> Callable:

    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')

    def convert_color(img: np.ndarray) -> np.ndarray:
        out_img = cv2.cvtColor(img, code)
        return out_img

    convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
        image.

    Args:
        img (ndarray or str): The input image.

    Returns:
        ndarray: The converted {dst.upper()} image.
    """

    return convert_color


bgr2rgb = convert_color_factory('bgr', 'rgb')

rgb2bgr = convert_color_factory('rgb', 'bgr')

bgr2hsv = convert_color_factory('bgr', 'hsv')

hsv2bgr = convert_color_factory('hsv', 'bgr')

bgr2hls = convert_color_factory('bgr', 'hls')

hls2bgr = convert_color_factory('hls', 'bgr')


================================================
FILE: mmcv/image/geometric.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numbers
from typing import List, Optional, Tuple, Union, no_type_check

import cv2
import numpy as np
from mmengine.utils import to_2tuple

from .io import imread_backend

try:
    from PIL import Image
except ImportError:
    Image = None


def _scale_size(
    size: Tuple[int, int],
    scale: Union[float, int, Tuple[float, float], Tuple[int, int]],
) -> Tuple[int, int]:
    """Rescale a size by a ratio.

    Args:
        size (tuple[int]): (w, h).
        scale (float | int | tuple(float) | tuple(int)): Scaling factor.

    Returns:
        tuple[int]: scaled size.
    """
    if isinstance(scale, (float, int)):
        scale = (scale, scale)
    w, h = size
    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)


cv2_interp_codes = {
    'nearest': cv2.INTER_NEAREST,
    'bilinear': cv2.INTER_LINEAR,
    'bicubic': cv2.INTER_CUBIC,
    'area': cv2.INTER_AREA,
    'lanczos': cv2.INTER_LANCZOS4
}

cv2_border_modes = {
    'constant': cv2.BORDER_CONSTANT,
    'replicate': cv2.BORDER_REPLICATE,
    'reflect': cv2.BORDER_REFLECT,
    'wrap': cv2.BORDER_WRAP,
    'reflect_101': cv2.BORDER_REFLECT_101,
    'transparent': cv2.BORDER_TRANSPARENT,
    'isolated': cv2.BORDER_ISOLATED
}

# Pillow >=v9.1.0 use a slightly different naming scheme for filters.
# Set pillow_interp_codes according to the naming scheme used.
if Image is not None:
    if hasattr(Image, 'Resampling'):
        pillow_interp_codes = {
            'nearest': Image.Resampling.NEAREST,
            'bilinear': Image.Resampling.BILINEAR,
            'bicubic': Image.Resampling.BICUBIC,
            'box': Image.Resampling.BOX,
            'lanczos': Image.Resampling.LANCZOS,
            'hamming': Image.Resampling.HAMMING
        }
    else:
        pillow_interp_codes = {
            'nearest': Image.NEAREST,
            'bilinear': Image.BILINEAR,
            'bicubic': Image.BICUBIC,
            'box': Image.BOX,
            'lanczos': Image.LANCZOS,
            'hamming': Image.HAMMING
        }


def imresize(
    img: np.ndarray,
    size: Tuple[int, int],
    return_scale: bool = False,
    interpolation: str = 'bilinear',
    out: Optional[np.ndarray] = None,
    backend: Optional[str] = None
) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
    """Resize image to a given size.

    Args:
        img (ndarray): The input image.
        size (tuple[int]): Target size (w, h).
        return_scale (bool): Whether to return `w_scale` and `h_scale`.
        interpolation (str): Interpolation method, accepted values are
            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
            backend, "nearest", "bilinear" for 'pillow' backend.
        out (ndarray): The output destination.
        backend (str | None): The image resize backend type. Options are `cv2`,
            `pillow`, `None`. If backend is None, the global imread_backend
            specified by ``mmcv.use_backend()`` will be used. Default: None.

    Returns:
        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
        `resized_img`.
    """
    h, w = img.shape[:2]
    if backend is None:
        backend = imread_backend
    if backend not in ['cv2', 'pillow']:
        raise ValueError(f'backend: {backend} is not supported for resize.'
                         f"Supported backends are 'cv2', 'pillow'")

    if backend == 'pillow':
        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
        pil_image = Image.fromarray(img)
        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
        resized_img = np.array(pil_image)
    else:
        resized_img = cv2.resize(
            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
    if not return_scale:
        return resized_img
    else:
        w_scale = size[0] / w
        h_scale = size[1] / h
        return resized_img, w_scale, h_scale


@no_type_check
def imresize_to_multiple(
    img: np.ndarray,
    divisor: Union[int, Tuple[int, int]],
    size: Union[int, Tuple[int, int], None] = None,
    scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int],
                        None] = None,
    keep_ratio: bool = False,
    return_scale: bool = False,
    interpolation: str = 'bilinear',
    out: Optional[np.ndarray] = None,
    backend: Optional[str] = None
) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
    """Resize image according to a given size or scale factor and then rounds
    up the the resized or rescaled image size to the nearest value that can be
    divided by the divisor.

    Args:
        img (ndarray): The input image.
        divisor (int | tuple): Resized image size will be a multiple of
            divisor. If divisor is a tuple, divisor should be
            (w_divisor, h_divisor).
        size (None | int | tuple[int]): Target size (w, h). Default: None.
        scale_factor (None | float | int | tuple[float] | tuple[int]):
            Multiplier for spatial size. Should match input size if it is a
            tuple and the 2D style is (w_scale_factor, h_scale_factor).
            Default: None.
        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
            image. Default: False.
        return_scale (bool): Whether to return `w_scale` and `h_scale`.
        interpolation (str): Interpolation method, accepted values are
            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
            backend, "nearest", "bilinear" for 'pillow' backend.
        out (ndarray): The output destination.
        backend (str | None): The image resize backend type. Options are `cv2`,
            `pillow`, `None`. If backend is None, the global imread_backend
            specified by ``mmcv.use_backend()`` will be used. Default: None.

    Returns:
        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
        `resized_img`.
    """
    h, w = img.shape[:2]
    if size is not None and scale_factor is not None:
        raise ValueError('only one of size or scale_factor should be defined')
    elif size is None and scale_factor is None:
        raise ValueError('one of size or scale_factor should be defined')
    elif size is not None:
        size = to_2tuple(size)
        if keep_ratio:
            size = rescale_size((w, h), size, return_scale=False)
    else:
        size = _scale_size((w, h), scale_factor)

    divisor = to_2tuple(divisor)
    size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor))
    resized_img, w_scale, h_scale = imresize(
        img,
        size,
        return_scale=True,
        interpolation=interpolation,
        out=out,
        backend=backend)
    if return_scale:
        return resized_img, w_scale, h_scale
    else:
        return resized_img


def imresize_like(
    img: np.ndarray,
    dst_img: np.ndarray,
    return_scale: bool = False,
    interpolation: str = 'bilinear',
    backend: Optional[str] = None
) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
    """Resize image to the same size of a given image.

    Args:
        img (ndarray): The input image.
        dst_img (ndarray): The target image.
        return_scale (bool): Whether to return `w_scale` and `h_scale`.
        interpolation (str): Same as :func:`resize`.
        backend (str | None): Same as :func:`resize`.

    Returns:
        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
        `resized_img`.
    """
    h, w = dst_img.shape[:2]
    return imresize(img, (w, h), return_scale, interpolation, backend=backend)


def rescale_size(old_size: tuple,
                 scale: Union[float, int, Tuple[int, int]],
                 return_scale: bool = False) -> tuple:
    """Calculate the new size to be rescaled to.

    Args:
        old_size (tuple[int]): The old size (w, h) of image.
        scale (float | int | tuple[int]): The scaling factor or maximum size.
            If it is a float number or an integer, then the image will be
            rescaled by this factor, else if it is a tuple of 2 integers, then
            the image will be rescaled as large as possible within the scale.
        return_scale (bool): Whether to return the scaling factor besides the
            rescaled image size.

    Returns:
        tuple[int]: The new rescaled image size.
    """
    w, h = old_size
    if isinstance(scale, (float, int)):
        if scale <= 0:
            raise ValueError(f'Invalid scale {scale}, must be positive.')
        scale_factor = scale
    elif isinstance(scale, tuple):
        max_long_edge = max(scale)
        max_short_edge = min(scale)
        scale_factor = min(max_long_edge / max(h, w),
                           max_short_edge / min(h, w))
    else:
        raise TypeError(
            f'Scale must be a number or tuple of int, but got {type(scale)}')

    new_size = _scale_size((w, h), scale_factor)

    if return_scale:
        return new_size, scale_factor
    else:
        return new_size


def imrescale(
    img: np.ndarray,
    scale: Union[float, int, Tuple[int, int]],
    return_scale: bool = False,
    interpolation: str = 'bilinear',
    backend: Optional[str] = None
) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
    """Resize image while keeping the aspect ratio.

    Args:
        img (ndarray): The input image.
        scale (float | int | tuple[int]): The scaling factor or maximum size.
            If it is a float number or an integer, then the image will be
            rescaled by this factor, else if it is a tuple of 2 integers, then
            the image will be rescaled as large as possible within the scale.
        return_scale (bool): Whether to return the scaling factor besides the
            rescaled image.
        interpolation (str): Same as :func:`resize`.
        backend (str | None): Same as :func:`resize`.

    Returns:
        ndarray: The rescaled image.
    """
    h, w = img.shape[:2]
    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
    rescaled_img = imresize(
        img, new_size, interpolation=interpolation, backend=backend)
    if return_scale:
        return rescaled_img, scale_factor
    else:
        return rescaled_img


def imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
    """Flip an image horizontally or vertically.

    Args:
        img (ndarray): Image to be flipped.
        direction (str): The flip direction, either "horizontal" or
            "vertical" or "diagonal".

    Returns:
        ndarray: The flipped image.
    """
    assert direction in ['horizontal', 'vertical', 'diagonal']
    if direction == 'horizontal':
        return np.flip(img, axis=1)
    elif direction == 'vertical':
        return np.flip(img, axis=0)
    else:
        return np.flip(img, axis=(0, 1))


def imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
    """Inplace flip an image horizontally or vertically.

    Args:
        img (ndarray): Image to be flipped.
        direction (str): The flip direction, either "horizontal" or
            "vertical" or "diagonal".

    Returns:
        ndarray: The flipped image (inplace).
    """
    assert direction in ['horizontal', 'vertical', 'diagonal']
    if direction == 'horizontal':
        return cv2.flip(img, 1, img)
    elif direction == 'vertical':
        return cv2.flip(img, 0, img)
    else:
        return cv2.flip(img, -1, img)


def imrotate(img: np.ndarray,
             angle: float,
             center: Optional[Tuple[float, float]] = None,
             scale: float = 1.0,
             border_value: int = 0,
             interpolation: str = 'bilinear',
             auto_bound: bool = False,
             border_mode: str = 'constant') -> np.ndarray:
    """Rotate an image.

    Args:
        img (np.ndarray): Image to be rotated.
        angle (float): Rotation angle in degrees, positive values mean
            clockwise rotation.
        center (tuple[float], optional): Center point (w, h) of the rotation in
            the source image. If not specified, the center of the image will be
            used.
        scale (float): Isotropic scale factor.
        border_value (int): Border value used in case of a constant border.
            Defaults to 0.
        interpolation (str): Same as :func:`resize`.
        auto_bound (bool): Whether to adjust the image size to cover the whole
            rotated image.
        border_mode (str): Pixel extrapolation method. Defaults to 'constant'.

    Returns:
        np.ndarray: The rotated image.
    """
    if center is not None and auto_bound:
        raise ValueError('`auto_bound` conflicts with `center`')
    h, w = img.shape[:2]
    if center is None:
        center = ((w - 1) * 0.5, (h - 1) * 0.5)
    assert isinstance(center, tuple)

    matrix = cv2.getRotationMatrix2D(center, -angle, scale)
    if auto_bound:
        cos = np.abs(matrix[0, 0])
        sin = np.abs(matrix[0, 1])
        new_w = h * sin + w * cos
        new_h = h * cos + w * sin
        matrix[0, 2] += (new_w - w) * 0.5
        matrix[1, 2] += (new_h - h) * 0.5
        w = int(np.round(new_w))
        h = int(np.round(new_h))
    rotated = cv2.warpAffine(
        img,
        matrix, (w, h),
        flags=cv2_interp_codes[interpolation],
        borderMode=cv2_border_modes[border_mode],
        borderValue=border_value)
    return rotated


def bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray:
    """Clip bboxes to fit the image shape.

    Args:
        bboxes (ndarray): Shape (..., 4*k)
        img_shape (tuple[int]): (height, width) of the image.

    Returns:
        ndarray: Clipped bboxes.
    """
    assert bboxes.shape[-1] % 4 == 0
    cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
    cmin[0::2] = img_shape[1] - 1
    cmin[1::2] = img_shape[0] - 1
    clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
    return clipped_bboxes


def bbox_scaling(bboxes: np.ndarray,
                 scale: float,
                 clip_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:
    """Scaling bboxes w.r.t the box center.

    Args:
        bboxes (ndarray): Shape(..., 4).
        scale (float): Scaling factor.
        clip_shape (tuple[int], optional): If specified, bboxes that exceed the
            boundary will be clipped according to the given shape (h, w).

    Returns:
        ndarray: Scaled bboxes.
    """
    if float(scale) == 1.0:
        scaled_bboxes = bboxes.copy()
    else:
        w = bboxes[..., 2] - bboxes[..., 0] + 1
        h = bboxes[..., 3] - bboxes[..., 1] + 1
        dw = (w * (scale - 1)) * 0.5
        dh = (h * (scale - 1)) * 0.5
        scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
    if clip_shape is not None:
        return bbox_clip(scaled_bboxes, clip_shape)
    else:
        return scaled_bboxes


def imcrop(
    img: np.ndarray,
    bboxes: np.ndarray,
    scale: float = 1.0,
    pad_fill: Union[float, list, None] = None
) -> Union[np.ndarray, List[np.ndarray]]:
    """Crop image patches.

    3 steps: scale the bboxes -> clip bboxes -> crop and pad.

    Args:
        img (ndarray): Image to be cropped.
        bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
        scale (float, optional): Scale ratio of bboxes, the default value
            1.0 means no scaling.
        pad_fill (Number | list[Number]): Value to be filled for padding.
            Default: None, which means no padding.

    Returns:
        list[ndarray] | ndarray: The cropped image patches.
    """
    chn = 1 if img.ndim == 2 else img.shape[2]
    if pad_fill is not None:
        if isinstance(pad_fill, (int, float)):
            pad_fill = [pad_fill for _ in range(chn)]
        assert len(pad_fill) == chn

    _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
    scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
    clipped_bbox = bbox_clip(scaled_bboxes, img.shape)

    patches = []
    for i in range(clipped_bbox.shape[0]):
        x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
        if pad_fill is None:
            patch = img[y1:y2 + 1, x1:x2 + 1, ...]
        else:
            _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
            patch_h = _y2 - _y1 + 1
            patch_w = _x2 - _x1 + 1
            if chn == 1:
                patch_shape = (patch_h, patch_w)
            else:
                patch_shape = (patch_h, patch_w, chn)  # type: ignore
            patch = np.array(
                pad_fill, dtype=img.dtype) * np.ones(
                    patch_shape, dtype=img.dtype)
            x_start = 0 if _x1 >= 0 else -_x1
            y_start = 0 if _y1 >= 0 else -_y1
            w = x2 - x1 + 1
            h = y2 - y1 + 1
            patch[y_start:y_start + h, x_start:x_start + w,
                  ...] = img[y1:y1 + h, x1:x1 + w, ...]
        patches.append(patch)

    if bboxes.ndim == 1:
        return patches[0]
    else:
        return patches


def impad(img: np.ndarray,
          *,
          shape: Optional[Tuple[int, int]] = None,
          padding: Union[int, tuple, None] = None,
          pad_val: Union[float, List] = 0,
          padding_mode: str = 'constant') -> np.ndarray:
    """Pad the given image to a certain shape or pad on all sides with
    specified padding mode and padding value.

    Args:
        img (ndarray): Image to be padded.
        shape (tuple[int]): Expected padding shape (h, w). Default: None.
        padding (int or tuple[int]): Padding on each border. If a single int is
            provided this is used to pad all borders. If tuple of length 2 is
            provided this is the padding on left/right and top/bottom
            respectively. If a tuple of length 4 is provided this is the
            padding for the left, top, right and bottom borders respectively.
            Default: None. Note that `shape` and `padding` can not be both
            set.
        pad_val (Number | Sequence[Number]): Values to be filled in padding
            areas when padding_mode is 'constant'. Default: 0.
        padding_mode (str): Type of padding. Should be: constant, edge,
            reflect or symmetric. Default: constant.

            - constant: pads with a constant value, this value is specified
              with pad_val.
            - edge: pads with the last value at the edge of the image.
            - reflect: pads with reflection of image without repeating the last
              value on the edge. For example, padding [1, 2, 3, 4] with 2
              elements on both sides in reflect mode will result in
              [3, 2, 1, 2, 3, 4, 3, 2].
            - symmetric: pads with reflection of image repeating the last value
              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
              both sides in symmetric mode will result in
              [2, 1, 1, 2, 3, 4, 4, 3]

    Returns:
        ndarray: The padded image.
    """

    assert (shape is not None) ^ (padding is not None)
    if shape is not None:
        width = max(shape[1] - img.shape[1], 0)
        height = max(shape[0] - img.shape[0], 0)
        padding = (0, 0, width, height)

    # check pad_val
    if isinstance(pad_val, tuple):
        assert len(pad_val) == img.shape[-1]
    elif not isinstance(pad_val, numbers.Number):
        raise TypeError('pad_val must be a int or a tuple. '
                        f'But received {type(pad_val)}')

    # check padding
    if isinstance(padding, tuple) and len(padding) in [2, 4]:
        if len(padding) == 2:
            padding = (padding[0], padding[1], padding[0], padding[1])
    elif isinstance(padding, numbers.Number):
        padding = (padding, padding, padding, padding)
    else:
        raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
                         f'But received {padding}')

    # check padding mode
    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']

    border_type = {
        'constant': cv2.BORDER_CONSTANT,
        'edge': cv2.BORDER_REPLICATE,
        'reflect': cv2.BORDER_REFLECT_101,
        'symmetric': cv2.BORDER_REFLECT
    }
    img = cv2.copyMakeBorder(
        img,
        padding[1],
        padding[3],
        padding[0],
        padding[2],
        border_type[padding_mode],
        value=pad_val)

    return img


def impad_to_multiple(img: np.ndarray,
                      divisor: int,
                      pad_val: Union[float, List] = 0) -> np.ndarray:
    """Pad an image to ensure each edge to be multiple to some number.

    Args:
        img (ndarray): Image to be padded.
        divisor (int): Padded image edges will be multiple to divisor.
        pad_val (Number | Sequence[Number]): Same as :func:`impad`.

    Returns:
        ndarray: The padded image.
    """
    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
    return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)


def cutout(img: np.ndarray,
           shape: Union[int, Tuple[int, int]],
           pad_val: Union[int, float, tuple] = 0) -> np.ndarray:
    """Randomly cut out a rectangle from the original img.

    Args:
        img (ndarray): Image to be cutout.
        shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
            int, the value will be used for both h and w.
        pad_val (int | float | tuple[int | float]): Values to be filled in the
            cut area. Defaults to 0.

    Returns:
        ndarray: The cutout image.
    """

    channels = 1 if img.ndim == 2 else img.shape[2]
    if isinstance(shape, int):
        cut_h, cut_w = shape, shape
    else:
        assert isinstance(shape, tuple) and len(shape) == 2, \
            f'shape must be a int or a tuple with length 2, but got type ' \
            f'{type(shape)} instead.'
        cut_h, cut_w = shape
    if isinstance(pad_val, (int, float)):
        pad_val = tuple([pad_val] * channels)
    elif isinstance(pad_val, tuple):
        assert len(pad_val) == channels, \
            'Expected the num of elements in tuple equals the channels' \
            'of input image. Found {} vs {}'.format(
                len(pad_val), channels)
    else:
        raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')

    img_h, img_w = img.shape[:2]
    y0 = np.random.uniform(img_h)
    x0 = np.random.uniform(img_w)

    y1 = int(max(0, y0 - cut_h / 2.))
    x1 = int(max(0, x0 - cut_w / 2.))
    y2 = min(img_h, y1 + cut_h)
    x2 = min(img_w, x1 + cut_w)

    if img.ndim == 2:
        patch_shape = (y2 - y1, x2 - x1)
    else:
        patch_shape = (y2 - y1, x2 - x1, channels)  # type: ignore

    img_cutout = img.copy()
    patch = np.array(
        pad_val, dtype=img.dtype) * np.ones(
            patch_shape, dtype=img.dtype)
    img_cutout[y1:y2, x1:x2, ...] = patch

    return img_cutout


def _get_shear_matrix(magnitude: Union[int, float],
                      direction: str = 'horizontal') -> np.ndarray:
    """Generate the shear matrix for transformation.

    Args:
        magnitude (int | float): The magnitude used for shear.
        direction (str): The flip direction, either "horizontal"
            or "vertical".

    Returns:
        ndarray: The shear matrix with dtype float32.
    """
    if direction == 'horizontal':
        shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
    elif direction == 'vertical':
        shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
    return shear_matrix


def imshear(img: np.ndarray,
            magnitude: Union[int, float],
            direction: str = 'horizontal',
            border_value: Union[int, Tuple[int, int]] = 0,
            interpolation: str = 'bilinear') -> np.ndarray:
    """Shear an image.

    Args:
        img (ndarray): Image to be sheared with format (h, w)
            or (h, w, c).
        magnitude (int | float): The magnitude used for shear.
        direction (str): The flip direction, either "horizontal"
            or "vertical".
        border_value (int | tuple[int]): Value used in case of a
            constant border.
        interpolation (str): Same as :func:`resize`.

    Returns:
        ndarray: The sheared image.
    """
    assert direction in ['horizontal',
                         'vertical'], f'Invalid direction: {direction}'
    height, width = img.shape[:2]
    if img.ndim == 2:
        channels = 1
    elif img.ndim == 3:
        channels = img.shape[-1]
    if isinstance(border_value, int):
        border_value = tuple([border_value] * channels)  # type: ignore
    elif isinstance(border_value, tuple):
        assert len(border_value) == channels, \
            'Expected the num of elements in tuple equals the channels' \
            'of input image. Found {} vs {}'.format(
                len(border_value), channels)
    else:
        raise ValueError(
            f'Invalid type {type(border_value)} for `border_value`')
    shear_matrix = _get_shear_matrix(magnitude, direction)
    sheared = cv2.warpAffine(
        img,
        shear_matrix,
        (width, height),
        # Note case when the number elements in `border_value`
        # greater than 3 (e.g. shearing masks whose channels large
        # than 3) will raise TypeError in `cv2.warpAffine`.
        # Here simply slice the first 3 values in `border_value`.
        borderValue=border_value[:3],  # type: ignore
        flags=cv2_interp_codes[interpolation])
    return sheared


def _get_translate_matrix(offset: Union[int, float],
                          direction: str = 'horizontal') -> np.ndarray:
    """Generate the translate matrix.

    Args:
        offset (int | float): The offset used for translate.
        direction (str): The translate direction, either
            "horizontal" or "vertical".

    Returns:
        ndarray: The translate matrix with dtype float32.
    """
    if direction == 'horizontal':
        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
    elif direction == 'vertical':
        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
    return translate_matrix


def imtranslate(img: np.ndarray,
                offset: Union[int, float],
                direction: str = 'horizontal',
                border_value: Union[int, tuple] = 0,
                interpolation: str = 'bilinear') -> np.ndarray:
    """Translate an image.

    Args:
        img (ndarray): Image to be translated with format
            (h, w) or (h, w, c).
        offset (int | float): The offset used for translate.
        direction (str): The translate direction, either "horizontal"
            or "vertical".
        border_value (int | tuple[int]): Value used in case of a
            constant border.
        interpolation (str): Same as :func:`resize`.

    Returns:
        ndarray: The translated image.
    """
    assert direction in ['horizontal',
                         'vertical'], f'Invalid direction: {direction}'
    height, width = img.shape[:2]
    if img.ndim == 2:
        channels = 1
    elif img.ndim == 3:
        channels = img.shape[-1]
    if isinstance(border_value, int):
        border_value = tuple([border_value] * channels)
    elif isinstance(border_value, tuple):
        assert len(border_value) == channels, \
            'Expected the num of elements in tuple equals the channels' \
            'of input image. Found {} vs {}'.format(
                len(border_value), channels)
    else:
        raise ValueError(
            f'Invalid type {type(border_value)} for `border_value`.')
    translate_matrix = _get_translate_matrix(offset, direction)
    translated = cv2.warpAffine(
        img,
        translate_matrix,
        (width, height),
        # Note case when the number elements in `border_value`
        # greater than 3 (e.g. translating masks whose channels
        # large than 3) will raise TypeError in `cv2.warpAffine`.
        # Here simply slice the first 3 values in `border_value`.
        borderValue=border_value[:3],
        flags=cv2_interp_codes[interpolation])
    return translated


================================================
FILE: mmcv/image/io.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import io
import os.path as osp
import warnings
from pathlib import Path
from typing import Optional, Union

import cv2
import mmengine.fileio as fileio
import numpy as np
from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
                 IMREAD_UNCHANGED)
from mmengine.utils import is_filepath, is_str

try:
    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
except ImportError:
    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None

try:
    from PIL import Image, ImageOps
except ImportError:
    Image = None

try:
    import tifffile
except ImportError:
    tifffile = None

jpeg = None
supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']

imread_flags = {
    'color': IMREAD_COLOR,
    'grayscale': IMREAD_GRAYSCALE,
    'unchanged': IMREAD_UNCHANGED,
    'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
    'grayscale_ignore_orientation':
    IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
}

imread_backend = 'cv2'


def use_backend(backend: str) -> None:
    """Select a backend for image decoding.

    Args:
        backend (str): The image decoding backend type. Options are `cv2`,
        `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
        and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
        file format.
    """
    assert backend in supported_backends
    global imread_backend
    imread_backend = backend
    if imread_backend == 'turbojpeg':
        if TurboJPEG is None:
            raise ImportError('`PyTurboJPEG` is not installed')
        global jpeg
        if jpeg is None:
            jpeg = TurboJPEG()
    elif imread_backend == 'pillow':
        if Image is None:
            raise ImportError('`Pillow` is not installed')
    elif imread_backend == 'tifffile':
        if tifffile is None:
            raise ImportError('`tifffile` is not installed')


def _jpegflag(flag: str = 'color', channel_order: str = 'bgr'):
    channel_order = channel_order.lower()
    if channel_order not in ['rgb', 'bgr']:
        raise ValueError('channel order must be either "rgb" or "bgr"')

    if flag == 'color':
        if channel_order == 'bgr':
            return TJPF_BGR
        elif channel_order == 'rgb':
            return TJCS_RGB
    elif flag == 'grayscale':
        return TJPF_GRAY
    else:
        raise ValueError('flag must be "color" or "grayscale"')


def _pillow2array(img,
                  flag: str = 'color',
                  channel_order: str = 'bgr') -> np.ndarray:
    """Convert a pillow image to numpy array.

    Args:
        img (:obj:`PIL.Image.Image`): The image loaded using PIL
        flag (str): Flags specifying the color type of a loaded image,
            candidates are 'color', 'grayscale' and 'unchanged'.
            Default to 'color'.
        channel_order (str): The channel order of the output image array,
            candidates are 'bgr' and 'rgb'. Default to 'bgr'.

    Returns:
        np.ndarray: The converted numpy array
    """
    channel_order = channel_order.lower()
    if channel_order not in ['rgb', 'bgr']:
        raise ValueError('channel order must be either "rgb" or "bgr"')

    if flag == 'unchanged':
        array = np.array(img)
        if array.ndim >= 3 and array.shape[2] >= 3:  # color image
            array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
    else:
        # Handle exif orientation tag
        if flag in ['color', 'grayscale']:
            img = ImageOps.exif_transpose(img)
        # If the image mode is not 'RGB', convert it to 'RGB' first.
        if img.mode != 'RGB':
            if img.mode != 'LA':
                # Most formats except 'LA' can be directly converted to RGB
                img = img.convert('RGB')
            else:
                # When the mode is 'LA', the default conversion will fill in
                #  the canvas with black, which sometimes shadows black objects
                #  in the foreground.
                #
                # Therefore, a random color (124, 117, 104) is used for canvas
                img_rgba = img.convert('RGBA')
                img = Image.new('RGB', img_rgba.size, (124, 117, 104))
                img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
        if flag in ['color', 'color_ignore_orientation']:
            array = np.array(img)
            if channel_order != 'rgb':
                array = array[:, :, ::-1]  # RGB to BGR
        elif flag in ['grayscale', 'grayscale_ignore_orientation']:
            img = img.convert('L')
            array = np.array(img)
        else:
            raise ValueError(
                'flag must be "color", "grayscale", "unchanged", '
                f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
                f' but got {flag}')
    return array


def imread(img_or_path: Union[np.ndarray, str, Path],
           flag: str = 'color',
           channel_order: str = 'bgr',
           backend: Optional[str] = None,
           file_client_args: Optional[dict] = None,
           *,
           backend_args: Optional[dict] = None) -> np.ndarray:
    """Read an image.

    Args:
        img_or_path (ndarray or str or Path): Either a numpy array or str or
            pathlib.Path. If it is a numpy array (loaded image), then
            it will be returned as is.
        flag (str): Flags specifying the color type of a loaded image,
            candidates are `color`, `grayscale`, `unchanged`,
            `color_ignore_orientation` and `grayscale_ignore_orientation`.
            By default, `cv2` and `pillow` backend would rotate the image
            according to its EXIF info unless called with `unchanged` or
            `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
            always ignore image's EXIF info regardless of the flag.
            The `turbojpeg` backend only supports `color` and `grayscale`.
        channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
        backend (str | None): The image decoding backend type. Options are
            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
            If backend is None, the global imread_backend specified by
            ``mmcv.use_backend()`` will be used. Default: None.
        file_client_args (dict, optional): Arguments to instantiate a
            FileClient. See :class:`mmengine.fileio.FileClient` for details.
            Default: None. It will be deprecated in future. Please use
            ``backend_args`` instead.
            Deprecated in version 2.0.0rc4.
        backend_args (dict, optional): Instantiates the corresponding file
            backend. It may contain `backend` key to specify the file
            backend. If it contains, the file backend corresponding to this
            value will be used and initialized with the remaining values,
            otherwise the corresponding file backend will be selected
            based on the prefix of the file path. Defaults to None.
            New in version 2.0.0rc4.

    Returns:
        ndarray: Loaded image array.

    Examples:
        >>> import mmcv
        >>> img_path = '/path/to/img.jpg'
        >>> img = mmcv.imread(img_path)
        >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',
        ...     backend='cv2')
        >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',
        ...     backend='pillow')
        >>> s3_img_path = 's3://bucket/img.jpg'
        >>> # infer the file backend by the prefix s3
        >>> img = mmcv.imread(s3_img_path)
        >>> # manually set the file backend petrel
        >>> img = mmcv.imread(s3_img_path, backend_args={
        ...     'backend': 'petrel'})
        >>> http_img_path = 'http://path/to/img.jpg'
        >>> img = mmcv.imread(http_img_path)
        >>> img = mmcv.imread(http_img_path, backend_args={
        ...     'backend': 'http'})
    """
    if file_client_args is not None:
        warnings.warn(
            '"file_client_args" will be deprecated in future. '
            'Please use "backend_args" instead', DeprecationWarning)
        if backend_args is not None:
            raise ValueError(
                '"file_client_args" and "backend_args" cannot be set at the '
                'same time.')

    if isinstance(img_or_path, Path):
        img_or_path = str(img_or_path)

    if isinstance(img_or_path, np.ndarray):
        return img_or_path
    elif is_str(img_or_path):
        if file_client_args is not None:
            file_client = fileio.FileClient.infer_client(
                file_client_args, img_or_path)
            img_bytes = file_client.get(img_or_path)
        else:
            img_bytes = fileio.get(img_or_path, backend_args=backend_args)
        return imfrombytes(img_bytes, flag, channel_order, backend)
    else:
        raise TypeError('"img" must be a numpy array or a str or '
                        'a pathlib.Path object')


def imfrombytes(content: bytes,
                flag: str = 'color',
                channel_order: str = 'bgr',
                backend: Optional[str] = None) -> np.ndarray:
    """Read an image from bytes.

    Args:
        content (bytes): Image bytes got from files or other streams.
        flag (str): Same as :func:`imread`.
        channel_order (str): The channel order of the output, candidates
            are 'bgr' and 'rgb'. Default to 'bgr'.
        backend (str | None): The image decoding backend type. Options are
            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is
            None, the global imread_backend specified by ``mmcv.use_backend()``
            will be used. Default: None.

    Returns:
        ndarray: Loaded image array.

    Examples:
        >>> img_path = '/path/to/img.jpg'
        >>> with open(img_path, 'rb') as f:
        >>>     img_buff = f.read()
        >>> img = mmcv.imfrombytes(img_buff)
        >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')
        >>> img = mmcv.imfrombytes(img_buff, backend='pillow')
        >>> img = mmcv.imfrombytes(img_buff, backend='cv2')
    """

    if backend is None:
        backend = imread_backend
    if backend not in supported_backends:
        raise ValueError(
            f'backend: {backend} is not supported. Supported '
            "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'")
    if backend == 'turbojpeg':
        img = jpeg.decode(  # type: ignore
            content, _jpegflag(flag, channel_order))
        if img.shape[-1] == 1:
            img = img[:, :, 0]
        return img
    elif backend == 'pillow':
        with io.BytesIO(content) as buff:
            img = Image.open(buff)
            img = _pillow2array(img, flag, channel_order)
        return img
    elif backend == 'tifffile':
        with io.BytesIO(content) as buff:
            img = tifffile.imread(buff)
        return img
    else:
        img_np = np.frombuffer(content, np.uint8)
        flag = imread_flags[flag] if is_str(flag) else flag
        img = cv2.imdecode(img_np, flag)
        if flag == IMREAD_COLOR and channel_order == 'rgb':
            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
        return img


def imwrite(img: np.ndarray,
            file_path: str,
            params: Optional[list] = None,
            auto_mkdir: Optional[bool] = None,
            file_client_args: Optional[dict] = None,
            *,
            backend_args: Optional[dict] = None) -> bool:
    """Write image to file.

    Warning:
        The parameter `auto_mkdir` will be deprecated in the future and every
        file clients will make directory automatically.

    Args:
        img (ndarray): Image array to be written.
        file_path (str): Image file path.
        params (None or list): Same as opencv :func:`imwrite` interface.
        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
            whether to create it automatically. It will be deprecated.
        file_client_args (dict, optional): Arguments to instantiate a
            FileClient. See :class:`mmengine.fileio.FileClient` for details.
            Default: None. It will be deprecated in future. Please use
            ``backend_args`` instead.
            Deprecated in version 2.0.0rc4.
        backend_args (dict, optional): Instantiates the corresponding file
            backend. It may contain `backend` key to specify the file
            backend. If it contains, the file backend corresponding to this
            value will be used and initialized with the remaining values,
            otherwise the corresponding file backend will be selected
            based on the prefix of the file path. Defaults to None.
            New in version 2.0.0rc4.

    Returns:
        bool: Successful or not.

    Examples:
        >>> # write to hard disk client
        >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')
        >>> # infer the file backend by the prefix s3
        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')
        >>> # manually set the file backend petrel
        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', backend_args={
        ...     'backend': 'petrel'})
    """
    if file_client_args is not None:
        warnings.warn(
            '"file_client_args" will be deprecated in future. '
            'Please use "backend_args" instead', DeprecationWarning)
        if backend_args is not None:
            raise ValueError(
                '"file_client_args" and "backend_args" cannot be set at the '
                'same time.')

    assert is_filepath(file_path)
    file_path = str(file_path)
    if auto_mkdir is not None:
        warnings.warn(
            'The parameter `auto_mkdir` will be deprecated in the future and '
            'every file clients will make directory automatically.')

    img_ext = osp.splitext(file_path)[-1]
    # Encode image according to image suffix.
    # For example, if image path is '/path/your/img.jpg', the encode
    # format is '.jpg'.
    flag, img_buff = cv2.imencode(img_ext, img, params)

    if file_client_args is not None:
        file_client = fileio.FileClient.infer_client(file_client_args,
                                                     file_path)
        file_client.put(img_buff.tobytes(), file_path)
    else:
        fileio.put(img_buff.tobytes(), file_path, backend_args=backend_args)

    return flag


================================================
FILE: mmcv/image/misc.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional

import numpy as np

import mmcv

try:
    import torch
except ImportError:
    torch = None


def tensor2imgs(tensor,
                mean: Optional[tuple] = None,
                std: Optional[tuple] = None,
                to_rgb: bool = True) -> list:
    """Convert tensor to 3-channel images or 1-channel gray images.

    Args:
        tensor (torch.Tensor): Tensor that contains multiple images, shape (
            N, C, H, W). :math:`C` can be either 3 or 1.
        mean (tuple[float], optional): Mean of images. If None,
            (0, 0, 0) will be used for tensor with 3-channel,
            while (0, ) for tensor with 1-channel. Defaults to None.
        std (tuple[float], optional): Standard deviation of images. If None,
            (1, 1, 1) will be used for tensor with 3-channel,
            while (1, ) for tensor with 1-channel. Defaults to None.
        to_rgb (bool, optional): Whether the tensor was converted to RGB
            format in the first place. If so, convert it back to BGR.
            For the tensor with 1 channel, it must be False. Defaults to True.

    Returns:
        list[np.ndarray]: A list that contains multiple images.
    """

    if torch is None:
        raise RuntimeError('pytorch is not installed')
    assert torch.is_tensor(tensor) and tensor.ndim == 4
    channels = tensor.size(1)
    assert channels in [1, 3]
    if mean is None:
        mean = (0, ) * channels
    if std is None:
        std = (1, ) * channels
    assert (channels == len(mean) == len(std) == 3) or \
        (channels == len(mean) == len(std) == 1 and not to_rgb)

    num_imgs = tensor.size(0)
    mean = np.array(mean, dtype=np.float32)
    std = np.array(std, dtype=np.float32)
    imgs = []
    for img_id in range(num_imgs):
        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
        img = mmcv.imdenormalize(
            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
        imgs.append(np.ascontiguousarray(img))
    return imgs


================================================
FILE: mmcv/image/photometric.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from typing import Optional

import cv2
import numpy as np
from mmengine.utils import is_tuple_of
from PIL import Image, ImageEnhance

from .colorspace import bgr2gray, gray2bgr
from .io import imread_backend


def imnormalize(img, mean, std, to_rgb=True):
    """Normalize an image with mean and std.

    Args:
        img (ndarray): Image to be normalized.
        mean (ndarray): The mean to be used for normalize.
        std (ndarray): The std to be used for normalize.
        to_rgb (bool): Whether to convert to rgb.

    Returns:
        ndarray: The normalized image.
    """
    img = img.copy().astype(np.float32)
    return imnormalize_(img, mean, std, to_rgb)


def imnormalize_(img, mean, std, to_rgb=True):
    """Inplace normalize an image with mean and std.

    Args:
        img (ndarray): Image to be normalized.
        mean (ndarray): The mean to be used for normalize.
        std (ndarray): The std to be used for normalize.
        to_rgb (bool): Whether to convert to rgb.

    Returns:
        ndarray: The normalized image.
    """
    # cv2 inplace normalization does not accept uint8
    assert img.dtype != np.uint8
    mean = np.float64(mean.reshape(1, -1))
    stdinv = 1 / np.float64(std.reshape(1, -1))
    if to_rgb:
        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
    cv2.subtract(img, mean, img)  # inplace
    cv2.multiply(img, stdinv, img)  # inplace
    return img


def imdenormalize(img, mean, std, to_bgr=True):
    assert img.dtype != np.uint8
    mean = mean.reshape(1, -1).astype(np.float64)
    std = std.reshape(1, -1).astype(np.float64)
    img = cv2.multiply(img, std)  # make a copy
    cv2.add(img, mean, img)  # inplace
    if to_bgr:
        cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img)  # inplace
    return img


def iminvert(img):
    """Invert (negate) an image.

    Args:
        img (ndarray): Image to be inverted.

    Returns:
        ndarray: The inverted image.
    """
    return np.full_like(img, 255) - img


def solarize(img, thr=128):
    """Solarize an image (invert all pixel values above a threshold)

    Args:
        img (ndarray): Image to be solarized.
        thr (int): Threshold for solarizing (0 - 255).

    Returns:
        ndarray: The solarized image.
    """
    img = np.where(img < thr, img, 255 - img)
    return img


def posterize(img, bits):
    """Posterize an image (reduce the number of bits for each color channel)

    Args:
        img (ndarray): Image to be posterized.
        bits (int): Number of bits (1 to 8) to use for posterizing.

    Returns:
        ndarray: The posterized image.
    """
    shift = 8 - bits
    img = np.left_shift(np.right_shift(img, shift), shift)
    return img


def adjust_color(img, alpha=1, beta=None, gamma=0, backend=None):
    r"""It blends the source image and its gray image:

    .. math::
        output = img * alpha + gray\_img * beta + gamma

    Args:
        img (ndarray): The input source image.
        alpha (int | float): Weight for the source image. Default 1.
        beta (int | float): Weight for the converted gray image.
            If None, it's assigned the value (1 - `alpha`).
        gamma (int | float): Scalar added to each sum.
            Same as :func:`cv2.addWeighted`. Default 0.
        backend (str | None): The image processing backend type. Options are
            `cv2`, `pillow`, `None`. If backend is None, the global
            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
            used. Defaults to None.

    Returns:
        ndarray: Colored image which has the same size and dtype as input.
    """
    if backend is None:
        backend = imread_backend
    if backend not in ['cv2', 'pillow']:
        raise ValueError(f'backend: {backend} is not supported.'
                         f"Supported backends are 'cv2', 'pillow'")

    if backend == 'pillow':
        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
        warnings.warn("Only use 'alpha' for pillow backend.")
        # Image.fromarray defaultly supports RGB, not BGR.
        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
        enhancer = ImageEnhance.Color(pil_image)
        pil_image = enhancer.enhance(alpha)
        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
    else:
        gray_img = bgr2gray(img)
        gray_img = np.tile(gray_img[..., None], [1, 1, 3])
        if beta is None:
            beta = 1 - alpha
        colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
        if not colored_img.dtype == np.uint8:
            # Note when the dtype of `img` is not the default `np.uint8`
            # (e.g. np.float32), the value in `colored_img` got from cv2
            # is not guaranteed to be in range [0, 255], so here clip
            # is needed.
            colored_img = np.clip(colored_img, 0, 255)
        return colored_img.astype(img.dtype)


def imequalize(img):
    """Equalize the image histogram.

    This function applies a non-linear mapping to the input image,
    in order to create a uniform distribution of grayscale values
    in the output image.

    Args:
        img (ndarray): Image to be equalized.

    Returns:
        ndarray: The equalized image.
    """

    def _scale_channel(im, c):
        """Scale the data in the corresponding channel."""
        im = im[:, :, c]
        # Compute the histogram of the image channel.
        histo = np.histogram(im, 256, (0, 255))[0]
        # For computing the step, filter out the nonzeros.
        nonzero_histo = histo[histo > 0]
        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
        if not step:
            lut = np.array(range(256))
        else:
            # Compute the cumulative sum, shifted by step // 2
            # and then normalized by step.
            lut = (np.cumsum(histo) + (step // 2)) // step
            # Shift lut, prepending with 0.
            lut = np.concatenate([[0], lut[:-1]], 0)
            # handle potential integer overflow
            lut[lut > 255] = 255
        # If step is zero, return the original image.
        # Otherwise, index from lut.
        return np.where(np.equal(step, 0), im, lut[im])

    # Scales each channel independently and then stacks
    # the result.
    s1 = _scale_channel(img, 0)
    s2 = _scale_channel(img, 1)
    s3 = _scale_channel(img, 2)
    equalized_img = np.stack([s1, s2, s3], axis=-1)
    return equalized_img.astype(img.dtype)


def adjust_brightness(img, factor=1., backend=None):
    """Adjust image brightness.

    This function controls the brightness of an image. An
    enhancement factor of 0.0 gives a black image.
    A factor of 1.0 gives the original image. This function
    blends the source image and the degenerated black image:

    .. math::
        output = img * factor + degenerated * (1 - factor)

    Args:
        img (ndarray): Image to be brightened.
        factor (float): A value controls the enhancement.
            Factor 1.0 returns the original image, lower
            factors mean less color (brightness, contrast,
            etc), and higher values more. Default 1.
        backend (str | None): The image processing backend type. Options are
            `cv2`, `pillow`, `None`. If backend is None, the global
            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
            used. Defaults to None.

    Returns:
        ndarray: The brightened image.
    """
    if backend is None:
        backend = imread_backend
    if backend not in ['cv2', 'pillow']:
        raise ValueError(f'backend: {backend} is not supported.'
                         f"Supported backends are 'cv2', 'pillow'")

    if backend == 'pillow':
        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
        # Image.fromarray defaultly supports RGB, not BGR.
        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
        enhancer = ImageEnhance.Brightness(pil_image)
        pil_image = enhancer.enhance(factor)
        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
    else:
        degenerated = np.zeros_like(img)
        # Note manually convert the dtype to np.float32, to
        # achieve as close results as PIL.ImageEnhance.Brightness.
        # Set beta=1-factor, and gamma=0
        brightened_img = cv2.addWeighted(
            img.astype(np.float32), factor, degenerated.astype(np.float32),
            1 - factor, 0)
        brightened_img = np.clip(brightened_img, 0, 255)
        return brightened_img.astype(img.dtype)


def adjust_contrast(img, factor=1., backend=None):
    """Adjust image contrast.

    This function controls the contrast of an image. An
    enhancement factor of 0.0 gives a solid grey
    image. A factor of 1.0 gives the original image. It
    blends the source image and the degenerated mean image:

    .. math::
        output = img * factor + degenerated * (1 - factor)

    Args:
        img (ndarray): Image to be contrasted. BGR order.
        factor (float): Same as :func:`mmcv.adjust_brightness`.
        backend (str | None): The image processing backend type. Options are
            `cv2`, `pillow`, `None`. If backend is None, the global
            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
            used. Defaults to None.

    Returns:
        ndarray: The contrasted image.
    """
    if backend is None:
        backend = imread_backend
    if backend not in ['cv2', 'pillow']:
        raise ValueError(f'backend: {backend} is not supported.'
                         f"Supported backends are 'cv2', 'pillow'")

    if backend == 'pillow':
        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
        # Image.fromarray defaultly supports RGB, not BGR.
        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
        enhancer = ImageEnhance.Contrast(pil_image)
        pil_image = enhancer.enhance(factor)
        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
    else:
        gray_img = bgr2gray(img)
        hist = np.histogram(gray_img, 256, (0, 255))[0]
        mean = round(np.sum(gray_img) / np.sum(hist))
        degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
        degenerated = gray2bgr(degenerated)
        contrasted_img = cv2.addWeighted(
            img.astype(np.float32), factor, degenerated.astype(np.float32),
            1 - factor, 0)
        contrasted_img = np.clip(contrasted_img, 0, 255)
        return contrasted_img.astype(img.dtype)


def auto_contrast(img, cutoff=0):
    """Auto adjust image contrast.

    This function maximize (normalize) image contrast by first removing cutoff
    percent of the lightest and darkest pixels from the histogram and remapping
    the image so that the darkest pixel becomes black (0), and the lightest
    becomes white (255).

    Args:
        img (ndarray): Image to be contrasted. BGR order.
        cutoff (int | float | tuple): The cutoff percent of the lightest and
            darkest pixels to be removed. If given as tuple, it shall be
            (low, high). Otherwise, the single value will be used for both.
            Defaults to 0.

    Returns:
        ndarray: The contrasted image.
    """

    def _auto_contrast_channel(im, c, cutoff):
        im = im[:, :, c]
        # Compute the histogram of the image channel.
        histo = np.histogram(im, 256, (0, 255))[0]
        # Remove cut-off percent pixels from histo
        histo_sum = np.cumsum(histo)
        cut_low = histo_sum[-1] * cutoff[0] // 100
        cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
        histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
        histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)

        # Compute mapping
        low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
        # If all the values have been cut off, return the origin img
        if low >= high:
            return im
        scale = 255.0 / (high - low)
        offset = -low * scale
        lut = np.array(range(256))
        lut = lut * scale + offset
        lut = np.clip(lut, 0, 255)
        return lut[im]

    if isinstance(cutoff, (int, float)):
        cutoff = (cutoff, cutoff)
    else:
        assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
            f'float or tuple, but got {type(cutoff)} instead.'
    # Auto adjusts contrast for each channel independently and then stacks
    # the result.
    s1 = _auto_contrast_channel(img, 0, cutoff)
    s2 = _auto_contrast_channel(img, 1, cutoff)
    s3 = _auto_contrast_channel(img, 2, cutoff)
    contrasted_img = np.stack([s1, s2, s3], axis=-1)
    return contrasted_img.astype(img.dtype)


def adjust_sharpness(img, factor=1., kernel=None):
    """Adjust image sharpness.

    This function controls the sharpness of an image. An
    enhancement factor of 0.0 gives a blurred image. A
    factor of 1.0 gives the original image. And a factor
    of 2.0 gives a sharpened image. It blends the source
    image and the degenerated mean image:

    .. math::
        output = img * factor + degenerated * (1 - factor)

    Args:
        img (ndarray): Image to be sharpened. BGR order.
        factor (float): Same as :func:`mmcv.adjust_brightness`.
        kernel (np.ndarray, optional): Filter kernel to be applied on the img
            to obtain the degenerated img. Defaults to None.

    Note:
        No value sanity check is enforced on the kernel set by users. So with
        an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
        the function its name indicates but end up performing whatever
        transform determined by the kernel.

    Returns:
        ndarray: The sharpened image.
    """

    if kernel is None:
        # adopted from PIL.ImageFilter.SMOOTH
        kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
    assert isinstance(kernel, np.ndarray), \
        f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
    assert kernel.ndim == 2, \
        f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'

    degenerated = cv2.filter2D(img, -1, kernel)
    sharpened_img = cv2.addWeighted(
        img.astype(np.float32), factor, degenerated.astype(np.float32),
        1 - factor, 0)
    sharpened_img = np.clip(sharpened_img, 0, 255)
    return sharpened_img.astype(img.dtype)


def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
    """AlexNet-style PCA jitter.

    This data augmentation is proposed in `ImageNet Classification with Deep
    Convolutional Neural Networks
    <https://dl.acm.org/doi/pdf/10.1145/3065386>`_.

    Args:
        img (ndarray): Image to be adjusted lighting. BGR order.
        eigval (ndarray): the eigenvalue of the convariance matrix of pixel
            values, respectively.
        eigvec (ndarray): the eigenvector of the convariance matrix of pixel
            values, respectively.
        alphastd (float): The standard deviation for distribution of alpha.
            Defaults to 0.1
        to_rgb (bool): Whether to convert img to rgb.

    Returns:
        ndarray: The adjusted image.
    """
    assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
        f'eigval and eigvec should both be of type np.ndarray, got ' \
        f'{type(eigval)} and {type(eigvec)} instead.'

    assert eigval.ndim == 1 and eigvec.ndim == 2
    assert eigvec.shape == (3, eigval.shape[0])
    n_eigval = eigval.shape[0]
    assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
        f'got {type(alphastd)} instead.'

    img = img.copy().astype(np.float32)
    if to_rgb:
        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace

    alpha = np.random.normal(0, alphastd, n_eigval)
    alter = eigvec \
        * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
        * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
    alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
    img_adjusted = img + alter
    return img_adjusted


def lut_transform(img, lut_table):
    """Transform array by look-up table.

    The function lut_transform fills the output array with values from the
    look-up table. Indices of the entries are taken from the input array.

    Args:
        img (ndarray): Image to be transformed.
        lut_table (ndarray): look-up table of 256 elements; in case of
            multi-channel input array, the table should either have a single
            channel (in this case the same table is used for all channels) or
            the same number of channels as in the input array.

    Returns:
        ndarray: The transformed image.
    """
    assert isinstance(img, np.ndarray)
    assert 0 <= np.min(img) and np.max(img) <= 255
    assert isinstance(lut_table, np.ndarray)
    assert lut_table.shape == (256, )

    return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)


def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
    """Use CLAHE method to process the image.

    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
    Graphics Gems, 1994:474-485.` for more information.

    Args:
        img (ndarray): Image to be processed.
        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
            Input image will be divided into equally sized rectangular tiles.
            It defines the number of tiles in row and column. Default: (8, 8).

    Returns:
        ndarray: The processed image.
    """
    assert isinstance(img, np.ndarray)
    assert img.ndim == 2
    assert isinstance(clip_limit, (float, int))
    assert is_tuple_of(tile_grid_size, int)
    assert len(tile_grid_size) == 2

    clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
    return clahe.apply(np.array(img, dtype=np.uint8))


def adjust_hue(img: np.ndarray,
               hue_factor: float,
               backend: Optional[str] = None) -> np.ndarray:
    """Adjust hue of an image.

    The image hue is adjusted by converting the image to HSV and cyclically
    shifting the intensities in the hue channel (H). The image is then
    converted back to original image mode.

    `hue_factor` is the amount of shift in H channel and must be in the
    interval `[-0.5, 0.5]`.

    Modified from
    https://github.com/pytorch/vision/blob/main/torchvision/
    transforms/functional.py

    Args:
        img (ndarray): Image to be adjusted.
        hue_factor (float):  How much to shift the hue channel. Should be in
            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
            HSV space in positive and negative direction respectively.
            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
            with complementary colors while 0 gives the original image.
        backend (str | None): The image processing backend type. Options are
            `cv2`, `pillow`, `None`. If backend is None, the global
            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
            used. Defaults to None.

    Returns:
        ndarray: Hue adjusted image.
    """
    if backend is None:
        backend = imread_backend
    if backend not in ['cv2', 'pillow']:
        raise ValueError(f'backend: {backend} is not supported.'
                         f"Supported backends are 'cv2', 'pillow'")

    if not (-0.5 <= hue_factor <= 0.5):
        raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].')
    if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})):
        raise TypeError('img should be ndarray with dim=[2 or 3].')

    if backend == 'pillow':
        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
        # Image.fromarray defaultly supports RGB, not BGR.
        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
        input_mode = pil_image.mode
        if input_mode in {'L', '1', 'I', 'F'}:
            return pil_image

        h, s, v = pil_image.convert('HSV').split()

        np_h = np.array(h, dtype=np.uint8)
        # uint8 addition take cares of rotation across boundaries
        with np.errstate(over='ignore'):
            np_h += np.uint8(hue_factor * 255)
        h = Image.fromarray(np_h, 'L')

        pil_image = Image.merge('HSV', (h, s, v)).convert(input_mode)
        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
    else:
        dtype = img.dtype
        img = img.astype(np.uint8)
        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
        h, s, v = cv2.split(hsv_img)
        h = h.astype(np.uint8)
        # uint8 addition take cares of rotation across boundaries
        with np.errstate(over='ignore'):
            h += np.uint8(hue_factor * 255)
        hsv_img = cv2.merge([h, s, v])
        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)


================================================
FILE: mmcv/ops/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from mmcv.utils import IS_MLU_AVAILABLE
from .active_rotated_filter import active_rotated_filter
from .assign_score_withk import assign_score_withk
from .ball_query import ball_query
from .bbox import bbox_overlaps
from .bezier_align import BezierAlign, bezier_align
from .bias_act import bias_act
from .border_align import BorderAlign, border_align
from .box_iou_quadri import box_iou_quadri
from .box_iou_rotated import box_iou_rotated
from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
from .cc_attention import CrissCrossAttention
from .chamfer_distance import chamfer_distance
from .contour_expand import contour_expand
from .conv2d_gradfix import conv2d, conv_transpose2d
from .convex_iou import convex_giou, convex_iou
from .corner_pool import CornerPool
from .correlation import Correlation
from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
                              ModulatedDeformRoIPoolPack, deform_roi_pool)
from .deprecated_wrappers import Conv2d_deprecated as Conv2d
from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
from .deprecated_wrappers import Linear_deprecated as Linear
from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
from .filtered_lrelu import filtered_lrelu
from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
                         sigmoid_focal_loss, softmax_focal_loss)
from .furthest_point_sample import (furthest_point_sample,
                                    furthest_point_sample_with_dist)
from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
from .gather_points import gather_points
from .group_points import GroupAll, QueryAndGroup, grouping_operation
from .info import get_compiler_version, get_compiling_cuda_version
from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
                    nms3d_normal, nms_bev, nms_normal_bev)
from .knn import knn
from .masked_conv import MaskedConv2d, masked_conv2d
from .min_area_polygons import min_area_polygons
from .modulated_deform_conv import (ModulatedDeformConv2d,
                                    ModulatedDeformConv2dPack,
                                    modulated_deform_conv2d)
from .multi_scale_deform_attn import MultiScaleDeformableAttention
from .nms import batched_nms, nms, nms_match, nms_quadri, nms_rotated, soft_nms
from .pixel_group import pixel_group
from .point_sample import (SimpleRoIAlign, point_sample,
                           rel_roi_point_to_rel_img_point)
from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
                              points_in_boxes_part)
from .points_in_polygons import points_in_polygons
from .points_sampler import PointsSampler
from .prroi_pool import PrRoIPool, prroi_pool
from .psa_mask import PSAMask
from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
from .roi_align import RoIAlign, roi_align
from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
from .roi_pool import RoIPool, roi_pool
from .roiaware_pool3d import RoIAwarePool3d
from .roipoint_pool3d import RoIPointPool3d
from .rotated_feature_align import rotated_feature_align
from .saconv import SAConv2d
from .scatter_points import DynamicScatter, dynamic_scatter
from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
                          SparseConvTranspose3d, SparseInverseConv2d,
                          SparseInverseConv3d, SubMConv2d, SubMConv3d)
from .sparse_modules import SparseModule, SparseSequential
from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
from .sparse_structure import SparseConvTensor, scatter_nd
from .sync_bn import SyncBatchNorm
from .three_interpolate import three_interpolate
from .three_nn import three_nn
from .tin_shift import TINShift, tin_shift
from .upfirdn2d import filter2d, upfirdn2d, upsample2d
from .voxelize import Voxelization, voxelization

__all__ = [
    'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
    'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
    'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
    'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
    'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
    'get_compiler_version', 'get_compiling_cuda_version', 'MaskedConv2d',
    'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
    'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
    'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
    'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
    'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
    'box_iou_rotated', 'box_iou_quadri', 'RoIPointPool3d', 'nms_rotated',
    'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU',
    'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated',
    'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated',
    'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation',
    'contour_expand', 'three_nn', 'three_interpolate',
    'MultiScaleDeformableAttention', 'BorderAlign', 'border_align',
    'gather_points', 'furthest_point_sample', 'nms_quadri',
    'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
    'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
    'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
    'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
    'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
    'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
    'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
    'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d',
    'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align'
]

if IS_MLU_AVAILABLE:
    from .deform_conv import DeformConv2dPack_MLU  # noqa:F401
    from .modulated_deform_conv import \
        ModulatedDeformConv2dPack_MLU  # noqa:F401
    __all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU'])


================================================
FILE: mmcv/ops/active_rotated_filter.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple

import torch
from torch.autograd import Function
from torch.autograd.function import once_differentiable

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext',
    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])


class ActiveRotatedFilterFunction(Function):
    """Encoding the orientation information and generating orientation-
    sensitive features.

    The details are described in the paper
    `Align Deep Features for Oriented Object Detection  <https://arxiv.org/abs/2008.09397>_`.
    """  # noqa: E501

    @staticmethod
    def forward(ctx, input: torch.Tensor,
                indices: torch.Tensor) -> torch.Tensor:
        """
        Args:
            input (torch.Tensor): Input features with shape
                [num_output_planes, num_input_planes, num_orientations, H, W].
            indices (torch.Tensor): Indices with shape
                [num_orientations, H, W, num_rotations].

        Returns:
            torch.Tensor: Refined features with shape [num_output_planes *
            num_rotations, num_input_planes * num_orientations, H, W].
        """
        ctx.save_for_backward(input, indices)
        op, ip, o, h, w = input.size()
        o, h, w, r = indices.size()
        output = input.new_zeros((op * r, ip * o, h, w))
        ext_module.active_rotated_filter_forward(input, indices, output)

        return output

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
        """
        Args:
            grad_output (torch.Tensor): The gradient of output features
                with shape [num_output_planes * num_rotations,
                num_input_planes * num_orientations, H, W].

        Returns:
            torch.Tensor: The gradient of input features with shape
            [num_output_planes, num_input_planes, num_orientations, H, W].
        """
        input, indices = ctx.saved_tensors
        grad_in = torch.zeros_like(input)
        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
        return grad_in, None


active_rotated_filter = ActiveRotatedFilterFunction.apply


================================================
FILE: mmcv/ops/assign_score_withk.py
================================================
from typing import Tuple

import torch
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])


class AssignScoreWithK(Function):
    r"""Perform weighted sum to generate output features according to scores.
    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
    scene_seg/lib/paconv_lib/src/gpu>`_.

    This is a memory-efficient CUDA implementation of assign_scores operation,
    which first transform all point features with weight bank, then assemble
    neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.

    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
        more detailed descriptions.

    Note:
        This implementation assumes using ``neighbor`` kernel input, which is
            (point_features - center_features, point_features).
        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
        pointnet2/paconv.py#L128 for more details.
    """

    @staticmethod
    def forward(ctx,
                scores: torch.Tensor,
                point_features: torch.Tensor,
                center_features: torch.Tensor,
                knn_idx: torch.Tensor,
                aggregate: str = 'sum') -> torch.Tensor:
        """
        Args:
            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
                aggregate weight matrices in the weight bank.
                ``npoint`` is the number of sampled centers.
                ``K`` is the number of queried neighbors.
                ``M`` is the number of weight matrices in the weight bank.
            point_features (torch.Tensor): (B, N, M, out_dim)
                Pre-computed point features to be aggregated.
            center_features (torch.Tensor): (B, N, M, out_dim)
                Pre-computed center features to be aggregated.
            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
                We assume the first idx in each row is the idx of the center.
            aggregate (str, optional): Aggregation method.
                Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.

        Returns:
            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
        """
        agg = {'sum': 0, 'avg': 1, 'max': 2}

        B, N, M, out_dim = point_features.size()
        _, npoint, K, _ = scores.size()

        output = point_features.new_zeros((B, out_dim, npoint, K))
        ext_module.assign_score_withk_forward(
            point_features.contiguous(),
            center_features.contiguous(),
            scores.contiguous(),
            knn_idx.contiguous(),
            output,
            B=B,
            N0=N,
            N1=npoint,
            M=M,
            K=K,
            O=out_dim,
            aggregate=agg[aggregate])

        ctx.save_for_backward(output, point_features, center_features, scores,
                              knn_idx)
        ctx.agg = agg[aggregate]

        return output

    @staticmethod
    def backward(
        ctx, grad_out: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
        """
        Args:
            grad_out (torch.Tensor): (B, out_dim, npoint, K)

        Returns:
            tuple[torch.Tensor]: A tuple contains five elements. The first one
            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
            second is the gradient of ``point_features`` whose shape is
            (B, N, M, out_dim). The third is the gradient of
            ``center_features`` with the shape of (B, N, M, out_dim). The last
            two are ``None``.
        """
        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors

        agg = ctx.agg

        B, N, M, out_dim = point_features.size()
        _, npoint, K, _ = scores.size()

        grad_point_features = point_features.new_zeros(point_features.shape)
        grad_center_features = center_features.new_zeros(center_features.shape)
        grad_scores = scores.new_zeros(scores.shape)

        ext_module.assign_score_withk_backward(
            grad_out.contiguous(),
            point_features.contiguous(),
            center_features.contiguous(),
            scores.contiguous(),
            knn_idx.contiguous(),
            grad_point_features,
            grad_center_features,
            grad_scores,
            B=B,
            N0=N,
            N1=npoint,
            M=M,
            K=K,
            O=out_dim,
            aggregate=agg)

        return grad_scores, grad_point_features, \
            grad_center_features, None, None


assign_score_withk = AssignScoreWithK.apply


================================================
FILE: mmcv/ops/ball_query.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Tuple

import torch
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['ball_query_forward', 'stack_ball_query_forward'])


class BallQuery(Function):
    """Find nearby points in spherical space."""

    @staticmethod
    def forward(
            ctx,
            min_radius: float,
            max_radius: float,
            sample_num: int,
            xyz: torch.Tensor,
            center_xyz: torch.Tensor,
            xyz_batch_cnt: Optional[torch.Tensor] = None,
            center_xyz_batch_cnt: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Args:
            min_radius (float): minimum radius of the balls.
            max_radius (float): maximum radius of the balls.
            sample_num (int): maximum number of features in the balls.
            xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features,
                or staked input (N1 + N2 ..., 3).
            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
                query, or staked input (M1 + M2 ..., 3).
            xyz_batch_cnt: (batch_size): Stacked input xyz coordinates nums in
                each batch, just like (N1, N2, ...). Defaults to None.
                New in version 1.7.0.
            center_xyz_batch_cnt: (batch_size): Stacked centers coordinates
                nums in each batch, just line (M1, M2, ...). Defaults to None.
                New in version 1.7.0.

        Returns:
            torch.Tensor: (B, npoint, nsample) tensor with the indices of the
            features that form the query balls.
        """
        assert center_xyz.is_contiguous()
        assert xyz.is_contiguous()
        assert min_radius < max_radius
        if xyz_batch_cnt is not None and center_xyz_batch_cnt is not None:
            assert xyz_batch_cnt.dtype == torch.int
            assert center_xyz_batch_cnt.dtype == torch.int
            idx = center_xyz.new_zeros((center_xyz.shape[0], sample_num),
                                       dtype=torch.int32)
            ext_module.stack_ball_query_forward(
                center_xyz,
                center_xyz_batch_cnt,
                xyz,
                xyz_batch_cnt,
                idx,
                max_radius=max_radius,
                nsample=sample_num,
            )
        else:
            B, N, _ = xyz.size()
            npoint = center_xyz.size(1)
            idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int32)
            ext_module.ball_query_forward(
                center_xyz,
                xyz,
                idx,
                b=B,
                n=N,
                m=npoint,
                min_radius=min_radius,
                max_radius=max_radius,
                nsample=sample_num)
        if torch.__version__ != 'parrots':
            ctx.mark_non_differentiable(idx)
        return idx

    @staticmethod
    def backward(ctx, a=None) -> Tuple[None, None, None, None]:
        return None, None, None, None


ball_query = BallQuery.apply


================================================
FILE: mmcv/ops/bbox.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])


def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
                       bboxes2: torch.Tensor,
                       mode: str = 'iou',
                       aligned: bool = False,
                       offset: int = 0) -> torch.Tensor:
    assert mode in ['iou', 'iof']

    if aligned:
        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]

        wh = (rb - lt + offset).clamp(min=0)  # [rows, 2]
        overlap = wh[:, 0] * wh[:, 1]
        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
            bboxes1[:, 3] - bboxes1[:, 1] + offset)

        if mode == 'iou':
            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
                bboxes2[:, 3] - bboxes2[:, 1] + offset)
            ious = overlap / (area1 + area2 - overlap)
        else:
            ious = overlap / area1
    else:
        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]

        wh = (rb - lt + offset).clamp(min=0)  # [rows, cols, 2]
        overlap = wh[:, :, 0] * wh[:, :, 1]
        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
            bboxes1[:, 3] - bboxes1[:, 1] + offset)

        if mode == 'iou':
            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
                bboxes2[:, 3] - bboxes2[:, 1] + offset)
            ious = overlap / (area1[:, None] + area2 - overlap)
        else:
            ious = overlap / (area1[:, None])

    return ious


def bbox_overlaps(bboxes1: torch.Tensor,
                  bboxes2: torch.Tensor,
                  mode: str = 'iou',
                  aligned: bool = False,
                  offset: int = 0) -> torch.Tensor:
    """Calculate overlap between two set of bboxes.

    If ``aligned`` is ``False``, then calculate the ious between each bbox
    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
    bboxes1 and bboxes2.

    Args:
        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
            empty.
        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
            empty. If aligned is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union) or iof (intersection over
            foreground).

    Returns:
        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
        ``False``, the shape of ious is (m, n) else (m, 1).

    Example:
        >>> bboxes1 = torch.FloatTensor([
        >>>     [0, 0, 10, 10],
        >>>     [10, 10, 20, 20],
        >>>     [32, 32, 38, 42],
        >>> ])
        >>> bboxes2 = torch.FloatTensor([
        >>>     [0, 0, 10, 20],
        >>>     [0, 10, 10, 19],
        >>>     [10, 10, 20, 20],
        >>> ])
        >>> bbox_overlaps(bboxes1, bboxes2)
        tensor([[0.5000, 0.0000, 0.0000],
                [0.0000, 0.0000, 1.0000],
                [0.0000, 0.0000, 0.0000]])

    Example:
        >>> empty = torch.FloatTensor([])
        >>> nonempty = torch.FloatTensor([
        >>>     [0, 0, 10, 9],
        >>> ])
        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
    """

    mode_dict = {'iou': 0, 'iof': 1}
    assert mode in mode_dict.keys()
    mode_flag = mode_dict[mode]
    # Either the boxes are empty or the length of boxes' last dimension is 4
    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
    assert offset == 1 or offset == 0

    rows = bboxes1.size(0)
    cols = bboxes2.size(0)

    if aligned:
        assert rows == cols
        ious = bboxes1.new_zeros(rows)
    else:
        ious = bboxes1.new_zeros((rows, cols))

    if rows * cols == 0:
        return ious

    if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots':
        return _bbox_overlaps_cpu(
            bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)

    ext_module.bbox_overlaps(
        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)

    return ious


================================================
FILE: mmcv/ops/bezier_align.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple, Union

import torch
import torch.nn as nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['bezier_align_forward', 'bezier_align_backward'])


class BezierAlignFunction(Function):

    @staticmethod
    def forward(ctx,
                input: torch.Tensor,
                beziers: torch.Tensor,
                output_size: Union[int, Tuple[int, int]],
                spatial_scale: Union[int, float] = 1.0,
                sampling_ratio: int = 0,
                aligned: bool = True) -> torch.Tensor:
        ctx.output_size = _pair(output_size)
        ctx.spatial_scale = spatial_scale
        ctx.input_shape = input.size()
        ctx.sampling_ratio = sampling_ratio
        ctx.aligned = aligned

        assert beziers.size(1) == 17
        output_shape = (beziers.size(0), input.size(1), ctx.output_size[0],
                        ctx.output_size[1])
        output = input.new_zeros(output_shape)
        ext_module.bezier_align_forward(
            input,
            beziers,
            output,
            aligned_height=ctx.output_size[0],
            aligned_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale,
            sampling_ratio=ctx.sampling_ratio,
            aligned=ctx.aligned)

        ctx.save_for_backward(beziers)
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_output: torch.Tensor):
        beziers = ctx.saved_tensors[0]
        grad_input = grad_output.new_zeros(ctx.input_shape)
        grad_output = grad_output.contiguous()
        ext_module.bezier_align_backward(
            grad_output,
            beziers,
            grad_input,
            aligned_height=ctx.output_size[0],
            aligned_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale,
            sampling_ratio=ctx.sampling_ratio,
            aligned=ctx.aligned)
        return grad_input, None, None, None, None, None


bezier_align = BezierAlignFunction.apply


class BezierAlign(nn.Module):
    """Bezier align pooling layer.

    Args:
        output_size (tuple): h, w
        spatial_scale (float): scale the input boxes by this number
        sampling_ratio (int): number of inputs samples to take for each
            output sample. 0 to take samples densely for current models.
        aligned (bool): if False, use the legacy implementation in
            MMDetection. If True, align the results more perfectly.

    Note:
        The implementation of BezierAlign is modified from
        https://github.com/aim-uofa/AdelaiDet

        The meaning of aligned=True:

        Given a continuous coordinate c, its two neighboring pixel
        indices (in our pixel model) are computed by floor(c - 0.5) and
        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
        indices [0] and [1] (which are sampled from the underlying signal
        at continuous coordinates 0.5 and 1.5). But the original roi_align
        (aligned=False) does not subtract the 0.5 when computing
        neighboring pixel indices and therefore it uses pixels with a
        slightly incorrect alignment (relative to our pixel model) when
        performing bilinear interpolation.

        With `aligned=True`,
        we first appropriately scale the ROI and then shift it by -0.5
        prior to calling roi_align. This produces the correct neighbors;

        The difference does not make a difference to the model's
        performance if ROIAlign is used together with conv layers.
    """

    def __init__(
        self,
        output_size: Tuple,
        spatial_scale: Union[int, float],
        sampling_ratio: int,
        aligned: bool = True,
    ) -> None:
        super().__init__()

        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)
        self.sampling_ratio = int(sampling_ratio)
        self.aligned = aligned

    def forward(self, input: torch.Tensor,
                beziers: torch.Tensor) -> torch.Tensor:
        """BezierAlign forward.

        Args:
            inputs (Tensor): input features.
            beziers (Tensor): beziers for align.
        """
        return bezier_align(input, beziers, self.output_size,
                            self.spatial_scale, self.sampling_ratio,
                            self.aligned)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(output_size={self.output_size}, '
        s += f'spatial_scale={self.spatial_scale})'
        s += f'sampling_ratio={self.sampling_ratio})'
        s += f'aligned={self.aligned})'
        return s


================================================
FILE: mmcv/ops/bias_act.py
================================================
# Modified from
# https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.py

# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

# source: https://github.com/open-mmlab/mmediting/blob/dev-1.x/mmedit/models/editors/stylegan3/stylegan3_ops/ops/bias_act.py # noqa
"""Custom PyTorch ops for efficient bias and activation."""

from typing import Any, Dict, Optional, Union

import numpy as np
import torch

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['bias_act'])


class EasyDict(dict):
    """Convenience class that behaves like a dict but allows access with the
    attribute syntax."""

    def __getattr__(self, name: str) -> Any:
        try:
            return self[name]
        except KeyError:
            raise AttributeError(name)

    def __setattr__(self, name: str, value: Any) -> None:
        self[name] = value

    def __delattr__(self, name: str) -> None:
        del self[name]


activation_funcs = {
    'linear':
    EasyDict(
        func=lambda x, **_: x,
        def_alpha=0,
        def_gain=1,
        cuda_idx=1,
        ref='',
        has_2nd_grad=False),
    'relu':
    EasyDict(
        func=lambda x, **_: torch.nn.functional.relu(x),
        def_alpha=0,
        def_gain=np.sqrt(2),
        cuda_idx=2,
        ref='y',
        has_2nd_grad=False),
    'lrelu':
    EasyDict(
        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),
        def_alpha=0.2,
        def_gain=np.sqrt(2),
        cuda_idx=3,
        ref='y',
        has_2nd_grad=False),
    'tanh':
    EasyDict(
        func=lambda x, **_: torch.tanh(x),
        def_alpha=0,
        def_gain=1,
        cuda_idx=4,
        ref='y',
        has_2nd_grad=True),
    'sigmoid':
    EasyDict(
        func=lambda x, **_: torch.sigmoid(x),
        def_alpha=0,
        def_gain=1,
        cuda_idx=5,
        ref='y',
        has_2nd_grad=True),
    'elu':
    EasyDict(
        func=lambda x, **_: torch.nn.functional.elu(x),
        def_alpha=0,
        def_gain=1,
        cuda_idx=6,
        ref='y',
        has_2nd_grad=True),
    'selu':
    EasyDict(
        func=lambda x, **_: torch.nn.functional.selu(x),
        def_alpha=0,
        def_gain=1,
        cuda_idx=7,
        ref='y',
        has_2nd_grad=True),
    'softplus':
    EasyDict(
        func=lambda x, **_: torch.nn.functional.softplus(x),
        def_alpha=0,
        def_gain=1,
        cuda_idx=8,
        ref='y',
        has_2nd_grad=True),
    'swish':
    EasyDict(
        func=lambda x, **_: torch.sigmoid(x) * x,
        def_alpha=0,
        def_gain=np.sqrt(2),
        cuda_idx=9,
        ref='x',
        has_2nd_grad=True),
}

activation_funcs_musa = {
    'linear':
    EasyDict(
        func=lambda x, **_: x,
        def_alpha=0,
        def_gain=1,
        musa_idx=1,
        ref='',
        has_2nd_grad=False),
    'relu':
    EasyDict(
        func=lambda x, **_: torch.nn.functional.relu(x),
        def_alpha=0,
        def_gain=np.sqrt(2),
        musa_idx=2,
        ref='y',
        has_2nd_grad=False),
    'lrelu':
    EasyDict(
        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),
        def_alpha=0.2,
        def_gain=np.sqrt(2),
        musa_idx=3,
        ref='y',
        has_2nd_grad=False),
    'tanh':
    EasyDict(
        func=lambda x, **_: torch.tanh(x),
        def_alpha=0,
        def_gain=1,
        musa_idx=4,
        ref='y',
        has_2nd_grad=True),
    'sigmoid':
    EasyDict(
        func=lambda x, **_: torch.sigmoid(x),
        def_alpha=0,
        def_gain=1,
        musa_idx=5,
        ref='y',
        has_2nd_grad=True),
    'elu':
    EasyDict(
        func=lambda x, **_: torch.nn.functional.elu(x),
        def_alpha=0,
        def_gain=1,
        musa_idx=6,
        ref='y',
        has_2nd_grad=True),
    'selu':
    EasyDict(
        func=lambda x, **_: torch.nn.functional.selu(x),
        def_alpha=0,
        def_gain=1,
        musa_idx=7,
        ref='y',
        has_2nd_grad=True),
    'softplus':
    EasyDict(
        func=lambda x, **_: torch.nn.functional.softplus(x),
        def_alpha=0,
        def_gain=1,
        musa_idx=8,
        ref='y',
        has_2nd_grad=True),
    'swish':
    EasyDict(
        func=lambda x, **_: torch.sigmoid(x) * x,
        def_alpha=0,
        def_gain=np.sqrt(2),
        musa_idx=9,
        ref='x',
        has_2nd_grad=True),
}

_null_tensor = torch.empty([0])


def bias_act(input: torch.Tensor,
             bias: Optional[torch.Tensor] = None,
             dim: int = 1,
             act: str = 'linear',
             alpha: Optional[Union[float, int]] = None,
             gain: Optional[float] = None,
             clamp: Optional[float] = None,
             use_custom_op: bool = True):
    r"""Fused bias and activation function.

    Adds `bias` to activation tensor `input`, and evaluates activation
    function `act`, and scales the result by `gain`. Each of the steps is
    optional.

    In most cases, the fused op is considerably more efficient than performing
    the same calculation using standard PyTorch ops. It supports first and
    second order gradients, but not third order gradients.

    Args:
        input (torch.Tensor): Input activation tensor. Can be of any shape.
        bias (torch.Tensor): Bias vector, or `None` to disable.
            Must be a 1D tensor of the same type as `input`. The shape must
            be known, and it must match the dimension of `input` corresponding
            to `dim`. Defaults to None.
        dim (int): The dimension in `input` corresponding to the elements of
            `bias`. The value of `dim` is ignored if `b` is not specified.
            Defaults to 1.
        act (str): Name of the activation function to evaluate, or `"linear"`
            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
            "swish", etc. See `activation_funcs` for a full list. `None` is not
            allowed. Defaults to `linear`.
        alpha (float or int): Shape parameter for the activation
            function, or `None` to use the default. Defaults to None.
        gain (float): Scaling factor for the output tensor, or `None`
            to use default. See `activation_funcs` for the default scaling of
            each activation function. If unsure, consider specifying 1.
            Defaults to None.
        clamp (float):  Clamp the output values to `[-clamp, +clamp]`,
            or `None` to disable the clamping (default). Defaults to None.
        use_custom_op (bool): Whether to use customized op.
            Defaults to True.

    Returns:
        torch.Tensor: Tensor of the same shape and datatype as `input`.
    """
    assert isinstance(input, torch.Tensor)
    if use_custom_op and input.is_cuda:
        return _bias_act_cuda(
            dim=dim, act=act, alpha=alpha, gain=gain,
            clamp=clamp).apply(input, bias)
    try:
        if use_custom_op and input.is_musa:
            return _bias_act_musa(
                dim=dim, act=act, alpha=alpha, gain=gain,
                clamp=clamp).apply(input, bias)
    except AttributeError:
        pass
    return _bias_act_ref(
        input=input,
        bias=bias,
        dim=dim,
        act=act,
        alpha=alpha,
        gain=gain,
        clamp=clamp)


def _bias_act_ref(input: torch.Tensor,
                  bias: Optional[torch.Tensor] = None,
                  dim: int = 1,
                  act: str = 'linear',
                  alpha: Optional[Union[float, int]] = None,
                  gain: Optional[float] = None,
                  clamp: Optional[float] = None):
    """Slow reference implementation of `bias_act()` using standard PyTorch
    ops.

    Adds `bias` to activation tensor `input`, and evaluates activation
    function `act`, and scales the result by `gain`. Each of the steps is
    optional.

    In most cases, the fused op is considerably more efficient than performing
    the same calculation using standard PyTorch ops. It supports first and
    second order gradients, but not third order gradients.

    Args:
        input (torch.Tensor): Input activation tensor. Can be of any shape.
        bias (torch.Tensor): Bias vector, or `None` to disable.
            Must be a 1D tensor of the same type as `input`. The shape must
            be known, and it must match the dimension of `input` corresponding
            to `dim`. Defaults to None.
        dim (int): The dimension in `input` corresponding to the elements of
            `bias`. The value of `dim` is ignored if `b` is not specified.
            Defaults to 1.
        act (str): Name of the activation function to evaluate, or `"linear"`
            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
            "swish", etc. See `activation_funcs` for a full list. `None` is not
            allowed. Defaults to `linear`.
        alpha (float or int): Shape parameter for the activation
            function, or `None` to use the default. Defaults to None.
        gain (float): Scaling factor for the output tensor, or `None`
            to use default. See `activation_funcs` for the default scaling of
            each activation function. If unsure, consider specifying 1.
            Defaults to None.
        clamp (float):  Clamp the output values to
            `[-clamp, +clamp]`, or `None` to disable the clamping (default).
            Defaults to None.

    Returns:
        torch.Tensor: Tensor of the same shape and datatype as `input`.
    """
    assert isinstance(input, torch.Tensor)
    assert clamp is None or clamp >= 0
    spec = activation_funcs[act]
    alpha = float(alpha if alpha is not None else spec.def_alpha)
    gain = float(gain if gain is not None else spec.def_gain)
    clamp = float(clamp if clamp is not None else -1)

    # Add bias.
    if bias is not None:
        assert isinstance(bias, torch.Tensor) and bias.ndim == 1
        assert 0 <= dim < input.ndim
        assert bias.shape[0] == input.shape[dim]
        input = input + bias.reshape(
            [-1 if i == dim else 1 for i in range(input.ndim)])

    # Evaluate activation function.
    alpha = float(alpha)
    output = spec.func(input, alpha=alpha)

    # Scale by gain.
    gain = float(gain)
    if gain != 1:
        output = output * gain

    # Clamp.
    if clamp >= 0:
        # pylint: disable=invalid-unary-operand-type
        output = output.clamp(-clamp, clamp)
    return output


_bias_act_cuda_cache: Dict = dict()


def _bias_act_cuda(dim: int = 1,
                   act: str = 'linear',
                   alpha: Optional[Union[float, int]] = None,
                   gain: Optional[float] = None,
                   clamp: Optional[float] = None):
    """"Fast CUDA implementation of `bias_act()` using custom ops.

    Args:
        dim (int): The dimension in `x` corresponding to the elements of `b`.
            The value of `dim` is ignored if `b` is not specified.
            Defaults to 1.
        act (str): Name of the activation function to evaluate, or `"linear"`
            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
            "swish", etc. See `activation_funcs` for a full list. `None` is not
            allowed. Defaults to `linear`.
        alpha (float | int): Shape parameter for the activation
            function, or `None` to use the default. Defaults to None.
        gain (float): Scaling factor for the output tensor, or `None`
            to use default. See `activation_funcs` for the default scaling of
            each activation function. If unsure, consider specifying 1.
            Defaults to None.
        clamp (float): Clamp the output values to `[-clamp, +clamp]`,
            or `None` to disable the clamping (default). Defaults to None.

    Returns:
        torch.Tensor: Tensor of the same shape and datatype as `x`.
    """
    # Parse arguments.
    assert clamp is None or clamp >= 0
    spec = activation_funcs[act]
    alpha = float(alpha if alpha is not None else spec.def_alpha)
    gain = float(gain if gain is not None else spec.def_gain)
    clamp = float(clamp if clamp is not None else -1)

    # Lookup from cache.
    key = (dim, act, alpha, gain, clamp)
    if key in _bias_act_cuda_cache:
        return _bias_act_cuda_cache[key]

    # Forward op.
    class BiasActCuda(torch.autograd.Function):

        @staticmethod
        def forward(ctx, x, b):  # pylint: disable=arguments-differ
            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(
                1) == 1 else torch.contiguous_format
            x = x.contiguous(memory_format=ctx.memory_format)
            b = b.contiguous() if b is not None else _null_tensor.to(x.device)
            y = x
            if act != 'linear' or gain != 1 or clamp >= 0 or (
                    b is not _null_tensor.to(x.device)):
                y = ext_module.bias_act(x, b, _null_tensor.to(x.device),
                                        _null_tensor.to(x.device),
                                        _null_tensor.to(x.device), 0, dim,
                                        spec.cuda_idx, alpha, gain, clamp)
            ctx.save_for_backward(
                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(
                    x.device), b if 'x' in spec.ref or spec.has_2nd_grad else
                _null_tensor.to(x.device),
                y if 'y' in spec.ref else _null_tensor.to(x.device))
            return y

        @staticmethod
        def backward(ctx, dy):  # pylint: disable=arguments-differ
            dy = dy.contiguous(memory_format=ctx.memory_format)
            x, b, y = ctx.saved_tensors
            dx = None
            db = None

            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
                dx = dy
                if act != 'linear' or gain != 1 or clamp >= 0:
                    dx = BiasActCudaGrad.apply(dy, x, b, y)

            if ctx.needs_input_grad[1]:
                db = dx.sum([i for i in range(dx.ndim) if i != dim])

            return dx, db

    # Backward op.
    class BiasActCudaGrad(torch.autograd.Function):

        @staticmethod
        def forward(ctx, dy, x, b, y):  # pylint: disable=arguments-differ
            ctx.memory_format = torch.channels_last if dy.ndim > 2 and (
                dy.stride(1) == 1) else torch.contiguous_format
            dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1,
                                     dim, spec.cuda_idx, alpha, gain, clamp)
            ctx.save_for_backward(
                dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b,
                y)
            return dx

        @staticmethod
        def backward(ctx, d_dx):  # pylint: disable=arguments-differ
            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
            dy, x, b, y = ctx.saved_tensors
            d_dy = None
            d_x = None
            d_b = None
            d_y = None

            if ctx.needs_input_grad[0]:
                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)

            if spec.has_2nd_grad and (ctx.needs_input_grad[1]
                                      or ctx.needs_input_grad[2]):
                d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim,
                                          spec.cuda_idx, alpha, gain, clamp)

            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])

            return d_dy, d_x, d_b, d_y

    # Add to cache.
    _bias_act_cuda_cache[key] = BiasActCuda
    return BiasActCuda


_bias_act_musa_cache: Dict = dict()


def _bias_act_musa(dim: int = 1,
                   act: str = 'linear',
                   alpha: Optional[Union[float, int]] = None,
                   gain: Optional[float] = None,
                   clamp: Optional[float] = None):
    """"Fast MUSA implementation of `bias_act()` using custom ops.

    Args:
        dim (int): The dimension in `x` corresponding to the elements of `b`.
            The value of `dim` is ignored if `b` is not specified.
            Defaults to 1.
        act (str): Name of the activation function to evaluate, or `"linear"`
            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
            "swish", etc. See `activation_funcs_musa` for a full list. `None`
            is not allowed. Defaults to `linear`.
        alpha (float | int): Shape parameter for the activation
            function, or `None` to use the default. Defaults to None.
        gain (float): Scaling factor for the output tensor, or `None`
            to use default. See `activation_funcs_musa` for the default scaling
            of each activation function. If unsure, consider specifying 1.
            Defaults to None.
        clamp (float): Clamp the output values to `[-clamp, +clamp]`,
            or `None` to disable the clamping (default). Defaults to None.

    Returns:
        torch.Tensor: Tensor of the same shape and datatype as `x`.
    """
    # Parse arguments.
    assert clamp is None or clamp >= 0
    spec = activation_funcs_musa[act]
    alpha = float(alpha if alpha is not None else spec.def_alpha)
    gain = float(gain if gain is not None else spec.def_gain)
    clamp = float(clamp if clamp is not None else -1)

    # Lookup from cache.
    key = (dim, act, alpha, gain, clamp)
    if key in _bias_act_musa_cache:
        return _bias_act_musa_cache[key]

    # Forward op.
    class BiasActMusa(torch.autograd.Function):

        @staticmethod
        def forward(ctx, x, b):  # pylint: disable=arguments-differ
            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(
                1) == 1 else torch.contiguous_format
            x = x.contiguous(memory_format=ctx.memory_format)
            b = b.contiguous() if b is not None else _null_tensor.to(x.device)
            y = x
            if act != 'linear' or gain != 1 or clamp >= 0 or (
                    b is not _null_tensor.to(x.device)):
                y = ext_module.bias_act(x, b, _null_tensor.to(x.device),
                                        _null_tensor.to(x.device),
                                        _null_tensor.to(x.device), 0, dim,
                                        spec.musa_idx, alpha, gain, clamp)
            ctx.save_for_backward(
                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(
                    x.device), b if 'x' in spec.ref or spec.has_2nd_grad else
                _null_tensor.to(x.device),
                y if 'y' in spec.ref else _null_tensor.to(x.device))
            return y

        @staticmethod
        def backward(ctx, dy):  # pylint: disable=arguments-differ
            dy = dy.contiguous(memory_format=ctx.memory_format)
            x, b, y = ctx.saved_tensors
            dx = None
            db = None

            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
                dx = dy
                if act != 'linear' or gain != 1 or clamp >= 0:
                    dx = BiasActMusaGrad.apply(dy, x, b, y)

            if ctx.needs_input_grad[1]:
                db = dx.sum([i for i in range(dx.ndim) if i != dim])

            return dx, db

    # Backward op.
    class BiasActMusaGrad(torch.autograd.Function):

        @staticmethod
        def forward(ctx, dy, x, b, y):  # pylint: disable=arguments-differ
            ctx.memory_format = torch.channels_last if dy.ndim > 2 and (
                dy.stride(1) == 1) else torch.contiguous_format
            dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1,
                                     dim, spec.musa_idx, alpha, gain, clamp)
            ctx.save_for_backward(
                dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b,
                y)
            return dx

        @staticmethod
        def backward(ctx, d_dx):  # pylint: disable=arguments-differ
            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
            dy, x, b, y = ctx.saved_tensors
            d_dy = None
            d_x = None
            d_b = None
            d_y = None

            if ctx.needs_input_grad[0]:
                d_dy = BiasActMusaGrad.apply(d_dx, x, b, y)

            if spec.has_2nd_grad and (ctx.needs_input_grad[1]
                                      or ctx.needs_input_grad[2]):
                d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim,
                                          spec.musa_idx, alpha, gain, clamp)

            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])

            return d_dy, d_x, d_b, d_y

    # Add to cache.
    _bias_act_musa_cache[key] = BiasActMusa
    return BiasActMusa


================================================
FILE: mmcv/ops/border_align.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# modified from
# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py

from typing import Tuple

import torch
import torch.nn as nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['border_align_forward', 'border_align_backward'])


class BorderAlignFunction(Function):

    @staticmethod
    def symbolic(g, input, boxes, pool_size):
        return g.op(
            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)

    @staticmethod
    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
                pool_size: int) -> torch.Tensor:
        ctx.pool_size = pool_size
        ctx.input_shape = input.size()

        assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'
        assert boxes.size(2) == 4, \
            'the last dimension of boxes must be (x1, y1, x2, y2)'
        assert input.size(1) % 4 == 0, \
            'the channel for input feature must be divisible by factor 4'

        # [B, C//4, H*W, 4]
        output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)
        output = input.new_zeros(output_shape)
        # `argmax_idx` only used for backward
        argmax_idx = input.new_zeros(output_shape).to(torch.int)

        ext_module.border_align_forward(
            input, boxes, output, argmax_idx, pool_size=ctx.pool_size)

        ctx.save_for_backward(boxes, argmax_idx)
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx,
                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
        boxes, argmax_idx = ctx.saved_tensors
        grad_input = grad_output.new_zeros(ctx.input_shape)
        # complex head architecture may cause grad_output uncontiguous
        grad_output = grad_output.contiguous()
        ext_module.border_align_backward(
            grad_output,
            boxes,
            argmax_idx,
            grad_input,
            pool_size=ctx.pool_size)
        return grad_input, None, None


border_align = BorderAlignFunction.apply


class BorderAlign(nn.Module):
    r"""Border align pooling layer.

    Applies border_align over the input feature based on predicted bboxes.
    The details were described in the paper
    `BorderDet: Border Feature for Dense Object Detection
    <https://arxiv.org/abs/2007.11056>`_.

    For each border line (e.g. top, left, bottom or right) of each box,
    border_align does the following:

    1. uniformly samples ``pool_size`` +1 positions on this line, involving
       the start and end points.
    2. the corresponding features on these points are computed by bilinear
       interpolation.
    3. max pooling over all the ``pool_size`` +1 positions are used for
       computing pooled feature.

    Args:
        pool_size (int): number of positions sampled over the boxes' borders
            (e.g. top, bottom, left, right).
    """

    def __init__(self, pool_size: int):
        super().__init__()
        self.pool_size = pool_size

    def forward(self, input: torch.Tensor,
                boxes: torch.Tensor) -> torch.Tensor:
        """
        Args:
            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
                [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
                right features respectively.
            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).

        Returns:
            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
            (top,left,bottom,right) for the last dimension.
        """
        return border_align(input, boxes, self.pool_size)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(pool_size={self.pool_size})'
        return s


================================================
FILE: mmcv/ops/box_iou_quadri.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])


def box_iou_quadri(bboxes1: torch.Tensor,
                   bboxes2: torch.Tensor,
                   mode: str = 'iou',
                   aligned: bool = False) -> torch.Tensor:
    """Return intersection-over-union (Jaccard index) of boxes.

    Both sets of boxes are expected to be in
    (x1, y1, ..., x4, y4) format.

    If ``aligned`` is ``False``, then calculate the ious between each bbox
    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
    bboxes1 and bboxes2.

    Args:
        bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),
            indicating (x1, y1, ..., x4, y4) for each row.
        bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),
            indicating (x1, y1, ..., x4, y4) for each row.
        mode (str): "iou" (intersection over union) or iof (intersection over
            foreground).

    Returns:
        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
        ``False``, the shape of ious is (N, M) else (N,).
    """
    assert mode in ['iou', 'iof']
    mode_dict = {'iou': 0, 'iof': 1}
    mode_flag = mode_dict[mode]
    rows = bboxes1.size(0)
    cols = bboxes2.size(0)
    if aligned:
        ious = bboxes1.new_zeros(rows)
    else:
        ious = bboxes1.new_zeros(rows * cols)
    bboxes1 = bboxes1.contiguous()
    bboxes2 = bboxes2.contiguous()
    ext_module.box_iou_quadri(
        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
    if not aligned:
        ious = ious.view(rows, cols)
    return ious


================================================
FILE: mmcv/ops/box_iou_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])


def box_iou_rotated(bboxes1: torch.Tensor,
                    bboxes2: torch.Tensor,
                    mode: str = 'iou',
                    aligned: bool = False,
                    clockwise: bool = True) -> torch.Tensor:
    """Return intersection-over-union (Jaccard index) of boxes.

    Both sets of boxes are expected to be in
    (x_center, y_center, width, height, angle) format.

    If ``aligned`` is ``False``, then calculate the ious between each bbox
    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
    bboxes1 and bboxes2.

    .. note::
        The operator assumes:

        1) The positive direction along x axis is left -> right.

        2) The positive direction along y axis is top -> down.

        3) The w border is in parallel with x axis when angle = 0.

        However, there are 2 opposite definitions of the positive angular
        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
        both definitions and uses CW by default.

        Please set ``clockwise=False`` if you are using the CCW definition.

        The coordinate system when ``clockwise`` is ``True`` (default)

            .. code-block:: none

                0-------------------> x (0 rad)
                |  A-------------B
                |  |             |
                |  |     box     h
                |  |   angle=0   |
                |  D------w------C
                v
                y (pi/2 rad)

            In such coordination system the rotation matrix is

            .. math::
                \\begin{pmatrix}
                \\cos\\alpha & -\\sin\\alpha \\\\
                \\sin\\alpha & \\cos\\alpha
                \\end{pmatrix}

            The coordinates of the corner point A can be calculated as:

            .. math::
                P_A=
                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
                =
                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
                =
                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
                \\\\
                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}


        The coordinate system when ``clockwise`` is ``False``

            .. code-block:: none

                0-------------------> x (0 rad)
                |  A-------------B
                |  |             |
                |  |     box     h
                |  |   angle=0   |
                |  D------w------C
                v
                y (-pi/2 rad)

            In such coordination system the rotation matrix is

            .. math::
                \\begin{pmatrix}
                \\cos\\alpha & \\sin\\alpha \\\\
                -\\sin\\alpha & \\cos\\alpha
                \\end{pmatrix}

            The coordinates of the corner point A can be calculated as:

            .. math::
                P_A=
                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
                =
                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
                =
                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
                \\\\
                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}

    Args:
        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
            indicating (x, y, w, h, theta) for each row. Note that theta is in
            radian.
        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
            indicating (x, y, w, h, theta) for each row. Note that theta is in
            radian.
        mode (str): "iou" (intersection over union) or iof (intersection over
            foreground).
        clockwise (bool): flag indicating whether the positive angular
            orientation is clockwise. default True.
            `New in version 1.4.3.`

    Returns:
        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
        ``False``, the shape of ious is (N, M) else (N,).
    """
    assert mode in ['iou', 'iof']
    mode_dict = {'iou': 0, 'iof': 1}
    mode_flag = mode_dict[mode]
    rows = bboxes1.size(0)
    cols = bboxes2.size(0)
    if aligned:
        ious = bboxes1.new_zeros(rows)
    else:
        if bboxes1.device.type == 'mlu':
            ious = bboxes1.new_zeros([rows, cols])
        else:
            ious = bboxes1.new_zeros(rows * cols)
    if not clockwise:
        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
        flip_mat[-1] = -1
        bboxes1 = bboxes1 * flip_mat
        bboxes2 = bboxes2 * flip_mat
    bboxes1 = bboxes1.contiguous()
    bboxes2 = bboxes2.contiguous()
    ext_module.box_iou_rotated(
        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
    if not aligned:
        ious = ious.view(rows, cols)
    return ious


================================================
FILE: mmcv/ops/carafe.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import normal_init, xavier_init
from mmengine.registry import MODELS
from torch import Tensor
from torch.autograd import Function
from torch.nn.modules.module import Module

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', [
    'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
    'carafe_backward'
])


class CARAFENaiveFunction(Function):

    @staticmethod
    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
                 group_size: int, scale_factor: int) -> Tensor:
        return g.op(
            'mmcv::MMCVCARAFENaive',
            features,
            masks,
            kernel_size_i=kernel_size,
            group_size_i=group_size,
            scale_factor_f=scale_factor)

    @staticmethod
    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
                group_size: int, scale_factor: int) -> Tensor:
        assert scale_factor >= 1
        assert masks.size(1) == kernel_size * kernel_size * group_size
        assert masks.size(-1) == features.size(-1) * scale_factor
        assert masks.size(-2) == features.size(-2) * scale_factor
        assert features.size(1) % group_size == 0
        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
        ctx.kernel_size = kernel_size
        ctx.group_size = group_size
        ctx.scale_factor = scale_factor
        ctx.feature_size = features.size()
        ctx.mask_size = masks.size()

        n, c, h, w = features.size()
        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
        ext_module.carafe_naive_forward(
            features,
            masks,
            output,
            kernel_size=kernel_size,
            group_size=group_size,
            scale_factor=scale_factor)

        if features.requires_grad or masks.requires_grad or \
                torch.__version__ == 'parrots':
            ctx.save_for_backward(features, masks)
        return output

    @staticmethod
    def backward(
            ctx,
            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
        assert grad_output.is_cuda or grad_output.is_musa

        features, masks = ctx.saved_tensors
        kernel_size = ctx.kernel_size
        group_size = ctx.group_size
        scale_factor = ctx.scale_factor

        grad_input = torch.zeros_like(features)
        grad_masks = torch.zeros_like(masks)
        ext_module.carafe_naive_backward(
            grad_output.contiguous(),
            features,
            masks,
            grad_input,
            grad_masks,
            kernel_size=kernel_size,
            group_size=group_size,
            scale_factor=scale_factor)

        return grad_input, grad_masks, None, None, None


carafe_naive = CARAFENaiveFunction.apply


class CARAFENaive(Module):

    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
        super().__init__()

        assert isinstance(kernel_size, int) and isinstance(
            group_size, int) and isinstance(scale_factor, int)
        self.kernel_size = kernel_size
        self.group_size = group_size
        self.scale_factor = scale_factor

    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
        return carafe_naive(features, masks, self.kernel_size, self.group_size,
                            self.scale_factor)


class CARAFEFunction(Function):

    @staticmethod
    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
                 group_size: int, scale_factor: int) -> Tensor:
        return g.op(
            'mmcv::MMCVCARAFE',
            features,
            masks,
            kernel_size_i=kernel_size,
            group_size_i=group_size,
            scale_factor_f=scale_factor)

    @staticmethod
    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
                group_size: int, scale_factor: int) -> Tensor:
        assert scale_factor >= 1
        assert masks.size(1) == kernel_size * kernel_size * group_size
        assert masks.size(-1) == features.size(-1) * scale_factor
        assert masks.size(-2) == features.size(-2) * scale_factor
        assert features.size(1) % group_size == 0
        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
        ctx.kernel_size = kernel_size
        ctx.group_size = group_size
        ctx.scale_factor = scale_factor
        ctx.feature_size = features.size()
        ctx.mask_size = masks.size()

        n, c, h, w = features.size()
        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
        routput = features.new_zeros(output.size(), requires_grad=False)
        rfeatures = features.new_zeros(features.size(), requires_grad=False)
        rmasks = masks.new_zeros(masks.size(), requires_grad=False)
        ext_module.carafe_forward(
            features,
            masks,
            rfeatures,
            routput,
            rmasks,
            output,
            kernel_size=kernel_size,
            group_size=group_size,
            scale_factor=scale_factor)

        if features.requires_grad or masks.requires_grad or \
                torch.__version__ == 'parrots':
            ctx.save_for_backward(features, masks, rfeatures)
        return output

    @staticmethod
    def backward(
            ctx,
            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
        features, masks, rfeatures = ctx.saved_tensors
        kernel_size = ctx.kernel_size
        group_size = ctx.group_size
        scale_factor = ctx.scale_factor

        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
        rgrad_input = torch.zeros_like(features, requires_grad=False)
        rgrad_masks = torch.zeros_like(masks, requires_grad=False)
        grad_input = torch.zeros_like(features, requires_grad=False)
        grad_masks = torch.zeros_like(masks, requires_grad=False)
        ext_module.carafe_backward(
            grad_output.contiguous(),
            rfeatures,
            masks,
            rgrad_output,
            rgrad_input_hs,
            rgrad_input,
            rgrad_masks,
            grad_input,
            grad_masks,
            kernel_size=kernel_size,
            group_size=group_size,
            scale_factor=scale_factor)
        return grad_input, grad_masks, None, None, None


carafe = CARAFEFunction.apply


class CARAFE(Module):
    """ CARAFE: Content-Aware ReAssembly of FEatures

    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
    <https://arxiv.org/abs/1905.02188>`_ for more details.

    Args:
        kernel_size (int): reassemble kernel size
        group_size (int): reassemble group size
        scale_factor (int): upsample ratio

    Returns:
        upsampled feature map
    """

    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
        super().__init__()

        assert isinstance(kernel_size, int) and isinstance(
            group_size, int) and isinstance(scale_factor, int)
        self.kernel_size = kernel_size
        self.group_size = group_size
        self.scale_factor = scale_factor

    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
        return carafe(features, masks, self.kernel_size, self.group_size,
                      self.scale_factor)


@MODELS.register_module(name='carafe')
class CARAFEPack(nn.Module):
    """A unified package of CARAFE upsampler that contains: 1) channel
    compressor 2) content encoder 3) CARAFE op.

    Official implementation of ICCV 2019 paper
    `CARAFE: Content-Aware ReAssembly of FEatures
    <https://arxiv.org/abs/1905.02188>`_.

    Args:
        channels (int): input feature channels
        scale_factor (int): upsample ratio
        up_kernel (int): kernel size of CARAFE op
        up_group (int): group size of CARAFE op
        encoder_kernel (int): kernel size of content encoder
        encoder_dilation (int): dilation of content encoder
        compressed_channels (int): output channels of channels compressor

    Returns:
        upsampled feature map
    """

    def __init__(self,
                 channels: int,
                 scale_factor: int,
                 up_kernel: int = 5,
                 up_group: int = 1,
                 encoder_kernel: int = 3,
                 encoder_dilation: int = 1,
                 compressed_channels: int = 64):
        super().__init__()
        self.channels = channels
        self.scale_factor = scale_factor
        self.up_kernel = up_kernel
        self.up_group = up_group
        self.encoder_kernel = encoder_kernel
        self.encoder_dilation = encoder_dilation
        self.compressed_channels = compressed_channels
        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
                                            1)
        self.content_encoder = nn.Conv2d(
            self.compressed_channels,
            self.up_kernel * self.up_kernel * self.up_group *
            self.scale_factor * self.scale_factor,
            self.encoder_kernel,
            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
            dilation=self.encoder_dilation,
            groups=1)
        self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                xavier_init(m, distribution='uniform')
        normal_init(self.content_encoder, std=0.001)

    def kernel_normalizer(self, mask: Tensor) -> Tensor:
        mask = F.pixel_shuffle(mask, self.scale_factor)
        n, mask_c, h, w = mask.size()
        # use float division explicitly,
        # to void inconsistency while exporting to onnx
        mask_channel = int(mask_c / float(self.up_kernel**2))
        mask = mask.view(n, mask_channel, -1, h, w)

        mask = F.softmax(mask, dim=2, dtype=mask.dtype)
        mask = mask.view(n, mask_c, h, w).contiguous()

        return mask

    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
        return x

    def forward(self, x: Tensor) -> Tensor:
        compressed_x = self.channel_compressor(x)
        mask = self.content_encoder(compressed_x)
        mask = self.kernel_normalizer(mask)

        x = self.feature_reassemble(x, mask)
        return x


================================================
FILE: mmcv/ops/cc_attention.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.registry import MODELS

from mmcv.cnn import Scale


def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
    """Returns a diagonal matrix of size [n, n].

    The diagonal are all "-inf". This is for avoiding calculating the
    overlapped element in the Criss-Cross twice.
    """
    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)


@MODELS.register_module()
class CrissCrossAttention(nn.Module):
    """Criss-Cross Attention Module.

    .. note::
        Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
        to a pure PyTorch and equivalent implementation. For more
        details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.

        Speed comparison for one forward pass

        - Input size: [2,512,97,97]
        - Device: 1 NVIDIA GeForce RTX 2080 Ti

        +-----------------------+---------------+------------+---------------+
        |                       |PyTorch version|CUDA version|Relative speed |
        +=======================+===============+============+===============+
        |with torch.no_grad()   |0.00554402 s   |0.0299619 s |5.4x           |
        +-----------------------+---------------+------------+---------------+
        |no with torch.no_grad()|0.00562803 s   |0.0301349 s |5.4x           |
        +-----------------------+---------------+------------+---------------+

    Args:
        in_channels (int): Channels of the input feature map.
    """

    def __init__(self, in_channels: int) -> None:
        super().__init__()
        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
        self.gamma = Scale(0.)
        self.in_channels = in_channels

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward function of Criss-Cross Attention.

        Args:
            x (torch.Tensor): Input feature with the shape of
                (batch_size, in_channels, height, width).

        Returns:
            torch.Tensor: Output of the layer, with the shape of
            (batch_size, in_channels, height, width)
        """
        B, C, H, W = x.size()
        query = self.query_conv(x)
        key = self.key_conv(x)
        value = self.value_conv(x)
        energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
            H, query.device)
        energy_H = energy_H.transpose(1, 2)
        energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
        attn = F.softmax(
            torch.cat([energy_H, energy_W], dim=-1), dim=-1)  # [B,H,W,(H+W)]
        out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
        out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])

        out = self.gamma(out) + x
        out = out.contiguous()

        return out

    def __repr__(self) -> str:
        s = self.__class__.__name__
        s += f'(in_channels={self.in_channels})'
        return s


================================================
FILE: mmcv/ops/chamfer_distance.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Sequence, Tuple

import torch
from torch import Tensor
from torch.autograd import Function
from torch.autograd.function import once_differentiable

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])


class ChamferDistanceFunction(Function):
    """This is an implementation of the 2D Chamfer Distance.

    It has been used in the paper `Oriented RepPoints for Aerial Object
    Detection (CVPR 2022)
    <https://arxiv.org/abs/2105.11111>_`.
    """

    @staticmethod
    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
        """
        Args:
            xyz1 (Tensor): Point set with shape (B, N, 2).
            xyz2 (Tensor): Point set with shape (B, N, 2).

        Returns:
            Sequence[Tensor]:

                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
                    shape (B, N).
                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
                    shape (B, N).
                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
                    with shape (B, N), which be used in compute gradient.
                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
                    with shape (B, N), which be used in compute gradient.
        """
        batch_size, n, _ = xyz1.size()
        _, m, _ = xyz2.size()
        device = xyz1.device
        xyz1 = xyz1.contiguous()
        xyz2 = xyz2.contiguous()

        dist1 = torch.zeros(batch_size, n).type(xyz1.dtype).to(device)
        dist2 = torch.zeros(batch_size, m).type(xyz2.dtype).to(device)
        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)

        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
                                            idx2)
        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
        return dist1, dist2, idx1, idx2

    @staticmethod
    @once_differentiable
    def backward(ctx,
                 grad_dist1: Tensor,
                 grad_dist2: Tensor,
                 grad_idx1=None,
                 grad_idx2=None) -> Tuple[Tensor, Tensor]:
        """

        Args:
            grad_dist1 (Tensor): Gradient of chamfer distance
                (xyz1 to xyz2) with shape (B, N).
            grad_dist2 (Tensor): Gradient of chamfer distance
                (xyz2 to xyz1) with shape (B, N).

        Returns:
            Tuple[Tensor, Tensor]:

            - grad_xyz1 (Tensor): Gradient of the point set with shape \
                (B, N, 2).
            - grad_xyz2 (Tensor):Gradient of the point set with shape \
                (B, N, 2).
        """
        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
        device = grad_dist1.device
        grad_dist1 = grad_dist1.contiguous()
        grad_dist2 = grad_dist2.contiguous()
        grad_xyz1 = torch.zeros(xyz1.size()).type(xyz1.dtype).to(device)
        grad_xyz2 = torch.zeros(xyz2.size()).type(xyz2.dtype).to(device)

        ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,
                                             grad_dist1, grad_dist2, grad_xyz1,
                                             grad_xyz2)
        return grad_xyz1, grad_xyz2


chamfer_distance = ChamferDistanceFunction.apply


================================================
FILE: mmcv/ops/contour_expand.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Union

import numpy as np
import torch

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['contour_expand'])


def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
                   internal_kernel_label: Union[np.array, torch.Tensor],
                   min_kernel_area: int, kernel_num: int) -> list:
    """Expand kernel contours so that foreground pixels are assigned into
    instances.

    Args:
        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
            size hxw.
        internal_kernel_label (np.array or torch.Tensor): The instance internal
            kernel label with size hxw.
        min_kernel_area (int): The minimum kernel area.
        kernel_num (int): The instance kernel number.

    Returns:
        list: The instance index map with size hxw.
    """
    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
    assert isinstance(min_kernel_area, int)
    assert isinstance(kernel_num, int)

    if isinstance(kernel_mask, np.ndarray):
        kernel_mask = torch.from_numpy(kernel_mask)
    if isinstance(internal_kernel_label, np.ndarray):
        internal_kernel_label = torch.from_numpy(internal_kernel_label)

    if torch.__version__ == 'parrots':
        if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:
            label = []
        else:
            label = ext_module.contour_expand(
                kernel_mask,
                internal_kernel_label,
                min_kernel_area=min_kernel_area,
                kernel_num=kernel_num)
            label = label.tolist()  # type: ignore
    else:
        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
                                          min_kernel_area, kernel_num)
    return label


================================================
FILE: mmcv/ops/conv2d_gradfix.py
================================================
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa
"""Custom replacement for `torch.nn.functional.conv2d` that supports
arbitrarily high order gradients with zero performance penalty."""

import contextlib
import warnings
from typing import Dict, Optional, Tuple, Union

import torch
from mmengine.device import is_musa_available
from mmengine.utils import digit_version
from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch

enabled = True
weight_gradients_disabled = False


@contextlib.contextmanager
def no_weight_gradients(disable=True):
    global weight_gradients_disabled
    old = weight_gradients_disabled
    if disable:
        weight_gradients_disabled = True
    yield
    weight_gradients_disabled = old


def conv2d(input: torch.Tensor,
           weight: torch.Tensor,
           bias: Optional[torch.Tensor] = None,
           stride: Union[int, Tuple[int, ...]] = 1,
           padding: Union[int, Tuple[int, ...]] = 0,
           dilation: Union[int, Tuple[int, ...]] = 1,
           groups: int = 1):
    flag = True
    if digit_version(torch.__version__) >= digit_version('1.10.0'):
        warnings.warn('Since '
                      'aten:cudnn_convolution_backward_weight is '
                      f'not supported in torch=={torch.__version__},'
                      ' rolling back to `torch.nn.functional.conv2d`')
        flag = False
    if _should_use_custom_op(input) and flag:
        return _conv2d_gradfix(
            transpose=False,
            weight_shape=weight.shape,
            stride=stride,
            padding=padding,
            output_padding=0,
            dilation=dilation,
            groups=groups).apply(input, weight, bias)
    return torch.nn.functional.conv2d(
        input=input,
        weight=weight,
        bias=bias,
        stride=stride,
        padding=padding,
        dilation=dilation,
        groups=groups)


def conv_transpose2d(input: torch.Tensor,
                     weight: torch.Tensor,
                     bias: Optional[torch.Tensor] = None,
                     stride: Union[int, Tuple[int, ...]] = 1,
                     padding: Union[int, Tuple[int, ...]] = 0,
                     output_padding: Union[int, Tuple[int, ...]] = 0,
                     groups: int = 1,
                     dilation: Union[int, Tuple[int, ...]] = 1):
    if _should_use_custom_op(input):
        return _conv2d_gradfix(
            transpose=True,
            weight_shape=weight.shape,
            stride=stride,
            padding=padding,
            output_padding=output_padding,
            groups=groups,
            dilation=dilation).apply(input, weight, bias)
    return torch.nn.functional.conv_transpose2d(
        input=input,
        weight=weight,
        bias=bias,
        stride=stride,
        padding=padding,
        output_padding=output_padding,
        groups=groups,
        dilation=dilation)


def _should_use_custom_op(input):
    assert isinstance(input, torch.Tensor)
    if enabled and is_musa_available():
        return True
    if (not enabled) or (not torch.backends.cudnn.enabled):
        return False
    if input.device.type != 'cuda':
        return False
    return True


def _to_tuple(x, ndim):
    xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim
    assert len(xs) == ndim
    assert all(isinstance(x, int) for x in xs)
    return xs


_conv2d_gradfix_cache: Dict = dict()
_null_tensor = torch.empty([0])


def _conv2d_gradfix(
    transpose: bool,
    weight_shape: Tuple[int, ...],
    stride: Union[int, Tuple[int, ...]],
    padding: Union[int, Tuple[int, ...]],
    output_padding: Union[int, Tuple[int, ...]],
    dilation: Union[int, Tuple[int, ...]],
    groups: int,
):
    # Parse arguments.
    ndim = 2
    weight_shape = tuple(weight_shape)
    stride = _to_tuple(stride, ndim)
    padding = _to_tuple(padding, ndim)
    output_padding = _to_tuple(output_padding, ndim)
    dilation = _to_tuple(dilation, ndim)

    # Lookup from cache.
    key = (transpose, weight_shape, stride, padding, output_padding, dilation,
           groups)
    if key in _conv2d_gradfix_cache:
        return _conv2d_gradfix_cache[key]

    # Validate arguments.

    assert groups >= 1
    assert len(weight_shape) == ndim + 2
    assert all(stride[i] >= 1 for i in range(ndim))  # type: ignore
    assert all(padding[i] >= 0 for i in range(ndim))  # type: ignore
    assert all(dilation[i] >= 0 for i in range(ndim))  # type: ignore
    if not transpose:
        assert all(output_padding[i] == 0 for i in range(ndim))  # type: ignore
    else:  # transpose
        for i in range(ndim):
            assert 0 <= output_padding[i] < max(  # type: ignore
                stride[i],  # type: ignore
                dilation[i])  # type: ignore

    # Helpers.
    common_kwargs = dict(
        stride=stride, padding=padding, dilation=dilation, groups=groups)

    def calc_output_padding(input_shape, output_shape):
        if transpose:
            return [0, 0]
        return [
            input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -
            (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
            for i in range(ndim)
        ]

    # Forward & backward.
    class Conv2d(torch.autograd.Function):

        @staticmethod
        def forward(ctx, input, weight, bias):
            assert weight.shape == weight_shape
            ctx.save_for_backward(
                input if weight.requires_grad else _null_tensor,
                weight if input.requires_grad else _null_tensor,
            )
            ctx.input_shape = input.shape

            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
            if (not is_musa_available()
                ) and weight_shape[2:] == stride == dilation == (
                    1, 1) and padding == (
                        0, 0) and torch.cuda.get_device_capability(
                            input.device) < (8, 0):
                a = weight.reshape(groups, weight_shape[0] // groups,
                                   weight_shape[1])
                b = input.reshape(input.shape[0], groups,
                                  input.shape[1] // groups, -1)
                c = (a.transpose(1, 2) if transpose else a) @ b.permute(
                    1, 2, 0, 3).flatten(2)
                c = c.reshape(-1, input.shape[0],
                              *input.shape[2:]).transpose(0, 1)
                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(
                    2).unsqueeze(3)
                return c.contiguous(
                    memory_format=(torch.channels_last if input.stride(1) ==
                                   1 else torch.contiguous_format))

            # General case => cuDNN.
            if transpose:
                return torch.nn.functional.conv_transpose2d(
                    input=input,
                    weight=weight,
                    bias=bias,
                    output_padding=output_padding,
                    **common_kwargs)
            return torch.nn.functional.conv2d(
                input=input, weight=weight, bias=bias, **common_kwargs)

        @staticmethod
        def backward(ctx, grad_output):
            input, weight = ctx.saved_tensors
            input_shape = ctx.input_shape
            grad_input = None
            grad_weight = None
            grad_bias = None

            if ctx.needs_input_grad[0]:
                p = calc_output_padding(
                    input_shape=input_shape, output_shape=grad_output.shape)
                op = _conv2d_gradfix(
                    transpose=(not transpose),
                    weight_shape=weight_shape,
                    output_padding=p,
                    **common_kwargs)
                grad_input = op.apply(grad_output, weight, None)
                assert grad_input.shape == input_shape

            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
                grad_weight = Conv2dGradWeight.apply(grad_output, input)
                assert grad_weight.shape == weight_shape

            if ctx.needs_input_grad[2]:
                grad_bias = grad_output.sum([0, 2, 3])

            return grad_input, grad_weight, grad_bias

    # Gradient with respect to the weights.
    class Conv2dGradWeight(torch.autograd.Function):

        @staticmethod
        def forward(ctx, grad_output, input):
            ctx.save_for_backward(
                grad_output if input.requires_grad else _null_tensor,
                input if grad_output.requires_grad else _null_tensor,
            )
            ctx.grad_output_shape = grad_output.shape
            ctx.input_shape = input.shape

            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
            if weight_shape[2:] == stride == dilation == (
                    1, 1) and padding == (0, 0):
                a = grad_output.reshape(grad_output.shape[0], groups,
                                        grad_output.shape[1] // groups,
                                        -1).permute(1, 2, 0, 3).flatten(2)
                b = input.reshape(input.shape[0], groups,
                                  input.shape[1] // groups,
                                  -1).permute(1, 2, 0, 3).flatten(2)
                c = (b @ a.transpose(1, 2) if transpose else
                     a @ b.transpose(1, 2)).reshape(weight_shape)
                return c.contiguous(
                    memory_format=(torch.channels_last if input.stride(1) ==
                                   1 else torch.contiguous_format))

            # PyTorch consolidated convolution backward API in PR:
            # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122  # noqa: E501
            # Enhance the code referring to the discussion:
            # https://github.com/pytorch/pytorch/issues/74437
            if digit_version(torch.__version__) >= digit_version('1.11.0'):
                empty_weight = torch.tensor(
                    0.0, dtype=input.dtype,
                    device=input.device).expand(weight_shape)
                output_padding = calc_output_padding(input.shape,
                                                     grad_output.shape)
                return torch.ops.aten.convolution_backward(
                    grad_output,
                    input,
                    empty_weight,
                    None,
                    stride=stride,
                    dilation=dilation,
                    transposed=transpose,
                    padding=padding,
                    groups=groups,
                    output_padding=output_padding,
                    output_mask=[0, 1, 0])[1]
            else:
                if is_rocm_pytorch():
                    name = 'aten::miopen_convolution_transpose_backward_weight'
                    if not transpose:
                        name = 'aten::miopen_convolution_backward_weight'
                    flags = [
                        torch.backends.cudnn.benchmark,
                        torch.backends.cudnn.deterministic
                    ]
                else:
                    # General case => cuDNN.
                    name = ('aten::cudnn_convolution_transpose_backward_weight'
                            if transpose else
                            'aten::cudnn_convolution_backward_weight')
                    flags = [
                        torch.backends.cudnn.benchmark,
                        torch.backends.cudnn.deterministic,
                        torch.backends.cudnn.allow_tf32
                    ]
                return torch._C._jit_get_operation(name)(weight_shape,
                                                         grad_output, input,
                                                         padding, stride,
                                                         dilation, groups,
                                                         *flags)

        @staticmethod
        def backward(ctx, grad2_grad_weight):
            grad_output, input = ctx.saved_tensors
            grad_output_shape = ctx.grad_output_shape
            input_shape = ctx.input_shape
            grad2_grad_output = None
            grad2_input = None

            if ctx.needs_input_grad[0]:
                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,
                                                 None)
                assert grad2_grad_output.shape == grad_output_shape

            if ctx.needs_input_grad[1]:
                p = calc_output_padding(
                    input_shape=input_shape, output_shape=grad_output_shape)
                op = _conv2d_gradfix(
                    transpose=(not transpose),
                    weight_shape=weight_shape,
                    output_padding=p,
                    **common_kwargs)
                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
                assert grad2_input.shape == input_shape

            return grad2_grad_output, grad2_input

    _conv2d_gradfix_cache[key] = Conv2d
    return Conv2d


================================================
FILE: mmcv/ops/convex_iou.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple

import torch

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])


def convex_giou(pointsets: torch.Tensor,
                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    """Return generalized intersection-over-union (Jaccard index) between point
    sets and polygons.

    Args:
        pointsets (torch.Tensor): It has shape (N, 18),
            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
        polygons (torch.Tensor): It has shape (N, 8),
            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.

    Returns:
        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
        between point sets and polygons with the shape (N,). The second
        element is the gradient of point sets with the shape (N, 18).
    """
    output = pointsets.new_zeros((pointsets.size(0), 19))
    ext_module.convex_giou(pointsets, polygons, output)
    convex_giou = output[:, -1]
    points_grad = output[:, 0:-1]
    return convex_giou, points_grad


def convex_iou(pointsets: torch.Tensor,
               polygons: torch.Tensor) -> torch.Tensor:
    """Return intersection-over-union (Jaccard index) between point sets and
    polygons.

    Args:
        pointsets (torch.Tensor): It has shape (N, 18),
            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
        polygons (torch.Tensor): It has shape (K, 8),
            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.

    Returns:
        torch.Tensor: Return the ious between point sets and polygons with the
        shape (N, K).
    """
    N, K = pointsets.size(0), polygons.size(0)
    ious = pointsets.new_zeros((N, K))
    ext_module.convex_iou(pointsets, polygons, ious)
    return ious


================================================
FILE: mmcv/ops/corner_pool.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmengine.utils import digit_version
from torch import Tensor, nn

_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}


def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
    size = x.size(dim)
    output = x.clone()

    ind = 1
    while ind < size:
        if flip:
            cur_start = 0
            cur_len = size - ind
            next_start = ind
            next_len = size - ind
        else:
            cur_start = ind
            cur_len = size - ind
            next_start = 0
            next_len = size - ind

        # max_temp should be cloned for backward computation
        max_temp = output.narrow(dim, cur_start, cur_len).clone()
        cur_temp = output.narrow(dim, cur_start, cur_len)
        next_temp = output.narrow(dim, next_start, next_len)

        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)

        ind = ind << 1

    return output


class CornerPool(nn.Module):
    """Corner Pooling.

    Corner Pooling is a new type of pooling layer that helps a
    convolutional network better localize corners of bounding boxes.

    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
    <https://arxiv.org/abs/1808.01244>`_ for more details.

    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.

    Args:
        mode (str): Pooling orientation for the pooling layer

            - 'bottom': Bottom Pooling
            - 'left': Left Pooling
            - 'right': Right Pooling
            - 'top': Top Pooling

    Returns:
        Feature map after pooling.
    """

    cummax_dim_flip = {
        'bottom': (2, False),
        'left': (3, True),
        'right': (3, False),
        'top': (2, True),
    }

    def __init__(self, mode: str):
        super().__init__()
        assert mode in self.cummax_dim_flip
        self.mode = mode

    def forward(self, x: Tensor) -> Tensor:
        if (torch.__version__ != 'parrots' and
                digit_version(torch.__version__) >= digit_version('1.5.0')):
            dim, flip = self.cummax_dim_flip[self.mode]
            if flip:
                x = x.flip(dim)
            pool_tensor, _ = torch.cummax(x, dim=dim)
            if flip:
                pool_tensor = pool_tensor.flip(dim)
            return pool_tensor
        else:
            dim, flip = self.cummax_dim_flip[self.mode]
            return _corner_pool(x, dim, flip)


================================================
FILE: mmcv/ops/correlation.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple

import torch
from torch import Tensor, nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['correlation_forward', 'correlation_backward'])


class CorrelationFunction(Function):

    @staticmethod
    def forward(ctx,
                input1: Tensor,
                input2: Tensor,
                kernel_size: int = 1,
                max_displacement: int = 1,
                stride: int = 1,
                padding: int = 1,
                dilation: int = 1,
                dilation_patch: int = 1) -> Tensor:

        ctx.save_for_backward(input1, input2)

        kH, kW = ctx.kernel_size = _pair(kernel_size)
        patch_size = max_displacement * 2 + 1
        ctx.patch_size = patch_size
        dH, dW = ctx.stride = _pair(stride)
        padH, padW = ctx.padding = _pair(padding)
        dilationH, dilationW = ctx.dilation = _pair(dilation)
        dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(
            dilation_patch)

        output_size = CorrelationFunction._output_size(ctx, input1)

        output = input1.new_zeros(output_size)

        ext_module.correlation_forward(
            input1,
            input2,
            output,
            kH=kH,
            kW=kW,
            patchH=patch_size,
            patchW=patch_size,
            padH=padH,
            padW=padW,
            dilationH=dilationH,
            dilationW=dilationW,
            dilation_patchH=dilation_patchH,
            dilation_patchW=dilation_patchW,
            dH=dH,
            dW=dW)

        return output

    @staticmethod
    @once_differentiable
    def backward(
        ctx, grad_output: Tensor
    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
        input1, input2 = ctx.saved_tensors

        kH, kW = ctx.kernel_size
        patch_size = ctx.patch_size
        padH, padW = ctx.padding
        dilationH, dilationW = ctx.dilation
        dilation_patchH, dilation_patchW = ctx.dilation_patch
        dH, dW = ctx.stride
        grad_input1 = torch.zeros_like(input1)
        grad_input2 = torch.zeros_like(input2)

        ext_module.correlation_backward(
            grad_output,
            input1,
            input2,
            grad_input1,
            grad_input2,
            kH=kH,
            kW=kW,
            patchH=patch_size,
            patchW=patch_size,
            padH=padH,
            padW=padW,
            dilationH=dilationH,
            dilationW=dilationW,
            dilation_patchH=dilation_patchH,
            dilation_patchW=dilation_patchW,
            dH=dH,
            dW=dW)
        return grad_input1, grad_input2, None, None, None, None, None, None

    @staticmethod
    def _output_size(ctx, input1):
        iH, iW = input1.size(2), input1.size(3)
        batch_size = input1.size(0)
        kH, kW = ctx.kernel_size
        patch_size = ctx.patch_size
        dH, dW = ctx.stride
        padH, padW = ctx.padding
        dilationH, dilationW = ctx.dilation
        dilatedKH = (kH - 1) * dilationH + 1
        dilatedKW = (kW - 1) * dilationW + 1

        oH = int((iH + 2 * padH - dilatedKH) / dH + 1)
        oW = int((iW + 2 * padW - dilatedKW) / dW + 1)

        output_size = (batch_size, patch_size, patch_size, oH, oW)
        return output_size


class Correlation(nn.Module):
    r"""Correlation operator.

    This correlation operator works for optical flow correlation computation.

    There are two batched tensors with shape :math:`(N, C, H, W)`,
    and the correlation output's shape is :math:`(N, max\_displacement \times
    2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`

    where

    .. math::
        H_{out} = \left\lfloor\frac{H_{in}  + 2 \times padding -
            dilation \times (kernel\_size - 1) - 1}
            {stride} + 1\right\rfloor

    .. math::
        W_{out} = \left\lfloor\frac{W_{in}  + 2 \times padding - dilation
            \times (kernel\_size - 1) - 1}
            {stride} + 1\right\rfloor

    the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
    window convolution between input1 and shifted input2,

    .. math::
        Corr(N_i, dx, dy) =
        \sum_{c=0}^{C-1}
        input1(N_i, c) \star
        \mathcal{S}(input2(N_i, c), dy, dx)

    where :math:`\star` is the valid 2d sliding window convolution operator,
    and :math:`\mathcal{S}` means shifting the input features (auto-complete
    zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
    [-max\_displacement \times dilation\_patch, max\_displacement \times
    dilation\_patch]`.

    Args:
        kernel_size (int): The size of sliding window i.e. local neighborhood
            representing the center points and involved in correlation
            computation. Defaults to 1.
        max_displacement (int): The radius for computing correlation volume,
            but the actual working space can be dilated by dilation_patch.
            Defaults to 1.
        stride (int): The stride of the sliding blocks in the input spatial
            dimensions. Defaults to 1.
        padding (int): Zero padding added to all four sides of the input1.
            Defaults to 0.
        dilation (int): The spacing of local neighborhood that will involved
            in correlation. Defaults to 1.
        dilation_patch (int): The spacing between position need to compute
            correlation.  Defaults to 1.
    """

    def __init__(self,
                 kernel_size: int = 1,
                 max_displacement: int = 1,
                 stride: int = 1,
                 padding: int = 0,
                 dilation: int = 1,
                 dilation_patch: int = 1) -> None:
        super().__init__()
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.dilation_patch = dilation_patch

    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
        return CorrelationFunction.apply(input1, input2, self.kernel_size,
                                         self.max_displacement, self.stride,
                                         self.padding, self.dilation,
                                         self.dilation_patch)

    def __repr__(self) -> str:
        s = self.__class__.__name__
        s += f'(kernel_size={self.kernel_size}, '
        s += f'max_displacement={self.max_displacement}, '
        s += f'stride={self.stride}, '
        s += f'padding={self.padding}, '
        s += f'dilation={self.dilation}, '
        s += f'dilation_patch={self.dilation_patch})'
        return s


================================================
FILE: mmcv/ops/csrc/README.md
================================================
# Code Structure of CUDA operators

This folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops.

## Directories Tree

```folder
.
├── common
│   ├── box_iou_rotated_utils.hpp
│   ├── parrots_cpp_helper.hpp
│   ├── parrots_cuda_helper.hpp
│   ├── pytorch_cpp_helper.hpp
│   ├── pytorch_cuda_helper.hpp
│   ├── pytorch_device_registry.hpp
│   ├── cuda
│   │   ├── common_cuda_helper.hpp
│   │   ├── parrots_cudawarpfunction.cuh
│   │   ├── ...
│   │   └── ops_cuda_kernel.cuh
|   ├── mps
│   │   ├── MPSLibrary.h
│   │   ├── ...
│   │   └── MPSUtils.h
|   ├── mlu
│   │   └── ...
|   └── utils
│   │   └── ...
├── parrots
│   ├── ...
│   ├── ops.cpp
│   ├── ops_parrots.cpp
│   └── ops_pytorch.h
└── pytorch
    ├── info.cpp
    ├── pybind.cpp
    ├── ...
    ├── ops.cpp
    ├── cuda
    │   ├── ...
    │   └── ops_cuda.cu
    ├── cpu
    │   ├── ...
    │   └── ops.cpp
    ├── mps
    │   ├── ...
    |   └── op_mps.mm
    └── mlu
        ├── ...
        └── op_mlu.cpp
```

## Components

- `common`: This directory contains all tools and shared codes.
  - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
  - `utils`: The kernels and utils of spconv.
- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
  - `mlu`: This directory contain launchers of each MLU kernels.
  - `mps`: MPS ops implementation and launchers.

## How to add new PyTorch ops?

1. (Optional) Add shared kernel in `common` to support special hardware platform.

   ```c++
   // src/common/cuda/new_ops_cuda_kernel.cuh

   template <typename T>
   __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
       // forward here
   }

   ```

   Add cuda kernel launcher in `pytorch/cuda`.

   ```c++
   // src/pytorch/cuda
   #include <new_ops_cuda_kernel.cuh>

   void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
       // initialize
       at::cuda::CUDAGuard device_guard(input.device());
       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
       ...
       AT_DISPATCH_FLOATING_TYPES_AND_HALF(
           input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
               new_ops_forward_cuda_kernel<scalar_t>
                   <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                       input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
           }));
       AT_CUDA_CHECK(cudaGetLastError());
   }
   ```

2. Register implementation for different devices.

   ```c++
   // src/pytorch/cuda/cudabind.cpp
   ...

   Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
       // implement cuda forward here
       // use `NewOpsForwardCUDAKernelLauncher` here
   }
   // declare interface here.
   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
   // register the implementation for given device (CUDA here).
   REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
   ```

3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.

   ```c++
   // src/pytorch/new_ops.cpp
   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
       // dispatch the implementation according to the device type of input.
       DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
   }
   ...

   Tensor new_ops_forward(Tensor input, Tensor output, ...){
       return new_ops_forward_impl(input, output, ...);
   }
   ```

4. Binding the implementation in `pytorch/pybind.cpp`

   ```c++
   // src/pytorch/pybind.cpp

   ...

   Tensor new_ops_forward(Tensor input, Tensor output, ...);

   ...

   // bind with pybind11
   m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
           py::arg("input"), py::arg("output"), ...);

   ...

   ```

5. Build MMCV again. Enjoy new ops in python

   ```python
   from ..utils import ext_loader
   ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])

   ...

   ext_module.new_ops_forward(input, output, ...)

   ```


================================================
FILE: mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
#pragma once
#include <cassert>
#include <cmath>

#if defined(__CUDACC__) || defined(__MUSACC__)
// Designates functions callable from the host (CPU) and the device (GPU)
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
#else
#include <algorithm>
#define HOST_DEVICE
#define HOST_DEVICE_INLINE HOST_DEVICE inline
#endif

namespace {

template <typename T>
struct RotatedBox {
  T x_ctr, y_ctr, w, h, a;
};

template <typename T>
struct Point {
  T x, y;
  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
    return Point(x + p.x, y + p.y);
  }
  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
    x += p.x;
    y += p.y;
    return *this;
  }
  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
    return Point(x - p.x, y - p.y);
  }
  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
    return Point(x * coeff, y * coeff);
  }
};

template <typename T>
HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
  return A.x * B.x + A.y * B.y;
}

template <typename T>
HOST_DEVICE_INLINE T cross_2d(const Point<T>& A, const Point<T>& B) {
  return A.x * B.y - B.x * A.y;
}

template <typename T>
HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T>& box,
                                             Point<T> (&pts)[4]) {
  // M_PI / 180. == 0.01745329251
  // double theta = box.a * 0.01745329251;
  // MODIFIED
  double theta = box.a;
  T cosTheta2 = (T)cos(theta) * 0.5f;
  T sinTheta2 = (T)sin(theta) * 0.5f;

  // y: top --> down; x: left --> right
  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
  pts[2].x = 2 * box.x_ctr - pts[0].x;
  pts[2].y = 2 * box.y_ctr - pts[0].y;
  pts[3].x = 2 * box.x_ctr - pts[1].x;
  pts[3].y = 2 * box.y_ctr - pts[1].y;
}

template <typename T>
HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
                                               const Point<T> (&pts2)[4],
                                               Point<T> (&intersections)[24]) {
  // Line vector
  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
  Point<T> vec1[4], vec2[4];
  for (int i = 0; i < 4; i++) {
    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
  }

  // Line test - test all line combos for intersection
  int num = 0;  // number of intersections
  for (int i = 0; i < 4; i++) {
    for (int j = 0; j < 4; j++) {
      // Solve for 2x2 Ax=b
      T det = cross_2d<T>(vec2[j], vec1[i]);

      // This takes care of parallel lines
      if (fabs(det) <= 1e-14) {
        continue;
      }

      auto vec12 = pts2[j] - pts1[i];

      T t1 = cross_2d<T>(vec2[j], vec12) / det;
      T t2 = cross_2d<T>(vec1[i], vec12) / det;

      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
        intersections[num++] = pts1[i] + vec1[i] * t1;
      }
    }
  }

  // Check for vertices of rect1 inside rect2
  {
    const auto& AB = vec2[0];
    const auto& DA = vec2[3];
    auto ABdotAB = dot_2d<T>(AB, AB);
    auto ADdotAD = dot_2d<T>(DA, DA);
    for (int i = 0; i < 4; i++) {
      // assume ABCD is the rectangle, and P is the point to be judged
      // P is inside ABCD iff. P's projection on AB lies within AB
      // and P's projection on AD lies within AD

      auto AP = pts1[i] - pts2[0];

      auto APdotAB = dot_2d<T>(AP, AB);
      auto APdotAD = -dot_2d<T>(AP, DA);

      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
          (APdotAD <= ADdotAD)) {
        intersections[num++] = pts1[i];
      }
    }
  }

  // Reverse the check - check for vertices of rect2 inside rect1
  {
    const auto& AB = vec1[0];
    const auto& DA = vec1[3];
    auto ABdotAB = dot_2d<T>(AB, AB);
    auto ADdotAD = dot_2d<T>(DA, DA);
    for (int i = 0; i < 4; i++) {
      auto AP = pts2[i] - pts1[0];

      auto APdotAB = dot_2d<T>(AP, AB);
      auto APdotAD = -dot_2d<T>(AP, DA);

      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
          (APdotAD <= ADdotAD)) {
        intersections[num++] = pts2[i];
      }
    }
  }

  return num;
}

template <typename T>
HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
                                          const int& num_in, Point<T> (&q)[24],
                                          bool shift_to_zero = false) {
  assert(num_in >= 2);

  // Step 1:
  // Find point with minimum y
  // if more than 1 points have the same minimum y,
  // pick the one with the minimum x.
  int t = 0;
  for (int i = 1; i < num_in; i++) {
    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
      t = i;
    }
  }
  auto& start = p[t];  // starting point

  // Step 2:
  // Subtract starting point from every points (for sorting in the next step)
  for (int i = 0; i < num_in; i++) {
    q[i] = p[i] - start;
  }

  // Swap the starting point to position 0
  auto tmp = q[0];
  q[0] = q[t];
  q[t] = tmp;

  // Step 3:
  // Sort point 1 ~ num_in according to their relative cross-product values
  // (essentially sorting according to angles)
  // If the angles are the same, sort according to their distance to origin
  T dist[24];
  for (int i = 0; i < num_in; i++) {
    dist[i] = dot_2d<T>(q[i], q[i]);
  }

#if defined(__CUDACC__) || defined(__MUSACC__)
  // CUDA version
  // In the future, we can potentially use thrust
  // for sorting here to improve speed (though not guaranteed)
  for (int i = 1; i < num_in - 1; i++) {
    for (int j = i + 1; j < num_in; j++) {
      T crossProduct = cross_2d<T>(q[i], q[j]);
      if ((crossProduct < -1e-6) ||
          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
        auto q_tmp = q[i];
        q[i] = q[j];
        q[j] = q_tmp;
        auto dist_tmp = dist[i];
        dist[i] = dist[j];
        dist[j] = dist_tmp;
      }
    }
  }
#else
  // CPU version
  std::sort(q + 1, q + num_in,
            [](const Point<T>& A, const Point<T>& B) -> bool {
              T temp = cross_2d<T>(A, B);
              if (fabs(temp) < 1e-6) {
                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
              } else {
                return temp > 0;
              }
            });
  // compute distance to origin after sort, since the points are now different.
  for (int i = 0; i < num_in; i++) {
    dist[i] = dot_2d<T>(q[i], q[i]);
  }
#endif

  // Step 4:
  // Make sure there are at least 2 points (that don't overlap with each other)
  // in the stack
  int k;  // index of the non-overlapped second point
  for (k = 1; k < num_in; k++) {
    if (dist[k] > 1e-8) {
      break;
    }
  }
  if (k == num_in) {
    // We reach the end, which means the convex hull is just one point
    q[0] = p[t];
    return 1;
  }
  q[1] = q[k];
  int m = 2;  // 2 points in the stack
  // Step 5:
  // Finally we can start the scanning process.
  // When a non-convex relationship between the 3 points is found
  // (either concave shape or duplicated points),
  // we pop the previous point from the stack
  // until the 3-point relationship is convex again, or
  // until the stack only contains two points
  for (int i = k + 1; i < num_in; i++) {
    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
      m--;
    }
    q[m++] = q[i];
  }

  // Step 6 (Optional):
  // In general sense we need the original coordinates, so we
  // need to shift the points back (reverting Step 2)
  // But if we're only interested in getting the area/perimeter of the shape
  // We can simply return.
  if (!shift_to_zero) {
    for (int i = 0; i < m; i++) {
      q[i] += start;
    }
  }

  return m;
}

template <typename T>
HOST_DEVICE_INLINE T quadri_box_area(const Point<T> (&q)[4]) {
  T area = 0;
#pragma unroll
  for (int i = 1; i < 3; i++) {
    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
  }

  return area / 2.0;
}

template <typename T>
HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
  if (m <= 2) {
    return 0;
  }

  T area = 0;
  for (int i = 1; i < m - 1; i++) {
    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
  }

  return area / 2.0;
}

template <typename T>
HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
                                                const RotatedBox<T>& box2) {
  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
  // from rotated_rect_intersection_pts
  Point<T> intersectPts[24], orderedPts[24];

  Point<T> pts1[4];
  Point<T> pts2[4];
  get_rotated_vertices<T>(box1, pts1);
  get_rotated_vertices<T>(box2, pts2);

  int num = get_intersection_points<T>(pts1, pts2, intersectPts);

  if (num <= 2) {
    return 0.0;
  }

  // Convex Hull to order the intersection points in clockwise order and find
  // the contour area.
  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
  return polygon_area<T>(orderedPts, num_convex);
}

template <typename T>
HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point<T> (&pts1)[4],
                                               const Point<T> (&pts2)[4]) {
  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
  // from rotated_rect_intersection_pts
  Point<T> intersectPts[24], orderedPts[24];

  int num = get_intersection_points<T>(pts1, pts2, intersectPts);

  if (num <= 2) {
    return 0.0;
  }

  // Convex Hull to order the intersection points in clockwise order and find
  // the contour area.
  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
  return polygon_area<T>(orderedPts, num_convex);
}

}  // namespace

template <typename T>
HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
                                            T const* const box2_raw,
                                            const int mode_flag) {
  // shift center to the middle point to achieve higher precision in result
  RotatedBox<T> box1, box2;
  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
  box1.x_ctr = box1_raw[0] - center_shift_x;
  box1.y_ctr = box1_raw[1] - center_shift_y;
  box1.w = box1_raw[2];
  box1.h = box1_raw[3];
  box1.a = box1_raw[4];
  box2.x_ctr = box2_raw[0] - center_shift_x;
  box2.y_ctr = box2_raw[1] - center_shift_y;
  box2.w = box2_raw[2];
  box2.h = box2_raw[3];
  box2.a = box2_raw[4];

  const T area1 = box1.w * box1.h;
  const T area2 = box2.w * box2.h;
  if (area1 < 1e-14 || area2 < 1e-14) {
    return 0.f;
  }

  const T intersection = rotated_boxes_intersection<T>(box1, box2);
  T baseS = 1.0;
  if (mode_flag == 0) {
    baseS = (area1 + area2 - intersection);
  } else if (mode_flag == 1) {
    baseS = area1;
  }
  const T iou = intersection / baseS;
  return iou;
}

template <typename T>
HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,
                                           T const* const pts2_raw,
                                           const int mode_flag) {
  // shift center to the middle point to achieve higher precision in result
  Point<T> pts1[4], pts2[4];

  auto center_shift_x =
      (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +
       pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /
      8.0;
  auto center_shift_y =
      (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +
       pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /
      8.0;
  pts1[0].x = pts1_raw[0] - center_shift_x;
  pts1[0].y = pts1_raw[1] - center_shift_y;
  pts1[1].x = pts1_raw[2] - center_shift_x;
  pts1[1].y = pts1_raw[3] - center_shift_y;
  pts1[2].x = pts1_raw[4] - center_shift_x;
  pts1[2].y = pts1_raw[5] - center_shift_y;
  pts1[3].x = pts1_raw[6] - center_shift_x;
  pts1[3].y = pts1_raw[7] - center_shift_y;
  pts2[0].x = pts2_raw[0] - center_shift_x;
  pts2[0].y = pts2_raw[1] - center_shift_y;
  pts2[1].x = pts2_raw[2] - center_shift_x;
  pts2[1].y = pts2_raw[3] - center_shift_y;
  pts2[2].x = pts2_raw[4] - center_shift_x;
  pts2[2].y = pts2_raw[5] - center_shift_y;
  pts2[3].x = pts2_raw[6] - center_shift_x;
  pts2[3].y = pts2_raw[7] - center_shift_y;

  const T area1 = quadri_box_area<T>(pts1);
  const T area2 = quadri_box_area<T>(pts2);
  if (area1 < 1e-14 || area2 < 1e-14) {
    return 0.f;
  }

  const T intersection = quadri_boxes_intersection<T>(pts1, pts2);
  T baseS = 1.0;
  if (mode_flag == 0) {
    baseS = (area1 + area2 - intersection);
  } else if (mode_flag == 1) {
    baseS = area1;
  }
  const T iou = intersection / baseS;
  return iou;
}


================================================
FILE: mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename scalar_t>
__global__ void active_rotated_filter_forward_cuda_kernel(
    const int nthreads, const scalar_t* weight_data, const int* indices_data,
    const int num_input_planes, const int num_output_planes,
    const int num_orientations, const int num_rotations, const int nEntry,
    scalar_t* output_data) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int l = index % nEntry;
    int j = (index / nEntry) % num_input_planes;
    int i = index / nEntry / num_input_planes;
    int k;
    scalar_t val = *(weight_data + index);
    for (k = 0; k < num_rotations; k++) {
      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
      scalar_t* target = output_data +
                         i * (num_rotations * num_input_planes * nEntry) +
                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
      *target = val;
    }
  }
}

template <typename scalar_t>
__global__ void active_rotated_filter_backward_cuda_kernel(
    const int nthreads, const scalar_t* gradWeight_data,
    const int* indices_data, const int num_input_planes,
    const int num_output_planes, const int num_orientations,
    const int num_rotations, const int nEntry, scalar_t* weight_data) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int l = index % nEntry;
    int j = (index / nEntry) % num_input_planes;
    int i = index / nEntry / num_input_planes;
    int k;
    scalar_t* val = weight_data + index;
    *val = 0;
    scalar_t tmp = 0;
    for (k = 0; k < num_rotations; k++) {
      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
      scalar_t target =
          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
      tmp = tmp + target;
    }
    *val = tmp;
  }
}
#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
// output: fout(B,O,N)
// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
//       i(k) = idx(b,i,k)
//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))

template <typename T>
__global__ void assign_score_withk_forward_cuda_kernel(
    const int B, const int N0, const int N1, const int M, const int K,
    const int O, const int aggregate, const T* points, const T* centers,
    const T* scores, const int64_t* knn_idx, T* output) {
  // ----- parallel loop for B, N1, K and O ---------
  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
    // ------- loop for M ----------
    const int b = (int)(i / (O * N1 * K));
    const int o = (int)(i % (O * N1 * K) / (N1 * K));
    const int n = (int)(i % (N1 * K) / K);
    const int k = (int)(i % K);
    const int cn = (int)knn_idx[b * K * N1 + n * K +
                                0];  // The first neighbor is the center point
    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
    if (kn >= N0 ||
        kn < 0) {  // if index overflows, it is out of the neighborhood range
      return;
    }
    assert(b < B);
    assert(kn < N0);
    assert(cn < N0);
    assert(o < O);
    assert(n < N1);
    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
    T val = output[out_idx];
    for (int m = 0; m < M; m++) {
      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
                 scores[b * N1 * K * M + n * K * M + k * M + m] -
             centers[b * N0 * M * O + cn * M * O + m * O + o] *
                 scores[b * N1 * K * M + n * K * M + k * M + m];
    }
    output[out_idx] = val;
  }
}

template <typename T>
__global__ void assign_score_withk_points_backward_cuda_kernel(
    const int B, const int N0, const int N, const int M, const int K,
    const int O, const int aggregate, const T* grad_out, const T* scores,
    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
  // ----- parallel loop for B, M, O ---------
  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
    int b = (int)(i / (M * O));
    int m = (int)(i % (M * O) / O);
    int o = (int)(i % O);

    // ----- loop for N,K ---------
    for (int n = 0; n < N; n++) {
      for (int k = 0; k < K; k++) {
        int kn = knn_idx[b * N * K + n * K + k];
        int cn = knn_idx[b * N * K + n * K + 0];
        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
                                   // neighborhood range
          continue;
        }
        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
                  scores[b * N * K * M + n * K * M + k * M + m] *
                      grad_out[b * O * N * K + o * N * K + n * K + k]);
        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
                  -scores[b * N * K * M + n * K * M + k * M + m] *
                      grad_out[b * O * N * K + o * N * K + n * K + k]);
      }
    }
  }
}

template <typename T>
__global__ void assign_score_withk_scores_backward_cuda_kernel(
    const int B, const int N0, const int N, const int M, const int K,
    const int O, const int aggregate, const T* grad_out, const T* points,
    const T* centers, const int64_t* knn_idx, T* grad_scores) {
  // ----- parallel loop for B, N, K, M ---------
  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
    const int b = (int)(i / (N * M * K));
    const int n = (int)(i % (N * M * K) / M / K);
    const int k = (int)(i % (M * K) / M);
    const int m = (int)(i % M);
    const int cn = knn_idx[b * N * K + n * K + 0];
    const int kn = knn_idx[b * N * K + n * K + k];
    if (kn >= N0 ||
        kn < 0) {  // if index overflows, it is out of the neighborhood range
      return;
    }

    // -------------- loop for O ------------------------
    const int out_idx = b * N * K * M + n * K * M + k * M + m;
    T val = grad_scores[out_idx];
    for (int o = 0; o < O; o++) {
      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
             grad_out[b * O * N * K + o * N * K + n * K + k];
    }
    grad_scores[out_idx] = val;
  }
}

#endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
#ifndef BALL_QUERY_CUDA_KERNEL_CUH
#define BALL_QUERY_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
                                               float min_radius,
                                               float max_radius, int nsample,
                                               const T* new_xyz, const T* xyz,
                                               int* idx) {
  // new_xyz: (B, M, 3)
  // xyz: (B, N, 3)
  // output:
  //      idx: (B, M, nsample)
  int bs_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
    if (bs_idx >= b) return;

    new_xyz += bs_idx * m * 3 + pt_idx * 3;
    xyz += bs_idx * n * 3;
    idx += bs_idx * m * nsample + pt_idx * nsample;

    float max_radius2 = max_radius * max_radius;
    float min_radius2 = min_radius * min_radius;
    T new_x = new_xyz[0];
    T new_y = new_xyz[1];
    T new_z = new_xyz[2];

    int cnt = 0;
    for (int k = 0; k < n; ++k) {
      T x = xyz[k * 3 + 0];
      T y = xyz[k * 3 + 1];
      T z = xyz[k * 3 + 2];
      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
             (new_z - z) * (new_z - z);
      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
        if (cnt == 0) {
          for (int l = 0; l < nsample; ++l) {
            idx[l] = k;
          }
        }
        idx[cnt] = k;
        ++cnt;
        if (cnt >= nsample) break;
      }
    }
  }
}

#endif  // BALL_QUERY_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
#define BBOX_OVERLAPS_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
                                          T& y1, T& x2, T& y2) {
  x1 = bbox[base];
  y1 = bbox[base + 1];
  x2 = bbox[base + 2];
  y2 = bbox[base + 3];
}

template <>
__device__ __forceinline__ void load_bbox<float>(const float* bbox,
                                                 const int base, float& x1,
                                                 float& y1, float& x2,
                                                 float& y2) {
  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
  x1 = bbox_offset.x;
  y1 = bbox_offset.y;
  x2 = bbox_offset.z;
  y2 = bbox_offset.w;
}

template <typename T>
__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                          T* ious, const int num_bbox1,
                                          const int num_bbox2, const int mode,
                                          const bool aligned,
                                          const int offset) {
  if (aligned) {
    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
      const int b1 = index;
      const int b2 = index;

      const int base1 = b1 << 2;  // b1 * 4
      T b1_x1, b1_y1, b1_x2, b1_y2;
      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);

      const int base2 = b2 << 2;  // b2 * 4
      T b2_x1, b2_y1, b2_x2, b2_y2;
      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);

      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
      const T width = fmaxf(right - left + offset, 0.f);
      const T height = fmaxf(bottom - top + offset, 0.f);
      const T interS = width * height;

      const T baseS =
          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
      ious[index] = interS / baseS;
    }
  } else {
    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
      const int b1 = index / num_bbox2;
      const int b2 = index % num_bbox2;

      const int base1 = b1 << 2;  // b1 * 4
      T b1_x1, b1_y1, b1_x2, b1_y2;
      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);

      const int base2 = b2 << 2;  // b2 * 4
      T b2_x1, b2_y1, b2_x2, b2_y2;
      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);

      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
      const T width = fmaxf(right - left + offset, 0.f);
      const T height = fmaxf(bottom - top + offset, 0.f);
      const T interS = width * height;

      const T baseS =
          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
      ious[index] = interS / baseS;
    }
  }
}

#if __CUDA_ARCH__ >= 530
__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
                                              const __half x2, const __half y2,
                                              const __half offset) {
  const __half half_w = __hadd(__hsub(x2, x1), offset);
  const __half half_h = __hadd(__hsub(y2, y1), offset);
  return __hmul(half_w, half_h);
}

__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
  return __hge(a, b) ? a : b;
}

__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
  return __hle(a, b) ? a : b;
}

// fp16 won't provide much increase when aligned==true. It is useful when
// aligned==false, which would give you ~40% bonus.
__device__ void bbox_overlaps_cuda_kernel_half(
    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
    const int num_bbox2, const int mode, const bool aligned, const int offset) {
  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
  const __half h_offset = __int2half_rn(offset);
  CUDA_1D_KERNEL_LOOP(index, num_output) {
    const int b1 = aligned ? index : index / num_bbox2;
    const int b2 = aligned ? index : index % num_bbox2;

    const int base1 = b1 << 2;
    __half b1_x1, b1_y1, b1_x2, b1_y2;
    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);

    const int base2 = b2 << 2;
    __half b2_x1, b2_y1, b2_x2, b2_y2;
    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);

    const __half left = __half_max(b1_x1, b2_x1),
                 right = __half_min(b1_x2, b2_x2);
    const __half top = __half_max(b1_y1, b2_y1),
                 bottom = __half_min(b1_y2, b2_y2);
    const __half width =
        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
    const __half height =
        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
    const __half interS = __hmul(width, height);

    const __half baseS = __half_max(
        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
        h_offset);
    ious[index] = __hdiv(interS, baseS);
  }
}
#endif  // __CUDA_ARCH__ >= 530

#endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH
#define BEZIER_ALIGN_CUDA_KERNEL_CUH

#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else  // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else  // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif  // MMCV_USE_PARROTS
#endif  // MMCV_WITH_TRT

template <typename T>
__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
                          const T u) {
  return ((1. - u) * (1. - u) * (1. - u) * p0 +
          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
          u * u * u * p3);
}

template <typename T>
__global__ void bezier_align_forward_cuda_kernel(
    const int nthreads,
    const T *bottom_data,  // inputs
    const T *bottom_rois,  // bottom rois contains the bezier curve
    T *top_data,           // outputs
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int sampling_ratio, bool aligned, const int channels,
    const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    // beziers have size Nx(1+8*2) = Nx17
    const T *offset_bottom_rois = bottom_rois + n * 17;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;

    // TODO: avoid this by using parallel annotation, for good
    T p0_x = offset_bottom_rois[1] * spatial_scale;
    T p0_y = offset_bottom_rois[2] * spatial_scale;
    T p1_x = offset_bottom_rois[3] * spatial_scale;
    T p1_y = offset_bottom_rois[4] * spatial_scale;
    T p2_x = offset_bottom_rois[5] * spatial_scale;
    T p2_y = offset_bottom_rois[6] * spatial_scale;
    T p3_x = offset_bottom_rois[7] * spatial_scale;
    T p3_y = offset_bottom_rois[8] * spatial_scale;
    T p4_x = offset_bottom_rois[15] * spatial_scale;
    T p4_y = offset_bottom_rois[16] * spatial_scale;
    T p5_x = offset_bottom_rois[13] * spatial_scale;
    T p5_y = offset_bottom_rois[14] * spatial_scale;
    T p6_x = offset_bottom_rois[11] * spatial_scale;
    T p6_y = offset_bottom_rois[12] * spatial_scale;
    T p7_x = offset_bottom_rois[9] * spatial_scale;
    T p7_y = offset_bottom_rois[10] * spatial_scale;

    // compute the coords
    const T u = pw / static_cast<T>(pooled_width);
    const T v = ph / static_cast<T>(pooled_height);
    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
    const T x_center = x1 * v + x0 * (1. - v) - offset;
    const T y_center = y1 * v + y0 * (1. - v) - offset;

    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
    if (!aligned) {  // for backward-compatibility only
      roi_width = max(roi_width, (T)1.);
      roi_height = max(roi_height, (T)1.);
    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    const T *offset_bottom_data =
        bottom_data + (roi_batch_ind * channels + c) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceil(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

    // We do average (integral) pooling inside a bin
    // When the grid is empty, output zeros == 0/1, instead of NaN.
    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    T output_val = 0.;
    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
    {
      const T y = y_center - (T)0.5 * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = x_center - (T)0.5 * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
                                     index);
        output_val += val;
      }
    }
    output_val /= count;

    top_data[index] = output_val;
  }
}

template <typename T>
__global__ void bezier_align_backward_cuda_kernel(
    const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int sampling_ratio, bool aligned, const int channels,
    const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    // beziers have size Nx(1+8*2) = Nx17
    const T *offset_bottom_rois = bottom_rois + n * 17;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T p0_x = offset_bottom_rois[1] * spatial_scale;
    T p0_y = offset_bottom_rois[2] * spatial_scale;
    T p1_x = offset_bottom_rois[3] * spatial_scale;
    T p1_y = offset_bottom_rois[4] * spatial_scale;
    T p2_x = offset_bottom_rois[5] * spatial_scale;
    T p2_y = offset_bottom_rois[6] * spatial_scale;
    T p3_x = offset_bottom_rois[7] * spatial_scale;
    T p3_y = offset_bottom_rois[8] * spatial_scale;
    T p4_x = offset_bottom_rois[15] * spatial_scale;
    T p4_y = offset_bottom_rois[16] * spatial_scale;
    T p5_x = offset_bottom_rois[13] * spatial_scale;
    T p5_y = offset_bottom_rois[14] * spatial_scale;
    T p6_x = offset_bottom_rois[11] * spatial_scale;
    T p6_y = offset_bottom_rois[12] * spatial_scale;
    T p7_x = offset_bottom_rois[9] * spatial_scale;
    T p7_y = offset_bottom_rois[10] * spatial_scale;

    // compute the coords
    const T u = pw / static_cast<T>(pooled_width);
    const T v = ph / static_cast<T>(pooled_height);
    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
    const T x_center = x1 * v + x0 * (1. - v) - offset;
    const T y_center = y1 * v + y0 * (1. - v) - offset;

    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
    if (!aligned) {  // for backward-compatibility only
      roi_width = max(roi_width, (T)1.);
      roi_height = max(roi_height, (T)1.);
    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    T *offset_bottom_diff =
        bottom_diff + (roi_batch_ind * channels + c) * height * width;

    int top_offset = (n * channels + c) * pooled_height * pooled_width;
    const T *offset_top_diff = top_diff + top_offset;
    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceil(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
    {
      const T y = y_center - (T)0.5 * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = x_center - (T)0.5 * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high, index);

        T g1 = top_diff_this_bin * w1 / count;
        T g2 = top_diff_this_bin * w2 / count;
        T g3 = top_diff_this_bin * w3 / count;
        T g4 = top_diff_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_bottom_diff + y_low * width + x_low,
                    static_cast<T>(g1));
          atomicAdd(offset_bottom_diff + y_low * width + x_high,
                    static_cast<T>(g2));
          atomicAdd(offset_bottom_diff + y_high * width + x_low,
                    static_cast<T>(g3));
          atomicAdd(offset_bottom_diff + y_high * width + x_high,
                    static_cast<T>(g4));
        }  // if
      }  // ix
    }  // iy
  }  // CUDA_1D_KERNEL_LOOP
}  // BezierAlignBackward

#endif  // BEZIER_ALIGN_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.
// the main difference: (1) use `argmax_idx` for fast computing of gradient
// during the backward. (2) `wh` is directly computed by `boxes`, rather than
// passing it as argument to forward or backward functions.

#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH
#define BORDER_ALIGN_CUDA_KERNEL_CUH

#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else  // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else  // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif  // MMCV_USE_PARROTS
#endif  // MMCV_WITH_TRT

enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };

/*** Forward ***/
template <typename T>
__global__ void border_align_forward_cuda_kernel(
    const int nthreads, const T* input, const T* boxes, T* output,
    int* argmax_idx, const int channels, const int box_size, const int height,
    const int width, const int pool_size) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
    // output, and `extreme_idx` is in range [0,3]
    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;
    const T *offset_box, *offset_input, *offset_box_x;
    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,
        val, maxval;

    extreme_idx = threadIdx.y;
    // shape (N, C, box_size, 4) for output
    batch_idx = index / channels / box_size;
    // shape (N, box_size, 4) for boxes
    box_idx = index % box_size + batch_idx * box_size;
    c_idx = (index / box_size) % channels;

    offset_box = boxes + box_idx * 4;
    box_width = *(offset_box + 2) - *offset_box;
    box_height = *(offset_box + 3) - *(offset_box + 1);
    offset_output = output + index * 4 + extreme_idx;
    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
    // shape (N, 4C, h, w) for input.
    // [0,C) for top feature, [C,2C) for left feature,
    // [2C,3C) for bottom feature, [3C,4C) for right feature
    offset_input =
        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *
                    height * width;

    // extreme_idx in [0,1] -> offset_box_x indexed at x1
    // extreme_idx in [2,3] -> offset_box_x indexed at x2
    offset_box_x = offset_box + extreme_idx / 2 * 2;

    // (x1,y1) or (x2,y2) for (x,y)
    x = *offset_box_x;
    y = *(offset_box_x + 1);

    switch (extreme_idx) {
      // top
      case BorderMode::Top:
        stride = box_width / pool_size;
        x_stride = stride;
        y_stride = 0;
        break;
      // left
      case BorderMode::Left:
        stride = box_height / pool_size;
        x_stride = 0;
        y_stride = stride;
        break;
      // bottom
      case BorderMode::Bottom:
        stride = box_width / pool_size;
        x_stride = -stride;
        y_stride = 0;
        break;
      // right
      case BorderMode::Right:
        stride = box_height / pool_size;
        x_stride = 0;
        y_stride = -stride;
        break;
    }

    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or
    // (x2,y2))
    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);
    maxidx = 0;

    // do max_pool along the border
    for (int i = 1; i <= pool_size; i++) {
      x += x_stride;
      y += y_stride;
      val = bilinear_interpolate(offset_input, height, width, y, x, index);
      if (val > maxval) {
        maxval = val;
        maxidx = i;
      }
    }

    // update output and argmax_idx
    *offset_output = maxval;
    *offset_argmax_idx = maxidx;
  }
}

/*** Backward ***/
template <typename T>
__global__ void border_align_backward_cuda_kernel(
    const int nthreads, const T* grad_output, const T* boxes,
    const int* argmax_idx, T* grad_input, const int channels,
    const int box_size, const int height, const int width,
    const int pool_size) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
    // output, and `extreme_idx` is in range [0,3]
    int batch_idx, c_idx, box_idx, extreme_idx;
    const int* offset_argmax_idx;
    const T *offset_grad_output, *offset_box, *offset_box_x;
    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,
        y;

    extreme_idx = threadIdx.y;
    batch_idx = index / channels / box_size;
    box_idx = index % box_size + batch_idx * box_size;
    c_idx = (index / box_size) % channels;

    offset_box = boxes + box_idx * 4;
    box_width = *(offset_box + 2) - *offset_box;
    box_height = *(offset_box + 3) - *(offset_box + 1);
    offset_grad_output = grad_output + index * 4 + extreme_idx;
    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
    // [0,C) for top feature grad, [C,2C) for left feature grad,
    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad
    offset_grad_input = grad_input + (batch_idx * channels * 4 +
                                      extreme_idx * channels + c_idx) *
                                         height * width;

    // extreme_idx in [0,1] -> offset_box_x indexed at x1
    // extreme_idx in [2,3] -> offset_box_x indexed at x2
    offset_box_x = offset_box + extreme_idx / 2 * 2;

    switch (extreme_idx) {
      // top
      case BorderMode::Top:
        stride = box_width / pool_size;
        x_stride = stride;
        y_stride = 0;
        break;
      // left
      case BorderMode::Left:
        stride = box_height / pool_size;
        x_stride = 0;
        y_stride = stride;
        break;
      // bottom
      case BorderMode::Bottom:
        stride = box_width / pool_size;
        x_stride = -stride;
        y_stride = 0;
        break;
      // right
      case BorderMode::Right:
        stride = box_height / pool_size;
        x_stride = 0;
        y_stride = -stride;
        break;
    }

    // get position (x,y) which has maximum value during forward
    x = *offset_box_x;
    y = *(offset_box_x + 1);
    x += x_stride * (T)(*offset_argmax_idx);
    y += y_stride * (T)(*offset_argmax_idx);

    T w1, w2, w3, w4;
    int x_low, x_high, y_low, y_high;
    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,
                                  x_high, y_low, y_high, index);

    // update grad_output
    atomicAdd(offset_grad_input + y_low * width + x_low,
              *offset_grad_output * w1);
    atomicAdd(offset_grad_input + y_low * width + x_high,
              *offset_grad_output * w2);
    atomicAdd(offset_grad_input + y_high * width + x_low,
              *offset_grad_output * w3);
    atomicAdd(offset_grad_input + y_high * width + x_high,
              *offset_grad_output * w4);
  }
}

#endif  // BORDER_ALIGN_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef BOX_IOU_QUADRI_CUDA_CUH
#define BOX_IOU_QUADRI_CUDA_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#include "box_iou_rotated_utils.hpp"

// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;

inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }

template <typename T>
__global__ void box_iou_quadri_cuda_kernel(
    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
  if (aligned) {
    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
      int b1 = index;
      int b2 = index;

      int base1 = b1 * 8;

      float block_boxes1[8];
      float block_boxes2[8];

      block_boxes1[0] = dev_boxes1[base1 + 0];
      block_boxes1[1] = dev_boxes1[base1 + 1];
      block_boxes1[2] = dev_boxes1[base1 + 2];
      block_boxes1[3] = dev_boxes1[base1 + 3];
      block_boxes1[4] = dev_boxes1[base1 + 4];
      block_boxes1[5] = dev_boxes1[base1 + 5];
      block_boxes1[6] = dev_boxes1[base1 + 6];
      block_boxes1[7] = dev_boxes1[base1 + 7];

      int base2 = b2 * 8;

      block_boxes2[0] = dev_boxes2[base2 + 0];
      block_boxes2[1] = dev_boxes2[base2 + 1];
      block_boxes2[2] = dev_boxes2[base2 + 2];
      block_boxes2[3] = dev_boxes2[base2 + 3];
      block_boxes2[4] = dev_boxes2[base2 + 4];
      block_boxes2[5] = dev_boxes2[base2 + 5];
      block_boxes2[6] = dev_boxes2[base2 + 6];
      block_boxes2[7] = dev_boxes2[base2 + 7];

      dev_ious[index] =
          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
    }
  } else {
    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
      int b1 = index / n_boxes2;
      int b2 = index % n_boxes2;

      int base1 = b1 * 8;

      float block_boxes1[8];
      float block_boxes2[8];

      block_boxes1[0] = dev_boxes1[base1 + 0];
      block_boxes1[1] = dev_boxes1[base1 + 1];
      block_boxes1[2] = dev_boxes1[base1 + 2];
      block_boxes1[3] = dev_boxes1[base1 + 3];
      block_boxes1[4] = dev_boxes1[base1 + 4];
      block_boxes1[5] = dev_boxes1[base1 + 5];
      block_boxes1[6] = dev_boxes1[base1 + 6];
      block_boxes1[7] = dev_boxes1[base1 + 7];

      int base2 = b2 * 8;

      block_boxes2[0] = dev_boxes2[base2 + 0];
      block_boxes2[1] = dev_boxes2[base2 + 1];
      block_boxes2[2] = dev_boxes2[base2 + 2];
      block_boxes2[3] = dev_boxes2[base2 + 3];
      block_boxes2[4] = dev_boxes2[base2 + 4];
      block_boxes2[5] = dev_boxes2[base2 + 5];
      block_boxes2[6] = dev_boxes2[base2 + 6];
      block_boxes2[7] = dev_boxes2[base2 + 7];

      dev_ious[index] =
          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
#ifndef BOX_IOU_ROTATED_CUDA_CUH
#define BOX_IOU_ROTATED_CUDA_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#include "box_iou_rotated_utils.hpp"

// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;

inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }

template <typename T>
__global__ void box_iou_rotated_cuda_kernel(
    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
  if (aligned) {
    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
      int b1 = index;
      int b2 = index;

      int base1 = b1 * 5;

      float block_boxes1[5];
      float block_boxes2[5];

      block_boxes1[0] = dev_boxes1[base1 + 0];
      block_boxes1[1] = dev_boxes1[base1 + 1];
      block_boxes1[2] = dev_boxes1[base1 + 2];
      block_boxes1[3] = dev_boxes1[base1 + 3];
      block_boxes1[4] = dev_boxes1[base1 + 4];

      int base2 = b2 * 5;

      block_boxes2[0] = dev_boxes2[base2 + 0];
      block_boxes2[1] = dev_boxes2[base2 + 1];
      block_boxes2[2] = dev_boxes2[base2 + 2];
      block_boxes2[3] = dev_boxes2[base2 + 3];
      block_boxes2[4] = dev_boxes2[base2 + 4];

      dev_ious[index] =
          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
    }
  } else {
    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
      int b1 = index / n_boxes2;
      int b2 = index % n_boxes2;

      int base1 = b1 * 5;

      float block_boxes1[5];
      float block_boxes2[5];

      block_boxes1[0] = dev_boxes1[base1 + 0];
      block_boxes1[1] = dev_boxes1[base1 + 1];
      block_boxes1[2] = dev_boxes1[base1 + 2];
      block_boxes1[3] = dev_boxes1[base1 + 3];
      block_boxes1[4] = dev_boxes1[base1 + 4];

      int base2 = b2 * 5;

      block_boxes2[0] = dev_boxes2[base2 + 0];
      block_boxes2[1] = dev_boxes2[base2 + 1];
      block_boxes2[2] = dev_boxes2[base2 + 2];
      block_boxes2[3] = dev_boxes2[base2 + 3];
      block_boxes2[4] = dev_boxes2[base2 + 4];

      dev_ious[index] =
          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CARAFE_CUDA_KERNEL_CUH
#define CARAFE_CUDA_KERNEL_CUH

#include <ATen/cuda/DeviceUtils.cuh>

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

#ifdef MMCV_WITH_HIP
#define WARP_SIZE 64
#else
#define WARP_SIZE 32
#endif
#define THREADS_PER_PIXEL 32
#define MAX_SHARED_MEMORY 49152
#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
#define MAXIMIZE_KERNEL_SIZE true
#define kTileDim 32
#define kBlockRows 8
#define FULL_MASK 0xffffffff

inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }

__device__ inline int Loc2Index(const int n, const int c, const int h,
                                const int w, const int channel_num,
                                const int height, const int width) {
  int index = w + (h + (c + n * channel_num) * height) * width;
  return index;
}
#ifndef MMCV_WITH_HIP
/* TODO: move this to a common place */
template <typename scalar_t>
__device__ inline scalar_t min(scalar_t a, scalar_t b) {
  return a < b ? a : b;
}

template <typename scalar_t>
__device__ inline scalar_t max(scalar_t a, scalar_t b) {
  return a > b ? a : b;
}
#endif
template <typename scalar_t>
__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP
    val += __shfl_down(val, offset);
#else
    val += __shfl_down_sync(FULL_MASK, val, offset);
#endif
  return val;
}

template <>
__device__ __forceinline__ phalf warpReduceSum(phalf val) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP
    // Using PyTorch's macro for half support
    __PHALF(val) += WARP_SHFL_DOWN(val, offset);
#else
    __PHALF(val) +=
        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
#endif
  return val;
}

// Splits the original matrix into submatrices with size 32 * 32.
// Each block transposes one submatrix by loading it into shared memory.
// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
template <typename scalar_t>
__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
                                           const int W, const int dh,
                                           const int dw,
                                           const scalar_t *__restrict__ X,
                                           scalar_t *__restrict__ Y) {
  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
  const int n = blockIdx.x / (dh * dw);
  const int k = blockIdx.x % (dh * dw);
  const int r = k / dw;
  const int c = k % dw;
  const int offset = n * H * W;
  int x = c * kTileDim + threadIdx.x;
  int y = r * kTileDim + threadIdx.y;
  if (x < W) {
    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
    }
  }
  __syncthreads();
  x = r * kTileDim + threadIdx.x;
  y = c * kTileDim + threadIdx.y;
  if (x < H) {
    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
    }
  }
}
template <typename scalar_t>
__global__ void CARAFEForward(
    const int num_kernels, const scalar_t *__restrict__ bottom_data,
    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
    const int group_size, const int scale_factor, const int channels,
    const int down_height, const int down_width, const int height,
    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
#if MAXIMIZE_KERNEL_SIZE
  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
#else
  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
#endif

  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index > num_kernels - 1) {
    return;
  }
  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
  index = index / THREADS_PER_PIXEL;
  const int pw = index % width;
  const int ph = (index / width) % height;
  const int n = index / width / height;

  const int down_pw = pw / scale_factor;
  const int down_ph = ph / scale_factor;

  const int start_w = down_pw - (kernel_size - 1) / 2;
  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
  const int start_h = down_ph - (kernel_size - 1) / 2;
  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
  }
  __syncthreads();

  const int channels_per_group = ceilf(channels / (float)group_size);
#pragma unroll
  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
    int mask_group = c / channels_per_group;
    scalar_t output_val = 0;
#pragma unroll
    for (int iy = start_h; iy < end_h; iy++) {
#pragma unroll
      for (int ix = start_w; ix < end_w; ix++) {
        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
          continue;
        }
        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
        int mask_c =
            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
        int feat_index =
            Loc2Index(n, iy, ix, c, down_height, down_width, channels);

        output_val += bottom_data[feat_index] *
                      shared_mask[mask_c * WARP_SIZE + pixel_id];
      }
    }

    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
    top_data[top_index] = output_val;
  }
}

template <typename scalar_t>
__global__ void CARAFEBackward_Feature(
    const int num_kernels, const scalar_t *__restrict__ top_diff,
    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
    const int group_size, const int scale_factor, const int channels,
    const int down_height, const int down_width, const int height,
    const int width, const int mask_channels,
    scalar_t *__restrict__ bottom_diff) {
#if MAXIMIZE_KERNEL_SIZE
  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
#else
  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
#endif

  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index > num_kernels - 1) {
    return;
  }

  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
  // (n, c, ph, pw) is an element in the bottom_data
  index = index / THREADS_PER_PIXEL;
  const int pw = index % width;
  const int ph = (index / width) % height;
  const int n = index / width / height;

  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
    const int mask_w = (c % kernel_size) * scale_factor;
    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
    const int mask_x = start_w + mask_w;
    const int mask_y = start_h + mask_h;
    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
      shared_mask[c * WARP_SIZE + pixel_id] = 0;
      continue;
    }
    const int mask_group = c / (kernel_size * kernel_size);
    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
    int mask_index =
        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
  }
  __syncthreads();
  const int channels_per_group = ceilf(channels / (float)group_size);
#pragma unroll
  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
    int mask_group = c / channels_per_group;
    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
    scalar_t output_val = 0;
#pragma unroll
    for (int iy = start_h; iy < end_h; iy += scale_factor) {
#pragma unroll
      for (int ix = start_w; ix < end_w; ix += scale_factor) {
        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
          continue;
        }
        int mask_iy =
            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
        int mask_ix =
            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
        int mask_c =
            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
        output_val +=
            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
      }
    }
    bottom_diff[top_index] = output_val;
  }
}

template <typename scalar_t>
__global__ void FeatureSum(const int num_kernels,
                           const scalar_t *__restrict__ input_data,
                           const int scale_factor, const int channels,
                           const int height, const int width,
                           scalar_t *__restrict__ output_data) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index > num_kernels - 1) {
    return;
  }
  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
  index = index / THREADS_PER_PIXEL;
  const int pw = index % width;
  const int ph = (index / width) % height;
  const int n = index / width / height;
  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
    scalar_t output_val = 0;
    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
                                 width * scale_factor, channels);
        output_val += input_data[input_id];
      }
    }
    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
    output_data[output_id] = output_val;
  }
}

template <typename scalar_t>
__global__ void CARAFEBackward_Mask(const int num_kernels,
                                    const scalar_t *__restrict__ top_diff,
                                    const scalar_t *__restrict__ bottom_data,
                                    const int kernel_size, const int group_size,
                                    const int scale_factor, const int channels,
                                    const int down_height, const int down_width,
                                    const int height, const int width,
                                    const int mask_channels,
                                    scalar_t *__restrict__ mask_diff) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index > num_kernels - 1) {
    return;
  }

  const int lane_id = index % WARP_SIZE;
  index = index / WARP_SIZE;
  const int mask_c = index % mask_channels;
  // (n, c, ph, pw) is an element in the bottom_data
  index = index / mask_channels;
  const int pw = index % width;
  const int ph = (index / width) % height;
  const int n = index / width / height;

  const int down_pw = pw / scale_factor;
  const int down_ph = ph / scale_factor;

  const int mask_group = mask_c / (kernel_size * kernel_size);
  const int mask_loc = mask_c % (kernel_size * kernel_size);

  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
  const int offset_y =
      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;

  const int down_x = down_pw + offset_x;
  const int down_y = down_ph + offset_y;

  scalar_t output_val = 0;

  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
      down_x <= down_width - 1) {
    const int channels_per_mask = ceilf(channels / (float)group_size);
    const int start = channels_per_mask * mask_group;
    const int end = min(channels_per_mask * (mask_group + 1), channels);
    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
      int bottom_id =
          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
      output_val += top_diff[top_id] * bottom_data[bottom_id];
    }
  }
#ifdef MMCV_WITH_HIP
  __syncthreads();
#else
  __syncwarp();
#endif
  output_val = warpReduceSum(output_val);
  if (lane_id == 0) {
    const int mask_id =
        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
    mask_diff[mask_id] = output_val;
  }
}

#endif  // CARAFE_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
#define CARAFE_NAIVE_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

__device__ inline int Loc2Index(const int n, const int c, const int h,
                                const int w, const int channel_num,
                                const int height, const int width) {
  int index = w + (h + (c + n * channel_num) * height) * width;
  return index;
}

template <typename scalar_t>
__global__ void carafe_naive_forward_cuda_kernel(
    const int nthreads, const scalar_t *bottom_data,
    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
    const int group_size, const int scale_factor, const int channels,
    const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the bottom_data
    int pw = index % width;
    int ph = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    int mask_channels = kernel_size * kernel_size * group_size;
    int mask_group = c / (channels / group_size);

    int down_pw = pw / scale_factor;
    int down_ph = ph / scale_factor;
    int down_width = width / scale_factor;
    int down_height = height / scale_factor;
    int start_w = down_pw - (kernel_size - 1) / 2;
    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
    int start_h = down_ph - (kernel_size - 1) / 2;
    int end_h = down_ph + (kernel_size - 1) / 2 + 1;

    scalar_t output_val = 0;
    for (int iy = start_h; iy < end_h; iy++) {
      for (int ix = start_w; ix < end_w; ix++) {
        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
          continue;
        }
        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
        int mask_c =
            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
        int feat_index =
            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
        int mask_index =
            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
      }
    }
    top_data[index] = output_val;
  }
}

template <typename scalar_t>
__global__ void carafe_naive_backward_cuda_kernel(
    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
    const int kernel_size, const int group_size, const int scale_factor,
    const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the bottom_data
    int pw = index % width;
    int ph = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    int mask_channels = kernel_size * kernel_size * group_size;
    int mask_group = c / (channels / group_size);

    int down_pw = pw / scale_factor;
    int down_ph = ph / scale_factor;
    int down_width = width / scale_factor;
    int down_height = height / scale_factor;
    int start_w = down_pw - (kernel_size - 1) / 2;
    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
    int start_h = down_ph - (kernel_size - 1) / 2;
    int end_h = down_ph + (kernel_size - 1) / 2 + 1;

    for (int iy = start_h; iy < end_h; iy++) {
      for (int ix = start_w; ix < end_w; ix++) {
        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
          continue;
        }
        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
        int mask_c =
            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
        int feat_index =
            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
        int mask_index =
            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
        atomicAdd(bottom_diff + feat_index,
                  bottom_masks[mask_index] * top_diff[index]);
        atomicAdd(mask_diff + mask_index,
                  bottom_data[feat_index] * top_diff[index]);
      }
    }
  }
}

#endif  // CARAFE_NAIVE_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144

template <typename scalar_t>
__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
                                                     const scalar_t* xyz, int m,
                                                     const scalar_t* xyz2,
                                                     scalar_t* result,
                                                     int* result_i) {
  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
  for (int i = blockIdx.x; i < b; i += gridDim.x) {
    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
        buf[j] = xyz2[(i * m + k2) * 2 + j];
      }
      __syncthreads();
      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
        scalar_t x1 = xyz[(i * n + j) * 2 + 0];
        scalar_t y1 = xyz[(i * n + j) * 2 + 1];
        int best_i = 0;
        scalar_t best = 1e10;
        int end_ka = end_k & (~3);
        if (end_ka == THREADS_PER_BLOCK) {
          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
#pragma unroll
            for (int j = 0; j < 4; ++j) {
              scalar_t x2 = buf[(k + j) * 2] - x1;
              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
              scalar_t d = x2 * x2 + y2 * y2;
              if (d < best) {
                best = d;
                best_i = k + k2 + j;
              }
            }
          }
        } else {
          for (int k = 0; k < end_ka; k += 4) {
#pragma unroll
            for (int j = 0; j < 4; ++j) {
              scalar_t x2 = buf[(k + j) * 2] - x1;
              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
              scalar_t d = x2 * x2 + y2 * y2;
              if (d < best) {
                best = d;
                best_i = k + k2 + j;
              }
            }
          }
        }
        for (int k = end_ka; k < end_k; k++) {
          scalar_t x2 = buf[k * 2 + 0] - x1;
          scalar_t y2 = buf[k * 2 + 1] - y1;
          scalar_t d = x2 * x2 + y2 * y2;
          if (k == 0 || d < best) {
            best = d;
            best_i = k + k2;
          }
        }
        if (k2 == 0 || result[(i * n + j)] > best) {
          result[(i * n + j)] = best;
          result_i[(i * n + j)] = best_i;
        }
      }
      __syncthreads();
    }
  }
}

template <typename scalar_t>
__global__ void chamfer_distance_backward_cuda_kernel(
    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
    scalar_t* grad_xyz2) {
  for (int i = blockIdx.x; i < b; i += gridDim.x) {
    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
      int j2 = idx1[i * n + j];
      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
      scalar_t g = grad_dist1[i * n + j] * 2;
      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
    }
  }
}
#endif  // CHAMFER_DISTANCE_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
================================================
#ifndef COMMON_CUDA_HELPER
#define COMMON_CUDA_HELPER

#include <cuda.h>

#define CUDA_1D_KERNEL_LOOP(i, n)                              \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)

#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
       i += blockDim.x * gridDim.x)                                 \
    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
         j += blockDim.y * gridDim.y)

#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)

#define THREADS_PER_BLOCK 512

inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
  int optimal_block_num = (N + num_threads - 1) / num_threads;
  int max_block_num = 4096;
  return min(optimal_block_num, max_block_num);
}

template <typename T>
__device__ T bilinear_interpolate(const T* input, const int height,
                                  const int width, T y, T x,
                                  const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  int y_low = (int)y;
  int x_low = (int)x;
  int y_high;
  int x_high;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;
  // do bilinear interpolation
  T v1 = input[y_low * width + x_low];
  T v2 = input[y_low * width + x_high];
  T v3 = input[y_high * width + x_low];
  T v4 = input[y_high * width + x_high];
  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  return val;
}

template <typename T>
__device__ void bilinear_interpolate_gradient(
    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
    int& x_low, int& x_high, int& y_low, int& y_high,
    const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    // empty
    w1 = w2 = w3 = w4 = 0.;
    x_low = x_high = y_low = y_high = -1;
    return;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  y_low = (int)y;
  x_low = (int)x;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;

  // reference in forward
  // T v1 = input[y_low * width + x_low];
  // T v2 = input[y_low * width + x_high];
  // T v3 = input[y_high * width + x_low];
  // T v4 = input[y_high * width + x_high];
  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  return;
}
#endif  // COMMON_CUDA_HELPER


================================================
FILE: mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
#define CONVEX_IOU_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

#define MAXN 100
#define NMAX 512
__device__ const double EPS = 1E-8;

__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }

struct Point {
  double x, y;
  __device__ Point() {}
  __device__ Point(double x, double y) : x(x), y(y) {}
};

__device__ inline bool point_same(Point& a, Point& b) {
  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
}

__device__ inline void swap1(Point* a, Point* b) {
  Point temp;
  temp.x = a->x;
  temp.y = a->y;

  a->x = b->x;
  a->y = b->y;

  b->x = temp.x;
  b->y = temp.y;
}

__device__ inline void reverse1(Point* a, const int n) {
  for (int i = 0; i < (n - 1) / 2.0; i++) {
    Point* j = &(a[i]);
    Point* k = &(a[n - 1 - i]);
    swap1(j, k);
  }
}

__device__ inline double cross(Point o, Point a, Point b) {
  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
}

__device__ inline double dis(Point a, Point b) {
  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
}
__device__ inline double area(Point* ps, int n) {
  ps[n] = ps[0];
  double res = 0;
  for (int i = 0; i < n; i++) {
    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
  }
  return res / 2.0;
}
__device__ inline double polygon_area_grad(Point* ps, int n,
                                           int* polygon_to_pred_index,
                                           int n_pred, double* grad_C) {
  ps[n] = ps[0];
  double partion_grad[4 * 30 + 2];
  double res = 0;
  for (int i = 0; i < n; i++) {
    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
    partion_grad[i * 4 + 2] = ps[i + 1].y;
    partion_grad[i * 4 + 3] = -ps[i + 1].x;
    if (i != n - 1) {
      partion_grad[i * 4 + 4] = -ps[i].y;
      partion_grad[i * 4 + 5] = ps[i].x;
    } else {
      partion_grad[0] = -ps[i].y;
      partion_grad[1] = ps[i].x;
    }
  }
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < n_pred; j++) {
      if (i == polygon_to_pred_index[j]) {
        grad_C[2 * polygon_to_pred_index[j + n_pred]] =
            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
        break;
      }
    }
    for (int j = 0; j < n_pred; j++) {
      if (i == polygon_to_pred_index[j]) {
        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
        break;
      }
    }
  }

  return res / 2.0;
}

__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
                                double* cut_grad, int m, int n, int i) {
  double s1, s2;
  double s2_s1_2;
  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
  s1 = cross(a, b, c);
  s2 = cross(a, b, d);

  ds1_dxc = -(b.y - a.y);
  ds1_dyc = b.x - a.x;
  ds2_dxd = ds1_dxc;
  ds2_dyd = ds1_dyc;
  s2_s1_2 = (s2 - s1) * (s2 - s1);

  if (sig(s1) == 0 && sig(s2) == 0) return 2;
  if (sig(s2 - s1) == 0) return 0;

  dxp_dxc =
      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
      (s2_s1_2);
  dxp_dyc =
      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
      (s2_s1_2);
  dxp_dxd =
      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
      (s2_s1_2);
  dxp_dyd =
      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
      (s2_s1_2);

  dyp_dxc =
      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
      (s2_s1_2);
  dyp_dyc =
      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
      (s2_s1_2);
  dyp_dxd =
      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
      (s2_s1_2);
  dyp_dyd =
      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
      (s2_s1_2);

  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
  if (i == n - 1) {
    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;
    cut_grad[4 * n * m + 1] = dyp_dxd;
    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;
    cut_grad[4 * n * m + 3] = dyp_dyd;
  } else {
    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;
    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;
    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
  }

  return 1;
}
__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
                                   double* cut_grad) {
  Point pp[MAXN];
  double ccur_grad[MAXN] = {};
  int m = 0;
  p[n] = p[0];
  int k = n;
  for (int i = 0; i < n; i++) {
    if (sig(cross(a, b, p[i])) > 0) {
      pp[m] = p[i];
      ccur_grad[4 * n * m + 4 * i] = 1.0;
      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
      m++;
    }
    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
      m++;
    }
  }

  n = 0;
  for (int i = 0; i < m; i++) {
    if (!i || !(point_same(pp[i], pp[i - 1]))) {
      p[n] = pp[i];
      for (int j = 0; j < 4 * k; j++) {
        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
      }
      n++;
    }
  }

  while (n > 1 && point_same(p[n - 1], p[0])) n--;
}

__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
                                       double* grad_AB, int order,
                                       int convex_n) {
  Point o(0, 0);
  int res_flag = 0;
  int s1 = sig(cross(o, a, b));
  int s2 = sig(cross(o, c, d));
  if (s1 == 0 || s2 == 0) return 0.0;
  if (s1 == -1) {
    Point* i = &a;
    Point* j = &b;
    swap1(i, j);
    res_flag = 1;
  }
  if (s2 == -1) {
    Point* i = &c;
    Point* j = &d;
    swap1(i, j);
  }
  Point p[10] = {o, a, b};
  int n = 3, n0 = 3, n1, n2, n3;
  double cut_grad1[MAXN] = {};
  double cut_grad2[MAXN] = {};
  double cut_grad3[MAXN] = {};
  double p1_p_grad[10][10] = {};
  double p2_p1_grad[10][10] = {};
  double p3_p2_grad[10][10] = {};

  double p3_p1_grad[10][10] = {};
  double p3_p_grad[10][10] = {};

  // 1
  polygon_cut(p, n, o, c, cut_grad1);
  n1 = n;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < 4 * n0; j++) {
      if (!(j % 2)) {
        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
      } else {
        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
      }
    }
  }

  // 2
  polygon_cut(p, n, c, d, cut_grad2);
  n2 = n;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < 4 * n1; j++) {
      if (!(j % 2)) {
        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
      } else {
        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
      }
    }
  }
  // 3
  polygon_cut(p, n, d, o, cut_grad3);
  n3 = n;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < 4 * n2; j++) {
      if (!(j % 2)) {
        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
      } else {
        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
      }
    }
  }

  // mul
  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
  for (int i = 0; i < 2 * n3; i++) {
    for (int j = 0; j < 2 * n1; j++) {
      double sum = 0.0;
      for (int m = 0; m < 2 * n2; m++) {
        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
      }
      p3_p1_grad[i][j] = sum;
    }
  }

  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
  for (int i = 0; i < 2 * n3; i++) {
    for (int j = 0; j < 2 * n0; j++) {
      double sum = 0.0;
      for (int m = 0; m < 2 * n1; m++) {
        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
      }
      p3_p_grad[i][j] = sum;
    }
  }

  // calculate S_grad
  int polygon_index_box_index[20];
  double grad_polygon[20];
  double S_grad[6];

  for (int i = 0; i < n3; i++) {
    polygon_index_box_index[i] = i;
    polygon_index_box_index[i + n3] = i;
  }

  double res =
      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);

  if (s1 * s2 == -1) {
    for (int j = 0; j < 2 * 3; j++) {
      double sum = 0.0;
      for (int m = 0; m < 2 * n3; m++) {
        sum = sum - grad_polygon[m] * p3_p_grad[m][j];
      }
      S_grad[j] = sum;
    }

    if (order != convex_n - 1) {
      if (res_flag) {
        grad_AB[2 * order] += S_grad[4];
        grad_AB[2 * order + 1] += S_grad[5];
        grad_AB[2 * order + 2] += S_grad[2];
        grad_AB[2 * order + 3] += S_grad[3];

      } else {
        grad_AB[2 * order] += S_grad[2];
        grad_AB[2 * order + 1] += S_grad[3];
        grad_AB[2 * order + 2] += S_grad[4];
        grad_AB[2 * order + 3] += S_grad[5];
      }
    } else {
      if (res_flag) {
        grad_AB[2 * order] += S_grad[4];
        grad_AB[2 * order + 1] += S_grad[5];
        grad_AB[0] += S_grad[2];
        grad_AB[1] += S_grad[3];

      } else {
        grad_AB[2 * order] += S_grad[2];
        grad_AB[2 * order + 1] += S_grad[3];
        grad_AB[0] += S_grad[4];
        grad_AB[1] += S_grad[5];
      }
    }
    res = -res;
  } else {
    for (int j = 0; j < 2 * 3; j++) {
      double sum = 0.0;
      for (int m = 0; m < 2 * n3; m++) {
        sum = sum + grad_polygon[m] * p3_p_grad[m][j];
      }
      S_grad[j] = sum;
    }

    if (order != convex_n - 1) {
      if (res_flag) {
        grad_AB[2 * order] += S_grad[4];
        grad_AB[2 * order + 1] += S_grad[5];
        grad_AB[2 * order + 2] += S_grad[2];
        grad_AB[2 * order + 3] += S_grad[3];
      } else {
        grad_AB[2 * order] += S_grad[2];
        grad_AB[2 * order + 1] += S_grad[3];
        grad_AB[2 * order + 2] += S_grad[4];
        grad_AB[2 * order + 3] += S_grad[5];
      }
    } else {
      if (res_flag) {
        grad_AB[2 * order] += S_grad[4];
        grad_AB[2 * order + 1] += S_grad[5];
        grad_AB[0] += S_grad[2];
        grad_AB[1] += S_grad[3];
      } else {
        grad_AB[2 * order] += S_grad[2];
        grad_AB[2 * order + 1] += S_grad[3];
        grad_AB[0] += S_grad[4];
        grad_AB[1] += S_grad[5];
      }
    }
  }
  return res;
}

__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
                                        double* grad_AB) {
  if (area(ps1, n1) < 0) reverse1(ps1, n1);
  if (area(ps2, n2) < 0) reverse1(ps2, n2);
  ps1[n1] = ps1[0];
  ps2[n2] = ps2[0];
  double res = 0;
  for (int i = 0; i < n1; i++) {
    for (int j = 0; j < n2; j++) {
      res +=
          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
    }
  }
  return res;
}

__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
  Point p_max, p_k;
  int max_index, k_index;
  int Stack[NMAX] = {}, top1, top2;
  double sign;
  Point right_point[10], left_point[10];

  for (int i = 0; i < n_poly; i++) {
    if (in_poly[i].y < in_poly[0].y ||
        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
      Point* j = &(in_poly[0]);
      Point* k = &(in_poly[i]);
      swap1(j, k);
    }
    if (i == 0) {
      p_max = in_poly[0];
      max_index = 0;
    }
    if (in_poly[i].y > p_max.y ||
        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
      p_max = in_poly[i];
      max_index = i;
    }
  }

  if (max_index == 0) {
    max_index = 1;
    p_max = in_poly[max_index];
  }

  k_index = 0, Stack[0] = 0, top1 = 0;
  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
                                         dis(in_poly[Stack[top1]], p_k)))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top1++;
    Stack[top1] = k_index;
  }
  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];

  k_index = 0, Stack[0] = 0, top2 = 0;

  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
                                        dis(in_poly[Stack[top2]], p_k))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top2++;
    Stack[top2] = k_index;
  }
  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];

  for (int i = 0; i < top1 + top2; i++) {
    if (i <= top1) {
      in_poly[i] = right_point[i];
    } else {
      in_poly[i] = left_point[top2 - (i - top1)];
    }
  }
  n_poly = top1 + top2;
}

__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
                                           int n2, double* grad_C) {
  Point polygon[MAXN];
  int n = n1 + n2, n_poly = 0;
  for (int i = 0; i < n1; i++) {
    for (int j = 0; j < n - n1; j++) {
      if (point_same(ps1[i], ps2[j])) {
        for (int k = j; k < n - n1 - 1; k++) {
          ps2[k] = ps2[k + 1];
        }
        n2--;
        break;
      }
    }
  }
  n_poly = n1 + n2;
  for (int i = 0; i < n_poly; i++) {
    if (i < n1) {
      polygon[i] = ps1[i];
    } else {
      polygon[i] = ps2[i - n1];
    }
  }

  Jarvis(polygon, n_poly);

  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};
  int n_pred = 0;
  for (int i = 0; i < n_poly; i++) {
    for (int j = 0; j < n1; j++) {
      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
        polygon_to_pred_index[n_pred] = i;
        polygon_to_pred_index[n_pred + n1] = j;
        n_pred += 1;
        break;
      }
    }
  }
  if (n_pred == 0) {
    double polygon_area = fabs(area(polygon, n_poly));
    for (int i = 0; i < 18; i++) {
      grad_C[i] = 0.0;
    }
    return polygon_area;
  } else {
    double polygon_area =
        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
    if (polygon_area < 0) {
      for (int i = 0; i < 18; i++) {
        grad_C[i] = -grad_C[i];
      }
    }
    return fabs(polygon_area);
  }
}

// convex_find and get the polygon_index_box_index
__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
                                        int* points_to_convex_ind) {
  int n_input = n_poly;
  Point input_poly[20];
  for (int i = 0; i < n_input; i++) {
    input_poly[i].x = in_poly[i].x;
    input_poly[i].y = in_poly[i].y;
  }
  Point p_max, p_k;
  int max_index, k_index;
  int Stack[20], top1, top2;
  double sign;
  Point right_point[10], left_point[10];

  for (int i = 0; i < n_poly; i++) {
    if (in_poly[i].y < in_poly[0].y ||
        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
      Point* j = &(in_poly[0]);
      Point* k = &(in_poly[i]);
      swap1(j, k);
    }
    if (i == 0) {
      p_max = in_poly[0];
      max_index = 0;
    }
    if (in_poly[i].y > p_max.y ||
        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
      p_max = in_poly[i];
      max_index = i;
    }
  }
  if (max_index == 0) {
    max_index = 1;
    p_max = in_poly[max_index];
  }

  k_index = 0, Stack[0] = 0, top1 = 0;
  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
                                         dis(in_poly[Stack[top1]], p_k)))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top1++;
    Stack[top1] = k_index;
  }
  for (int i = 0; i <= top1; i++) {
    right_point[i] = in_poly[Stack[i]];
  }

  k_index = 0, Stack[0] = 0, top2 = 0;

  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
                                        dis(in_poly[Stack[top2]], p_k))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top2++;
    Stack[top2] = k_index;
  }

  for (int i = top2 - 1; i >= 0; i--) {
    left_point[i] = in_poly[Stack[i]];
  }

  for (int i = 0; i < top1 + top2; i++) {
    if (i <= top1) {
      in_poly[i] = right_point[i];
    } else {
      in_poly[i] = left_point[top2 - (i - top1)];
    }
  }
  n_poly = top1 + top2;
  for (int i = 0; i < n_poly; i++) {
    for (int j = 0; j < n_input; j++) {
      if (point_same(in_poly[i], input_poly[j])) {
        points_to_convex_ind[i] = j;
        break;
      }
    }
  }
}

template <typename T>
__device__ inline float devrIoU(T const* const p, T const* const q,
                                T* point_grad, const int idx) {
  Point ps1[MAXN], ps2[MAXN];

  Point convex[MAXN];
  for (int i = 0; i < 9; i++) {
    convex[i].x = (double)p[i * 2];
    convex[i].y = (double)p[i * 2 + 1];
  }
  int n_convex = 9;
  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
  Jarvis_and_index(convex, n_convex, points_to_convex_ind);

  int n1 = n_convex;
  int n2 = 4;

  for (int i = 0; i < n1; i++) {
    ps1[i].x = (double)convex[i].x;
    ps1[i].y = (double)convex[i].y;
  }

  for (int i = 0; i < n2; i++) {
    ps2[i].x = (double)q[i * 2];
    ps2[i].y = (double)q[i * 2 + 1];
  }

  int polygon_index_box_index[18];
  for (int i = 0; i < n1; i++) {
    polygon_index_box_index[i] = i;
    polygon_index_box_index[i + n1] = i;
  }

  double grad_A[18] = {};
  double grad_AB[18] = {};
  double grad_C[18] = {};

  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
  double S_pred =
      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
  if (S_pred < 0) {
    for (int i = 0; i < n_convex * 2; i++) {
      grad_A[i] = -grad_A[i];
    }
  }
  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;

  double iou = inter_area / union_area;
  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);

  //    printf("%d:live\n", idx);
  double rot_giou = iou - (polygon_area - union_area) / polygon_area;

  float grad_point_temp[18] = {};

  for (int i = 0; i < n_convex; i++) {
    int grad_point = points_to_convex_ind[i];
    grad_point_temp[2 * grad_point] =
        (float)((union_area + inter_area) / (union_area * union_area) *
                    grad_AB[2 * i] -
                iou / union_area * grad_A[2 * i] -
                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
    grad_point_temp[2 * grad_point + 1] =
        (float)((union_area + inter_area) / (union_area * union_area) *
                    grad_AB[2 * i + 1] -
                iou / union_area * grad_A[2 * i + 1] -
                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
  }

  for (int i = 0; i < 9; i++) {
    point_grad[2 * i] = grad_point_temp[2 * i];
    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
  }
  return (float)rot_giou;
}

template <typename T>
__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
                                        const int gt_n_boxes, const T* ex_boxes,
                                        const T* gt_boxes, T* point_grad) {
  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
    const T* cur_box = ex_boxes + index * 18;
    const T* cur_gt_box = gt_boxes + index * 8;
    T* cur_grad = point_grad + index * 19;
    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
    cur_grad[18] = giou;
  }
}

__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
  double s1, s2;
  s1 = cross(a, b, c);
  s2 = cross(a, b, d);
  if (sig(s1) == 0 && sig(s2) == 0) return 2;
  if (sig(s2 - s1) == 0) return 0;
  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
  return 1;
}

__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
  Point pp[MAXN];
  int m = 0;
  p[n] = p[0];
  for (int i = 0; i < n; i++) {
    if (sig(cross(a, b, p[i])) > 0) {
      pp[m] = p[i];
      m++;
    }
    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
      lineCross(a, b, p[i], p[i + 1], pp[m]);
      m++;
    }
  }
  n = 0;
  for (int i = 0; i < m; i++) {
    if (!i || !(point_same(pp[i], pp[i - 1]))) {
      p[n] = pp[i];
      n++;
    }
  }

  while (n > 1 && point_same(p[n - 1], p[0])) n--;
}

__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
  Point o(0, 0);
  int s1 = sig(cross(o, a, b));
  int s2 = sig(cross(o, c, d));
  if (s1 == 0 || s2 == 0) return 0.0;
  if (s1 == -1) {
    Point* i = &a;
    Point* j = &b;
    swap1(i, j);
  }
  if (s2 == -1) {
    Point* i = &c;
    Point* j = &d;
    swap1(i, j);
  }
  Point p[10] = {o, a, b};
  int n = 3;

  polygon_cut(p, n, o, c);
  polygon_cut(p, n, c, d);
  polygon_cut(p, n, d, o);
  double res = area(p, n);
  if (s1 * s2 == -1) res = -res;
  return res;
}
__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
                                        int n2) {
  if (area(ps1, n1) < 0) reverse1(ps1, n1);
  if (area(ps2, n2) < 0) reverse1(ps2, n2);
  ps1[n1] = ps1[0];
  ps2[n2] = ps2[0];
  double res = 0;
  for (int i = 0; i < n1; i++) {
    for (int j = 0; j < n2; j++) {
      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
    }
  }
  return res;
}

template <typename T>
__device__ inline float devrIoU(T const* const p, T const* const q) {
  Point ps1[MAXN], ps2[MAXN];
  Point convex[MAXN];
  for (int i = 0; i < 9; i++) {
    convex[i].x = (double)p[i * 2];
    convex[i].y = (double)p[i * 2 + 1];
  }
  int n_convex = 9;
  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
  int n1 = n_convex;
  for (int i = 0; i < n1; i++) {
    ps1[i].x = (double)convex[i].x;
    ps1[i].y = (double)convex[i].y;
  }
  int n2 = 4;
  for (int i = 0; i < n2; i++) {
    ps2[i].x = (double)q[i * 2];
    ps2[i].y = (double)q[i * 2 + 1];
  }
  double inter_area = intersectAreaO(ps1, n1, ps2, n2);
  double S_pred = area(ps1, n1);
  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
  double iou = inter_area / union_area;
  return (float)iou;
}

template <typename T>
__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
                                       const int gt_n_boxes, const T* ex_boxes,
                                       const T* gt_boxes, T* iou) {
  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
    const T* cur_box = ex_boxes + index * 18;
    for (int i = 0; i < gt_n_boxes; i++) {
      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
    }
  }
}
#endif  // CONVEX_IOU_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
// Original licence: Under MIT License

#ifndef CORRELATION_CUDA
#define CORRELATION_CUDA

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

#include <cuda.h>
#include <cuda_runtime.h>
// Using <torch/extension.h> is recommended in the official documentation in
// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
// However, we use <torch/types.h> for compatibility with CUDA 9.0
// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
#include <torch/types.h>

#include <iostream>
#include <vector>

using namespace torch;

#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>
#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)

#define WARP_SIZE 32
#define FULL_MASK 0xffffffff

template <typename scalar_t>
__global__ void correlation_forward_cuda_kernel(
    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW,
    int oH, int oW) {
  const int iH = rInput1.size(1);
  const int iW = rInput1.size(2);
  const int C = rInput1.size(3);

  const int n = blockIdx.x;
  const int h = blockIdx.y * blockDim.y + threadIdx.y;
  const int w = blockIdx.z * blockDim.z + threadIdx.z;

  if (h >= oH || w >= oW) return;

  const int thread = threadIdx.x;

  const int start_i = -padH + h * dH;
  const int start_j = -padW + w * dW;

  const int patchRadH = dilation_patchH * (patchH - 1) / 2;
  const int patchRadW = dilation_patchW * (patchW - 1) / 2;

  for (int ph = 0; ph < patchH; ++ph) {
    int ph_dilated = ph * dilation_patchH - patchRadH;
    for (int pw = 0; pw < patchW; ++pw) {
      int pw_dilated = pw * dilation_patchW - patchRadW;
      scalar_t prod_sum = 0.0f;
      for (int i = 0; i < kH; ++i) {
        int i1 = start_i + i * dilationH;
        int i2 = i1 + ph_dilated;
        if (WITHIN_BOUNDS(i1, i2, iH, iH)) {
          for (int j = 0; j < kW; ++j) {
            int j1 = start_j + j * dilationW;
            int j2 = j1 + pw_dilated;
            if (WITHIN_BOUNDS(j1, j2, iW, iW)) {
              for (int c = thread; c < C; c += WARP_SIZE) {
                scalar_t v1 = rInput1[n][i1][j1][c];
                scalar_t v2 = rInput2[n][i2][j2][c];
                prod_sum += v1 * v2;
              }
            }
          }
        }
      }
      // accumulate
      for (int offset = 16; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP
        prod_sum += __shfl_down(float(prod_sum), offset);
#else
        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
#endif
      if (thread == 0) {
        output[n][ph][pw][h][w] = prod_sum;
      }
    }
  }
}

template <typename scalar_t>
__global__ void correlation_backward_cuda_kernel_input1(
    const TensorAcc5R grad_output, const TensorAcc4R input2,
    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
    const int patchW, const int padH, const int padW, const int dilationH,
    const int dilationW, const int dilation_patchH, const int dilation_patchW,
    const int dH, const int dW) {
  const int iH = input2.size(1);
  const int iW = input2.size(2);
  const int C = input2.size(3);

  const int H = grad_output.size(3);
  const int W = grad_output.size(4);

  const int patchRadH = (patchH - 1) / 2;
  const int patchRadW = (patchW - 1) / 2;

  const int n = blockIdx.x;
  const int h = blockIdx.y;
  const int w = blockIdx.z;

  const int h_2 = h + padH;
  const int w_2 = w + padW;
  const int min_h = h_2 - kH * dilationH;
  const int min_w = w_2 - kW * dilationW;

  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
    const int ph = i / patchW;
    const int pw = i % patchW;
    int i1 = h + dilation_patchH * (ph - patchRadH);
    int j1 = w + dilation_patchW * (pw - patchRadW);

    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
      scalar_t grad_val = 0.0f;
      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
        int i2 = (h_3) / dH;
        if (i2 * dH != h_3) continue;
        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
          int j2 = (w_3) / dW;
          if (j2 * dW != w_3) continue;
          if (WITHIN_BOUNDS(i2, j2, H, W)) {
            grad_val += grad_output[n][ph][pw][i2][j2];
          }
        }
      }
      grad_cache[i] = grad_val;
    }
  }
  __syncthreads();

  for (int c = threadIdx.x; c < C; c += blockDim.x) {
    scalar_t grad_input_val = 0.0f;
    for (int ph = 0; ph < patchH; ++ph) {
      int i1 = h + dilation_patchH * (ph - patchRadH);
      for (int pw = 0; pw < patchW; ++pw) {
        int j1 = w + dilation_patchW * (pw - patchRadW);
        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
        }
      }
    }
    grad_input1[n][c][h][w] = grad_input_val;
  }
}

template <typename scalar_t>
__global__ void correlation_backward_cuda_kernel_input2(
    const TensorAcc5R grad_output, const TensorAcc4R input1,
    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
    int padW, int dilationH, int dilationW, int dilation_patchH,
    int dilation_patchW, int dH, int dW) {
  const int iH = input1.size(1);
  const int iW = input1.size(2);
  const int C = input1.size(3);

  const int patchRadH = (patchH - 1) / 2;
  const int patchRadW = (patchW - 1) / 2;

  const int H = grad_output.size(3);
  const int W = grad_output.size(4);

  const int dilatedKH = kH * dilationH;
  const int dilatedKW = kW * dilationW;

  const int n = blockIdx.x;
  const int h = blockIdx.y;
  const int w = blockIdx.z;

  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
    const int ph = i / patchW;
    const int pw = i % patchW;
    int i1 = h - dilation_patchH * (ph - patchRadH);
    int j1 = w - dilation_patchW * (pw - patchRadW);

    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
      scalar_t grad_val = 0.0f;

      const int h_2 = i1 + padH;
      const int w_2 = j1 + padW;
      const int min_h = h_2 - dilatedKH;
      const int min_w = w_2 - dilatedKW;

      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
        int i2 = (h_3) / dH;
        if (i2 * dH != h_3) continue;
        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
          int j2 = (w_3) / dW;
          if (j2 * dW != w_3) continue;
          if (WITHIN_BOUNDS(i2, j2, H, W)) {
            grad_val += grad_output[n][ph][pw][i2][j2];
          }
        }
      }
      grad_cache[i] = grad_val;
    }
  }
  __syncthreads();

  for (int c = threadIdx.x; c < C; c += blockDim.x) {
    scalar_t grad_input_val = 0.0f;
    for (int ph = 0; ph < patchH; ++ph) {
      int i1 = h - dilation_patchH * (ph - patchRadH);
      for (int pw = 0; pw < patchW; ++pw) {
        int j1 = w - dilation_patchW * (pw - patchRadW);
        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
        }
      }
    }
    grad_input2[n][c][h][w] = grad_input_val;
  }
}
#endif


================================================
FILE: mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
================================================
/*!
 ******************* BEGIN Caffe Copyright Notice and Disclaimer
 *****************
 *
 * COPYRIGHT
 *
 * All contributions by the University of California:
 * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 * All rights reserved.
 *
 * All other contributions:
 * Copyright (c) 2014-2017, the respective contributors
 * All rights reserved.
 *
 * Caffe uses a shared copyright model: each contributor holds copyright over
 * their contributions to Caffe. The project versioning records all such
 * contribution and copyright details. If a contributor wants to further mark
 * their specific copyright on a particular contribution, they should indicate
 * their copyright solely in the commit message of the change when it is
 * committed.
 *
 * LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * CONTRIBUTION AGREEMENT
 *
 * By contributing to the BVLC/caffe repository through pull-request, comment,
 * or otherwise, the contributor releases their content to the
 * license and copyright terms herein.
 *
 ***************** END Caffe Copyright Notice and Disclaimer
 *********************
 *
 * Copyright (c) 2018 Microsoft
 * Licensed under The MIT License [see LICENSE for details]
 * \file modulated_deformable_im2col.cuh
 * \brief Function definitions of converting an image to
 * column matrix based on kernel, padding, dilation, and offset.
 * These functions are mainly used in deformable convolution operators.
 * \ref: https://arxiv.org/abs/1703.06211
 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
 */

// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu

#ifndef DEFORM_CONV_CUDA_KERNEL_CUH
#define DEFORM_CONV_CUDA_KERNEL_CUH

#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else  // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else  // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif  // MMCV_USE_PARROTS
#endif  // MMCV_WITH_TRT

template <typename T>
__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
                                        const int height, const int width, T h,
                                        T w) {
  if (h <= -1 || height <= h || w <= -1 || width <= w) {
    return 0;
  }

  int h_low = floorf(h);
  int w_low = floorf(w);
  int h_high = h_low + 1;
  int w_high = w_low + 1;

  T lh = h - h_low;
  T lw = w - w_low;
  T hh = 1 - lh, hw = 1 - lw;

  T v1 = 0;
  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
  T v2 = 0;
  if (h_low >= 0 && w_high <= width - 1)
    v2 = input[h_low * data_width + w_high];
  T v3 = 0;
  if (h_high <= height - 1 && w_low >= 0)
    v3 = input[h_high * data_width + w_low];
  T v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
    v4 = input[h_high * data_width + w_high];

  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

template <typename T>
__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
                                 const int w, const int height,
                                 const int width) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;
  if (h == argmax_h_low && w == argmax_w_low)
    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
  if (h == argmax_h_low && w == argmax_w_high)
    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
  if (h == argmax_h_high && w == argmax_w_low)
    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
  if (h == argmax_h_high && w == argmax_w_high)
    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
  return weight;
}

template <typename T>
__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
                                   const int width, const T *im_data,
                                   const int data_width, const int bp_dir) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;

  if (bp_dir == 0) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += -1 * (argmax_w - argmax_w_low) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_w - argmax_w_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  } else if (bp_dir == 1) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += -1 * (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  }

  return weight;
}

template <typename T>
__global__ void deformable_im2col_gpu_kernel(
    const int n, const T *data_im, const T *data_offset, const int height,
    const int width, const int kernel_h, const int kernel_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int num_channels, const int deformable_group, const int height_col,
    const int width_col, T *data_col) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    // index index of output matrix
    const int w_col = index % width_col;
    const int h_col = (index / width_col) % height_col;
    const int b_col = (index / width_col / height_col) % batch_size;
    const int c_im = (index / width_col / height_col) / batch_size;
    const int c_col = c_im * kernel_h * kernel_w;

    // compute deformable group index
    const int deformable_group_index = c_im / channel_per_deformable_group;

    const int h_in = h_col * stride_h - pad_h;
    const int w_in = w_col * stride_w - pad_w;
    T *data_col_ptr =
        data_col +
        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
    const T *data_im_ptr =
        data_im + (b_col * num_channels + c_im) * height * width;
    const T *data_offset_ptr =
        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        const int data_offset_h_ptr =
            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
        const int data_offset_w_ptr =
            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
            w_col;
        const T offset_h = data_offset_ptr[data_offset_h_ptr];
        const T offset_w = data_offset_ptr[data_offset_w_ptr];
        T val = static_cast<T>(0);
        const T h_im = h_in + i * dilation_h + offset_h;
        const T w_im = w_in + j * dilation_w + offset_w;
        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
                                           h_im, w_im);
        *data_col_ptr = val;
        data_col_ptr += batch_size * height_col * width_col;
      }
    }
  }
}

template <typename T>
__global__ void deformable_col2im_gpu_kernel(
    const int n, const T *data_col, const T *data_offset, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int deformable_group, const int height_col, const int width_col,
    T *grad_im) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    const int j = (index / width_col / height_col / batch_size) % kernel_w;
    const int i =
        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
    const int c =
        index / width_col / height_col / batch_size / kernel_w / kernel_h;
    // compute the start and end of the output

    const int deformable_group_index = c / channel_per_deformable_group;

    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int b = (index / width_col / height_col) % batch_size;
    int w_in = w_out * stride_w - pad_w;
    int h_in = h_out * stride_h - pad_h;

    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const int data_offset_h_ptr =
        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
    const int data_offset_w_ptr =
        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
    const T offset_h = data_offset_ptr[data_offset_h_ptr];
    const T offset_w = data_offset_ptr[data_offset_w_ptr];
    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;

    const T cur_top_grad = data_col[index];
    const int cur_h = (int)cur_inv_h_data;
    const int cur_w = (int)cur_inv_w_data;
    for (int dy = -2; dy <= 2; dy++) {
      for (int dx = -2; dx <= 2; dx++) {
        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
          int cur_bottom_grad_pos =
              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
                                         cur_h + dy, cur_w + dx, height, width);
          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
        }
      }
    }
  }
}

template <typename T>
__global__ void deformable_col2im_coord_gpu_kernel(
    const int n, const T *data_col, const T *data_im, const T *data_offset,
    const int channels, const int height, const int width, const int kernel_h,
    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int offset_channels, const int deformable_group, const int height_col,
    const int width_col, T *grad_offset) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    T val = 0;
    int w = index % width_col;
    int h = (index / width_col) % height_col;
    int c = (index / width_col / height_col) % offset_channels;
    int b = (index / width_col / height_col) / offset_channels;
    // compute the start and end of the output

    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
    const int col_step = kernel_h * kernel_w;
    int cnt = 0;
    const T *data_col_ptr = data_col + deformable_group_index *
                                           channel_per_deformable_group *
                                           batch_size * width_col * height_col;
    const T *data_im_ptr =
        data_im + (b * deformable_group + deformable_group_index) *
                      channel_per_deformable_group / kernel_h / kernel_w *
                      height * width;
    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;

    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
         col_c += col_step) {
      const int col_pos =
          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
      const int bp_dir = offset_c % 2;

      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
      int i =
          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
      int w_out = col_pos % width_col;
      int h_out = (col_pos / width_col) % height_col;
      int w_in = w_out * stride_w - pad_w;
      int h_in = h_out * stride_h - pad_h;
      const int data_offset_h_ptr =
          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
      const int data_offset_w_ptr =
          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
           w_out);
      const T offset_h = data_offset_ptr[data_offset_h_ptr];
      const T offset_w = data_offset_ptr[data_offset_w_ptr];
      T inv_h = h_in + i * dilation_h + offset_h;
      T inv_w = w_in + j * dilation_w + offset_w;
      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
        inv_h = inv_w = -2;
      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
                                             data_im_ptr + cnt * height * width,
                                             width, bp_dir);
      val += weight * data_col_ptr[col_pos];
      cnt += 1;
    }

    grad_offset[index] = val;
  }
}

#endif  // DEFORM_CONV_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH
#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void deform_roi_pool_forward_cuda_kernel(
    const int nthreads, const T* input, const T* rois, const T* offset,
    T* output, const int pooled_height, const int pooled_width,
    const T spatial_scale, const int sampling_ratio, const T gamma,
    const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];

    // Do not using rounding; this implementation detail is critical
    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;

    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;

    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    const T* offset_input =
        input + (roi_batch_ind * channels + c) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_height / pooled_height));
    int roi_bin_grid_w =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_width / pooled_width));

    // Compute roi offset
    if (offset != NULL) {
      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
                              ph * pooled_width + pw;
      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
      T offset_roi_h =
          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
      roi_start_w += offset_roi_w;
      roi_start_h += offset_roi_h;
    }

    // We do average pooling inside a bin
    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
    T output_val = 0.;
    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
      const T y = roi_start_h + ph * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = roi_start_w + pw * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);
        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
        output_val += val;
      }
    }
    output[index] = output_val / count;
  }
}

template <typename T>
__global__ void deform_roi_pool_backward_cuda_kernel(
    const int nthreads, const T* grad_output, const T* input, const T* rois,
    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
    const int pooled_width, const T spatial_scale, const int sampling_ratio,
    const T gamma, const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];
    const T* offset_input =
        input + ((roi_batch_ind * channels + c) * height * width);
    T* offset_grad_input =
        grad_input + ((roi_batch_ind * channels + c) * height * width);

    // Do not using rounding; this implementation detail is critical
    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;

    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;

    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_height / pooled_height));
    int roi_bin_grid_w =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_width / pooled_width));

    // Compute roi offset
    if (offset != NULL) {
      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
                              ph * pooled_width + pw;
      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
      T offset_roi_h =
          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
      roi_start_w += offset_roi_w;
      roi_start_h += offset_roi_h;
    }

    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
    const T grad_output_this_bin = grad_output[index] / count;

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
      const T y = roi_start_h + ph * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = roi_start_w + pw * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;
        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high, index);

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_grad_input + y_low * width + x_low,
                    grad_output_this_bin * w1);
          atomicAdd(offset_grad_input + y_low * width + x_high,
                    grad_output_this_bin * w2);
          atomicAdd(offset_grad_input + y_high * width + x_low,
                    grad_output_this_bin * w3);
          atomicAdd(offset_grad_input + y_high * width + x_high,
                    grad_output_this_bin * w4);
          if (offset != NULL) {
            T input_00 = offset_input[y_low * width + x_low];
            T input_10 = offset_input[y_low * width + x_high];
            T input_01 = offset_input[y_high * width + x_low];
            T input_11 = offset_input[y_high * width + x_high];
            T ogx = gamma * roi_width * grad_output_this_bin *
                    (input_11 * (y - y_low) + input_10 * (y_high - y) +
                     input_01 * (y_low - y) + input_00 * (y - y_high));
            T ogy = gamma * roi_height * grad_output_this_bin *
                    (input_11 * (x - x_low) + input_01 * (x_high - x) +
                     input_10 * (x_low - x) + input_00 * (x - x_high));
            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
                          ph * pooled_width + pw,
                      ogx);
            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
                          pooled_width * pooled_height + ph * pooled_width + pw,
                      ogy);
          }
        }
      }
    }
  }
}

#endif  // DEFORM_ROI_POOL_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

#define MAX_NUM_VERT_IDX 9
#define INTERSECTION_OFFSET 8
#define EPSILON 1e-8

inline int opt_n_thread(int work_size) {
  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
}

/*
compare normalized vertices (vertices around (0,0))
if vertex1 < vertex2 return true.
order: minimum at x-aixs, become larger in anti-clockwise direction
*/
__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
    return false;  // if equal, return false

  if (y1 > 0 && y2 < 0) return true;
  if (y1 < 0 && y2 > 0) return false;

  float n1 = x1 * x1 + y1 * y1 + EPSILON;
  float n2 = x2 * x2 + y2 * y2 + EPSILON;
  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;

  if (y1 > 0 && y2 > 0) {
    if (diff > EPSILON)
      return true;
    else
      return false;
  }
  if (y1 < 0 && y2 < 0) {
    if (diff < EPSILON)
      return true;
    else
      return false;
  }
  return false;
}

__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
    int b, int n, int m, const float *__restrict__ vertices,
    const bool *__restrict__ mask, const int *__restrict__ num_valid,
    int *__restrict__ idx) {
  int batch_idx = blockIdx.x;
  vertices += batch_idx * n * m * 2;
  mask += batch_idx * n * m;
  num_valid += batch_idx * n;
  idx += batch_idx * n * MAX_NUM_VERT_IDX;

  int index = threadIdx.x;  // index of polygon
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride) {
    int pad;  // index of arbitrary invalid intersection point (not box corner!)
    for (int j = INTERSECTION_OFFSET; j < m; ++j) {
      if (!mask[i * m + j]) {
        pad = j;
        break;
      }
    }
    if (num_valid[i] < 3) {
      // not enough vertices, take an invalid intersection point
      // (zero padding)
      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
        idx[i * MAX_NUM_VERT_IDX + j] = pad;
      }
    } else {
      // sort the valid vertices
      // note the number of valid vertices is known
      // note: check that num_valid[i] < MAX_NUM_VERT_IDX
      for (int j = 0; j < num_valid[i]; ++j) {
        // initialize with a "big" value
        float x_min = 1;
        float y_min = -EPSILON;
        int i_take = 0;
        int i2;
        float x2, y2;
        if (j != 0) {
          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
          x2 = vertices[i * m * 2 + i2 * 2 + 0];
          y2 = vertices[i * m * 2 + i2 * 2 + 1];
        }
        for (int k = 0; k < m; ++k) {
          float x = vertices[i * m * 2 + k * 2 + 0];
          float y = vertices[i * m * 2 + k * 2 + 1];
          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
              x_min = x;
              y_min = y;
              i_take = k;
            }
          }
        }
        idx[i * MAX_NUM_VERT_IDX + j] = i_take;
      }
      // duplicate the first idx
      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];

      // pad zeros
      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
        idx[i * MAX_NUM_VERT_IDX + j] = pad;
      }

      // for corner case: the two boxes are exactly the same.
      // in this case, idx would have duplicate elements, which makes the
      // shoelace formula broken because of the definition, the duplicate
      // elements only appear in the first 8 positions (they are "corners in
      // box", not "intersection of edges")
      if (num_valid[i] == 8) {
        int counter = 0;
        for (int j = 0; j < 4; ++j) {
          int check = idx[i * MAX_NUM_VERT_IDX + j];
          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
          }
        }
        if (counter == 4) {
          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
            idx[i * MAX_NUM_VERT_IDX + j] = pad;
          }
        }
      }

      // TODO: still might need to cover some other corner cases :(
    }
  }
}


================================================
FILE: mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
                         int idx1, int idx2) {
  const float v1 = dists[idx1], v2 = dists[idx2];
  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
  dists[idx1] = max(v1, v2);
  dists_i[idx1] = v2 > v1 ? i2 : i1;
}

template <unsigned int block_size>
__global__ void furthest_point_sampling_forward_cuda_kernel(
    int b, int n, int m, const float *__restrict__ dataset,
    float *__restrict__ temp, int *__restrict__ idxs) {
  // dataset: (B, N, 3)
  // tmp: (B, N)
  // output:
  //      idx: (B, M)

  if (m <= 0) return;
  __shared__ float dists[block_size];
  __shared__ int dists_i[block_size];

  int batch_index = blockIdx.x;
  dataset += batch_index * n * 3;
  temp += batch_index * n;
  idxs += batch_index * m;

  int tid = threadIdx.x;
  const int stride = block_size;

  int old = 0;
  if (threadIdx.x == 0) idxs[0] = old;

  __syncthreads();
  for (int j = 1; j < m; j++) {
    int besti = 0;
    float best = -1;
    float x1 = dataset[old * 3 + 0];
    float y1 = dataset[old * 3 + 1];
    float z1 = dataset[old * 3 + 2];
    for (int k = tid; k < n; k += stride) {
      float x2, y2, z2;
      x2 = dataset[k * 3 + 0];
      y2 = dataset[k * 3 + 1];
      z2 = dataset[k * 3 + 2];
      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
      // if (mag <= 1e-3)
      // continue;

      float d =
          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
      float d2 = min(d, temp[k]);
      temp[k] = d2;
      besti = d2 > best ? k : besti;
      best = d2 > best ? d2 : best;
    }
    dists[tid] = best;
    dists_i[tid] = besti;
    __syncthreads();

#pragma unroll
    for (int block_size_thres = 1024; block_size_thres >= 2;
         block_size_thres >>= 1) {
      const int tid_thres = block_size_thres / 2;
      if (block_size >= block_size_thres && tid < tid_thres) {
        __update(dists, dists_i, tid, tid + tid_thres);
      }
      __syncthreads();
    }

    old = dists_i[0];
    if (tid == 0) idxs[j] = old;
  }
}

// Modified from
// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
template <unsigned int block_size>
__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel(
    int b, int n, int m, const float *__restrict__ dataset,
    float *__restrict__ temp, int *__restrict__ idxs) {
  // dataset: (B, N, N)
  // tmp: (B, N)
  // output:
  //      idx: (B, M)

  if (m <= 0) return;
  __shared__ float dists[block_size];
  __shared__ int dists_i[block_size];

  int batch_index = blockIdx.x;
  dataset += batch_index * n * n;
  temp += batch_index * n;
  idxs += batch_index * m;

  int tid = threadIdx.x;
  const int stride = block_size;

  int old = 0;
  if (threadIdx.x == 0) idxs[0] = old;

  __syncthreads();
  for (int j = 1; j < m; j++) {
    int besti = 0;
    float best = -1;
    // float x1 = dataset[old * 3 + 0];
    // float y1 = dataset[old * 3 + 1];
    // float z1 = dataset[old * 3 + 2];
    for (int k = tid; k < n; k += stride) {
      // float x2, y2, z2;
      // x2 = dataset[k * 3 + 0];
      // y2 = dataset[k * 3 + 1];
      // z2 = dataset[k * 3 + 2];

      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
      // (z2 - z1);
      float d = dataset[old * n + k];

      float d2 = min(d, temp[k]);
      temp[k] = d2;
      besti = d2 > best ? k : besti;
      best = d2 > best ? d2 : best;
    }
    dists[tid] = best;
    dists_i[tid] = besti;
    __syncthreads();

#pragma unroll
    for (int block_size_thres = 1024; block_size_thres >= 2;
         block_size_thres >>= 1) {
      const int tid_thres = block_size_thres / 2;
      if (block_size >= block_size_thres && tid < tid_thres) {
        __update(dists, dists_i, tid, tid + tid_thres);
      }
      __syncthreads();
    }

    old = dists_i[0];
    if (tid == 0) idxs[j] = old;
  }
}

#endif  // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef GATHER_POINTS_CUDA_KERNEL_CUH
#define GATHER_POINTS_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

#define TOTAL_THREADS 1024

template <typename T>
__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
                                                  const T *points,
                                                  const int *__restrict__ idx,
                                                  T *out) {
  // points: (B, C, N)
  // idx: (B, M)
  // output:
  //      out: (B, C, M)

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
    if (bs_idx >= b || c_idx >= c) return;

    out += bs_idx * c * m + c_idx * m + pt_idx;
    idx += bs_idx * m + pt_idx;
    points += bs_idx * c * n + c_idx * n;
    out[0] = points[idx[0]];
  }
}

template <typename T>
__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
                                                   const T *grad_out,
                                                   const int *__restrict__ idx,
                                                   T *grad_points) {
  // grad_out: (B, C, M)
  // idx: (B, M)
  // output:
  //      grad_points: (B, C, N)

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
    if (bs_idx >= b || c_idx >= c) return;

    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
    idx += bs_idx * m + pt_idx;
    grad_points += bs_idx * c * n + c_idx * n;

    atomicAdd(grad_points + idx[0], grad_out[0]);
  }
}

#endif  // GATHER_POINTS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#ifndef GROUP_POINTS_CUDA_KERNEL_CUH
#define GROUP_POINTS_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void group_points_forward_cuda_kernel(int b, int c, int n,
                                                 int npoints, int nsample,
                                                 const T *points,
                                                 const int *__restrict__ idx,
                                                 T *out) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
    if (bs_idx >= b || c_idx >= c) return;

    int pt_idx = index / nsample;
    int sample_idx = index % nsample;

    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
                  pt_idx * nsample + sample_idx;

    out[out_idx] = points[in_idx];
  }
}

template <typename T>
__global__ void group_points_backward_cuda_kernel(int b, int c, int n,
                                                  int npoints, int nsample,
                                                  const T *grad_out,
                                                  const int *__restrict__ idx,
                                                  T *grad_points) {
  // grad_out: (B, C, npoints, nsample)
  // idx: (B, npoints, nsample)
  // output:
  //      grad_points: (B, C, N)
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
    int pt_idx = index / nsample;
    if (bs_idx >= b || c_idx >= c) return;

    int sample_idx = index % nsample;
    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
                pt_idx * nsample + sample_idx;
    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;

    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
  }
}

#endif  // GROUP_POINTS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef IOU3D_CUDA_KERNEL_CUH
#define IOU3D_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

const int THREADS_PER_BLOCK_IOU3D = 16;
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
__device__ const float EPS = 1e-8;

struct Point {
  float x, y;
  __device__ Point() {}
  __device__ Point(double _x, double _y) { x = _x, y = _y; }

  __device__ void set(float _x, float _y) {
    x = _x;
    y = _y;
  }

  __device__ Point operator+(const Point &b) const {
    return Point(x + b.x, y + b.y);
  }

  __device__ Point operator-(const Point &b) const {
    return Point(x - b.x, y - b.y);
  }
};

__device__ inline float cross(const Point &a, const Point &b) {
  return a.x * b.y - a.y * b.x;
}

__device__ inline float cross(const Point &p1, const Point &p2,
                              const Point &p0) {
  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
}

__device__ int check_rect_cross(const Point &p1, const Point &p2,
                                const Point &q1, const Point &q2) {
  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
            min(q1.y, q2.y) <= max(p1.y, p2.y);
  return ret;
}

__device__ inline int check_in_box2d(const float *box, const Point &p) {
  // params: box (7) [x, y, z, dx, dy, dz, heading]
  const float MARGIN = 1e-2;

  float center_x = box[0], center_y = box[1];
  // rotate the point in the opposite direction of box
  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;

  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
          fabs(rot_y) < box[4] / 2 + MARGIN);
}

__device__ inline int intersection(const Point &p1, const Point &p0,
                                   const Point &q1, const Point &q0,
                                   Point &ans_point) {
  // fast exclusion
  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;

  // check cross standing
  float s1 = cross(q0, p1, p0);
  float s2 = cross(p1, q1, p0);
  float s3 = cross(p0, q1, q0);
  float s4 = cross(q1, p1, q0);

  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;

  // calculate intersection of two lines
  float s5 = cross(q1, p1, p0);
  if (fabs(s5 - s1) > EPS) {
    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);

  } else {
    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
    float D = a0 * b1 - a1 * b0;

    ans_point.x = (b0 * c1 - b1 * c0) / D;
    ans_point.y = (a1 * c0 - a0 * c1) / D;
  }

  return 1;
}

__device__ inline void rotate_around_center(const Point &center,
                                            const float angle_cos,
                                            const float angle_sin, Point &p) {
  float new_x =
      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
  float new_y =
      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
  p.set(new_x, new_y);
}

__device__ inline int point_cmp(const Point &a, const Point &b,
                                const Point &center) {
  return atan2(a.y - center.y, a.x - center.x) >
         atan2(b.y - center.y, b.x - center.x);
}

__device__ inline float box_overlap(const float *box_a, const float *box_b) {
  // params box_a: [x, y, z, dx, dy, dz, heading]
  // params box_b: [x, y, z, dx, dy, dz, heading]

  float a_angle = box_a[6], b_angle = box_b[6];
  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;

  Point center_a(box_a[0], box_a[1]);
  Point center_b(box_b[0], box_b[1]);

  Point box_a_corners[5];
  box_a_corners[0].set(a_x1, a_y1);
  box_a_corners[1].set(a_x2, a_y1);
  box_a_corners[2].set(a_x2, a_y2);
  box_a_corners[3].set(a_x1, a_y2);

  Point box_b_corners[5];
  box_b_corners[0].set(b_x1, b_y1);
  box_b_corners[1].set(b_x2, b_y1);
  box_b_corners[2].set(b_x2, b_y2);
  box_b_corners[3].set(b_x1, b_y2);

  // get oriented corners
  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);

  for (int k = 0; k < 4; k++) {
    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
  }

  box_a_corners[4] = box_a_corners[0];
  box_b_corners[4] = box_b_corners[0];

  // get intersection of lines
  Point cross_points[16];
  Point poly_center;
  int cnt = 0, flag = 0;

  poly_center.set(0, 0);
  for (int i = 0; i < 4; i++) {
    for (int j = 0; j < 4; j++) {
      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
                          box_b_corners[j + 1], box_b_corners[j],
                          cross_points[cnt]);
      if (flag) {
        poly_center = poly_center + cross_points[cnt];
        cnt++;
      }
    }
  }

  // check corners
  for (int k = 0; k < 4; k++) {
    if (check_in_box2d(box_a, box_b_corners[k])) {
      poly_center = poly_center + box_b_corners[k];
      cross_points[cnt] = box_b_corners[k];
      cnt++;
    }
    if (check_in_box2d(box_b, box_a_corners[k])) {
      poly_center = poly_center + box_a_corners[k];
      cross_points[cnt] = box_a_corners[k];
      cnt++;
    }
  }

  poly_center.x /= cnt;
  poly_center.y /= cnt;

  // sort the points of polygon
  Point temp;
  for (int j = 0; j < cnt - 1; j++) {
    for (int i = 0; i < cnt - j - 1; i++) {
      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
        temp = cross_points[i];
        cross_points[i] = cross_points[i + 1];
        cross_points[i + 1] = temp;
      }
    }
  }

  // get the overlap areas
  float area = 0;
  for (int k = 0; k < cnt - 1; k++) {
    area += cross(cross_points[k] - cross_points[0],
                  cross_points[k + 1] - cross_points[0]);
  }

  return fabs(area) / 2.0;
}

__device__ inline float iou_bev(const float *box_a, const float *box_b) {
  // params box_a: [x, y, z, dx, dy, dz, heading]
  // params box_b: [x, y, z, dx, dy, dz, heading]
  float sa = box_a[3] * box_a[4];
  float sb = box_b[3] * box_b[4];
  float s_overlap = box_overlap(box_a, box_b);
  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
}

__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
    const int num_a, const float *boxes_a, const int num_b,
    const float *boxes_b, float *ans_overlap) {
  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
    if (a_idx >= num_a || b_idx >= num_b) {
      return;
    }

    const float *cur_box_a = boxes_a + a_idx * 7;
    const float *cur_box_b = boxes_b + b_idx * 7;
    float cur_overlap = box_overlap(cur_box_a, cur_box_b);
    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
  }
}

__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
                                                const float nms_overlap_thresh,
                                                const float *boxes,
                                                unsigned long long *mask) {
  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
  const int blocks =
      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
    // if (row_start > col_start) return;

    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
                               THREADS_PER_BLOCK_NMS);
    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
                               THREADS_PER_BLOCK_NMS);

    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];

    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 7 + 0] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
      block_boxes[threadIdx.x * 7 + 1] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
      block_boxes[threadIdx.x * 7 + 2] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
      block_boxes[threadIdx.x * 7 + 3] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
      block_boxes[threadIdx.x * 7 + 4] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
      block_boxes[threadIdx.x * 7 + 5] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
      block_boxes[threadIdx.x * 7 + 6] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
      const float *cur_box = boxes + cur_box_idx * 7;

      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks =
          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
      mask[cur_box_idx * col_blocks + col_start] = t;
    }
  }
}

__device__ inline float iou_normal(float const *const a, float const *const b) {
  // params: a: [x, y, z, dx, dy, dz, heading]
  // params: b: [x, y, z, dx, dy, dz, heading]

  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
  float interS = width * height;
  float Sa = a[3] * a[4];
  float Sb = b[3] * b[4];
  return interS / fmaxf(Sa + Sb - interS, EPS);
}

__global__ void iou3d_nms3d_normal_forward_cuda_kernel(
    const int boxes_num, const float nms_overlap_thresh, const float *boxes,
    unsigned long long *mask) {
  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
  // params: mask (N, N/THREADS_PER_BLOCK_NMS)

  const int blocks =
      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
    // if (row_start > col_start) return;

    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
                               THREADS_PER_BLOCK_NMS);
    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
                               THREADS_PER_BLOCK_NMS);

    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];

    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 7 + 0] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
      block_boxes[threadIdx.x * 7 + 1] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
      block_boxes[threadIdx.x * 7 + 2] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
      block_boxes[threadIdx.x * 7 + 3] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
      block_boxes[threadIdx.x * 7 + 4] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
      block_boxes[threadIdx.x * 7 + 5] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
      block_boxes[threadIdx.x * 7 + 6] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
      const float *cur_box = boxes + cur_box_idx * 7;

      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks =
          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
      mask[cur_box_idx * col_blocks + col_start] = t;
    }
  }
}

#endif  // IOU3D_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#ifndef KNN_CUDA_KERNEL_CUH
#define KNN_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

inline __device__ void swap_float(float *x, float *y) {
  float tmp = *x;
  *x = *y;
  *y = tmp;
}

inline __device__ void swap_int(int *x, int *y) {
  int tmp = *x;
  *x = *y;
  *y = tmp;
}

__device__ void reheap(float *dist, int *idx, int k) {
  int root = 0;
  int child = root * 2 + 1;
  while (child < k) {
    if (child + 1 < k && dist[child + 1] > dist[child]) child++;
    if (dist[root] > dist[child]) return;
    swap_float(&dist[root], &dist[child]);
    swap_int(&idx[root], &idx[child]);
    root = child;
    child = root * 2 + 1;
  }
}

__device__ void heap_sort(float *dist, int *idx, int k) {
  int i;
  for (i = k - 1; i > 0; i--) {
    swap_float(&dist[0], &dist[i]);
    swap_int(&idx[0], &idx[i]);
    reheap(dist, idx, i);
  }
}

// input: xyz (b, n, 3) new_xyz (b, m, 3)
// output: idx (b, m, nsample) dist2 (b, m, nsample)
template <typename T>
__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
                                        const T *xyz, const T *new_xyz,
                                        int *__restrict__ idx, T *dist2) {
  int bs_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
    if (bs_idx >= b) return;

    new_xyz += bs_idx * m * 3 + pt_idx * 3;
    xyz += bs_idx * n * 3;
    idx += bs_idx * m * nsample + pt_idx * nsample;
    dist2 += bs_idx * m * nsample + pt_idx * nsample;

    T new_x = new_xyz[0];
    T new_y = new_xyz[1];
    T new_z = new_xyz[2];

    float best_dist[100];
    int best_idx[100];
    for (int i = 0; i < nsample; i++) {
      best_dist[i] = 1e10;
      best_idx[i] = 0;
    }
    for (int i = 0; i < n; i++) {
      T x = xyz[i * 3 + 0];
      T y = xyz[i * 3 + 1];
      T z = xyz[i * 3 + 2];
      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
             (new_z - z) * (new_z - z);
      if (d2 < best_dist[0]) {
        best_dist[0] = d2;
        best_idx[0] = i;
        reheap(best_dist, best_idx, nsample);
      }
    }
    heap_sort(best_dist, best_idx, nsample);
    for (int i = 0; i < nsample; i++) {
      idx[i] = best_idx[i];
      dist2[i] = best_dist[i];
    }
  }
}

#endif  // KNN_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH
#define MASKED_CONV2D_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename scalar_t>
__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
                                    const int height, const int width,
                                    const int kernel_h, const int kernel_w,
                                    const int pad_h, const int pad_w,
                                    const int64_t *mask_h_idx,
                                    const int64_t *mask_w_idx,
                                    const int mask_cnt, scalar_t *data_col) {
  // mask_cnt * channels
  CUDA_1D_KERNEL_LOOP(index, n) {
    const int m_index = index % mask_cnt;
    const int h_col = mask_h_idx[m_index];
    const int w_col = mask_w_idx[m_index];
    const int c_im = index / mask_cnt;
    const int c_col = c_im * kernel_h * kernel_w;
    const int h_offset = h_col - pad_h;
    const int w_offset = w_col - pad_w;
    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
    for (int i = 0; i < kernel_h; ++i) {
      int h_im = h_offset + i;
      for (int j = 0; j < kernel_w; ++j) {
        int w_im = w_offset + j;
        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
          *data_col_ptr =
              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
        } else {
          *data_col_ptr = 0.0;
        }
        data_col_ptr += mask_cnt;
      }
    }
  }
}

template <typename scalar_t>
__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
                                    const int height, const int width,
                                    const int channels,
                                    const int64_t *mask_h_idx,
                                    const int64_t *mask_w_idx,
                                    const int mask_cnt, scalar_t *data_im) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    const int m_index = index % mask_cnt;
    const int h_im = mask_h_idx[m_index];
    const int w_im = mask_w_idx[m_index];
    const int c_im = index / mask_cnt;
    // compute the start and end of the output
    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
  }
}

#endif  // MASKED_CONV2D_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

#define MAXN 20
__device__ const float PI = 3.1415926;

struct Point {
  float x, y;
  __device__ Point() {}
  __device__ Point(float x, float y) : x(x), y(y) {}
};

__device__ inline void swap1(Point *a, Point *b) {
  Point temp;
  temp.x = a->x;
  temp.y = a->y;

  a->x = b->x;
  a->y = b->y;

  b->x = temp.x;
  b->y = temp.y;
}
__device__ inline float cross(Point o, Point a, Point b) {
  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
}

__device__ inline float dis(Point a, Point b) {
  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
}
__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
  float convex_points[2][MAXN];
  for (int j = 0; j < n_points; j++) {
    convex_points[0][j] = ps[j].x;
  }
  for (int j = 0; j < n_points; j++) {
    convex_points[1][j] = ps[j].y;
  }

  Point edges[MAXN];
  float edges_angles[MAXN];
  float unique_angles[MAXN];
  int n_edges = n_points - 1;
  int n_unique = 0;
  int unique_flag = 0;

  for (int i = 0; i < n_edges; i++) {
    edges[i].x = ps[i + 1].x - ps[i].x;
    edges[i].y = ps[i + 1].y - ps[i].y;
  }
  for (int i = 0; i < n_edges; i++) {
    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
    if (edges_angles[i] >= 0) {
      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
    } else {
      edges_angles[i] =
          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
    }
  }
  unique_angles[0] = edges_angles[0];
  n_unique += 1;
  for (int i = 1; i < n_edges; i++) {
    for (int j = 0; j < n_unique; j++) {
      if (edges_angles[i] == unique_angles[j]) {
        unique_flag += 1;
      }
    }
    if (unique_flag == 0) {
      unique_angles[n_unique] = edges_angles[i];
      n_unique += 1;
      unique_flag = 0;
    } else {
      unique_flag = 0;
    }
  }

  float minarea = 1e12;
  for (int i = 0; i < n_unique; i++) {
    float R[2][2];
    float rot_points[2][MAXN];
    R[0][0] = cos(unique_angles[i]);
    R[0][1] = sin(unique_angles[i]);
    R[1][0] = -sin(unique_angles[i]);
    R[1][1] = cos(unique_angles[i]);
    // R x Points
    for (int m = 0; m < 2; m++) {
      for (int n = 0; n < n_points; n++) {
        float sum = 0.0;
        for (int k = 0; k < 2; k++) {
          sum = sum + R[m][k] * convex_points[k][n];
        }
        rot_points[m][n] = sum;
      }
    }

    // xmin;
    float xmin, ymin, xmax, ymax;
    xmin = 1e12;
    for (int j = 0; j < n_points; j++) {
      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
        continue;
      } else {
        if (rot_points[0][j] < xmin) {
          xmin = rot_points[0][j];
        }
      }
    }
    // ymin
    ymin = 1e12;
    for (int j = 0; j < n_points; j++) {
      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
        continue;
      } else {
        if (rot_points[1][j] < ymin) {
          ymin = rot_points[1][j];
        }
      }
    }
    // xmax
    xmax = -1e12;
    for (int j = 0; j < n_points; j++) {
      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
        continue;
      } else {
        if (rot_points[0][j] > xmax) {
          xmax = rot_points[0][j];
        }
      }
    }
    // ymax
    ymax = -1e12;
    for (int j = 0; j < n_points; j++) {
      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
        continue;
      } else {
        if (rot_points[1][j] > ymax) {
          ymax = rot_points[1][j];
        }
      }
    }
    float area = (xmax - xmin) * (ymax - ymin);
    if (area < minarea) {
      minarea = area;
      minbox[0] = unique_angles[i];
      minbox[1] = xmin;
      minbox[2] = ymin;
      minbox[3] = xmax;
      minbox[4] = ymax;
    }
  }
}

// convex_find
__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
  int n_input = n_poly;
  Point input_poly[20];
  for (int i = 0; i < n_input; i++) {
    input_poly[i].x = in_poly[i].x;
    input_poly[i].y = in_poly[i].y;
  }
  Point p_max, p_k;
  int max_index, k_index;
  int Stack[20], top1, top2;
  // float sign;
  double sign;
  Point right_point[10], left_point[10];

  for (int i = 0; i < n_poly; i++) {
    if (in_poly[i].y < in_poly[0].y ||
        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
      Point *j = &(in_poly[0]);
      Point *k = &(in_poly[i]);
      swap1(j, k);
    }
    if (i == 0) {
      p_max = in_poly[0];
      max_index = 0;
    }
    if (in_poly[i].y > p_max.y ||
        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
      p_max = in_poly[i];
      max_index = i;
    }
  }
  if (max_index == 0) {
    max_index = 1;
    p_max = in_poly[max_index];
  }

  k_index = 0, Stack[0] = 0, top1 = 0;
  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
                                         dis(in_poly[Stack[top1]], p_k)))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top1++;
    Stack[top1] = k_index;
  }

  for (int i = 0; i <= top1; i++) {
    right_point[i] = in_poly[Stack[i]];
  }

  k_index = 0, Stack[0] = 0, top2 = 0;

  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
                                        dis(in_poly[Stack[top2]], p_k))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top2++;
    Stack[top2] = k_index;
  }

  for (int i = top2 - 1; i >= 0; i--) {
    left_point[i] = in_poly[Stack[i]];
  }

  for (int i = 0; i < top1 + top2; i++) {
    if (i <= top1) {
      in_poly[i] = right_point[i];
    } else {
      in_poly[i] = left_point[top2 - (i - top1)];
    }
  }
  n_poly = top1 + top2;
}

template <typename T>
__device__ inline void Findminbox(T const *const p, T *minpoints) {
  Point ps1[MAXN];
  Point convex[MAXN];
  for (int i = 0; i < 9; i++) {
    convex[i].x = p[i * 2];
    convex[i].y = p[i * 2 + 1];
  }
  int n_convex = 9;
  Jarvis(convex, n_convex);
  int n1 = n_convex;
  for (int i = 0; i < n1; i++) {
    ps1[i].x = convex[i].x;
    ps1[i].y = convex[i].y;
  }
  ps1[n1].x = convex[0].x;
  ps1[n1].y = convex[0].y;

  float minbbox[5] = {0};
  minBoundingRect(ps1, n1 + 1, minbbox);
  float angle = minbbox[0];
  float xmin = minbbox[1];
  float ymin = minbbox[2];
  float xmax = minbbox[3];
  float ymax = minbbox[4];
  float R[2][2];

  R[0][0] = cos(angle);
  R[0][1] = sin(angle);
  R[1][0] = -sin(angle);
  R[1][1] = cos(angle);

  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
}

template <typename T>
__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
                                              const T *ex_boxes, T *minbox) {
  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
    const T *cur_box = ex_boxes + index * 18;
    T *cur_min_box = minbox + index * 8;
    Findminbox(cur_box, cur_min_box);
  }
}

#endif  // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
================================================
/*!
 ******************* BEGIN Caffe Copyright Notice and Disclaimer
 *****************
 *
 * COPYRIGHT
 *
 * All contributions by the University of California:
 * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 * All rights reserved.
 *
 * All other contributions:
 * Copyright (c) 2014-2017, the respective contributors
 * All rights reserved.
 *
 * Caffe uses a shared copyright model: each contributor holds copyright over
 * their contributions to Caffe. The project versioning records all such
 * contribution and copyright details. If a contributor wants to further mark
 * their specific copyright on a particular contribution, they should indicate
 * their copyright solely in the commit message of the change when it is
 * committed.
 *
 * LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * CONTRIBUTION AGREEMENT
 *
 * By contributing to the BVLC/caffe repository through pull-request, comment,
 * or otherwise, the contributor releases their content to the
 * license and copyright terms herein.
 *
 ***************** END Caffe Copyright Notice and Disclaimer
 *********************
 *
 * Copyright (c) 2018 Microsoft
 * Licensed under The MIT License [see LICENSE for details]
 * \file modulated_deformable_im2col.cuh
 * \brief Function definitions of converting an image to
 * column matrix based on kernel, padding, dilation, and offset.
 * These functions are mainly used in deformable convolution operators.
 * \ref: https://arxiv.org/abs/1703.06211
 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
 */

// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu

#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH

#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else  // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else  // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif  // MMCV_USE_PARROTS
#endif  // MMCV_WITH_TRT

template <typename T>
__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
                                  const int height, const int width, T h, T w) {
  int h_low = floorf(h);
  int w_low = floorf(w);
  int h_high = h_low + 1;
  int w_high = w_low + 1;

  T lh = h - h_low;
  T lw = w - w_low;
  T hh = 1 - lh, hw = 1 - lw;

  T v1 = 0;
  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
  T v2 = 0;
  if (h_low >= 0 && w_high <= width - 1)
    v2 = input[h_low * data_width + w_high];
  T v3 = 0;
  if (h_high <= height - 1 && w_low >= 0)
    v3 = input[h_high * data_width + w_low];
  T v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
    v4 = input[h_high * data_width + w_high];

  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

template <typename T>
__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
                                      const int w, const int height,
                                      const int width) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;
  if (h == argmax_h_low && w == argmax_w_low)
    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
  if (h == argmax_h_low && w == argmax_w_high)
    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
  if (h == argmax_h_high && w == argmax_w_low)
    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
  if (h == argmax_h_high && w == argmax_w_high)
    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
  return weight;
}

template <typename T>
__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
                                        const int height, const int width,
                                        const T *im_data, const int data_width,
                                        const int bp_dir) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;

  if (bp_dir == 0) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += -1 * (argmax_w - argmax_w_low) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_w - argmax_w_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  } else if (bp_dir == 1) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += -1 * (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  }

  return weight;
}

template <typename T>
__global__ void modulated_deformable_im2col_gpu_kernel(
    const int n, const T *data_im, const T *data_offset, const T *data_mask,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int num_channels, const int deformable_group, const int height_col,
    const int width_col, T *data_col) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    // index index of output matrix
    const int w_col = index % width_col;
    const int h_col = (index / width_col) % height_col;
    const int b_col = (index / width_col / height_col) % batch_size;
    const int c_im = (index / width_col / height_col) / batch_size;
    const int c_col = c_im * kernel_h * kernel_w;

    // compute deformable group index
    const int deformable_group_index = c_im / channel_per_deformable_group;

    const int h_in = h_col * stride_h - pad_h;
    const int w_in = w_col * stride_w - pad_w;

    T *data_col_ptr =
        data_col +
        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
    const T *data_im_ptr =
        data_im + (b_col * num_channels + c_im) * height * width;
    const T *data_offset_ptr =
        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    const T *data_mask_ptr =
        data_mask + (b_col * deformable_group + deformable_group_index) *
                        kernel_h * kernel_w * height_col * width_col;

    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        const int data_offset_h_ptr =
            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
        const int data_offset_w_ptr =
            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
            w_col;
        const int data_mask_hw_ptr =
            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
        const T offset_h = data_offset_ptr[data_offset_h_ptr];
        const T offset_w = data_offset_ptr[data_offset_w_ptr];
        const T mask = data_mask_ptr[data_mask_hw_ptr];
        T val = static_cast<T>(0);
        const T h_im = h_in + i * dilation_h + offset_h;
        const T w_im = w_in + j * dilation_w + offset_w;
        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
                                     w_im);
        *data_col_ptr = val * mask;
        data_col_ptr += batch_size * height_col * width_col;
      }
    }
  }
}

template <typename T>
__global__ void modulated_deformable_col2im_gpu_kernel(
    const int n, const T *data_col, const T *data_offset, const T *data_mask,
    const int channels, const int height, const int width, const int kernel_h,
    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int deformable_group, const int height_col, const int width_col,
    T *grad_im) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    const int j = (index / width_col / height_col / batch_size) % kernel_w;
    const int i =
        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
    const int c =
        index / width_col / height_col / batch_size / kernel_w / kernel_h;
    // compute the start and end of the output

    const int deformable_group_index = c / channel_per_deformable_group;

    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int b = (index / width_col / height_col) % batch_size;
    int w_in = w_out * stride_w - pad_w;
    int h_in = h_out * stride_h - pad_h;

    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const T *data_mask_ptr =
        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
                        kernel_w * height_col * width_col;
    const int data_offset_h_ptr =
        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
    const int data_offset_w_ptr =
        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
    const int data_mask_hw_ptr =
        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
    const T offset_h = data_offset_ptr[data_offset_h_ptr];
    const T offset_w = data_offset_ptr[data_offset_w_ptr];
    const T mask = data_mask_ptr[data_mask_hw_ptr];
    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;

    const T cur_top_grad = data_col[index] * mask;
    const int cur_h = (int)cur_inv_h_data;
    const int cur_w = (int)cur_inv_w_data;
    for (int dy = -2; dy <= 2; dy++) {
      for (int dx = -2; dx <= 2; dx++) {
        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
          int cur_bottom_grad_pos =
              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
          T weight =
              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
                                       cur_h + dy, cur_w + dx, height, width);
          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
        }
      }
    }
  }
}

template <typename T>
__global__ void modulated_deformable_col2im_coord_gpu_kernel(
    const int n, const T *data_col, const T *data_im, const T *data_offset,
    const T *data_mask, const int channels, const int height, const int width,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int channel_per_deformable_group,
    const int batch_size, const int offset_channels, const int deformable_group,
    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    T val = 0, mval = 0;
    int w = index % width_col;
    int h = (index / width_col) % height_col;
    int c = (index / width_col / height_col) % offset_channels;
    int b = (index / width_col / height_col) / offset_channels;
    // compute the start and end of the output

    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
    const int col_step = kernel_h * kernel_w;
    int cnt = 0;
    const T *data_col_ptr = data_col + deformable_group_index *
                                           channel_per_deformable_group *
                                           batch_size * width_col * height_col;
    const T *data_im_ptr =
        data_im + (b * deformable_group + deformable_group_index) *
                      channel_per_deformable_group / kernel_h / kernel_w *
                      height * width;
    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const T *data_mask_ptr =
        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
                        kernel_w * height_col * width_col;

    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;

    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
         col_c += col_step) {
      const int col_pos =
          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
      const int bp_dir = offset_c % 2;

      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
      int i =
          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
      int w_out = col_pos % width_col;
      int h_out = (col_pos / width_col) % height_col;
      int w_in = w_out * stride_w - pad_w;
      int h_in = h_out * stride_h - pad_h;
      const int data_offset_h_ptr =
          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
      const int data_offset_w_ptr =
          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
           w_out);
      const int data_mask_hw_ptr =
          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
      const T offset_h = data_offset_ptr[data_offset_h_ptr];
      const T offset_w = data_offset_ptr[data_offset_w_ptr];
      const T mask = data_mask_ptr[data_mask_hw_ptr];
      T inv_h = h_in + i * dilation_h + offset_h;
      T inv_w = w_in + j * dilation_w + offset_w;
      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
        inv_h = inv_w = -2;
      else
        mval += data_col_ptr[col_pos] *
                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
                                     height, width, inv_h, inv_w);
      const T weight = dmcn_get_coordinate_weight(
          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
          width, bp_dir);
      val += weight * data_col_ptr[col_pos] * mask;
      cnt += 1;
    }
    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
    grad_offset[index] = val;
    if (offset_c % 2 == 0)
      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
      // height_col + h) * width_col + w], mask_req, mval);
      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
                      kernel_w +
                  offset_c / 2) *
                     height_col +
                 h) *
                    width_col +
                w] = mval;
  }
}

#endif  // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#ifndef DEFORM_ATTN_CUDA_KERNEL
#define DEFORM_ATTN_CUDA_KERNEL

#include "common_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"

template <typename scalar_t>
__device__ scalar_t ms_deform_attn_im2col_bilinear(
    const scalar_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const scalar_t &h,
    const scalar_t &w, const int &m, const int &c) {
  const int h_low = floorf(h);
  const int w_low = floorf(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const scalar_t lh = h - h_low;
  const scalar_t lw = w - w_low;
  const scalar_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  scalar_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
  }
  scalar_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
  }
  scalar_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
  }
  scalar_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
  }

  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

template <typename scalar_t>
__device__ void ms_deform_attn_col2im_bilinear(
    const scalar_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const scalar_t &h,
    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
    const scalar_t &attn_weight, scalar_t *&grad_value,
    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
  const int h_low = floorf(h);
  const int w_low = floorf(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const scalar_t lh = h - h_low;
  const scalar_t lw = w - w_low;
  const scalar_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const scalar_t top_grad_value = top_grad * attn_weight;
  scalar_t grad_h_weight = 0, grad_w_weight = 0;

  scalar_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
    grad_h_weight -= hw * v1;
    grad_w_weight -= hh * v1;
    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
  }
  scalar_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
    grad_h_weight -= lw * v2;
    grad_w_weight += hh * v2;
    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
  }
  scalar_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
  }
  scalar_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
    grad_h_weight += lw * v4;
    grad_w_weight += lh * v4;
    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
  }

  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  *grad_attn_weight = top_grad * val;
  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
}

template <typename scalar_t>
__device__ void ms_deform_attn_col2im_bilinear_gm(
    const scalar_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const scalar_t &h,
    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
    const scalar_t &attn_weight, scalar_t *&grad_value,
    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
  const int h_low = floorf(h);
  const int w_low = floorf(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const scalar_t lh = h - h_low;
  const scalar_t lw = w - w_low;
  const scalar_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const scalar_t top_grad_value = top_grad * attn_weight;
  scalar_t grad_h_weight = 0, grad_w_weight = 0;

  scalar_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
    grad_h_weight -= hw * v1;
    grad_w_weight -= hh * v1;
    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
  }
  scalar_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
    grad_h_weight -= lw * v2;
    grad_w_weight += hh * v2;
    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
  }
  scalar_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
  }
  scalar_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
    grad_h_weight += lw * v4;
    grad_w_weight += lh * v4;
    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
  }

  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  atomicAdd(grad_attn_weight, top_grad * val);
  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
}

template <typename scalar_t>
__global__ void ms_deformable_im2col_gpu_kernel(
    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
    const scalar_t *data_attn_weight, const int batch_size,
    const int spatial_size, const int num_heads, const int channels,
    const int num_levels, const int num_query, const int num_point,
    scalar_t *data_col) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    scalar_t *data_col_ptr = data_col + index;
    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
    scalar_t col = 0;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const scalar_t *data_value_ptr =
          data_value +
          (data_value_ptr_init_offset + level_start_id * qid_stride);
      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;

        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
                                                spatial_w, num_heads, channels,
                                                h_im, w_im, m_col, c_col) *
                 weight;
        }

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
      }
    }
    *data_col_ptr = col;
  }
}

template <typename scalar_t, unsigned int blockSize>
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
  __shared__ scalar_t cache_grad_attn_weight[blockSize];
  unsigned int tid = threadIdx.x;
  const int qid_stride = num_heads * channels;
  CUDA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();
        if (tid == 0) {
          scalar_t _grad_w = cache_grad_sampling_loc[0],
                   _grad_h = cache_grad_sampling_loc[1],
                   _grad_a = cache_grad_attn_weight[0];
          int sid = 2;
          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
            _grad_w += cache_grad_sampling_loc[sid];
            _grad_h += cache_grad_sampling_loc[sid + 1];
            _grad_a += cache_grad_attn_weight[_tid];
            sid += 2;
          }

          *grad_sampling_loc_out = _grad_w;
          *(grad_sampling_loc_out + 1) = _grad_h;
          *grad_attn_weight_out = _grad_a;
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t, unsigned int blockSize>
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
  __shared__ scalar_t cache_grad_attn_weight[blockSize];
  unsigned int tid = threadIdx.x;
  CUDA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
          }
          __syncthreads();
        }

        if (tid == 0) {
          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
          *grad_attn_weight_out = cache_grad_attn_weight[0];
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  extern __shared__ int _s[];
  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
  unsigned int tid = threadIdx.x;
  CUDA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();
        if (tid == 0) {
          scalar_t _grad_w = cache_grad_sampling_loc[0],
                   _grad_h = cache_grad_sampling_loc[1],
                   _grad_a = cache_grad_attn_weight[0];
          int sid = 2;
          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
            _grad_w += cache_grad_sampling_loc[sid];
            _grad_h += cache_grad_sampling_loc[sid + 1];
            _grad_a += cache_grad_attn_weight[_tid];
            sid += 2;
          }

          *grad_sampling_loc_out = _grad_w;
          *(grad_sampling_loc_out + 1) = _grad_h;
          *grad_attn_weight_out = _grad_a;
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  extern __shared__ int _s[];
  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
  unsigned int tid = threadIdx.x;
  CUDA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
             s >>= 1, spre >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
            if (tid + (s << 1) < spre) {
              cache_grad_attn_weight[tid] +=
                  cache_grad_attn_weight[tid + (s << 1)];
              cache_grad_sampling_loc[xid1] +=
                  cache_grad_sampling_loc[xid2 + (s << 1)];
              cache_grad_sampling_loc[xid1 + 1] +=
                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
            }
          }
          __syncthreads();
        }

        if (tid == 0) {
          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
          *grad_attn_weight_out = cache_grad_attn_weight[0];
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  extern __shared__ int _s[];
  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
  unsigned int tid = threadIdx.x;
  CUDA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
             s >>= 1, spre >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
            if (tid + (s << 1) < spre) {
              cache_grad_attn_weight[tid] +=
                  cache_grad_attn_weight[tid + (s << 1)];
              cache_grad_sampling_loc[xid1] +=
                  cache_grad_sampling_loc[xid2 + (s << 1)];
              cache_grad_sampling_loc[xid1 + 1] +=
                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
            }
          }
          __syncthreads();
        }

        if (tid == 0) {
          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_gm(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear_gm(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              grad_sampling_loc_out, grad_attn_weight_out);
        }
        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}
#endif  // DEFORM_ATTN_CUDA_KERNEL


================================================
FILE: mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef NMS_CUDA_KERNEL_CUH
#define NMS_CUDA_KERNEL_CUH

#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else  // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else  // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif  // MMCV_USE_PARROTS
#endif  // MMCV_WITH_TRT

int const threadsPerBlock = sizeof(unsigned long long int) * 8;

__device__ inline bool devIoU(float const *const a, float const *const b,
                              const int offset, const float threshold) {
  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
  float width = fmaxf(right - left + offset, 0.f),
        height = fmaxf(bottom - top + offset, 0.f);
  float interS = width * height;
  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
  return interS > threshold * (Sa + Sb - interS);
}

__global__ static void nms_cuda(const int n_boxes, const float iou_threshold,
                                const int offset, const float *dev_boxes,
                                unsigned long long *dev_mask) {
  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
    const int tid = threadIdx.x;

    if (row_start > col_start) return;

    const int row_size =
        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    __shared__ float block_boxes[threadsPerBlock * 4];
    if (tid < col_size) {
      block_boxes[tid * 4 + 0] =
          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
      block_boxes[tid * 4 + 1] =
          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
      block_boxes[tid * 4 + 2] =
          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
      block_boxes[tid * 4 + 3] =
          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
    }
    __syncthreads();

    if (tid < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + tid;
      const float *cur_box = dev_boxes + cur_box_idx * 4;
      int i = 0;
      unsigned long long int t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = tid + 1;
      }
      for (i = start; i < col_size; i++) {
        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
          t |= 1ULL << i;
        }
      }
      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
    }
  }
}

__global__ static void gather_keep_from_mask(bool *keep,
                                             const unsigned long long *dev_mask,
                                             const int n_boxes) {
  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
  const int tid = threadIdx.x;

  // mark the bboxes which have been removed.
  extern __shared__ unsigned long long removed[];

  // initialize removed.
  for (int i = tid; i < col_blocks; i += blockDim.x) {
    removed[i] = 0;
  }
  __syncthreads();

  for (int nblock = 0; nblock < col_blocks; ++nblock) {
    auto removed_val = removed[nblock];
    __syncthreads();
    const int i_offset = nblock * threadsPerBlock;
#pragma unroll
    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
      const int i = i_offset + inblock;
      if (i >= n_boxes) break;
      // select a candidate, check if it should kept.
      if (!(removed_val & (1ULL << inblock))) {
        if (tid == 0) {
          // mark the output.
          keep[i] = true;
        }
        auto p = dev_mask + i * col_blocks;
        // remove all bboxes which overlap the candidate.
        for (int j = tid; j < col_blocks; j += blockDim.x) {
          if (j >= nblock) removed[j] |= p[j];
        }
        __syncthreads();
        removed_val = removed[nblock];
      }
    }
  }
}

#endif  // NMS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef NMS_QUADRI_CUDA_CUH
#define NMS_QUADRI_CUDA_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#include "box_iou_rotated_utils.hpp"

__host__ __device__ inline int divideUP(const int x, const int y) {
  return (((x) + (y)-1) / (y));
}

namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}

template <typename T>
__global__ void nms_quadri_cuda_kernel(const int n_boxes,
                                       const float iou_threshold,
                                       const T* dev_boxes,
                                       unsigned long long* dev_mask,
                                       const int multi_label) {
  if (multi_label == 1) {
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;

    // if (row_start > col_start) return;

    const int row_size =
        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    // Compared to nms_cuda_kernel, where each box is represented with 4 values
    // (x1, y1, x2, y2), each rotated box is represented with 8 values
    // (x1, y1, ..., x4, y4) here.
    __shared__ T block_boxes[threadsPerBlock * 8];
    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 8 + 0] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];
      block_boxes[threadIdx.x * 8 + 1] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];
      block_boxes[threadIdx.x * 8 + 2] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];
      block_boxes[threadIdx.x * 8 + 3] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];
      block_boxes[threadIdx.x * 8 + 4] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];
      block_boxes[threadIdx.x * 8 + 5] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];
      block_boxes[threadIdx.x * 8 + 6] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];
      block_boxes[threadIdx.x * 8 + 7] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
      const T* cur_box = dev_boxes + cur_box_idx * 9;
      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        // Instead of devIoU used by original horizontal nms, here
        // we use the single_box_iou_quadri function from
        // box_iou_rotated_utils.h
        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
            iou_threshold) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
      dev_mask[cur_box_idx * col_blocks + col_start] = t;
    }
  } else {
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;

    // if (row_start > col_start) return;

    const int row_size =
        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    // Compared to nms_cuda_kernel, where each box is represented with 4 values
    // (x1, y1, x2, y2), each rotated box is represented with 8 values
    // (x1, y1, , ..., x4, y4) here.
    __shared__ T block_boxes[threadsPerBlock * 8];
    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 8 + 0] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0];
      block_boxes[threadIdx.x * 8 + 1] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1];
      block_boxes[threadIdx.x * 8 + 2] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2];
      block_boxes[threadIdx.x * 8 + 3] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3];
      block_boxes[threadIdx.x * 8 + 4] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4];
      block_boxes[threadIdx.x * 8 + 5] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5];
      block_boxes[threadIdx.x * 8 + 6] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6];
      block_boxes[threadIdx.x * 8 + 7] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
      const T* cur_box = dev_boxes + cur_box_idx * 8;
      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        // Instead of devIoU used by original horizontal nms, here
        // we use the single_box_iou_quadri function from
        // box_iou_rotated_utils.h
        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
            iou_threshold) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
      dev_mask[cur_box_idx * col_blocks + col_start] = t;
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
#ifndef NMS_ROTATED_CUDA_CUH
#define NMS_ROTATED_CUDA_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#include "box_iou_rotated_utils.hpp"

__host__ __device__ inline int divideUP(const int x, const int y) {
  return (((x) + (y)-1) / (y));
}

namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}

template <typename T>
__global__ void nms_rotated_cuda_kernel(const int n_boxes,
                                        const float iou_threshold,
                                        const T* dev_boxes,
                                        unsigned long long* dev_mask,
                                        const int multi_label) {
  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel

  if (multi_label == 1) {
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;

    // if (row_start > col_start) return;

    const int row_size =
        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    // Compared to nms_cuda_kernel, where each box is represented with 4 values
    // (x1, y1, x2, y2), each rotated box is represented with 5 values
    // (x_center, y_center, width, height, angle_degrees) here.
    __shared__ T block_boxes[threadsPerBlock * 5];
    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 5 + 0] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
      block_boxes[threadIdx.x * 5 + 1] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
      block_boxes[threadIdx.x * 5 + 2] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
      block_boxes[threadIdx.x * 5 + 3] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
      block_boxes[threadIdx.x * 5 + 4] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
      const T* cur_box = dev_boxes + cur_box_idx * 6;
      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        // Instead of devIoU used by original horizontal nms, here
        // we use the single_box_iou_rotated function from
        // box_iou_rotated_utils.h
        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
            iou_threshold) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
      dev_mask[cur_box_idx * col_blocks + col_start] = t;
    }
  } else {
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;

    // if (row_start > col_start) return;

    const int row_size =
        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    // Compared to nms_cuda_kernel, where each box is represented with 4 values
    // (x1, y1, x2, y2), each rotated box is represented with 5 values
    // (x_center, y_center, width, height, angle_degrees) here.
    __shared__ T block_boxes[threadsPerBlock * 5];
    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 5 + 0] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
      block_boxes[threadIdx.x * 5 + 1] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
      block_boxes[threadIdx.x * 5 + 2] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
      block_boxes[threadIdx.x * 5 + 3] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
      block_boxes[threadIdx.x * 5 + 4] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
      const T* cur_box = dev_boxes + cur_box_idx * 5;
      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        // Instead of devIoU used by original horizontal nms, here
        // we use the single_box_iou_rotated function from
        // box_iou_rotated_utils.h
        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
            iou_threshold) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
      dev_mask[cur_box_idx * col_blocks + col_start] = t;
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
================================================
/*
 * Copyright (c) 2019, SenseTime.
 */

#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_

#ifndef __CUDACC__
#error cudawarpfunction.cuh should only be included by .cu files
#endif
#include <cuda.h>

#include <parrots/foundation/common.hpp>

#ifdef PARROTS_USE_HALF
#include <cuda_fp16.h>
#endif
#ifdef __CUDA_ARCH__
#define CUDA_INTRINSIC_FUNC(Expr) Expr
#else
#define CUDA_INTRINSIC_FUNC(Expr)
#endif

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300

#ifdef PARROTS_USE_HALF

#if CUDA_VERSION < 9000

__device__ inline float16 __shfl(float16 var, int srcLane, int width) {
  CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width););
}

__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) {
  CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width););
}

__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) {
  CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width););
}

__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) {
  CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width););
}

#else  // CUDA_VERSION >= 9000

__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane,
                                      int width = warpSize) {
  CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width);
                      return r;);
}

__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var,
                                         unsigned delta, int width = warpSize) {
  CUDA_INTRINSIC_FUNC(
      float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;);
}

__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var,
                                           unsigned delta,
                                           int width = warpSize) {
  CUDA_INTRINSIC_FUNC(
      float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;);
}

__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var,
                                          int laneMask, int width) {
  CUDA_INTRINSIC_FUNC(float16 r;
                      r.y = __shfl_xor_sync(mask, var.y, laneMask, width);
                      return r;);
}

#endif  // CUDA_VERSION < 9000

#endif  // PARROTS_USE_HALF

// warp shuffle interface with a dummy mask
#if CUDA_VERSION < 9000

template <typename T>
__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane,
                                int width = warpSize) {
  CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width););
}

template <typename T>
__device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta,
                                   int width = warpSize) {
  CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width););
}

template <typename T>
__device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta,
                                     int width = warpSize) {
  CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width););
}

template <typename T>
__device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask,
                                    int width = warpSize) {
  CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width););
}

#endif  // CUDA_VERSION < 9000

#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300

#endif  // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_


================================================
FILE: mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH
#define POINT_IN_BOXES_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
                                             T &local_x, T &local_y) {
  T cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
}

template <typename T>
__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
                                        T &local_y) {
  // param pt: (x, y, z)
  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
  // cz in the bottom center
  T x = pt[0], y = pt[1], z = pt[2];
  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
  cz += z_size /
        2.0;  // shift to the center since cz in box3d is the bottom center

  if (fabsf(z - cz) > z_size / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
  return in_flag;
}

template <typename T>
__global__ void points_in_boxes_part_forward_cuda_kernel(
    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
    int *box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
  // (B, npoints), default -1

  int bs_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
    if (bs_idx >= batch_size) return;

    boxes += bs_idx * boxes_num * 7;
    pts += bs_idx * pts_num * 3 + pt_idx * 3;
    box_idx_of_points += bs_idx * pts_num + pt_idx;

    T local_x = 0, local_y = 0;
    int cur_in_flag = 0;
    for (int k = 0; k < boxes_num; k++) {
      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
      if (cur_in_flag) {
        box_idx_of_points[0] = k;
        break;
      }
    }
  }
}

template <typename T>
__global__ void points_in_boxes_all_forward_cuda_kernel(
    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
    int *box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
  // (B, npoints), default -1

  int bs_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
    if (bs_idx >= batch_size) return;

    boxes += bs_idx * boxes_num * 7;
    pts += bs_idx * pts_num * 3 + pt_idx * 3;
    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;

    T local_x = 0, local_y = 0;
    for (int k = 0; k < boxes_num; k++) {
      const int cur_in_flag =
          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
      if (cur_in_flag) {
        box_idx_of_points[k] = 1;
      }
    }
  }
}

#endif  // POINT_IN_BOXES_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

struct point {
  float x, y;
};

template <typename scalar_t>
__global__ void points_in_polygons_forward_cuda_kernel(
    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
    const int rows, const int cols, scalar_t *inside_flag) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int row = index / cols;
    int col = index % cols;

    const scalar_t *offset_vertex1 = vertex1 + row * 2;
    const scalar_t *offset_vertex2 = vertex2 + col * 8;

    point point_[1];
    point polygon[4];

    point_[0].x = offset_vertex1[0];
    point_[0].y = offset_vertex1[1];

    polygon[0].x = offset_vertex2[0];
    polygon[0].y = offset_vertex2[1];
    polygon[1].x = offset_vertex2[2];
    polygon[1].y = offset_vertex2[3];
    polygon[2].x = offset_vertex2[4];
    polygon[2].y = offset_vertex2[5];
    polygon[3].x = offset_vertex2[6];
    polygon[3].y = offset_vertex2[7];

    int nCross = 0;
    int i, j;
    float sx, sy, tx, ty, px, py, x;
    for (i = 0, j = 3; i < 4; j = i, i++) {
      sx = polygon[i].x;
      sy = polygon[i].y;
      tx = polygon[j].x;
      ty = polygon[j].y;

      px = point_[0].x;
      py = point_[0].y;

      if (py < min(sy, ty)) continue;
      if (py > max(sy, ty)) continue;

      if ((sx == px && sy == py) || (tx == px && ty == py)) {
        break;
      } else {
        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
          x = sx + (py - sy) * (tx - sx) / (ty - sy);
          if (x == px) {
            break;
          }
          if (x > px) {
            nCross++;
          }
        }
      }
    }
    if (nCross % 2 == 1) {
      inside_flag[index] = 1.0;
    } else {
      inside_flag[index] = 0.0;
    }
    return;
  }
}

#endif  // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
// Distributed under terms of the MIT license.
#ifndef PRROI_POOL_CUDA_KERNEL_CUH
#define PRROI_POOL_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
                                                        const int h,
                                                        const int w,
                                                        const int height,
                                                        const int width) {
  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
  T retVal = overflow ? 0.0f : data[h * width + w];
  return retVal;
}

template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
  return (1.0f - abs(dh)) * (1.0f - abs(dw));
}

template <typename T>
__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
                                                                   T c1, T c2) {
  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
}

template <typename T>
__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
                                              const T w, const int height,
                                              const int width) {
  T retVal = 0.0f;
  int h1 = floorf(h);
  int w1 = floorf(w);
  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
  h1 = floorf(h) + 1;
  w1 = floorf(w);
  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
  h1 = floorf(h);
  w1 = floorf(w) + 1;
  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
  h1 = floorf(h) + 1;
  w1 = floorf(w) + 1;
  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
  return retVal;
}

template <typename T>
__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
                                               const int s_h, const int s_w,
                                               const int e_h, const int e_w,
                                               const T y0, const T x0,
                                               const T y1, const T x1,
                                               const int h0, const int w0) {
  T alpha, beta, lim_alpha, lim_beta, tmp;
  T sum_out = 0;

  alpha = x0 - T(s_w);
  beta = y0 - T(s_h);
  lim_alpha = x1 - T(s_w);
  lim_beta = y1 - T(s_h);
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;

  alpha = T(e_w) - x1;
  lim_alpha = T(e_w) - x0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;

  alpha = x0 - T(s_w);
  beta = T(e_h) - y1;
  lim_alpha = x1 - T(s_w);
  lim_beta = T(e_h) - y0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;

  alpha = T(e_w) - x1;
  lim_alpha = T(e_w) - x0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;

  return sum_out;
}

template <typename T>
__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
                                                  const int h, const int w,
                                                  const int height,
                                                  const int width,
                                                  const T coeff) {
  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
}

template <typename T>
__device__ static void PrRoIPoolingMatDistributeDiff(
    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
    const int w0) {
  T alpha, beta, lim_alpha, lim_beta, tmp;

  alpha = x0 - T(s_w);
  beta = y0 - T(s_h);
  lim_alpha = x1 - T(s_w);
  lim_beta = y1 - T(s_h);
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);

  alpha = T(e_w) - x1;
  lim_alpha = T(e_w) - x0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);

  alpha = x0 - T(s_w);
  beta = T(e_h) - y1;
  lim_alpha = x1 - T(s_w);
  lim_beta = T(e_h) - y0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);

  alpha = T(e_w) - x1;
  lim_alpha = T(e_w) - x0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
}

template <typename T>
__global__ void prroi_pool_forward_cuda_kernel(
    const int nthreads, const T *input, const T *rois, T *output,
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T *offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];

    T roi_x1 = offset_rois[1] * spatial_scale;
    T roi_y1 = offset_rois[2] * spatial_scale;
    T roi_x2 = offset_rois[3] * spatial_scale;
    T roi_y2 = offset_rois[4] * spatial_scale;

    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
    T bin_size_h = roi_height / static_cast<T>(pooled_height);
    T bin_size_w = roi_width / static_cast<T>(pooled_width);

    const T *this_data =
        input + (roi_batch_ind * channels + c) * height * width;
    T *this_out = output + index;

    T bin_x1 = roi_x1 + bin_size_w * pw;
    T bin_y1 = roi_y1 + bin_size_h * ph;
    T bin_x2 = bin_x1 + bin_size_w;
    T bin_y2 = bin_y1 + bin_size_h;

    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
    if (bin_size == 0) {
      *this_out = 0;
      continue;
    }

    T sum_out = 0;

    int start_x, start_y, end_x, end_y;

    start_x = floorf(bin_x1);
    end_x = ceilf(bin_x2);
    start_y = floorf(bin_y1);
    end_y = ceilf(bin_y2);

    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
        sum_out += PrRoIPoolingMatCalculation(
            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
            width);
    *this_out = sum_out / bin_size;
  }
}

template <typename T>
__global__ void prroi_pool_backward_cuda_kernel(
    const int nthreads, const T *grad_output, const T *rois, T *grad_input,
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;
    auto rois_cur = rois + n * 5;

    int roi_batch_ind = rois_cur[0];
    T roi_x1 = rois_cur[1] * spatial_scale;
    T roi_y1 = rois_cur[2] * spatial_scale;
    T roi_x2 = rois_cur[3] * spatial_scale;
    T roi_y2 = rois_cur[4] * spatial_scale;

    T roi_width = max(roi_x2 - roi_x1, (T)0);
    T roi_height = max(roi_y2 - roi_y1, (T)0);
    T bin_size_h = roi_height / static_cast<T>(pooled_height);
    T bin_size_w = roi_width / static_cast<T>(pooled_width);

    const T *this_out_grad = grad_output + index;
    T *this_data_grad =
        grad_input + (roi_batch_ind * channels + c) * height * width;

    T bin_x1 = roi_x1 + bin_size_w * pw;
    T bin_y1 = roi_y1 + bin_size_h * ph;
    T bin_x2 = bin_x1 + bin_size_w;
    T bin_y2 = bin_y1 + bin_size_h;

    T bin_size = max(T(0.0), bin_size_w * bin_size_h);

    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;

    int start_x, start_y, end_x, end_y;

    start_x = floorf(bin_x1);
    end_x = ceilf(bin_x2);
    start_y = floorf(bin_y1);
    end_y = ceilf(bin_y2);

    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
        PrRoIPoolingMatDistributeDiff(
            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
            width);
  }
}

template <typename T>
__global__ void prroi_pool_coor_backward_cuda_kernel(
    const int nthreads, const T *output, const T *grad_output, const T *input,
    const T *rois, T *grad_rois, const int pooled_height,
    const int pooled_width, const T spatial_scale, const int channels,
    const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;
    auto rois_cur = rois + n * 5;

    int roi_batch_ind = rois_cur[0];
    T roi_x1 = rois_cur[1] * spatial_scale;
    T roi_y1 = rois_cur[2] * spatial_scale;
    T roi_x2 = rois_cur[3] * spatial_scale;
    T roi_y2 = rois_cur[4] * spatial_scale;

    T roi_width = max(roi_x2 - roi_x1, (T)0);
    T roi_height = max(roi_y2 - roi_y1, (T)0);
    T bin_size_h = roi_height / static_cast<T>(pooled_height);
    T bin_size_w = roi_width / static_cast<T>(pooled_width);

    const T output_grad_val = grad_output[index];
    const T *this_input_data =
        input + (roi_batch_ind * channels + c) * height * width;
    const T output_val = output[index];
    T *this_rois_grad = grad_rois + n * 5;

    T bin_x1 = roi_x1 + bin_size_w * pw;
    T bin_y1 = roi_y1 + bin_size_h * ph;
    T bin_x2 = bin_x1 + bin_size_w;
    T bin_y2 = bin_y1 + bin_size_h;

    T bin_size = max(T(0.0), bin_size_w * bin_size_h);

    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;

    // WARNING: to be discussed
    if (sum_out == 0) continue;

    int start_x, start_y, end_x, end_y;

    start_x = floorf(bin_x1);
    end_x = ceilf(bin_x2);
    start_y = floorf(bin_y1);
    end_y = ceilf(bin_y2);

    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
      grad_x1_y += PrRoIPoolingSingleCoorIntegral(
          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
                                    height, width),
          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
                                    height, width));

      grad_x2_y += PrRoIPoolingSingleCoorIntegral(
          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
                                    height, width),
          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
                                    height, width));
    }

    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
                                    height, width),
          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
                                    height, width));

      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
                                    height, width),
          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
                                    height, width));
    }

    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;

    partial_x1 = partial_x1 / bin_size * spatial_scale;
    partial_x2 = partial_x2 / bin_size * spatial_scale;
    partial_y1 = partial_y1 / bin_size * spatial_scale;
    partial_y2 = partial_y2 / bin_size * spatial_scale;

    // (index, x1, y1, x2, y2)
    this_rois_grad[0] = 0;
    atomicAdd(this_rois_grad + 1,
              (partial_x1 * (1.0f - T(pw) / pooled_width) +
               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
                  output_grad_val);
    atomicAdd(this_rois_grad + 2,
              (partial_y1 * (1.0f - T(ph) / pooled_height) +
               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
                  output_grad_val);
    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
                                   partial_x1 * T(pw) / pooled_width) *
                                      output_grad_val);
    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
                                   partial_y1 * T(ph) / pooled_height) *
                                      output_grad_val);
  }
}

#endif  // ROI_POOL_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef PSAMASK_CUDA_KERNEL_CUH
#define PSAMASK_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

// CUDA: grid stride looping
#ifndef CUDA_KERNEL_LOOP
#define CUDA_KERNEL_LOOP(i, n)                                 \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)
#endif

template <typename T>
__global__ void psamask_collect_forward_cuda(
    const int nthreads, const int h_feature, const int w_feature,
    const int h_mask, const int w_mask, const int half_h_mask,
    const int half_w_mask, const T* mask_data, T* buffer_data) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    const int w = index % w_feature;
    const int h = (index / w_feature) % h_feature;
    const int n = index / w_feature / h_feature;
    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
    const int hstart = max(0, half_h_mask - h);
    const int hend = min(h_mask, h_feature + half_h_mask - h);
    const int wstart = max(0, half_w_mask - w);
    const int wend = min(w_mask, w_feature + half_w_mask - w);
    // (hidx,                    widx                   ) with mask-indexed
    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
    for (int hidx = hstart; hidx < hend; hidx++) {
      for (int widx = wstart; widx < wend; widx++) {
        buffer_data[(n * h_feature * w_feature +
                     (hidx + h - half_h_mask) * w_feature +
                     (widx + w - half_w_mask)) *
                        h_feature * w_feature +
                    h * w_feature + w] = mask_data
            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
                 w_feature +
             w];
      }
    }
  }
}

template <typename T>
__global__ void psamask_distribute_forward_cuda(
    const int nthreads, const int h_feature, const int w_feature,
    const int h_mask, const int w_mask, const int half_h_mask,
    const int half_w_mask, const T* mask_data, T* buffer_data) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    const int w = index % w_feature;
    const int h = (index / w_feature) % h_feature;
    const int n = index / w_feature / h_feature;
    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
    const int hstart = max(0, half_h_mask - h);
    const int hend = min(h_mask, h_feature + half_h_mask - h);
    const int wstart = max(0, half_w_mask - w);
    const int wend = min(w_mask, w_feature + half_w_mask - w);
    // (hidx,                    widx                   ) with mask-indexed
    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
    for (int hidx = hstart; hidx < hend; hidx++) {
      for (int widx = wstart; widx < wend; widx++) {
        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
                        h_feature * w_feature +
                    (hidx + h - half_h_mask) * w_feature +
                    (widx + w - half_w_mask)] = mask_data
            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
                 w_feature +
             w];
      }
    }
  }
}

template <typename T>
__global__ void psamask_collect_backward_cuda(
    const int nthreads, const int h_feature, const int w_feature,
    const int h_mask, const int w_mask, const int half_h_mask,
    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    const int w = index % w_feature;
    const int h = (index / w_feature) % h_feature;
    const int n = index / w_feature / h_feature;
    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
    const int hstart = max(0, half_h_mask - h);
    const int hend = min(h_mask, h_feature + half_h_mask - h);
    const int wstart = max(0, half_w_mask - w);
    const int wend = min(w_mask, w_feature + half_w_mask - w);
    // (hidx,                    widx                   ) with mask-indexed
    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
    for (int hidx = hstart; hidx < hend; hidx++) {
      for (int widx = wstart; widx < wend; widx++) {
        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
                   h) *
                      w_feature +
                  w] = buffer_diff[(n * h_feature * w_feature +
                                    (hidx + h - half_h_mask) * w_feature +
                                    (widx + w - half_w_mask)) *
                                       h_feature * w_feature +
                                   h * w_feature + w];
      }
    }
  }
}

template <typename T>
__global__ void psamask_distribute_backward_cuda(
    const int nthreads, const int h_feature, const int w_feature,
    const int h_mask, const int w_mask, const int half_h_mask,
    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    const int w = index % w_feature;
    const int h = (index / w_feature) % h_feature;
    const int n = index / w_feature / h_feature;
    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
    const int hstart = max(0, half_h_mask - h);
    const int hend = min(h_mask, h_feature + half_h_mask - h);
    const int wstart = max(0, half_w_mask - w);
    const int wend = min(w_mask, w_feature + half_w_mask - w);
    // (hidx,                    widx                   ) with mask-indexed
    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
    for (int hidx = hstart; hidx < hend; hidx++) {
      for (int widx = wstart; widx < wend; widx++) {
        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
                   h) *
                      w_feature +
                  w] =
            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
                            h_feature * w_feature +
                        (hidx + h - half_h_mask) * w_feature +
                        (widx + w - half_w_mask)];
      }
    }
  }
}

#endif  // PSAMASK_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
================================================
// Modified from
// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH

#include <float.h>
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else  // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif  // MMCV_USE_PARROTS

/*** Forward ***/
template <typename scalar_t>
__global__ void riroi_align_rotated_forward_cuda_kernel(
    const int nthreads, const scalar_t *bottom_data,
    const scalar_t *bottom_rois, const scalar_t spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int pooled_height,
    const int pooled_width, const int num_orientations, scalar_t *top_data) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int o = (index / pooled_width / pooled_height) % num_orientations;
    int c =
        (index / pooled_width / pooled_height / num_orientations) % channels;
    int n = index / pooled_width / pooled_height / num_orientations / channels;

    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not using rounding; this implementation detail is critical
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    // Force malformed ROIs to be 1x1
    roi_width = max(roi_width, (scalar_t)1.);
    roi_height = max(roi_height, (scalar_t)1.);
    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
                          static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w =
        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    // find aligned index
    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
    int ind = floorf(ind_float);
    scalar_t l_var = ind_float - (scalar_t)ind;
    scalar_t r_var = 1.0 - l_var;
    // correct start channel
    ind = (ind + num_orientations) % num_orientations;
    // rotated channel
    int ind_rot = (o - ind + num_orientations) % num_orientations;
    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
    const scalar_t *offset_bottom_data =
        bottom_data + (roi_batch_ind * channels * num_orientations +
                       c * num_orientations + ind_rot) *
                          height * width;

    const scalar_t *offset_bottom_data_plus =
        bottom_data + (roi_batch_ind * channels * num_orientations +
                       c * num_orientations + ind_rot_plus) *
                          height * width;
    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (num_samples > 0)
                             ? num_samples
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosscalar_theta = cos(theta);
    scalar_t sinscalar_theta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    scalar_t output_val = 0.;
    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy =
          roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta (counterclockwise) around the center and translate
        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;

        scalar_t val = bilinear_interpolate<scalar_t>(
            offset_bottom_data, height, width, y, x, index);
        scalar_t val_plus = bilinear_interpolate<scalar_t>(
            offset_bottom_data_plus, height, width, y, x, index);
        output_val += r_var * val + l_var * val_plus;
      }
    }
    output_val /= count;

    top_data[index] = output_val;
  }
}

/*** Backward ***/
template <typename scalar_t>
__global__ void riroi_align_rotated_backward_cuda_kernel(
    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
    const scalar_t spatial_scale, const int num_samples, const bool clockwise,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, const int num_orientations,
    scalar_t *bottom_diff) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int o = (index / pooled_width / pooled_height) % num_orientations;
    int c =
        (index / pooled_width / pooled_height / num_orientations) % channels;
    int n = index / pooled_width / pooled_height / num_orientations / channels;

    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not round
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    // Force malformed ROIs to be 1x1
    roi_width = max(roi_width, (scalar_t)1.);
    roi_height = max(roi_height, (scalar_t)1.);

    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
                          static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w =
        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    // find aligned index
    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
    int ind = floorf(ind_float);
    scalar_t l_var = ind_float - (scalar_t)ind;
    scalar_t r_var = 1.0 - l_var;
    // correct start channel
    ind = (ind + num_orientations) % num_orientations;
    // rotated channel
    int ind_rot = (o - ind + num_orientations) % num_orientations;
    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
    scalar_t *offset_bottom_diff =
        bottom_diff + (roi_batch_ind * channels * num_orientations +
                       c * num_orientations + ind_rot) *
                          height * width;
    scalar_t *offset_bottom_diff_plus =
        bottom_diff + (roi_batch_ind * channels * num_orientations +
                       c * num_orientations + ind_rot_plus) *
                          height * width;
    int top_offset =
        (n * channels * num_orientations + c * num_orientations + o) *
        pooled_height * pooled_width;
    const scalar_t *offset_top_diff = top_diff + top_offset;
    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (num_samples > 0)
                             ? num_samples
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosTheta = cos(theta);
    scalar_t sinTheta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy =
          roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta around the center and translate
        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;

        scalar_t w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
                                                w4, x_low, x_high, y_low,
                                                y_high, index);

        scalar_t g1 = top_diff_this_bin * w1 / count;
        scalar_t g2 = top_diff_this_bin * w2 / count;
        scalar_t g3 = top_diff_this_bin * w3 / count;
        scalar_t g4 = top_diff_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);

          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
                    g1 * l_var);
          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
                    g2 * l_var);
          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
                    g3 * l_var);
          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
                    g4 * l_var);

        }  // if
      }  // ix
    }  // iy
  }  // CUDA_1D_KERNEL_LOOP
}  // RiRoIAlignBackward

#endif  // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
#define ROI_ALIGN_CUDA_KERNEL_CUH

#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else  // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else  // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif  // MMCV_USE_PARROTS
#endif  // MMCV_WITH_TRT

/*** Forward ***/
template <typename T>
__global__ void roi_align_forward_cuda_kernel(
    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
    T* argmax_x, const int pooled_height, const int pooled_width,
    const T spatial_scale, const int sampling_ratio,
    const int pool_mode,  // 0 - max pool, 1 - avg pool
    const bool aligned, const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];

    // Do not using rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T roi_start_w = offset_rois[1] * spatial_scale - offset;
    T roi_start_h = offset_rois[2] * spatial_scale - offset;
    T roi_end_w = offset_rois[3] * spatial_scale - offset;
    T roi_end_h = offset_rois[4] * spatial_scale - offset;

    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;
    if (!aligned) {  // for backward-compatibility only
      roi_width = max(roi_width, (T)1.);
      roi_height = max(roi_height, (T)1.);
    }

    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    const T* offset_input =
        input + (roi_batch_ind * channels + c) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_height / pooled_height));
    int roi_bin_grid_w =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_width / pooled_width));

    if (pool_mode == 0) {
      // We do max pooling inside a bin
      T maxval = -FLT_MAX;
      T maxidx_y = -1.f, maxidx_x = -1.f;
      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
        const T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);
        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
          const T x = roi_start_w + pw * bin_size_w +
                      static_cast<T>(ix + .5f) * bin_size_w /
                          static_cast<T>(roi_bin_grid_w);
          T val =
              bilinear_interpolate(offset_input, height, width, y, x, index);
          if (val > maxval) {
            maxval = val;
            maxidx_y = y;
            maxidx_x = x;
          }
        }
      }
      output[index] = maxval;
      argmax_y[index] = maxidx_y;
      argmax_x[index] = maxidx_x;
    } else if (pool_mode == 1) {
      // We do average pooling inside a bin
      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
      T output_val = 0.;
      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
        const T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);
        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
          const T x = roi_start_w + pw * bin_size_w +
                      static_cast<T>(ix + .5f) * bin_size_w /
                          static_cast<T>(roi_bin_grid_w);
          T val =
              bilinear_interpolate(offset_input, height, width, y, x, index);
          output_val += val;
        }
      }
      output[index] = output_val / count;
    }
  }
}

/*** Backward ***/
template <typename T>
__global__ void roi_align_backward_cuda_kernel(
    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
    const T* argmax_x, T* grad_input, const int pooled_height,
    const int pooled_width, const T spatial_scale, const int sampling_ratio,
    const int pool_mode,  // 0 - max pool, 1 - avg pool
    const bool aligned, const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T grad_output_this_bin = grad_output[index];

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];
    T* offset_grad_input =
        grad_input + ((roi_batch_ind * channels + c) * height * width);

    if (pool_mode == 0) {
      T y = argmax_y[index], x = argmax_x[index];
      if (y != -1.f) {
        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;
        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high, index);

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_grad_input + y_low * width + x_low,
                    grad_output_this_bin * w1);
          atomicAdd(offset_grad_input + y_low * width + x_high,
                    grad_output_this_bin * w2);
          atomicAdd(offset_grad_input + y_high * width + x_low,
                    grad_output_this_bin * w3);
          atomicAdd(offset_grad_input + y_high * width + x_high,
                    grad_output_this_bin * w4);
        }
      }
    } else if (pool_mode == 1) {
      // Do not using rounding; this implementation detail is critical
      T offset = aligned ? (T)0.5 : (T)0.0;
      T roi_start_w = offset_rois[1] * spatial_scale - offset;
      T roi_start_h = offset_rois[2] * spatial_scale - offset;
      T roi_end_w = offset_rois[3] * spatial_scale - offset;
      T roi_end_h = offset_rois[4] * spatial_scale - offset;

      T roi_width = roi_end_w - roi_start_w;
      T roi_height = roi_end_h - roi_start_h;
      if (!aligned) {  // for backward-compatibility only
        roi_width = max(roi_width, (T)1.);
        roi_height = max(roi_height, (T)1.);
      }

      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

      // We use roi_bin_grid to sample the grid and mimic integral
      int roi_bin_grid_h =
          (sampling_ratio > 0)
              ? sampling_ratio
              : static_cast<int>(ceilf(roi_height / pooled_height));
      int roi_bin_grid_w =
          (sampling_ratio > 0)
              ? sampling_ratio
              : static_cast<int>(ceilf(roi_width / pooled_width));

      // We do average (integral) pooling inside a bin
      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
        const T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);
        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
          const T x = roi_start_w + pw * bin_size_w +
                      static_cast<T>(ix + .5f) * bin_size_w /
                          static_cast<T>(roi_bin_grid_w);

          T w1, w2, w3, w4;
          int x_low, x_high, y_low, y_high;
          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                        x_low, x_high, y_low, y_high, index);

          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
            atomicAdd(offset_grad_input + y_low * width + x_low,
                      grad_output_this_bin * w1 / count);
            atomicAdd(offset_grad_input + y_low * width + x_high,
                      grad_output_this_bin * w2 / count);
            atomicAdd(offset_grad_input + y_high * width + x_low,
                      grad_output_this_bin * w3 / count);
            atomicAdd(offset_grad_input + y_high * width + x_high,
                      grad_output_this_bin * w4 / count);
          }
        }
      }
    }
  }
}

#endif  // ROI_ALIGN_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
================================================
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH

#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else  // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else  // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif  // MMCV_USE_PARROTS
#endif  // MMCV_WITH_TRT

/*** Forward ***/
template <typename scalar_t>
__global__ void roi_align_rotated_forward_cuda_kernel(
    const int nthreads, const scalar_t *bottom_data,
    const scalar_t *bottom_rois, const scalar_t spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, scalar_t *top_data) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not using rounding; this implementation detail is critical
    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    if (!aligned) {  // for backward-compatibility only
      // Force malformed ROIs to be 1x1
      roi_width = max(roi_width, (scalar_t)1.);
      roi_height = max(roi_height, (scalar_t)1.);
    }
    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
                          static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w =
        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    const scalar_t *offset_bottom_data =
        bottom_data + (roi_batch_ind * channels + c) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosscalar_theta = cos(theta);
    scalar_t sinscalar_theta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    scalar_t output_val = 0.;
    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy =
          roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta (counterclockwise) around the center and translate
        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;

        scalar_t val = bilinear_interpolate<scalar_t>(
            offset_bottom_data, height, width, y, x, index);
        output_val += val;
      }
    }
    output_val /= count;

    top_data[index] = output_val;
  }
}

/*** Backward ***/
template <typename scalar_t>
__global__ void roi_align_rotated_backward_cuda_kernel(
    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
    const bool clockwise, const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not round
    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    if (!aligned) {  // for backward-compatibility only
      // Force malformed ROIs to be 1x1
      roi_width = max(roi_width, (scalar_t)1.);
      roi_height = max(roi_height, (scalar_t)1.);
    }
    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
                          static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w =
        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    scalar_t *offset_bottom_diff =
        bottom_diff + (roi_batch_ind * channels + c) * height * width;

    int top_offset = (n * channels + c) * pooled_height * pooled_width;
    const scalar_t *offset_top_diff = top_diff + top_offset;
    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosTheta = cos(theta);
    scalar_t sinTheta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy =
          roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta around the center and translate
        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;

        scalar_t w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
                                                w4, x_low, x_high, y_low,
                                                y_high, index);

        scalar_t g1 = top_diff_this_bin * w1 / count;
        scalar_t g2 = top_diff_this_bin * w2 / count;
        scalar_t g3 = top_diff_this_bin * w3 / count;
        scalar_t g4 = top_diff_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
        }  // if
      }  // ix
    }  // iy
  }  // CUDA_1D_KERNEL_LOOP
}  // RoIAlignBackward

#endif  // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROI_POOL_CUDA_KERNEL_CUH
#define ROI_POOL_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void roi_pool_forward_cuda_kernel(
    const int nthreads, const T* input, const T* rois, T* output, int* argmax,
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];
    // calculate the roi region on feature maps
    T roi_x1 = offset_rois[1] * spatial_scale;
    T roi_y1 = offset_rois[2] * spatial_scale;
    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;
    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;

    // force malformed rois to be 1x1
    T roi_w = roi_x2 - roi_x1;
    T roi_h = roi_y2 - roi_y1;
    if (roi_w <= 0 || roi_h <= 0) continue;

    T bin_size_w = roi_w / static_cast<T>(pooled_width);
    T bin_size_h = roi_h / static_cast<T>(pooled_height);

    // the corresponding bin region
    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);
    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);
    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);
    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);

    // add roi offsets and clip to input boundaries
    bin_x1 = min(max(bin_x1, 0), width);
    bin_y1 = min(max(bin_y1, 0), height);
    bin_x2 = min(max(bin_x2, 0), width);
    bin_y2 = min(max(bin_y2, 0), height);
    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);

    const T* offset_input =
        input + (roi_batch_ind * channels + c) * height * width;
    // Define an empty pooling region to be zero
    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
    T max_val = is_empty ? 0 : -FLT_MAX;
    int max_idx = -1;
    for (int h = bin_y1; h < bin_y2; ++h) {
      for (int w = bin_x1; w < bin_x2; ++w) {
        int offset = h * width + w;
        if (offset_input[offset] > max_val) {
          max_val = offset_input[offset];
          max_idx = offset;
        }
      }
    }
    output[index] = max_val;
    if (argmax != NULL) argmax[index] = max_idx;
  }
}

template <typename T>
__global__ void roi_pool_backward_cuda_kernel(
    const int nthreads, const T* grad_output, const T* rois, const int* argmax,
    T* grad_input, const int pooled_height, const int pooled_width,
    const int channels, const int height, const int width) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c) is an element in the pooled output
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    int roi_batch_ind = rois[n * 5];
    T* grad_input_offset =
        grad_input + ((roi_batch_ind * channels + c) * height * width);
    int argmax_index = argmax[index];

    if (argmax_index != -1) {
      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);
    }
  }
}

#endif  // ROI_POOL_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH
#define ROIAWARE_POOL3D_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
                                             T &local_x, T &local_y) {
  T cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
}

template <typename T>
__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
                                        T &local_y) {
  // param pt: (x, y, z)
  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
  // cz in the bottom center
  T x = pt[0], y = pt[1], z = pt[2];
  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
  cz += z_size /
        2.0;  // shift to the center since cz in box3d is the bottom center

  if (fabsf(z - cz) > z_size / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
  return in_flag;
}

template <typename T>
__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
                                            int out_x, int out_y, int out_z,
                                            const T *rois, const T *pts,
                                            int *pts_mask) {
  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
  // y_idxs, z_idxs) by binary bit
  int box_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
    if (box_idx >= boxes_num) return;

    pts += pt_idx * 3;
    rois += box_idx * 7;
    pts_mask += box_idx * pts_num + pt_idx;

    T local_x = 0, local_y = 0;
    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);

    pts_mask[0] = -1;
    if (cur_in_flag > 0) {
      T local_z = pts[2] - rois[2];
      T x_size = rois[3], y_size = rois[4], z_size = rois[5];

      T x_res = x_size / out_x;
      T y_res = y_size / out_y;
      T z_res = z_size / out_z;

      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
      unsigned int z_idx = int(local_z / z_res);

      x_idx = min(max(x_idx, 0), out_x - 1);
      y_idx = min(max(y_idx, 0), out_y - 1);
      z_idx = min(max(z_idx, 0), out_z - 1);

      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;

      pts_mask[0] = idx_encoding;
    }
  }
}

template <typename T>
__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
                                             int max_pts_each_voxel, int out_x,
                                             int out_y, int out_z,
                                             const int *pts_mask,
                                             T *pts_idx_of_voxels) {
  // params pts_mask: (N, npoints)  0 or 1
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;

    for (int k = 0; k < pts_num; k++) {
      if (pts_mask[box_idx * pts_num + k] != -1) {
        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
        unsigned int z_idx = idx_encoding & 0xFF;
        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
                                   y_idx * out_z * max_pts_each_voxel +
                                   z_idx * max_pts_each_voxel;
        unsigned int cnt = pts_idx_of_voxels[base_offset];
        if (cnt < max_num_pts) {
          pts_idx_of_voxels[base_offset + cnt + 1] = k;
          pts_idx_of_voxels[base_offset]++;
        }
      }
    }
  }
}

template <typename T>
__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
                                   int max_pts_each_voxel, int out_x, int out_y,
                                   int out_z, const T *pts_feature,
                                   const int *pts_idx_of_voxels,
                                   T *pooled_features, int *argmax) {
  // params pts_feature: (npoints, C)
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
  // params argmax: (N, out_x, out_y, out_z, C)

  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
    int x_idx = voxel_idx_flat / (out_y * out_z);
    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
    int z_idx = voxel_idx_flat % out_z;
    if (box_idx >= boxes_num || channel_idx >= channels) return;

    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
                         offset_base * max_pts_each_voxel;
    pooled_features += box_idx * out_x * out_y * out_z * channels +
                       offset_base * channels + channel_idx;
    argmax += box_idx * out_x * out_y * out_z * channels +
              offset_base * channels + channel_idx;

    int argmax_idx = -1;
    float max_val = -1e50;

    int total_pts = pts_idx_of_voxels[0];

    for (int k = 1; k <= total_pts; k++) {
      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
          max_val) {
        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
        argmax_idx = pts_idx_of_voxels[k];
      }
    }

    if (argmax_idx != -1) {
      pooled_features[0] = max_val;
    }
    argmax[0] = argmax_idx;
  }
}

template <typename T>
__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
                                   int max_pts_each_voxel, int out_x, int out_y,
                                   int out_z, const T *pts_feature,
                                   const int *pts_idx_of_voxels,
                                   T *pooled_features) {
  // params pts_feature: (npoints, C)
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
  // params argmax: (N, out_x, out_y, out_z, C)

  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
    int x_idx = voxel_idx_flat / (out_y * out_z);
    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
    int z_idx = voxel_idx_flat % out_z;
    if (box_idx >= boxes_num || channel_idx >= channels) return;

    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
                         offset_base * max_pts_each_voxel;
    pooled_features += box_idx * out_x * out_y * out_z * channels +
                       offset_base * channels + channel_idx;

    float sum_val = 0;
    int total_pts = pts_idx_of_voxels[0];

    for (int k = 1; k <= total_pts; k++) {
      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
    }

    if (total_pts > 0) {
      pooled_features[0] = sum_val / total_pts;
    }
  }
}

template <typename T>
__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
                                            int out_x, int out_y, int out_z,
                                            const int *argmax,
                                            const T *grad_out, T *grad_in) {
  // params argmax: (N, out_x, out_y, out_z, C)
  // params grad_out: (N, out_x, out_y, out_z, C)
  // params grad_in: (npoints, C), return value

  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
    int x_idx = voxel_idx_flat / (out_y * out_z);
    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
    int z_idx = voxel_idx_flat % out_z;
    if (box_idx >= boxes_num || channel_idx >= channels) return;

    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
    argmax += box_idx * out_x * out_y * out_z * channels +
              offset_base * channels + channel_idx;
    grad_out += box_idx * out_x * out_y * out_z * channels +
                offset_base * channels + channel_idx;

    if (argmax[0] == -1) return;

    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
  }
}

template <typename T>
__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
                                            int out_x, int out_y, int out_z,
                                            int max_pts_each_voxel,
                                            const int *pts_idx_of_voxels,
                                            const T *grad_out, T *grad_in) {
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params grad_out: (N, out_x, out_y, out_z, C)
  // params grad_in: (npoints, C), return value

  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
    int x_idx = voxel_idx_flat / (out_y * out_z);
    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
    int z_idx = voxel_idx_flat % out_z;
    if (box_idx >= boxes_num || channel_idx >= channels) return;

    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
                         offset_base * max_pts_each_voxel;
    grad_out += box_idx * out_x * out_y * out_z * channels +
                offset_base * channels + channel_idx;

    int total_pts = pts_idx_of_voxels[0];
    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
    for (int k = 1; k <= total_pts; k++) {
      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
                grad_out[0] * cur_grad);
    }
  }
}

#endif  // ROIAWARE_POOL3D_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH
#define ROIPOINT_POOL3D_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
                                             T &local_x, T &local_y) {
  T cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
}

template <typename T>
__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
                                        T &local_y) {
  // param pt: (x, y, z)
  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
  // bottom center
  T x = pt[0], y = pt[1], z = pt[2];
  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center

  if (fabsf(z - cz) > dz / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
              (local_y > -dy / 2.0) & (local_y < dy / 2.0);
  return in_flag;
}

template <typename T>
__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
                                    const T *xyz, const T *boxes3d,
                                    int *pts_assign) {
  // params xyz: (B, N, 3)
  // params boxes3d: (B, M, 7)
  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
  // background points
  int box_idx = blockIdx.y;
  int bs_idx = blockIdx.z;
  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
    if (box_idx >= boxes_num || bs_idx >= batch_size) return;

    int assign_idx =
        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
    pts_assign[assign_idx] = 0;

    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;

    T local_x = 0, local_y = 0;
    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
                                        local_x, local_y);
    pts_assign[assign_idx] = cur_in_flag;
  }
}

__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
                               int sampled_pts_num, const int *pts_assign,
                               int *pts_idx, int *pooled_empty_flag) {
  // params xyz: (B, N, 3)
  // params pts_feature: (B, N, C)
  // params pts_assign: (B, N)
  // params pts_idx: (B, M, 512)
  // params pooled_empty_flag: (B, M)
  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
    int bs_idx = blockIdx.y;

    int cnt = 0;
    for (int k = 0; k < pts_num; k++) {
      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
                     boxes_idx]) {
        if (cnt < sampled_pts_num) {
          pts_idx[bs_idx * boxes_num * sampled_pts_num +
                  boxes_idx * sampled_pts_num + cnt] = k;
          cnt++;
        } else
          break;
      }
    }

    if (cnt == 0) {
      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
    } else if (cnt < sampled_pts_num) {
      // duplicate same points for sampling
      for (int k = cnt; k < sampled_pts_num; k++) {
        int duplicate_idx = k % cnt;
        int base_offset =
            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
      }
    }
  }
}

template <typename T>
__global__ void roipoint_pool3d_forward(
    int batch_size, int pts_num, int boxes_num, int feature_in_len,
    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,
    T *pooled_features, int *pooled_empty_flag) {
  // params xyz: (B, N, 3)
  // params pts_idx: (B, M, 512)
  // params pts_feature: (B, N, C)
  // params pooled_features: (B, M, 512, 3+C)
  // params pooled_empty_flag: (B, M)
  int box_idx = blockIdx.y;
  int bs_idx = blockIdx.z;
  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;

    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
                   box_idx * sampled_pts_num + sample_pt_idx;
    int src_pt_idx = pts_idx[temp_idx];
    int dst_feature_offset = temp_idx * (3 + feature_in_len);

    for (int j = 0; j < 3; j++)
      pooled_features[dst_feature_offset + j] =
          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];

    int src_feature_offset =
        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
    memcpy(pooled_features + dst_feature_offset + 3,
           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
  }
}

#endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename scalar_t>
__global__ void rotated_feature_align_forward_kernel(
    const int nthreads, const int points, const scalar_t* bottom_data,
    const scalar_t* best_bboxes, const scalar_t spatial_scale,
    const int channels, const int height, const int width, scalar_t* top_data) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int w = index % width;
    int h = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    const scalar_t* bbox_offset =
        best_bboxes + ((n * height + h) * width + w) * 5;
    scalar_t roi_y = bbox_offset[0] * spatial_scale;
    scalar_t roi_x = bbox_offset[1] * spatial_scale;

    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
    scalar_t py[5] = {roi_y, 0, 0, 0, 0};

    if (points > 1) {
      scalar_t roi_w = bbox_offset[2] * spatial_scale;
      scalar_t roi_h = bbox_offset[3] * spatial_scale;
      scalar_t roi_a = bbox_offset[4];

      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
      scalar_t wx = cosa * w_2, wy = sina * w_2;
      scalar_t hx = -sina * h_2, hy = cosa * h_2;

      px[1] = roi_x + wx + hx;
      py[1] = roi_y + wy + hy;
      px[2] = roi_x - wx + hx;
      py[2] = roi_y - wy + hy;
      px[3] = roi_x - wx - hx;
      py[3] = roi_y - wy - hy;
      px[4] = roi_x + wx - hx;
      py[4] = roi_y + wy - hy;
    }

    const scalar_t* offset_bottom_data =
        bottom_data + (n * channels + c) * height * width;

    scalar_t output_val = bottom_data[index];
    for (int i = 0; i < points; i++) {
      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
                                                   width, py[i], px[i], i);
    }
    top_data[index] = output_val;
  }
}

template <typename scalar_t>
__global__ void rotated_feature_align_backward_kernel(
    const int nthreads, const int points, const scalar_t* top_diff,
    const scalar_t* best_bboxes, const scalar_t spatial_scale,
    const int channels, const int height, const int width,
    scalar_t* bottom_diff) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int w = index % width;
    int h = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    const scalar_t* bbox_offset =
        best_bboxes + ((n * height + h) * width + w) * 5;
    scalar_t roi_y = bbox_offset[0] * spatial_scale;
    scalar_t roi_x = bbox_offset[1] * spatial_scale;

    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
    scalar_t py[5] = {roi_y, 0, 0, 0, 0};

    if (points > 1) {
      scalar_t roi_w = bbox_offset[2] * spatial_scale;
      scalar_t roi_h = bbox_offset[3] * spatial_scale;
      scalar_t roi_a = bbox_offset[4];

      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
      scalar_t wx = cosa * w_2, wy = sina * w_2;
      scalar_t hx = -sina * h_2, hy = cosa * h_2;

      px[1] = roi_x + wx + hx;
      py[1] = roi_y + wy + hy;
      px[2] = roi_x - wx + hx;
      py[2] = roi_y - wy + hy;
      px[3] = roi_x - wx - hx;
      py[3] = roi_y - wy - hy;
      px[4] = roi_x + wx - hx;
      py[4] = roi_y + wy - hy;
    }

    scalar_t* offset_bottom_diff =
        bottom_diff + (n * channels + c) * height * width;
    scalar_t value_top_diff = top_diff[index];

    atomicAdd(bottom_diff + index, value_top_diff);
    for (int i = 0; i < points; i++) {
      scalar_t w1, w2, w3, w4;
      int x_low, x_high, y_low, y_high;

      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
                                              w2, w3, w4, x_low, x_high, y_low,
                                              y_high, i);
      scalar_t g1 = value_top_diff * w1;
      scalar_t g2 = value_top_diff * w2;
      scalar_t g3 = value_top_diff * w3;
      scalar_t g4 = value_top_diff * w4;
      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
      }
    }
  }
}
#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH
#define SCATTER_POINTS_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
int const maxGridDim = 50000;

__device__ __forceinline__ static void reduceMax(float *address, float val) {
  int *address_as_i = reinterpret_cast<int *>(address);
  int old = *address_as_i, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_i, assumed,
                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
  } while (assumed != old || __int_as_float(old) < val);
}

__device__ __forceinline__ static void reduceMax(double *address, double val) {
  unsigned long long *address_as_ull =
      reinterpret_cast<unsigned long long *>(address);
  unsigned long long old = *address_as_ull, assumed;
  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, assumed,
        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
  } while (assumed != old || __longlong_as_double(old) < val);
}

// get rid of meaningless warnings when compiling host code
#ifdef MMCV_WITH_HIP
__device__ __forceinline__ static void reduceAdd(float *address, float val) {
  atomicAdd(address, val);
}
__device__ __forceinline__ static void reduceAdd(double *address, double val) {
  atomicAdd(address, val);
}
#else
#ifdef __CUDA_ARCH__
__device__ __forceinline__ static void reduceAdd(float *address, float val) {
#if (__CUDA_ARCH__ < 200)
#ifdef _MSC_VER
#pragma message( \
    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32")
#else
#warning \
    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
#endif
  int *address_as_i = reinterpret_cast<int *>(address);
  int old = *address_as_i, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_i, assumed,
                    __float_as_int(val + __int_as_float(assumed)));
  } while (assumed != old);
#else
  atomicAdd(address, val);
#endif
}

__device__ __forceinline__ static void reduceAdd(double *address, double val) {
#if (__CUDA_ARCH__ < 600)
#ifdef _MSC_VER
#pragma message( \
    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64")
#else
#warning \
    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
#endif
  unsigned long long *address_as_ull =
      reinterpret_cast<unsigned long long *>(address);
  unsigned long long old = *address_as_ull, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed,
                    __double_as_longlong(val + __longlong_as_double(assumed)));
  } while (assumed != old);
#else
  atomicAdd(address, val);
#endif
}
#endif  // __CUDA_ARCH__
#endif  // MMCV_WITH_HIP

template <typename T>
__global__ void feats_reduce_kernel(
    const T *feats, const int32_t *coors_map,
    T *reduced_feats,  // shall be 0 at initialization
    const int num_input, const int num_feats, const reduce_t reduce_type) {
  CUDA_1D_KERNEL_LOOP(x, num_input) {
    int32_t reduce_to = coors_map[x];
    if (reduce_to == -1) continue;

    const T *feats_offset = feats + x * num_feats;
    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
    if (reduce_type == reduce_t::MAX) {
      for (int i = 0; i < num_feats; i++) {
        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
      }
    } else {
      for (int i = 0; i < num_feats; i++) {
        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
      }
    }
  }
}

template <typename T>
__global__ void add_reduce_traceback_grad_kernel(
    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
    const int32_t *reduce_count, const int num_input, const int num_feats,
    const reduce_t reduce_type) {
  CUDA_1D_KERNEL_LOOP(x, num_input) {
    int32_t reduce_to = coors_map[x];
    if (reduce_to == -1) {
      continue;
    }

    const int input_offset = x * num_feats;
    T *grad_feats_offset = grad_feats + input_offset;
    const int reduced_offset = reduce_to * num_feats;
    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;

    if (reduce_type == reduce_t::SUM) {
      for (int i = 0; i < num_feats; i++) {
        grad_feats_offset[i] = grad_reduced_feats_offset[i];
      }
    } else if (reduce_type == reduce_t::MEAN) {
      for (int i = 0; i < num_feats; i++) {
        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
                               static_cast<T>(reduce_count[reduce_to]);
      }
    }
  }
}

template <typename T>
__global__ void max_reduce_traceback_scatter_idx_kernel(
    const T *feats, const T *reduced_feats, int32_t *reduce_from,
    const int32_t *coors_map, const int num_input, const int num_feats) {
  CUDA_1D_KERNEL_LOOP(x, num_input) {
    int32_t reduce_to = coors_map[x];

    const int input_offset = x * num_feats;
    const T *feats_offset = feats + input_offset;

    if (reduce_to == -1) {
      continue;
    }

    const int reduced_offset = reduce_to * num_feats;
    const T *reduced_feats_offset = reduced_feats + reduced_offset;
    int32_t *reduce_from_offset = reduce_from + reduced_offset;

    for (int i = 0; i < num_feats; i++) {
      if (feats_offset[i] == reduced_feats_offset[i]) {
        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
      }
    }
  }
}

template <typename T>
__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
                                               const T *grad_reduced_feats,
                                               const int32_t *reduce_from,
                                               const int num_reduced,
                                               const int num_feats) {
  CUDA_1D_KERNEL_LOOP(x, num_reduced) {
    const int reduced_offset = x * num_feats;
    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;

    for (int i = 0; i < num_feats; i++) {
      grad_feats[scatter_to_offset[i] * num_feats + i] =
          grad_reduced_feats_offset[i];
    }
  }
}

#endif  // SCATTER_POINTS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void sigmoid_focal_loss_forward_cuda_kernel(
    const int nthreads, const T* input, const int64_t* target, const T* weight,
    T* output, const T gamma, const T alpha, const int num_classes) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int n = index / num_classes;
    int c = index % num_classes;

    int64_t t = target[n];
    T flag_p = (t == c);
    T flag_n = (t != c);

    // p = sigmoid(x) = 1. / 1. + expf(-x)
    T p = (T)1. / ((T)1. + expf(-input[index]));

    // (1 - p)**gamma * log(p)
    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));
    // p**gamma * log(1 - p)
    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));

    output[index] = (T)0.;
    output[index] += -flag_p * alpha * term_p;
    output[index] += -flag_n * ((T)1. - alpha) * term_n;
    if (weight != NULL) {
      output[index] *= weight[t];
    }
  }
}

template <typename T>
__global__ void sigmoid_focal_loss_backward_cuda_kernel(
    const int nthreads, const T* input, const int64_t* target, const T* weight,
    T* grad_input, const T gamma, const T alpha, const int num_classes) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int n = index / num_classes;
    int c = index % num_classes;

    int64_t t = target[n];
    T flag_p = (t == c);
    T flag_n = (t != c);

    // p = sigmoid(x) = 1. / 1. + expf(-x)
    T p = (T)1. / ((T)1. + exp(-input[index]));

    // (1 - p)**gamma * (1 - p - gamma*p*log(p))
    T term_p = pow(((T)1. - p), gamma) *
               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));
    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)
    T term_n = pow(p, gamma) *
               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);

    grad_input[index] = (T)0.;
    grad_input[index] += -flag_p * alpha * term_p;
    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;
    if (weight != NULL) {
      grad_input[index] *= weight[t];
    }
  }
}

#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void softmax_focal_loss_forward_cuda_kernel(
    const int nthreads, const T* softmax, const int64_t* target,
    const T* weight, T* output, const T gamma, const T alpha,
    const int num_classes) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int64_t label = target[index];
    T pred = softmax[index * num_classes + label];

    if (label >= 0) {
      output[index] =
          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));
    } else {
      output[index] = 0;
    }
    if (weight != NULL) {
      output[index] *= weight[label];
    }
  }
}

template <typename T>
__global__ void softmax_focal_loss_backward_cuda1_kernel(
    const int nthreads, const T* softmax, const int64_t* target,
    const T* weight, T* buff, const T gamma, const T alpha,
    const int num_classes) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int64_t label = target[index];
    T pred = softmax[index * num_classes + label];

    if (label >= 0) {
      buff[index] = alpha * (-pow((T)1. - pred, gamma) +
                             gamma * pow((T)1. - pred, gamma - 1) * pred *
                                 log(max(pred, (T)FLT_MIN)));
    } else {
      buff[index] = 0;
    }
    if (weight != NULL) {
      buff[index] *= weight[label];
    }
  }
}

template <typename T>
__global__ void softmax_focal_loss_backward_cuda2_kernel(
    const int nthreads, const T* softmax, const int64_t* target, const T* buff,
    T* grad_input, const int num_classes) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int n = index / num_classes;
    int c = index % num_classes;
    int64_t label = target[n];

    if (label >= 0) {
      T flag = (label == c ? (T)1. : (T)0.);
      grad_input[index] = buff[n] * (flag - softmax[index]);
    } else {
      grad_input[index] = 0;
    }
  }
}

#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/spconv/indice.cuh
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <utils/spconv/spconv/geometry.h>
#include <utils/spconv/tensorview/tensorview.h>

#include <utils/spconv/tensorview/helper_kernel.cuh>

template <typename Index, typename IndexGrid, unsigned NDim,
          int KernelMaxVolume = 256>
__global__ void prepareIndicePairsKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> kernelSize,
    const tv::SimpleVector<Index, NDim> stride,
    const tv::SimpleVector<Index, NDim> padding,
    const tv::SimpleVector<Index, NDim> dilation,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];
  }
  Index numValidPoints = 0;
  Index validPoints[KernelMaxVolume * (NDim + 1)];
  Index *pointPtr = nullptr;
  auto indicePairsDim2 = indicePairs.dim(2);
  Index index;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
        validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
      indicePairs(offset, 0, oldNum) = ix;
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
              spatialVolume * indicesIn(ix, 0);
      indicePairs(offset, 1, oldNum) = index;
      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
    }
  }
}

template <typename Index, typename IndexGrid, unsigned NDim,
          int KernelMaxVolume = 256>
__global__ void prepareDeConvIndicePairsKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> kernelSize,
    const tv::SimpleVector<Index, NDim> stride,
    const tv::SimpleVector<Index, NDim> padding,
    const tv::SimpleVector<Index, NDim> dilation,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];
  }
  Index numValidPoints = 0;
  Index validPoints[KernelMaxVolume * (NDim + 1)];
  Index *pointPtr = nullptr;
  auto indicePairsDim2 = indicePairs.dim(2);
  Index index;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = getValidOutPosTranspose<Index, NDim>(
        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
        validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
      indicePairs(offset, 0, oldNum) = ix;
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
              spatialVolume * indicesIn(ix, 0);
      indicePairs(offset, 1, oldNum) = index;
      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
    }
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignGridAndIndiceOutKernel(
    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
    int numAct, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
  Index index;
  auto indicesOutPtr = indicesOut.data();
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    index = indicePairUnique[ix];
    gridsOut[index] = ix;
    index = tv::rowArrayIdxInv<Index, NDim>(
        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
    indicesOut[ix * (NDim + 1)] = index % batchSize;
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignIndicePairsKernel(
    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
    int numActIn, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  Index index;
  int kernelVolume = indicePairs.dim(0);
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    for (int i = 0; i < kernelVolume; ++i) {
      index = indicePairs(i, 1, ix);
      if (index > -1) {
        indicePairs(i, 1, ix) = gridsOut[index];
      }
    }
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void prepareSubMGridKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index index = 0;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
                                         outSpatialShape.data()) +
            spatialVolume * indicesIn(ix, 0);
    gridsOut[index] = ix;
  }
}

template <typename Index, typename IndexGrid, unsigned NDim,
          int KernelMaxVolume = 256>
__global__ void getSubMIndicePairsKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
    const tv::SimpleVector<Index, NDim> kernelSize,
    const tv::SimpleVector<Index, NDim> stride,
    const tv::SimpleVector<Index, NDim> padding,
    const tv::SimpleVector<Index, NDim> dilation,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index numValidPoints = 0;
  Index validPoints[KernelMaxVolume * (NDim + 1)];
  Index *pointPtr = nullptr;
  Index index = 0;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
        validPoints);
    for (int i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
              spatialVolume * indicesIn(ix, 0);
      if (gridsOut[index] > -1) {
        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
        indicePairs(offset, 1, oldNum) = gridsOut[index];
        indicePairs(offset, 0, oldNum) = ix;
      }
    }
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique,
                                tv::TensorView<IndexGrid> gridsOut,
                                int numAct) {
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    gridsOut[indicePairUnique[ix]] = -1;
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridSubMKernel(
    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
  int outSpatialShapeReg[NDim];
  for (int i = 0; i < NDim; ++i) {
    outSpatialShapeReg[i] = outSpatialShape[i];
  }
  Index spatialVolume = 1;
  auto indsPtr = indices;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index index;
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    indsPtr = indices + ix * (NDim + 1);
    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <utils/spconv/tensorview/helper_kernel.cuh>

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,
                                    const Index *indices, int size,
                                    int numPlanes) {
  int ILPStrideX[NumILP];
  Index inds[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;

  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < size)
        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < size)
          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
              features[inds[ilp] + iy];
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType>
__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,
                                const Index *indices, int size, int numPlanes) {
  int ILPStrideX[NumILP];
  Index inds[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;

  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < size)
        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < size)
          reinterpret_cast<VecType *>(
              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType = int4>
__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,
                                     const Index *indices, int size,
                                     int numPlanes) {
  int ILPStrideY[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
  features += blockIdx.x * NumTLP;
  buffer += blockIdx.x * NumTLP;

  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      reinterpret_cast<VecType *>(
          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
          reinterpret_cast<const VecType *>(
              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
                        threadIdx.x];
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddGenericKernel(scalar_t *outFeatures,
                                        const scalar_t *buffer,
                                        const Index *indices, int size,
                                        int numPlanes) {
  int ILPStrideX[NumILP];
  Index inds[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < size)
        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < size) {
          outFeatures[inds[ilp] + iy] +=
              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType = int4>
__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,
                                         const scalar_t *buffer,
                                         const Index *indices, int size,
                                         int numPlanes) {
  int ILPStrideY[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
  outFeatures += blockIdx.x * NumTLP;
  buffer += blockIdx.x * NumTLP;
  scalar_t buf[vecloadFactor];
  scalar_t buf2[vecloadFactor];
  Index idx;
  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      reinterpret_cast<VecType *>(buf)[0] =
          reinterpret_cast<VecType *>(outFeatures)[idx];
      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
#pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
        buf[i] += buf2[i];
      }
      reinterpret_cast<VecType *>(outFeatures)[idx] =
          reinterpret_cast<VecType *>(buf)[0];
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
#ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH
#define STACK_BALL_QUERY_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void stack_ball_query_forward_cuda_kernel(
    int B, int M, float radius, int nsample, const T *new_xyz,
    const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt,
    int *idx) {
  // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
  // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
  // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
  // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
  // output:
  //      idx: (M, nsample)
  const T *cur_xyz = xyz;
  int *cur_idx = idx;
  CUDA_1D_KERNEL_LOOP(pt_idx, M) {
    int bs_idx = 0;
    for (int pt_cnt = 0; bs_idx < B; bs_idx++) {
      pt_cnt += new_xyz_batch_cnt[bs_idx];
      if (pt_idx < pt_cnt) break;
    }

    int xyz_batch_start_idx = 0;
    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];

    const T *new_xyz_p = new_xyz + pt_idx * 3;
    cur_xyz += xyz_batch_start_idx * 3;
    cur_idx += pt_idx * nsample;

    float radius2 = radius * radius;
    T new_x = new_xyz_p[0];
    T new_y = new_xyz_p[1];
    T new_z = new_xyz_p[2];
    int n = xyz_batch_cnt[bs_idx];

    int cnt = 0;
    for (int k = 0; k < n; ++k) {
      T x = cur_xyz[k * 3 + 0];
      T y = cur_xyz[k * 3 + 1];
      T z = cur_xyz[k * 3 + 2];
      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
             (new_z - z) * (new_z - z);
      if (d2 < radius2) {
        if (cnt == 0) {
          for (int l = 0; l < nsample; ++l) {
            cur_idx[l] = k;
          }
        }
        cur_idx[cnt] = k;
        ++cnt;
        if (cnt >= nsample) break;
      }
    }
    if (cnt == 0) cur_idx[0] = -1;
  }
}

#endif  // STACK_BALL_QUERY_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH
#define STACK_GROUP_POINTS_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#include <stdio.h>
template <typename T>
__global__ void stack_group_points_forward_cuda_kernel(
    int b, int c, int m, int nsample, const T *features,
    const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt,
    T *out) {
  // :param features: (N1 + N2 ..., C) tensor of features to group
  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
  // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor
  // containing the indices of features to group with :param idx_batch_cnt:
  // (batch_size) [M1 + M2 ...] tensor containing the indices of features to
  // group with :return:
  //     output: (M1 + M2, C, nsample) tensor
  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
    const T *cur_features = features;
    const int *cur_idx = idx;
    int sample_idx = index % nsample;
    int c_idx = (index / nsample) % c;
    int pt_idx = (index / nsample / c);

    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
    for (int k = 1; k < b; k++) {
      if (pt_idx < pt_cnt) break;
      pt_cnt += idx_batch_cnt[k];
      bs_idx = k;
    }

    int features_batch_start_idx = 0;
    int features_batch_end_idx = features_batch_cnt[0];
    for (int k = 0; k < bs_idx; k++) {
      features_batch_start_idx += features_batch_cnt[k];
      features_batch_end_idx =
          features_batch_start_idx + features_batch_cnt[k + 1];
    }
    cur_features += features_batch_start_idx * c;

    cur_idx += pt_idx * nsample + sample_idx;
    int in_idx = cur_idx[0] * c + c_idx;
    int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx;
    if (in_idx < features_batch_end_idx * c) {
      out[out_idx] = cur_features[in_idx];
    }
  }
}

template <typename T>
__global__ void stack_group_points_backward_cuda_kernel(
    int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx,
    const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) {
  // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the
  // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing
  // the indices of features to group with :param idx_batch_cnt: (batch_size)
  // [M1 + M2 ...] tensor containing the indices of features to group with
  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
  // indices of features to group with :return:
  //     grad_features: (N1 + N2 ..., C) gradient of the features
  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
    const T *cur_grad_out = grad_out;
    const int *cur_idx = idx;
    T *cur_grad_features = grad_features;
    int sample_idx = index % nsample;
    int c_idx = (index / nsample) % c;
    int pt_idx = (index / nsample / c);

    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;

    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
    for (int k = 1; k < b; k++) {
      if (pt_idx < pt_cnt) break;
      pt_cnt += idx_batch_cnt[k];
      bs_idx = k;
    }

    int features_batch_start_idx = 0;
    for (int k = 0; k < bs_idx; k++)
      features_batch_start_idx += features_batch_cnt[k];

    cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx;
    cur_idx += pt_idx * nsample + sample_idx;
    cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx;

    atomicAdd(cur_grad_features, cur_grad_out[0]);
  }
}

#endif  // GROUP_POINTS_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SYNCBN_CUDA_KERNEL_CUH
#define SYNCBN_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,
                                                 int num, int channels,
                                                 int spatial) {
  __shared__ float buffer[THREADS_PER_BLOCK];
  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    buffer[tid] += input[index];
  }
  __syncthreads();

  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer[tid] += buffer[tid + s];
    }
    __syncthreads();
  }
  int total = num * spatial;
  if (tid == 0) {
    mean[c] = buffer[0] / total;
  }
}

template <>
__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input,
                                                 float *mean, int num,
                                                 int channels, int spatial) {
  __shared__ float buffer[THREADS_PER_BLOCK];
  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    buffer[tid] += static_cast<float>(input[index]);
  }
  __syncthreads();

  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer[tid] += buffer[tid + s];
    }
    __syncthreads();
  }
  int total = num * spatial;
  if (tid == 0) {
    mean[c] = buffer[0] / total;
  }
}

template <typename T>
__global__ void sync_bn_forward_var_cuda_kernel(const T *input,
                                                const float *mean, float *var,
                                                int num, int channels,
                                                int spatial) {
  __shared__ float buffer[THREADS_PER_BLOCK];
  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    float td = input[index] - mean[c];
    buffer[tid] += td * td;
  }
  __syncthreads();
  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer[tid] += buffer[tid + s];
    }
    __syncthreads();
  }
  int total = num * spatial;
  if (tid == 0) {
    var[c] = buffer[0] / total;
  }
}

template <>
__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input,
                                                const float *mean, float *var,
                                                int num, int channels,
                                                int spatial) {
  __shared__ float buffer[THREADS_PER_BLOCK];
  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    float td = static_cast<float>(input[index]) - mean[c];
    buffer[tid] += td * td;
  }
  __syncthreads();
  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer[tid] += buffer[tid + s];
    }
    __syncthreads();
  }
  int total = num * spatial;
  if (tid == 0) {
    var[c] = buffer[0] / total;
  }
}

template <typename T>
__global__ void sync_bn_forward_output_cuda_kernel(
    const T *input, const float *mean, const float *var, float *running_mean,
    float *running_var, const float *weight, const float *bias, float *norm,
    float *std, T *output, int num, int channels, int spatial, float eps,
    float momentum, int group_size) {
  int tid = threadIdx.x;
  int c = blockIdx.x;
  float mean_value = mean[c];
  float std_value = sqrt(var[c] + eps);

  if (weight != nullptr) {
    float weight_value = weight[c];
    float bias_value = bias[c];
    if (norm != nullptr) {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        norm[index] = (input[index] - mean_value) / std_value;
        output[index] = norm[index] * weight_value + bias_value;
      }
    } else {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] =
            (input[index] - mean_value) / std_value * weight_value + bias_value;
      }
    }
  } else {
    if (norm != nullptr) {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] = norm[index] = (input[index] - mean_value) / std_value;
      }
    } else {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] = (input[index] - mean_value) / std_value;
      }
    }
  }
  if (tid == 0) {
    if (std != nullptr) std[c] = std_value;
    if (running_mean != nullptr) {
      running_mean[c] =
          momentum * mean_value + (1 - momentum) * running_mean[c];
      int count = num * spatial * group_size;
      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
    }
  }
}

template <>
__global__ void sync_bn_forward_output_cuda_kernel(
    const phalf *input, const float *mean, const float *var,
    float *running_mean, float *running_var, const float *weight,
    const float *bias, float *norm, float *std, phalf *output, int num,
    int channels, int spatial, float eps, float momentum, int group_size) {
  int tid = threadIdx.x;
  int c = blockIdx.x;
  float mean_value = mean[c];
  float std_value = sqrt(var[c] + eps);
  if (weight != nullptr) {
    float weight_value = weight[c];
    float bias_value = bias[c];
    if (norm != nullptr) {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        norm[index] =
            (static_cast<float>(input[index]) - mean_value) / std_value;
        output[index] =
            static_cast<phalf>(norm[index] * weight_value + bias_value);
      }
    } else {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] =
            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /
                                   std_value * weight_value +
                               bias_value);
      }
    }
  } else {
    if (norm != nullptr) {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        norm[index] =
            (static_cast<float>(input[index]) - mean_value) / std_value;
        output[index] = static_cast<phalf>(norm[index]);
      }
    } else {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] = static_cast<phalf>(
            (static_cast<float>(input[index]) - mean_value) / std_value);
      }
    }
  }
  if (tid == 0) {
    if (std != nullptr) std[c] = std_value;
    if (running_mean != nullptr) {
      running_mean[c] =
          momentum * mean_value + (1 - momentum) * running_mean[c];
      int count = num * spatial * group_size;
      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
    }
  }
}

template <typename T>
__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output,
                                                   const float *norm,
                                                   float *grad_weight,
                                                   float *grad_bias, int num,
                                                   int channels, int spatial) {
  __shared__ float buffer1[THREADS_PER_BLOCK];
  __shared__ float buffer2[THREADS_PER_BLOCK];

  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer1[tid] = buffer2[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    buffer1[tid] += grad_output[index] * norm[index];
    buffer2[tid] += grad_output[index];
  }
  __syncthreads();

  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer1[tid] += buffer1[tid + s];
      buffer2[tid] += buffer2[tid + s];
    }
    __syncthreads();
  }
  if (tid == 0) {
    grad_weight[c] = buffer1[0];
    grad_bias[c] = buffer2[0];
  }
}

template <>
__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output,
                                                   const float *norm,
                                                   float *grad_weight,
                                                   float *grad_bias, int num,
                                                   int channels, int spatial) {
  __shared__ float buffer1[THREADS_PER_BLOCK];
  __shared__ float buffer2[THREADS_PER_BLOCK];

  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer1[tid] = buffer2[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];
    buffer2[tid] += static_cast<float>(grad_output[index]);
  }
  __syncthreads();

  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer1[tid] += buffer1[tid + s];
      buffer2[tid] += buffer2[tid + s];
    }
    __syncthreads();
  }
  if (tid == 0) {
    grad_weight[c] = buffer1[0];
    grad_bias[c] = buffer2[0];
  }
}

template <typename T>
__global__ void sync_bn_backward_data_cuda_kernel(
    int output_size, const T *grad_output, const float *weight,
    const float *grad_weight, const float *grad_bias, const float *norm,
    const float *std, T *grad_input, int num, int channels, int spatial) {
  int factor = num * spatial;
  CUDA_1D_KERNEL_LOOP(index, output_size) {
    int c = (index / spatial) % channels;
    grad_input[index] =
        weight[c] *
        (grad_output[index] -
         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
        std[c];
  }
}

template <>
__global__ void sync_bn_backward_data_cuda_kernel(
    int output_size, const phalf *grad_output, const float *weight,
    const float *grad_weight, const float *grad_bias, const float *norm,
    const float *std, phalf *grad_input, int num, int channels, int spatial) {
  int factor = num * spatial;
  CUDA_1D_KERNEL_LOOP(index, output_size) {
    int c = (index / spatial) % channels;
    grad_input[index] = static_cast<phalf>(
        weight[c] *
        (static_cast<float>(grad_output[index]) -
         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
        std[c]);
  }
}

#endif  // SYNCBN_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH
#define THREE_INTERPOLATE_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void three_interpolate_forward_cuda_kernel(
    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,
    const T *weight, T *out) {
  // points: (B, C, M)
  // idx: (B, N, 3)
  // weight: (B, N, 3)
  // output:
  //      out: (B, C, N)

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
    if (bs_idx >= b || c_idx >= c) return;

    weight += bs_idx * n * 3 + pt_idx * 3;
    points += bs_idx * c * m + c_idx * m;
    idx += bs_idx * n * 3 + pt_idx * 3;
    out += bs_idx * c * n + c_idx * n;

    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
                  weight[2] * points[idx[2]];
  }
}

template <typename T>
__global__ void three_interpolate_backward_cuda_kernel(
    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,
    const T *weight, T *grad_points) {
  // grad_out: (B, C, N)
  // weight: (B, N, 3)
  // output:
  //      grad_points: (B, C, M)

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
    if (bs_idx >= b || c_idx >= c) return;

    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
    weight += bs_idx * n * 3 + pt_idx * 3;
    grad_points += bs_idx * c * m + c_idx * m;
    idx += bs_idx * n * 3 + pt_idx * 3;

    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
  }
}

#endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef THREE_NN_CUDA_KERNEL_CUH
#define THREE_NN_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
                                             const T *unknown, const T *known,
                                             T *dist2, int *__restrict__ idx) {
  // unknown: (B, N, 3)
  // known: (B, M, 3)
  // output:
  //      dist2: (B, N, 3)
  //      idx: (B, N, 3)

  int bs_idx = blockIdx.y;
  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
    if (bs_idx >= b) return;

    unknown += bs_idx * n * 3 + pt_idx * 3;
    known += bs_idx * m * 3;
    dist2 += bs_idx * n * 3 + pt_idx * 3;
    idx += bs_idx * n * 3 + pt_idx * 3;

    T ux = unknown[0];
    T uy = unknown[1];
    T uz = unknown[2];

    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
    int besti1 = 0, besti2 = 0, besti3 = 0;
    for (int k = 0; k < m; ++k) {
      T x = known[k * 3 + 0];
      T y = known[k * 3 + 1];
      T z = known[k * 3 + 2];
      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
      if (d < best1) {
        best3 = best2;
        besti3 = besti2;
        best2 = best1;
        besti2 = besti1;
        best1 = d;
        besti1 = k;
      } else if (d < best2) {
        best3 = best2;
        besti3 = besti2;
        best2 = d;
        besti2 = k;
      } else if (d < best3) {
        best3 = d;
        besti3 = k;
      }
    }
    dist2[0] = best1;
    dist2[1] = best2;
    dist2[2] = best3;
    idx[0] = besti1;
    idx[1] = besti2;
    idx[2] = besti3;
  }
}

#endif  // THREE_NN_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TIN_SHIFT_CUDA_KERNEL_CUH
#define TIN_SHIFT_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

template <typename T>
__global__ void tin_shift_forward_cuda_kernel(
    const int nthreads, const T* input, const int* shift, T* output,
    const int batch_size, const int channels, const int t_size,
    const int hw_size, const int group_size, const int group_channel) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    const int hw_index = index % hw_size;
    const int j = (index / hw_size) % channels;

    const int n_index = (index / hw_size / channels) % batch_size;
    int group_id = j / group_channel;
    int t_shift = shift[n_index * group_size + group_id];
    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
    for (int i = 0; i < t_size; i++) {
      int now_t = i + t_shift;
      int data_id = i * hw_size * channels + offset;
      if (now_t < 0 || now_t >= t_size) {
        continue;
      }
      int out_id = now_t * hw_size * channels + offset;
      output[out_id] = input[data_id];
    }
  }
}

template <typename T>
__global__ void tin_shift_backward_cuda_kernel(
    const int nthreads, const T* input, const int* shift, T* output,
    const int batch_size, const int channels, const int t_size,
    const int hw_size, const int group_size, const int group_channel) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    const int hw_index = index % hw_size;
    const int j = (index / hw_size) % channels;

    const int n_index = (index / hw_size / channels) % batch_size;
    int group_id = j / group_channel;
    int t_shift = shift[n_index * group_size + group_id];
    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
    for (int i = 0; i < t_size; i++) {
      int now_t = i + t_shift;
      int data_id = i * hw_size * channels + offset;
      if (now_t < 0 || now_t >= t_size) {
        continue;
      }
      int out_id = now_t * hw_size * channels + offset;
      output[out_id] = input[data_id];
    }
  }
}

#endif  // TIN_SHIFT_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef VOXELIZATION_CUDA_KERNEL_CUH
#define VOXELIZATION_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;

template <typename T, typename T_int>
__global__ void dynamic_voxelize_kernel(
    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
    const float voxel_z, const float coors_x_min, const float coors_y_min,
    const float coors_z_min, const float coors_x_max, const float coors_y_max,
    const float coors_z_max, const int grid_x, const int grid_y,
    const int grid_z, const int num_points, const int num_features,
    const int NDim) {
  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
  CUDA_1D_KERNEL_LOOP(index, num_points) {
    // To save some computation
    auto points_offset = points + index * num_features;
    auto coors_offset = coors + index * NDim;
    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
    if (c_x < 0 || c_x >= grid_x) {
      coors_offset[0] = -1;
      continue;
    }

    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
    if (c_y < 0 || c_y >= grid_y) {
      coors_offset[0] = -1;
      coors_offset[1] = -1;
      continue;
    }

    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
    if (c_z < 0 || c_z >= grid_z) {
      coors_offset[0] = -1;
      coors_offset[1] = -1;
      coors_offset[2] = -1;
    } else {
      coors_offset[0] = c_z;
      coors_offset[1] = c_y;
      coors_offset[2] = c_x;
    }
  }
}

template <typename T, typename T_int>
__global__ void assign_point_to_voxel(const int nthreads, const T* points,
                                      T_int* point_to_voxelidx,
                                      T_int* coor_to_voxelidx, T* voxels,
                                      const int max_points,
                                      const int num_features,
                                      const int num_points, const int NDim) {
  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
    int index = thread_idx / num_features;

    int num = point_to_voxelidx[index];
    int voxelidx = coor_to_voxelidx[index];
    if (num > -1 && voxelidx > -1) {
      auto voxels_offset =
          voxels + voxelidx * max_points * num_features + num * num_features;

      int k = thread_idx % num_features;
      voxels_offset[k] = points[thread_idx];
    }
  }
}

template <typename T, typename T_int>
__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
                                   T_int* point_to_voxelidx,
                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
                                   const int num_points, const int NDim) {
  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
    // if (index >= num_points) return;
    int index = thread_idx / NDim;
    int num = point_to_voxelidx[index];
    int voxelidx = coor_to_voxelidx[index];
    if (num == 0 && voxelidx > -1) {
      auto coors_offset = voxel_coors + voxelidx * NDim;
      int k = thread_idx % NDim;
      coors_offset[k] = coor[thread_idx];
    }
  }
}

template <typename T_int>
__global__ void point_to_voxelidx_kernel(const T_int* coor,
                                         T_int* point_to_voxelidx,
                                         T_int* point_to_pointidx,
                                         const int max_points,
                                         const int max_voxels,
                                         const int num_points, const int NDim) {
  CUDA_1D_KERNEL_LOOP(index, num_points) {
    auto coor_offset = coor + index * NDim;
    // skip invalid points
    if (coor_offset[0] == -1) continue;

    int num = 0;
    int coor_x = coor_offset[0];
    int coor_y = coor_offset[1];
    int coor_z = coor_offset[2];
    // only calculate the coors before this coor[index]
    for (int i = 0; i < index; ++i) {
      auto prev_coor = coor + i * NDim;
      if (prev_coor[0] == -1) continue;

      // Find all previous points that have the same coors
      // if find the same coor, record it
      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
          (prev_coor[2] == coor_z)) {
        num++;
        if (num == 1) {
          // point to the same coor that first show up
          point_to_pointidx[index] = i;
        } else if (num >= max_points) {
          // out of boundary
          break;
        }
      }
    }
    if (num == 0) {
      point_to_pointidx[index] = index;
    }
    if (num < max_points) {
      point_to_voxelidx[index] = num;
    }
  }
}

template <typename T_int>
__global__ void determin_voxel_num(
    // const T_int* coor,
    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
    const int max_points, const int max_voxels, const int num_points) {
  // only calculate the coors before this coor[index]
  for (int i = 0; i < num_points; ++i) {
    int point_pos_in_voxel = point_to_voxelidx[i];
    // record voxel
    if (point_pos_in_voxel == -1) {
      // out of max_points or invalid point
      continue;
    } else if (point_pos_in_voxel == 0) {
      // record new voxel
      int voxelidx = voxel_num[0];
      if (voxel_num[0] >= max_voxels) continue;
      voxel_num[0] += 1;
      coor_to_voxelidx[i] = voxelidx;
      num_points_per_voxel[voxelidx] = 1;
    } else {
      int point_idx = point_to_pointidx[i];
      int voxelidx = coor_to_voxelidx[point_idx];
      if (voxelidx != -1) {
        coor_to_voxelidx[i] = voxelidx;
        num_points_per_voxel[voxelidx] += 1;
      }
    }
  }
}

__global__ void nondeterministic_get_assign_pos(
    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    int coors_idx = coors_map[thread_idx];
    if (coors_idx > -1) {
      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
      pts_id[thread_idx] = coors_pts_pos;
      if (coors_pts_pos == 0) {
        coors_order[coors_idx] = atomicAdd(coors_count, 1);
      }
    }
  }
}

template <typename T>
__global__ void nondeterministic_assign_point_voxel(
    const int nthreads, const T* points, const int32_t* coors_map,
    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
    const int max_voxels, const int max_points, const int num_features,
    const int NDim) {
  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    int coors_idx = coors_map[thread_idx];
    int coors_pts_pos = pts_id[thread_idx];
    if (coors_idx > -1 && coors_pts_pos < max_points) {
      int coors_pos = coors_order[coors_idx];
      if (coors_pos < max_voxels) {
        auto voxels_offset =
            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
        auto points_offset = points + thread_idx * num_features;
        for (int k = 0; k < num_features; k++) {
          voxels_offset[k] = points_offset[k];
        }
        if (coors_pts_pos == 0) {
          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
          auto coors_offset = coors + coors_pos * NDim;
          auto coors_in_offset = coors_in + coors_idx * NDim;
          for (int k = 0; k < NDim; k++) {
            coors_offset[k] = coors_in_offset[k];
          }
        }
      }
    }
  }
}

#endif  // VOXELIZATION_CUDA_KERNEL_CUH


================================================
FILE: mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
================================================
/*************************************************************************
 * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#ifndef COMMON_MLU_HELPER_HPP_
#define COMMON_MLU_HELPER_HPP_

#define NFU_ALIGN_SIZE 128          // Byte
#define REM_FOR_STACK (128 * 1024)  // 128KB reserved for cncc

#ifdef __BANG_ARCH__
#define MAX_NRAM_SIZE \
  (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
#define MAX_SRAM_SIZE \
  (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
#else
#define MAX_NRAM_SIZE (384 * 1024)   // 384KB,  initialization value
#define MAX_SRAM_SIZE (1920 * 1024)  // 1920KB, initialization value
#endif

#ifndef PAD_UP
#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
#endif

#ifndef PAD_DOWN
#define PAD_DOWN(x, y) (((x) / (y)) * (y))
#endif

#define CEIL_ALIGN(x, y) (((x) + (y) - 1) / (y) * (y))

template <typename scalar_t>
__mlu_func__ inline scalar_t min(scalar_t a, scalar_t b) {
  return a < b ? a : b;
}

template <typename scalar_t>
__mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) {
  return a > b ? a : b;
}

/*!
 * @brief Converts int32 to float32 data type.
 *
 * @param[out] dst
 *   Pointer to NRAM that stores int32 type data.
 * @param[in,out] dst_addition
 *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
 *   It allows empty pointer on MLU300 series.
 * @param[in] src
 *   Pointer to NRAM that stores float32 type data.
 * @param[in,out] src_addition
 *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
 *   It allows empty pointer on MLU300 series.
 * @param[in] src_count
 *   The count of elements in src.
 */
__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
                                   float *src_addition, const int src_count) {
#if __BANG_ARCH__ >= 300
  __bang_int2float((float *)dst, (int32_t *)src, src_count, 0);
#else
  // get sign bit
  const float move_23bit = 8388608.0;
  // 0x80000000 = 1,000000000,0000000000000000000000000000
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0x80000000);
  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
                    src_count * sizeof(float), NFU_ALIGN_SIZE);
  // get 1 or 0 from sign bit
  // judg is Odd
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0x00000001);
  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
                   (char *)src_addition, src_count * sizeof(float),
                   NFU_ALIGN_SIZE);
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0x80000001);
  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
                  NFU_ALIGN_SIZE / sizeof(float));
  // minus xor, positive num invariant
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0xffffffff);
  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));
  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
  // convert int32 to float32
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0x7fffff);
  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
                    src_count * sizeof(float), NFU_ALIGN_SIZE);
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0x4b000000);
  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
                   src_count * sizeof(float), NFU_ALIGN_SIZE);
  __bang_sub_scalar(dst, dst, move_23bit, src_count);
  // add one
  __bang_add(dst, dst, dst_addition, src_count);
  // set sign for float32
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0xffffffff);
  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));

  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0x00000001);
  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));

  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0x80000000);
  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
                    (char *)src_addition, src_count * 4, 128);
  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
#endif  // __BANG_ARCH__ >= 300
}

/*!
 * @brief Converts float32 to int32 data type with to_zero round mode.
 *
 * @param[out] dst
 *   Pointer to NRAM that stores float32 type data.
 * @param[in,out] dst_addition
 *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
 *   It allows empty pointer on MLU300 series.
 * @param[in] src
 *   Pointer to NRAM that stores int32 type data.
 * @param[in,out] src_addition
 *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
 *   It allows empty pointer on MLU300 series.
 * @param[in] src_count
 *   The count of elements in src.
 */
__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
                                   float *src_addition, const int src_count) {
#if __BANG_ARCH__ >= 300
  __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0);
#else
  // sign ===> src_addition
  // dst=-1.0 : when src[i] is a negative number
  // dst=+1.0 : when src[i] is a positive number
  const int floatDchar = sizeof(float) / sizeof(char);
  __bang_active_sign((float *)dst, src, src_count);
  // dst_addition = abs(src)
  __bang_mul(dst_addition, src, (float *)dst, src_count);
  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     1.0f);
  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
                  NFU_ALIGN_SIZE / sizeof(float));
  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     0xbf800000);
  // set negative flag -1.0 = 0xbf80000
  __bang_cycle_eq(
      (float *)dst, (float *)dst, (float *)src_addition, src_count,
      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
  __bang_active_abs(dst_addition, src, src_count);
  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     8388608.0f);
  // mask shift move 23
  __bang_cycle_add_tz(
      dst_addition, dst_addition, src_addition, src_count,
      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit
  // two`s complement for negatibe
  // dst=1.0 , when src <-1.0
  // dst=0.0 , when src >=-1.0
  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);
  // to fix max value
  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
  // means max value.
  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
              src_count * floatDchar);
  // get low 23bit
  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
                     (unsigned)0x007fffff);
  // mask low 23bit is 1
  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
                    (char *)src_addition, src_count * floatDchar,
                    NFU_ALIGN_SIZE / sizeof(char));
  // set 9 high bit ===> dst
  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));
  // src or dst_addition
  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
             src_count * floatDchar);
  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
             src_count * floatDchar);
#endif  // __BANG_ARCH__ >= 300
}

/*!
 * @brief Converts float32 to half data type,
 * the rounding mode on MLU200 is rd, on MLU300 is rn.
 *
 * @param[out] dst
 *   Pointer to NRAM that stores half type data.
 * @param[in] src
 *   Pointer to NRAM that stores float32 type data.
 * @param[in] src_count
 *   The count of elements in src.
 */
__mlu_func__ inline void convertFloat2half(half *dst, float *src,
                                           int src_count) {
#if __BANG_ARCH__ >= 300
  __bang_float2half_rn(dst, src, src_count);
#else
  __bang_float2half_rd(dst, src, src_count);
#endif
}

/*!
 * @brief recursiveSumPool.
 * @param[in,out] dst
 *     Pointer to NRAM that stores the input and output data.
 * @param[in] low_dim
 *     Which is the number of low dim.
 * @param[in] high_dim
 *     Which is the number of high dim.
 * @param[in] kernel_limit
 *     Which is the high_dim of sumpool per time.
 ******************************************************************************/
template <typename T>
__mlu_func__ void recursiveSumPool(T *dst, int low_dim, int high_dim,
                                   int kernel_limit) {
  for (; high_dim > 1;) {
    int repeat_s = high_dim / kernel_limit;
    int remain_s = high_dim % kernel_limit;

    if (remain_s) {
      __bang_sumpool((T *)dst, (T *)dst, low_dim, 1, remain_s, 1, remain_s, 1,
                     1);
    }
    if (repeat_s) {
      __bang_sumpool((T *)dst + (remain_s > 0 ? low_dim : 0),
                     (T *)dst + remain_s * low_dim, low_dim,
                     kernel_limit * repeat_s, 1, kernel_limit, 1, 1,
                     kernel_limit);
    }
    high_dim = repeat_s + (bool)remain_s;
  }
  return;
}

#endif  // COMMON_MLU_HELPER_HPP_


================================================
FILE: mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "common_mlu_helper.hpp"

__nram__ char nram_buffer[MAX_NRAM_SIZE];

template <typename T>
__mlu_func__ void MLUUnion1MaskedIm2colForward(
    const T *feature, const int height, const int width, const int channels,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt,
    T *data_col) {
  for (int index = taskId; index < mask_cnt; index += taskDim) {
    const int h_col = mask_h_idx[index];
    const int w_col = mask_w_idx[index];
    const int h_offset = h_col - pad_h;
    const int w_offset = w_col - pad_w;
    int h_start = h_offset;
    int h_end = h_offset + kernel_h - 1;
    int w_start = w_offset;
    int w_end = w_start + kernel_w - 1;
    if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) {
      continue;
    } else {
      int h_start_valid = max(0, h_start);
      int h_end_valid = min(height - 1, h_end);
      int w_start_valid = max(0, w_start);
      int w_end_valid = min(width - 1, w_end);
      __memcpy(
          data_col + index * kernel_h * kernel_w * channels +
              ((h_start_valid - h_start) * kernel_w +
               (w_start_valid - w_start)) *
                  channels,
          feature + h_start_valid * width * channels + w_start_valid * channels,
          (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM,
          kernel_w * channels * sizeof(T), width * channels * sizeof(T),
          h_end_valid - h_start_valid);
    }
  }
}

template <typename T>
__mlu_func__ void MLUUnion1MaskedCol2imForward(const T *col, const int height,
                                               const int width,
                                               const int channels,
                                               const int32_t *mask_h_idx,
                                               const int32_t *mask_w_idx,
                                               const int mask_cnt, T *im) {
  const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
  if (channels <= channels_max_num_nram) {
    const int deal_num = channels_max_num_nram / channels;
    int mask_per_core = mask_cnt / taskDim;
    const int mask_remain = mask_cnt % taskDim;
    mask_per_core += taskId < mask_remain ? 1 : 0;
    int index_start = taskId < mask_remain
                          ? taskId * mask_per_core
                          : taskId * mask_per_core + mask_remain;
    int loop = mask_per_core / deal_num;
    int remain_num = mask_per_core % deal_num;
    T *nram_col = (T *)nram_buffer;
    for (int index = 0; index < loop; ++index) {
      int cur_index = index_start + index * deal_num;
      __memcpy(nram_col, col + cur_index * channels,
               deal_num * channels * sizeof(T), GDRAM2NRAM);
      for (int i = 0; i < deal_num; ++i) {
        int mask_index = cur_index + i;
        const int h_im = mask_h_idx[mask_index];
        const int w_im = mask_w_idx[mask_index];
        // if(h_im>=height || w_im>=width) continue;
        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
                 channels * sizeof(T), NRAM2GDRAM);
      }
    }
    if (remain_num > 0) {
      int cur_index = index_start + loop * deal_num;
      __memcpy(nram_col, col + cur_index * channels,
               remain_num * channels * sizeof(T), GDRAM2NRAM);
      for (int i = 0; i < remain_num; ++i) {
        int mask_index = cur_index + i;
        const int h_im = mask_h_idx[mask_index];
        const int w_im = mask_w_idx[mask_index];
        // if(h_im>=height || w_im>=width) continue;
        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
                 channels * sizeof(T), NRAM2GDRAM);
      }
    }
  } else {
    for (int index = taskId; index < mask_cnt; index += taskDim) {
      const int m_index = index % mask_cnt;
      const int h_im = mask_h_idx[m_index];
      const int w_im = mask_w_idx[m_index];
      // if(h_im>=height || w_im>=width) continue;
      __memcpy(im + (h_im * width + w_im) * channels, col + index * channels,
               channels * sizeof(T), GDRAM2GDRAM);
    }
  }
}

__mlu_global__ void MLUKernelMaskedIm2colForward(
    const void *feature, const int height, const int width, const int channels,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
    void *data_col, const cnrtDataType_t data_dtype) {
  if (coreId == 0x80) {
    return;
  }

  switch (data_dtype) {
    case CNRT_FLOAT16: {
      MLUUnion1MaskedIm2colForward((half *)feature, height, width, channels,
                                   kernel_h, kernel_w, pad_h, pad_w,
                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
                                   mask_cnt, (half *)data_col);
    }; break;
    case CNRT_FLOAT32: {
      MLUUnion1MaskedIm2colForward((float *)feature, height, width, channels,
                                   kernel_h, kernel_w, pad_h, pad_w,
                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
                                   mask_cnt, (float *)data_col);
    }; break;
    default: {
      break;
    }
  }
}

__mlu_global__ void MLUKernelMaskedCol2imForward(
    const void *col, const int height, const int width, const int channels,
    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
    void *im, const cnrtDataType_t data_dtype) {
  if (coreId == 0x80) {
    return;
  }
  switch (data_dtype) {
    case CNRT_FLOAT16: {
      MLUUnion1MaskedCol2imForward((half *)col, height, width, channels,
                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
                                   mask_cnt, (half *)im);
    }; break;
    case CNRT_FLOAT32: {
      MLUUnion1MaskedCol2imForward((float *)col, height, width, channels,
                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
                                   mask_cnt, (float *)im);
    }; break;
    default: {
      break;
    }
  }
}

void KernelMaskedIm2colForward(
    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
    cnrtDataType_t k_dtype, const void *im_ptr, const int height,
    const int width, const int channels, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr) {
  MLUKernelMaskedIm2colForward<<<k_dim, k_type, queue>>>(
      im_ptr, height, width, channels, kernel_h, kernel_w, pad_h, pad_w,
      mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr, k_dtype);
}

void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                               cnrtQueue_t queue, cnrtDataType_t k_dtype,
                               const void *col_ptr, const int height,
                               const int width, const int channels,
                               const void *mask_h_idx_ptr,
                               const void *mask_w_idx_ptr, const int mask_cnt,
                               void *im_ptr) {
  MLUKernelMaskedCol2imForward<<<k_dim, k_type, queue>>>(
      col_ptr, height, width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
      mask_cnt, im_ptr, k_dtype);
}


================================================
FILE: mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "common_mlu_helper.hpp"

#define ALIGN_SIZE 64
#define PIPELINE_COMMON_NUM 2
#define PIPELINE_PINGPONG_NUM 10

__nram__ char nram_buffer[MAX_NRAM_SIZE];

namespace forward {
template <typename T>
__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,
                                int width, int channels, int p_height,
                                int p_width, T spatial_scale, int *bin_x1,
                                int *bin_y1, int *bin_x2, int *bin_y2,
                                int *bin_wdim, int *bin_hdim, int *bin_dims,
                                T **input_base, bool *is_empty) {
  int pw = bin_i % p_width;
  int ph = (bin_i / p_width) % p_height;
  int roi_n = bin_i / p_width / p_height;

  /*roi*/
  const T *roi_info = rois_v + roi_n * 5;  // {{batch, x1, y1, x2, y2},,,}
  int batch_index = (int)roi_info[0];
  int roi_x1 = round(roi_info[1] * spatial_scale);
  int roi_y1 = round(roi_info[2] * spatial_scale);
  int roi_x2 = round(roi_info[3] * spatial_scale);
  int roi_y2 = round(roi_info[4] * spatial_scale);
  int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;
  int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;

  /*bin*/
  T bin_w = (T)roi_w / (T)p_width;
  T bin_h = (T)roi_h / (T)p_height;

  *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;
  *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;
  *bin_x1 = *bin_x1 < width ? *bin_x1 : width;

  *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;
  *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;
  *bin_y1 = *bin_y1 < height ? *bin_y1 : height;

  *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;
  *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;
  *bin_x2 = *bin_x2 < width ? *bin_x2 : width;

  *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;
  *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;
  *bin_y2 = *bin_y2 < height ? *bin_y2 : height;

  *input_base = input_v + batch_index * height * width * channels;
  *bin_wdim = *bin_x2 - *bin_x1;
  *bin_hdim = *bin_y2 - *bin_y1;
  *bin_dims = (*bin_hdim) * (*bin_wdim);
  *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);
}

template <typename T>
__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
                                   int channels, int height, int width,
                                   int p_height, int p_width, int rois_num,
                                   T spatial_scale, T *output_v, int *argmax) {
  /*
   * NRAM partition
   *  |---------------------------------------------------|
   *  |                        ping                       |
   *  |---------------------------------------------------|
   *  |                        pong                       |
   *  |---------------------------------------------------|
   *  |                        out                        |
   *  |---------------------------------------------------|
   *  |                        argmax                     |
   *  |---------------------------------------------------|
   *  |                        a                          |
   *  |---------------------------------------------------|
   *  |                        b                          |
   *  |---------------------------------------------------|
   */
  uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;
  uint32_t t_size = sizeof(T);
  uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);
  uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);

  uint32_t channels_align = PAD_UP(channels, float_div);
  uint32_t nram_limit = PAD_DOWN(
      (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);

  // nram PING/PONG, output, argamx, a, b
  float *nram_ping = (float *)nram_buffer;
  float *nram_pong = (float *)nram_buffer + nram_limit;
  float *nram_out = (float *)nram_buffer + 2 * nram_limit;
  float *nram_argmax = nram_out + channels_align;
  float *nram_a = nram_out + 2 * channels_align;
  float *nram_b = nram_out + 3 * channels_align;

  uint32_t c_bins_num = rois_num * p_height * p_width;
  uint32_t task_bins = c_bins_num / taskDim;
  uint32_t rem_bins = c_bins_num % taskDim;
  if (taskId < rem_bins) {
    task_bins += 1;
  }
  int bin_first =
      (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);
  int bins_loop = bin_first + task_bins;

  T *input_base = NULL;
  T *output_base = output_v + bin_first * channels;
  int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;
  int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;
  int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;
  bool is_empty = false;
  bool pong_is_empty = false;
  bool is_first_bin = true;
  uint32_t src_offset = 0;
  uint32_t dst_offset = 0;
  uint32_t nram_offset = 0;
  uint32_t half_offset =
      is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;
  float *nram_tmp = NULL;

  uint32_t c_slice = 0;
  uint32_t c_slice_align = 0;
  uint32_t pongc_slice = 0;
  uint32_t pongc_slice_align = 0;
  for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {
    getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,
                  p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,
                  &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,
                  &input_base, &is_empty);
    uint32_t c_rem = channels;
    c_slice = nram_limit / bin_dims / float_div * float_div;

    if (is_first_bin && !is_empty) {
      c_slice = c_slice > c_rem ? c_rem : c_slice;
      c_slice_align = PAD_UP(c_slice, float_div);
      for (int h = bin_y1; h < bin_y2; h++) {
        src_offset = (h * width + bin_x1) * channels;
        nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;
        if (c_slice_align == channels) {
          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
                   bin_wdim * c_slice * t_size, GDRAM2NRAM);
        } else {
          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
                   c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,
                   channels * t_size, bin_wdim - 1);
        }
      }
    }
    uint32_t c_offset = 0;
    while (c_rem > 0) {
      c_slice = c_slice > c_rem ? c_rem : c_slice;
      c_slice_align = PAD_UP(c_slice, float_div);

      /*__memcpy_async*/
      if (c_rem - c_slice > 0 && !is_empty) {
        pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;
        pongc_slice_align = PAD_UP(pongc_slice, float_div);
        for (int h = bin_y1; h < bin_y2; h++) {
          src_offset = (h * width + bin_x1) * channels + c_offset;
          nram_offset =
              (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;
          __memcpy_async((T *)nram_pong + nram_offset,
                         (T *)input_base + src_offset + c_slice,
                         pongc_slice * t_size, GDRAM2NRAM,
                         pongc_slice_align * t_size, channels * t_size,
                         bin_wdim - 1);
        }
      } else if (bin_i + 1 < bins_loop) {
        getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,
                      channels, p_height, p_width, (T)spatial_scale, &pbin_x1,
                      &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,
                      &pbin_dims, &input_base, &pong_is_empty);
        pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);
        pongc_slice = pongc_slice > channels ? channels : pongc_slice;
        pongc_slice_align = PAD_UP(pongc_slice, float_div);
        if (!pong_is_empty) {
          for (int h = pbin_y1; h < pbin_y2; h++) {
            src_offset = (h * width + pbin_x1) * channels;
            nram_offset =
                (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;
            if (pongc_slice_align == channels) {
              __memcpy_async((T *)nram_pong + nram_offset,
                             (T *)input_base + src_offset,
                             pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);
            } else {
              __memcpy_async((T *)nram_pong + nram_offset,
                             (T *)input_base + src_offset, pongc_slice * t_size,
                             GDRAM2NRAM, pongc_slice_align * t_size,
                             channels * t_size, pbin_wdim - 1);
            }
          }
        }
      }

      if (is_empty) {
        __bang_write_value((T *)nram_out, c_slice_align, (T)0);
        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
                 c_slice * t_size, NRAM2GDRAM);
        if (NULL != argmax) {
          __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
        }
      } else {
        if (is_half) {
          uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);
          __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,
                            bin_align64);
        }
        __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,
                       bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);
        if (is_half) {
          uint32_t c_align64 = PAD_UP(c_slice_align, half_div);
          __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);
        }
        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
                 c_slice * t_size, NRAM2GDRAM);
        if (NULL != argmax) {
          /*compute max_index*/
          __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,
                               c_slice_align, bin_hdim, bin_wdim, bin_hdim,
                               bin_wdim, 1, 1);
          convertInt2Float((float *)nram_argmax, (float *)nram_a,
                           (int32_t *)nram_out, (float *)nram_b, c_slice_align);

          /*compute input_h*/
          for (int i = 0; i < c_slice; i++) {
            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
          }
          __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
                            c_slice_align);
          __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
                            c_slice_align);

          /*compute input_w*/
          __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
                            c_slice_align);
          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
                     c_slice_align);
          __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
                            c_slice_align);
          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
                     c_slice_align);
          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
                           (float *)nram_out, (float *)nram_b, c_slice_align);
          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
                   (int32_t *)nram_argmax, c_slice * sizeof(int32_t),
                   NRAM2GDRAM);
        }
      }
      nram_tmp = nram_ping;
      nram_ping = nram_pong;
      nram_pong = nram_tmp;
      c_offset += c_slice;
      c_rem -= c_slice;
      __asm__ volatile("sync;");
    }
    dst_offset += channels;
    is_first_bin = false;
  }
}

__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
                                     const void *input_data,
                                     const void *input_rois, int batch,
                                     int channels, int height, int width,
                                     int pooled_height, int pooled_width,
                                     int rois_num, float spatial_scale,
                                     void *output_data, int *argmax) {
  switch (data_type) {
    case CNRT_FLOAT16: {
      MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,
                       height, width, pooled_height, pooled_width, rois_num,
                       (half)spatial_scale, (half *)output_data, argmax);
    }; break;
    case CNRT_FLOAT32: {
      MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,
                       channels, height, width, pooled_height, pooled_width,
                       rois_num, (float)spatial_scale, (float *)output_data,
                       argmax);
    }; break;
    default: { break; }
  }
}
}  // namespace forward

namespace backward {
// Convert index of argmax from global grads_image to local bin in RoI. Vector
// operations do not support int type, so conversion from int to float is
// performed here.
__mlu_func__ void convertIndex(
    int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,
    int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,
    int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,
    int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,
    float *nram_atomic_add, float *nram_grads_image, int width, int height,
    int wstart, int hstart, int w_compute, int h_compute, int align_c,
    int channels, int loop_flag, int loop_id, int true_limit) {
  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);

  // This step uses scalar division, because the above vector division causes
  // rounding accuracy problem.
  for (int i = 0; i < channels; ++i) {
    *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;
  }

  // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'
  // operation.
  convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,
                   (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,
                   align_c);
  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
                   (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,
                   align_c);

  // Perform 'temp_result - hstart' operation
  __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
                    align_c);

  // Perform 'temp_result1 - temp_result2 * width' operation
  __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
                    align_c);
  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
             (float *)nram_argmax_fp_w, align_c);

  // Perform 'temp_result - wstart' operation
  __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
                    wstart, align_c);

  // Perform 'temp_result = h * w_compute + w' operation
  __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
                    w_compute, align_c);
  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
             (float *)nram_argmax_fp_w, align_c);

  if (loop_flag == 1) {
    __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
                      (loop_id * true_limit), align_c);
  }
  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
                   align_c);
}

template <typename T>
__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
                                   const int32_t *argmax, T *grads_image,
                                   int channels, int height, int width,
                                   int pooled_height, int pooled_width,
                                   int rois_num, const T spatial_scale,
                                   int high_precision) {
  // Calculate the number of rois processed by each core
  int bin_num = rois_num * pooled_height * pooled_width;
  int loop =
      (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);
  int tid = taskId * loop;
  if (bin_num % taskDim != 0) {
    if (tid >= bin_num) {
      return;
    } else {
      // last part is (bin_num - tid).
      loop = bin_num - tid < loop ? bin_num - tid : loop;
    }
  }
  int align_c = PAD_UP(channels, ALIGN_SIZE);
  // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.
  int data_size =
      PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -
                 (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /
                2),
               ALIGN_SIZE);
  int hw_limit = data_size / align_c;
  float *nram_grads = (float *)nram_buffer;
  for (int idx = tid; idx < tid + loop; ++idx) {
    // (n, ph, pw) is a C in the pooled output
    int pw = idx % pooled_width;
    int ph = (idx / pooled_width) % pooled_height;
    int n = idx / pooled_width / pooled_height;

    const T *offset_rois = (const T *)(rois + n * 5);
    int roi_batch_ind = int(offset_rois[0]);
    // Calculate the roi region on feature maps
    int roi_start_w = round(offset_rois[1] * spatial_scale);
    int roi_start_h = round(offset_rois[2] * spatial_scale);
    int roi_end_w = round(offset_rois[3] * spatial_scale);
    int roi_end_h = round(offset_rois[4] * spatial_scale);
    // Force malformed rois to 1x1
    int roi_width =
        roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;
    int roi_height =
        roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;
    T bin_size_h = (T)roi_height / (T)pooled_height;
    T bin_size_w = (T)roi_width / (T)pooled_width;

    // The corresponding bin region
    int hstart = int(floor((T)ph * bin_size_h));
    int wstart = int(floor((T)pw * bin_size_w));
    int hend = int(ceil((T)(ph + 1) * bin_size_h));
    int wend = int(ceil((T)(pw + 1) * bin_size_w));

    // Add roi offsets and clip to input boundaries, min(max(A, B), C);
    hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;
    hstart = hstart < height ? hstart : height;
    hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;
    hend = hend < height ? hend : height;
    wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;
    wstart = wstart < width ? wstart : width;
    wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;
    wend = wend < width ? wend : width;

    bool is_empty = (hend <= hstart) || (wend <= wstart);
    if (!is_empty) {
      int h_compute = hend - hstart;
      int w_compute = wend - wstart;
      int true_limit =
          hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;
      int loop_int = (h_compute * w_compute) / true_limit;
      int rem = (h_compute * w_compute) % true_limit;
      int32_t *nram_argmax = (int32_t *)nram_grads + align_c;
      int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;
      int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
      int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
      int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
      int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
      int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
      int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
      int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
      float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
      float *nram_grads_image = (float *)nram_atomic_add + align_c;
      if (true_limit == h_compute * w_compute) {
        /*
         * NRAM partition
         *  |---------------------------------------------------|
         *  |                     grads                         |
         *  |---------------------------------------------------|
         *  |                     argmax                        |
         *  |---------------------------------------------------|
         *  |                     argmax_temp                   |
         *  |---------------------------------------------------|
         *  |                     atomic_add                    |
         *  |---------------------------------------------------|
         *  |                     grads_image                   |
         *  |---------------------------------------------------|
         */

        // Load the data from GDRAM to NRAM.
        __memcpy(
            (T *)nram_grads + align_c * high_precision,
            (const T *)grads +
                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
                    channels,
            channels * sizeof(T), GDRAM2NRAM);
        if (high_precision) {
          __bang_half2float((float *)nram_grads,
                            (half *)nram_grads + align_c * high_precision,
                            align_c);
        }

        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
                                             (n * pooled_height * pooled_width +
                                              ph * pooled_width + pw) *
                                                 channels,
                 channels * sizeof(int32_t), GDRAM2NRAM);

        // Perform pooling operation on NRAM.
        convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
                     nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
                     nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
                     nram_atomic_add, nram_grads_image, width, height, wstart,
                     hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);
        __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
                          (int32_t *)nram_argmax_int, align_c, h_compute,
                          w_compute, h_compute, w_compute, h_compute,
                          w_compute);
        if (high_precision) {
          __bang_float2half_rd((half *)nram_grads_image,
                               (float *)nram_grads_image,
                               h_compute * w_compute * align_c);
        }

        // Store the result on NRAM back to GDRAM.
        for (int hc = 0; hc < h_compute; ++hc) {
          for (int wc = 0; wc < w_compute; ++wc) {
            T *dst = (T *)nram_atomic_add;
            int grad_image_offset = (roi_batch_ind * height * width +
                                     (hc + hstart) * width + wc + wstart) *
                                    channels;
            T *src1 = (T *)grads_image + grad_image_offset;
            int nram_grads_image_offset = (hc * w_compute + wc) * align_c;
            T *src2 = (T *)nram_grads_image + nram_grads_image_offset;
            __bang_atomic_add(dst, src1, src2, channels);
          }
        }
      } else if (true_limit > 0) {
        /*
         * NRAM partition
         *  |---------------------------------------------------|
         *  |                     grads                         |
         *  |---------------------------------------------------|
         *  |                     argmax                        |
         *  |--------------------ping_pong----------------------|
         *  |       argmax_temp      |       argmax_temp        |
         *  |------------------------|--------------------------|
         *  |       atomic_add       |       atomic_add         |
         *  |------------------------|--------------------------|
         *  |       grads_image      |       grads_image        |
         *  |---------------------------------------------------|
         */

        // Load the data from GDRAM to NRAM.
        __memcpy(
            (T *)nram_grads + align_c * high_precision,
            (const T *)grads +
                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
                    channels,
            channels * sizeof(T), GDRAM2NRAM);
        if (high_precision) {
          __bang_half2float((float *)nram_grads,
                            (half *)nram_grads + align_c * high_precision,
                            align_c);
        }
        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
                                             (n * pooled_height * pooled_width +
                                              ph * pooled_width + pw) *
                                                 channels,
                 channels * sizeof(int32_t), GDRAM2NRAM);

        int ping_pong = 0;
        int ping_pong_offset =
            (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;
        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
          int size = (loop_id == loop_int) ? rem : true_limit;
          if (size == 0) {
            break;
          }
          // Perform pooling operation on NRAM.
          nram_argmax_fp =
              (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;
          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
          nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
          nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
          nram_grads_image = (float *)nram_atomic_add + align_c;
          int loop_id_1 = loop_id;
          int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;
          if (size_1 == 0) {
            break;
          }
          convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
                       nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
                       nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
                       nram_atomic_add, nram_grads_image, width, height, wstart,
                       hstart, w_compute, h_compute, align_c, channels, 1,
                       loop_id_1, true_limit);
          __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
                            (int32_t *)nram_argmax_int, align_c, size_1, 1,
                            size_1, 1, size_1, 1);
          if (high_precision) {
            __bang_float2half_rd((half *)nram_grads_image,
                                 (float *)nram_grads_image, size_1 * align_c);
          }

          // Store the result on NRAM back to GDRAM.
          for (int index_size = 0; index_size < size; ++index_size) {
            int h = (loop_id * true_limit + index_size) / w_compute;
            int w = (loop_id * true_limit + index_size) % w_compute;
            T *dst = (T *)nram_atomic_add;
            T *grads_image_n =
                (T *)grads_image + roi_batch_ind * height * width * channels;
            T *src1 = (T *)grads_image_n +
                      ((h + hstart) * width + (w + wstart)) * channels;
            T *src2 = (T *)nram_grads_image + index_size * align_c;
            __bang_atomic_add(dst, src1, src2, channels);
          }
          ping_pong = 1 - ping_pong;
        }
      } else {
        /*
         * NRAM partition
         *  |---------------------------------------------------|
         *  |                     grads                         |
         *  |---------------------------------------------------|
         *  |                     argmax                        |
         *  |--------------------ping_pong----------------------|
         *  |       argmax_temp      |       argmax_temp        |
         *  |------------------------|--------------------------|
         *  |       atomic_add       |       atomic_add         |
         *  |------------------------|--------------------------|
         *  |       grads_image      |       grads_image        |
         *  |---------------------------------------------------|
         */

        int c_limit =
            PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /
                         (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),
                     ALIGN_SIZE);
        int loop_int = channels / c_limit;
        int rem = channels % c_limit;
        int ping_pong = 0;
        int ping_pong_offset =
            (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;
        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
          int size = (loop_id == loop_int) ? rem : c_limit;
          if (size == 0) {
            break;
          }
          nram_argmax_fp =
              (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;
          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;
          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;
          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;
          nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;
          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;
          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;
          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;
          nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;
          nram_grads_image = (float *)nram_atomic_add + c_limit;

          // This pipeline loads the data from GDRAM to NRAM.
          __memcpy((T *)nram_grads + c_limit * high_precision,
                   (const T *)grads +
                       n * pooled_height * pooled_width * channels +
                       ph * pooled_width * channels + pw * channels +
                       loop_id * c_limit,
                   size * sizeof(T), GDRAM2NRAM);
          if (high_precision) {
            __bang_half2float((float *)nram_grads,
                              (half *)nram_grads + c_limit * high_precision,
                              c_limit);
          }
          __memcpy((int32_t *)nram_argmax,
                   (const int32_t *)argmax +
                       n * pooled_height * pooled_width * channels +
                       ph * pooled_width * channels + pw * channels +
                       loop_id * c_limit,
                   size * sizeof(int32_t), GDRAM2NRAM);

          for (int hc = 0; hc < h_compute; ++hc) {
            for (int wc = 0; wc < w_compute; ++wc) {
              // This pipeline performs pooling operation on NRAM.
              convertIndex(
                  nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
                  nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
                  nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
                  nram_atomic_add, nram_grads_image, width, height, wstart + wc,
                  hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);
              __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
                                (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,
                                1, 1);
              if (high_precision) {
                __bang_float2half_rd((half *)nram_grads_image,
                                     (float *)nram_grads_image, c_limit);
              }
              // This pipeline stores the result on NRAM back to GDRAM.
              T *dst = (T *)nram_atomic_add;
              T *grads_image_n =
                  (T *)grads_image + roi_batch_ind * height * width * channels;
              T *src1 = (T *)grads_image_n +
                        ((hc + hstart) * width + (wc + wstart)) * channels +
                        loop_id * c_limit;
              T *src2 = (T *)nram_grads_image;
              __bang_atomic_add(dst, src1, src2, size);
            }
          }
          ping_pong = 1 - ping_pong;
        }
      }
    }
  }
}

__mlu_global__ void MLUKernelRoiPoolBackward(
    const void *grads, const void *rois, const int *argmax, void *grads_image,
    int rois_num, int pooled_height, int pooled_width, int channels, int no,
    int height, int width, const float spatial_scale,
    const cnrtDataType_t k_dtype) {
  // make sure that memcore is not used
  if (coreId == 0x80) {
    return;
  }
  switch (k_dtype) {
    case CNRT_FLOAT16: {
      // Using the float type '__bang_max_pool_bp' instruction to increase the
      // bit width.
      const int high_precision = 1;
      MLUUnion1Roipool((const half *)rois, (const half *)grads,
                       (const int32_t *)argmax, (half *)grads_image, channels,
                       height, width, pooled_height, pooled_width, rois_num,
                       (const half)spatial_scale, high_precision);
    }; break;
    case CNRT_FLOAT32: {
      const int high_precision = 0;
      MLUUnion1Roipool((const float *)rois, (const float *)grads,
                       (const int32_t *)argmax, (float *)grads_image, channels,
                       height, width, pooled_height, pooled_width, rois_num,
                       (const float)spatial_scale, high_precision);
    }; break;
    default: { break; }
  }
}
}  // namespace backward

void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                          cnrtQueue_t queue, cnrtDataType_t data_type,
                          const void *input_data, const void *input_rois,
                          const int batch, const int channels, const int height,
                          const int width, const int pooled_height,
                          const int pooled_width, const int rois_num,
                          const float spatial_scale, void *output_data,
                          int *argmax) {
  forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(
      data_type, input_data, input_rois, batch, channels, height, width,
      pooled_height, pooled_width, rois_num, spatial_scale, output_data,
      argmax);
}

void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
                           const void *grad_output_ptr, const void *rois_ptr,
                           const int *argmax_ptr, void *grad_input_ptr,
                           const int box_num, const int pooled_height,
                           const int pooled_width, const int channels,
                           const int batch, const int height, const int width,
                           const float spatial_scale) {
  backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(
      grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,
      pooled_height, pooled_width, channels, batch, height, width,
      spatial_scale, k_dtype);
}


================================================
FILE: mmcv/ops/csrc/common/mps/MPSDevice.h
================================================
//  Copyright © 2022 Apple Inc.

// This file is modify from:
// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h

#pragma once
#include <ATen/ATen.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>

#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef id<MTLDevice> MTLDevice_t;
#else
typedef void* MTLDevice;
typedef void* MTLDevice_t;
#endif

using namespace std;

namespace at {
namespace mps {

//-----------------------------------------------------------------
//  MPSDevice
//
// MPSDevice is a singleton class that returns the default device
//-----------------------------------------------------------------

class TORCH_API MPSDevice {
 public:
  /**
   * MPSDevice should not be cloneable.
   */
  MPSDevice(MPSDevice& other) = delete;
  /**
   * MPSDevice should not be assignable.
   */
  void operator=(const MPSDevice&) = delete;
  /**
   * Gets single instance of the Device.
   */
  static MPSDevice* getInstance();
  /**
   * Returns the single device.
   */
  MTLDevice_t device() { return _mtl_device; }

  ~MPSDevice();

 private:
  static MPSDevice* _device;
  MTLDevice_t _mtl_device;
  MPSDevice();
};

TORCH_API bool is_available();

TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);

}  // namespace mps
}  // namespace at


================================================
FILE: mmcv/ops/csrc/common/mps/MPSLibrary.h
================================================
#ifndef _MPS_LIBRARY_H_
#define _MPS_LIBRARY_H_

#include <string>
#include <unordered_map>

#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>

typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
typedef id<MTLLibrary> MTLLibrary_t;
#else
typedef void* MTLComputePipelineState;
typedef void* MTLComputePipelineState_t;
typedef void* MTLLibrary;
typedef void* MTLLibrary_t;
#endif

class MPSLibrary {
 public:
  // disable constructor for singleton
  static MPSLibrary* createFromUrl(const std::string& library_url);
  static MPSLibrary* createFromSource(const std::string& source);
  ~MPSLibrary();

  MTLLibrary_t library() { return _library; }

  MTLComputePipelineState_t getComputePipelineState(
      const std::string& function_name);

 private:
  MTLLibrary_t _library;
  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
};

class MPSLibraryManager {
 public:
  // disable constructor for singleton
  MPSLibraryManager(const MPSLibraryManager&) = delete;
  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
  MPSLibraryManager(MPSLibraryManager&&) = delete;
  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;

  static MPSLibraryManager* getInstance();

  bool hasLibrary(const std::string& name);

  MPSLibrary* getLibrary(const std::string& library_url);

  MPSLibrary* createLibraryFromSouce(const std::string& name,
                                     const std::string& sources);

  ~MPSLibraryManager();

 private:
  MPSLibraryManager();
  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
};
#endif


================================================
FILE: mmcv/ops/csrc/common/mps/MPSLibrary.mm
================================================
#include "MPSLibrary.h"
#include "MPSDevice.h"

static std::unique_ptr<MPSLibraryManager> mps_library_manager=nullptr;

MPSLibraryManager* MPSLibraryManager::getInstance() {
  if(!mps_library_manager)
    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
  return mps_library_manager.get();
}

MPSLibraryManager::~MPSLibraryManager() {}

MPSLibraryManager::MPSLibraryManager() {}

bool MPSLibraryManager::hasLibrary(const std::string& name) {
  return _library_map.find(name) != _library_map.end();
}

MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
  if (_library_map.find(library_url) != _library_map.end()) {
    return _library_map[library_url].get();
  }
  _library_map.emplace(std::make_pair(
      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
  return _library_map[library_url].get();
}

MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
                                                      const std::string& source) {
  NSString* ns_name = [NSString stringWithCString:name.c_str()];
  if (_library_map.find(name) != _library_map.end()) {
    NSLog(@"Library %@ already exist.", ns_name);
    return nullptr;
  }

  _library_map.emplace(
      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
  return _library_map[name].get();
}

MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
  MPSLibrary* library = new MPSLibrary();
  @autoreleasepool {
    NSError* error = nil;

    // load library and func
    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
                                                                                 error:&error];
    if (library->_library == nil) {
      NSLog(@"Failed to find library, error %@.", error);
      exit(1);
    }
  }

  return library;
}

MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
  MPSLibrary* library = new MPSLibrary();
  @autoreleasepool {
    NSError* error = nil;

    // load library and func
    NSString* code_str = [NSString stringWithCString:sources.c_str()];
    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
                                                                                  options:nil
                                                                                    error:&error];
    if (library->_library == nil) {
      NSLog(@"Failed to find library, error %@.", error);
      exit(1);
    }
  }

  return library;
}

MPSLibrary::~MPSLibrary() {
  [_library release];
  _library = nil;
}

MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
  if (_pso_map.find(function_name) != _pso_map.end()) {
    return _pso_map[function_name];
  }

  MTLComputePipelineState_t pso;
  @autoreleasepool {
    NSError* error = nil;

    // create function
    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
    if (func == nil) {
      NSLog(@"Failed to created pipeline state object, error %@.", error);
      exit(1);
    }
    // create pipeline
    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
                                                                                     error:&error];
    _pso_map.emplace(std::make_pair(function_name, pso));
  }
  return _pso_map[function_name];
}


================================================
FILE: mmcv/ops/csrc/common/mps/MPSStream.h
================================================
//  Copyright © 2022 Apple Inc.

// This file is modify from:
// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h

#pragma once

#include <cstdint>
#include <utility>

#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/util/Exception.h>
#include "MPSDevice.h"

#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
typedef id<MTLCommandQueue> MTLCommandQueue_t;
typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
typedef id<MTLSharedEvent> MTLSharedEvent_t;
typedef id<MTLDevice> MTLDevice_t;
#else
typedef void* MTLCommandQueue_t;
typedef void* MTLCommandQueue;
typedef void* MTLCommandBuffer_t;
typedef void* MTLCommandBuffer;
typedef void* MTLSharedEvent_t;
typedef void* dispatch_queue_t;
typedef void* MTLDevice_t;
#define nil NULL;
#endif

namespace at {
namespace mps {

//-----------------------------------------------------------------
//  MPSStream
//-----------------------------------------------------------------

class TORCH_API MPSStream {
 public:
  enum Unchecked { UNCHECKED };
  /// Construct a MPSStream from a Stream.  This construction is checked,
  /// and will raise an error if the Stream is not, in fact, a MPS stream.
  explicit MPSStream(Stream stream);

  ~MPSStream();
  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
  dispatch_queue_t queue() const { return _serialQueue; }

  MTLCommandBuffer_t commandBuffer();
  void commit(bool flush);
  void commitAndWait();
  void synchronize();

  void flush();

  /// Get the MPS device index that this stream is associated with.
  c10::DeviceIndex device_index() const { return _stream.device_index(); }

  MTLCommandQueue_t stream() const { return _commandQueue; };

  MTLDevice_t device() const { return [_commandQueue device]; }

  /// Explicit conversion to Stream.
  Stream unwrap() const { return _stream; }

 private:
  Stream _stream;
  MTLCommandQueue_t _commandQueue = nil;
  MTLCommandBuffer_t _commandBuffer = nil;
  void _flush(bool commitAndWait) const;

  dispatch_queue_t _serialQueue = nullptr;
};

/**
 * Get the current MPS stream
 */
TORCH_API MPSStream* getCurrentMPSStream();

/**
 * Get the default MPS stream
 */
TORCH_API MPSStream* getDefaultMPSStream();

//-----------------------------------------------------------------
//  MPSStreamImpl
//-----------------------------------------------------------------

class TORCH_API MPSStreamImpl {
 public:
  /**
   * Gets single instance of the MPSStream.
   */
  static MPSStream* getInstance();

 private:
  static MPSStream* _stream;
  MPSStreamImpl();
};

//-----------------------------------------------------------------
//  MPSEvent
//-----------------------------------------------------------------

struct TORCH_API MPSEvent {
  MPSEvent();
  // MPSEvent(id<MTLDevice> device);

  ~MPSEvent();
  MTLSharedEvent_t event() const { return _event; }

  void recordEvent(MPSStream* stream);
  void waitForEvent(MPSStream* queue);  // waits on the cpu
  bool queryEvent();
  uint64_t getCurrentValue() { return _currentValue; }
  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }

 private:
  bool _isRecorded = false;
  uint64_t _currentValue = 0;
  MTLSharedEvent_t _event;
};

typedef MPSEvent* mpsEvent_t;

}  // namespace mps
}  // namespace at


================================================
FILE: mmcv/ops/csrc/common/mps/MPSUtils.h
================================================
#ifndef _MPS_UTILS_H_
#define _MPS_UTILS_H_
#include <torch/extension.h>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>

typedef id<MTLBuffer> MTLBuffer_t;
typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
#else
typedef void* MTLBuffer;
typedef void* MTLBuffer_t;
typedef void* MTLComputeCommandEncoder;
typedef void* MTLComputeCommandEncoder_t;
#endif

// utils
static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
}

template <typename T,
          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);

template <typename T,
          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
}

template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
  [encoder setBytes:&t length:sizeof(t) atIndex:index];
}

inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}

template <typename T, typename... Args>
void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
  setMTLArg(encoder, index, std::forward<T>(t));
  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
}

template <typename... Args>
void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
  [encoder setComputePipelineState:pso];
  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
}
#endif


================================================
FILE: mmcv/ops/csrc/common/musa/active_rotated_filter_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#ifndef ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH
#define ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH


#include "pytorch_musa_helper.hpp"

template <typename scalar_t>
__global__ void active_rotated_filter_forward_musa_kernel(
    const int nthreads, const scalar_t* weight_data, const int* indices_data,
    const int num_input_planes, const int num_output_planes,
    const int num_orientations, const int num_rotations, const int nEntry,
    scalar_t* output_data) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int l = index % nEntry;
    int j = (index / nEntry) % num_input_planes;
    int i = index / nEntry / num_input_planes;
    int k;
    scalar_t val = *(weight_data + index);
    for (k = 0; k < num_rotations; k++) {
      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
      scalar_t* target = output_data +
                         i * (num_rotations * num_input_planes * nEntry) +
                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
      *target = val;
    }
  }
}

template <typename scalar_t>
__global__ void active_rotated_filter_backward_musa_kernel(
    const int nthreads, const scalar_t* gradWeight_data,
    const int* indices_data, const int num_input_planes,
    const int num_output_planes, const int num_orientations,
    const int num_rotations, const int nEntry, scalar_t* weight_data) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int l = index % nEntry;
    int j = (index / nEntry) % num_input_planes;
    int i = index / nEntry / num_input_planes;
    int k;
    scalar_t* val = weight_data + index;
    *val = 0;
    scalar_t tmp = 0;
    for (k = 0; k < num_rotations; k++) {
      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
      scalar_t target =
          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
      tmp = tmp + target;
    }
    *val = tmp;
  }
}
#endif  // ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/assign_score_withk_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH
#define ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH


#include "pytorch_musa_helper.hpp"

// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
// output: fout(B,O,N)
// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
//       i(k) = idx(b,i,k)
//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))

template <typename T>
__global__ void assign_score_withk_forward_musa_kernel(
    const int B, const int N0, const int N1, const int M, const int K,
    const int O, const int aggregate, const T* points, const T* centers,
    const T* scores, const int64_t* knn_idx, T* output) {
  // ----- parallel loop for B, N1, K and O ---------
  MUSA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
    // ------- loop for M ----------
    const int b = (int)(i / (O * N1 * K));
    const int o = (int)(i % (O * N1 * K) / (N1 * K));
    const int n = (int)(i % (N1 * K) / K);
    const int k = (int)(i % K);
    const int cn = (int)knn_idx[b * K * N1 + n * K +
                                0];  // The first neighbor is the center point
    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
    if (kn >= N0 ||
        kn < 0) {  // if index overflows, it is out of the neighborhood range
      return;
    }
    assert(b < B);
    assert(kn < N0);
    assert(cn < N0);
    assert(o < O);
    assert(n < N1);
    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
    T val = output[out_idx];
    for (int m = 0; m < M; m++) {
      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
                 scores[b * N1 * K * M + n * K * M + k * M + m] -
             centers[b * N0 * M * O + cn * M * O + m * O + o] *
                 scores[b * N1 * K * M + n * K * M + k * M + m];
    }
    output[out_idx] = val;
  }
}

template <typename T>
__global__ void assign_score_withk_points_backward_musa_kernel(
    const int B, const int N0, const int N, const int M, const int K,
    const int O, const int aggregate, const T* grad_out, const T* scores,
    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
  // ----- parallel loop for B, M, O ---------
  MUSA_1D_KERNEL_LOOP(i, B * M * O) {
    int b = (int)(i / (M * O));
    int m = (int)(i % (M * O) / O);
    int o = (int)(i % O);

    // ----- loop for N,K ---------
    for (int n = 0; n < N; n++) {
      for (int k = 0; k < K; k++) {
        int kn = knn_idx[b * N * K + n * K + k];
        int cn = knn_idx[b * N * K + n * K + 0];
        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
                                   // neighborhood range
          continue;
        }
        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
                  scores[b * N * K * M + n * K * M + k * M + m] *
                      grad_out[b * O * N * K + o * N * K + n * K + k]);
        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
                  -scores[b * N * K * M + n * K * M + k * M + m] *
                      grad_out[b * O * N * K + o * N * K + n * K + k]);
      }
    }
  }
}

template <typename T>
__global__ void assign_score_withk_scores_backward_musa_kernel(
    const int B, const int N0, const int N, const int M, const int K,
    const int O, const int aggregate, const T* grad_out, const T* points,
    const T* centers, const int64_t* knn_idx, T* grad_scores) {
  // ----- parallel loop for B, N, K, M ---------
  MUSA_1D_KERNEL_LOOP(i, B * N * K * M) {
    const int b = (int)(i / (N * M * K));
    const int n = (int)(i % (N * M * K) / M / K);
    const int k = (int)(i % (M * K) / M);
    const int m = (int)(i % M);
    const int cn = knn_idx[b * N * K + n * K + 0];
    const int kn = knn_idx[b * N * K + n * K + k];
    if (kn >= N0 ||
        kn < 0) {  // if index overflows, it is out of the neighborhood range
      return;
    }

    // -------------- loop for O ------------------------
    const int out_idx = b * N * K * M + n * K * M + k * M + m;
    T val = grad_scores[out_idx];
    for (int o = 0; o < O; o++) {
      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
             grad_out[b * O * N * K + o * N * K + n * K + k];
    }
    grad_scores[out_idx] = val;
  }
}

#endif  // ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/ball_query_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
#ifndef BALL_QUERY_MUSA_KERNEL_MUH
#define BALL_QUERY_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void ball_query_forward_musa_kernel(int b, int n, int m,
                                               float min_radius,
                                               float max_radius, int nsample,
                                               const T* new_xyz, const T* xyz,
                                               int* idx) {
  // new_xyz: (B, M, 3)
  // xyz: (B, N, 3)
  // output:
  //      idx: (B, M, nsample)
  int bs_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, m) {
    if (bs_idx >= b) return;

    new_xyz += bs_idx * m * 3 + pt_idx * 3;
    xyz += bs_idx * n * 3;
    idx += bs_idx * m * nsample + pt_idx * nsample;

    float max_radius2 = max_radius * max_radius;
    float min_radius2 = min_radius * min_radius;
    T new_x = new_xyz[0];
    T new_y = new_xyz[1];
    T new_z = new_xyz[2];

    int cnt = 0;
    for (int k = 0; k < n; ++k) {
      T x = xyz[k * 3 + 0];
      T y = xyz[k * 3 + 1];
      T z = xyz[k * 3 + 2];
      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
             (new_z - z) * (new_z - z);
      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
        if (cnt == 0) {
          for (int l = 0; l < nsample; ++l) {
            idx[l] = k;
          }
        }
        idx[cnt] = k;
        ++cnt;
        if (cnt >= nsample) break;
      }
    }
  }
}

#endif  // BALL_QUERY_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/bbox_overlaps_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef BBOX_OVERLAPS_MUSA_KERNEL_MUH
#define BBOX_OVERLAPS_MUSA_KERNEL_MUH


#include "pytorch_musa_helper.hpp"

template <typename T>
__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
                                          T& y1, T& x2, T& y2) {
  x1 = bbox[base];
  y1 = bbox[base + 1];
  x2 = bbox[base + 2];
  y2 = bbox[base + 3];
}

template <>
__device__ __forceinline__ void load_bbox<float>(const float* bbox,
                                                 const int base, float& x1,
                                                 float& y1, float& x2,
                                                 float& y2) {
  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
  x1 = bbox_offset.x;
  y1 = bbox_offset.y;
  x2 = bbox_offset.z;
  y2 = bbox_offset.w;
}

template <typename T>
__global__ void bbox_overlaps_musa_kernel(const T* bbox1, const T* bbox2,
                                          T* ious, const int num_bbox1,
                                          const int num_bbox2, const int mode,
                                          const bool aligned,
                                          const int offset) {
  if (aligned) {
    MUSA_1D_KERNEL_LOOP(index, num_bbox1) {
      const int b1 = index;
      const int b2 = index;

      const int base1 = b1 << 2;  // b1 * 4
      T b1_x1, b1_y1, b1_x2, b1_y2;
      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);

      const int base2 = b2 << 2;  // b2 * 4
      T b2_x1, b2_y1, b2_x2, b2_y2;
      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);

      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
      const T width = fmaxf(right - left + offset, 0.f);
      const T height = fmaxf(bottom - top + offset, 0.f);
      const T interS = width * height;

      const T baseS =
          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
      ious[index] = interS / baseS;
    }
  } else {
    MUSA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
      const int b1 = index / num_bbox2;
      const int b2 = index % num_bbox2;

      const int base1 = b1 << 2;  // b1 * 4
      T b1_x1, b1_y1, b1_x2, b1_y2;
      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);

      const int base2 = b2 << 2;  // b2 * 4
      T b2_x1, b2_y1, b2_x2, b2_y2;
      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);

      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
      const T width = fmaxf(right - left + offset, 0.f);
      const T height = fmaxf(bottom - top + offset, 0.f);
      const T interS = width * height;

      const T baseS =
          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
      ious[index] = interS / baseS;
    }
  }
}

__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
                                              const __half x2, const __half y2,
                                              const __half offset) {
  const __half half_w = __hadd(__hsub(x2, x1), offset);
  const __half half_h = __hadd(__hsub(y2, y1), offset);
  return __hmul(half_w, half_h);
}

__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
  return __hge(a, b) ? a : b;
}

__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
  return __hle(a, b) ? a : b;
}

// fp16 won't provide much increase when aligned==true. It is useful when
// aligned==false, which would give you ~40% bonus.
__device__ void bbox_overlaps_musa_kernel_half(
    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
    const int num_bbox2, const int mode, const bool aligned, const int offset) {
  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
  const __half h_offset = __int2half_rn(offset);
  MUSA_1D_KERNEL_LOOP(index, num_output) {
    const int b1 = aligned ? index : index / num_bbox2;
    const int b2 = aligned ? index : index % num_bbox2;

    const int base1 = b1 << 2;
    __half b1_x1, b1_y1, b1_x2, b1_y2;
    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);

    const int base2 = b2 << 2;
    __half b2_x1, b2_y1, b2_x2, b2_y2;
    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);

    const __half left = __half_max(b1_x1, b2_x1),
                 right = __half_min(b1_x2, b2_x2);
    const __half top = __half_max(b1_y1, b2_y1),
                 bottom = __half_min(b1_y2, b2_y2);
    const __half width =
        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
    const __half height =
        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
    const __half interS = __hmul(width, height);

    const __half baseS = __half_max(
        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
        h_offset);
    ious[index] = __hdiv(interS, baseS);
  }
}

#endif  // BBOX_OVERLAPS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/bezier_align_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
#ifndef BEZIER_ALIGN_MUSA_KERNEL_MUH
#define BEZIER_ALIGN_MUSA_KERNEL_MUH

#include <float.h>
#include "pytorch_musa_helper.hpp"

template <typename T>
__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
                          const T u) {
  return ((1. - u) * (1. - u) * (1. - u) * p0 +
          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
          u * u * u * p3);
}

template <typename T>
__global__ void bezier_align_forward_musa_kernel(
    const int nthreads,
    const T *bottom_data,  // inputs
    const T *bottom_rois,  // bottom rois contains the bezier curve
    T *top_data,           // outputs
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int sampling_ratio, bool aligned, const int channels,
    const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    // beziers have size Nx(1+8*2) = Nx17
    const T *offset_bottom_rois = bottom_rois + n * 17;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;

    // TODO: avoid this by using parallel annotation, for good
    T p0_x = offset_bottom_rois[1] * spatial_scale;
    T p0_y = offset_bottom_rois[2] * spatial_scale;
    T p1_x = offset_bottom_rois[3] * spatial_scale;
    T p1_y = offset_bottom_rois[4] * spatial_scale;
    T p2_x = offset_bottom_rois[5] * spatial_scale;
    T p2_y = offset_bottom_rois[6] * spatial_scale;
    T p3_x = offset_bottom_rois[7] * spatial_scale;
    T p3_y = offset_bottom_rois[8] * spatial_scale;
    T p4_x = offset_bottom_rois[15] * spatial_scale;
    T p4_y = offset_bottom_rois[16] * spatial_scale;
    T p5_x = offset_bottom_rois[13] * spatial_scale;
    T p5_y = offset_bottom_rois[14] * spatial_scale;
    T p6_x = offset_bottom_rois[11] * spatial_scale;
    T p6_y = offset_bottom_rois[12] * spatial_scale;
    T p7_x = offset_bottom_rois[9] * spatial_scale;
    T p7_y = offset_bottom_rois[10] * spatial_scale;

    // compute the coords
    const T u = pw / static_cast<T>(pooled_width);
    const T v = ph / static_cast<T>(pooled_height);
    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
    const T x_center = x1 * v + x0 * (1. - v) - offset;
    const T y_center = y1 * v + y0 * (1. - v) - offset;

    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
    if (!aligned) {  // for backward-compatibility only
      roi_width = max(roi_width, (T)1.);
      roi_height = max(roi_height, (T)1.);
    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    const T *offset_bottom_data =
        bottom_data + (roi_batch_ind * channels + c) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceil(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

    // We do average (integral) pooling inside a bin
    // When the grid is empty, output zeros == 0/1, instead of NaN.
    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    T output_val = 0.;
    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
    {
      const T y = y_center - (T)0.5 * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = x_center - (T)0.5 * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
                                     index);
        output_val += val;
      }
    }
    output_val /= count;

    top_data[index] = output_val;
  }
}

template <typename T>
__global__ void bezier_align_backward_musa_kernel(
    const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int sampling_ratio, bool aligned, const int channels,
    const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    // beziers have size Nx(1+8*2) = Nx17
    const T *offset_bottom_rois = bottom_rois + n * 17;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T p0_x = offset_bottom_rois[1] * spatial_scale;
    T p0_y = offset_bottom_rois[2] * spatial_scale;
    T p1_x = offset_bottom_rois[3] * spatial_scale;
    T p1_y = offset_bottom_rois[4] * spatial_scale;
    T p2_x = offset_bottom_rois[5] * spatial_scale;
    T p2_y = offset_bottom_rois[6] * spatial_scale;
    T p3_x = offset_bottom_rois[7] * spatial_scale;
    T p3_y = offset_bottom_rois[8] * spatial_scale;
    T p4_x = offset_bottom_rois[15] * spatial_scale;
    T p4_y = offset_bottom_rois[16] * spatial_scale;
    T p5_x = offset_bottom_rois[13] * spatial_scale;
    T p5_y = offset_bottom_rois[14] * spatial_scale;
    T p6_x = offset_bottom_rois[11] * spatial_scale;
    T p6_y = offset_bottom_rois[12] * spatial_scale;
    T p7_x = offset_bottom_rois[9] * spatial_scale;
    T p7_y = offset_bottom_rois[10] * spatial_scale;

    // compute the coords
    const T u = pw / static_cast<T>(pooled_width);
    const T v = ph / static_cast<T>(pooled_height);
    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
    const T x_center = x1 * v + x0 * (1. - v) - offset;
    const T y_center = y1 * v + y0 * (1. - v) - offset;

    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
    if (!aligned) {  // for backward-compatibility only
      roi_width = max(roi_width, (T)1.);
      roi_height = max(roi_height, (T)1.);
    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    T *offset_bottom_diff =
        bottom_diff + (roi_batch_ind * channels + c) * height * width;

    int top_offset = (n * channels + c) * pooled_height * pooled_width;
    const T *offset_top_diff = top_diff + top_offset;
    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceil(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
    {
      const T y = y_center - (T)0.5 * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = x_center - (T)0.5 * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high, index);

        T g1 = top_diff_this_bin * w1 / count;
        T g2 = top_diff_this_bin * w2 / count;
        T g3 = top_diff_this_bin * w3 / count;
        T g4 = top_diff_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_bottom_diff + y_low * width + x_low,
                    static_cast<T>(g1));
          atomicAdd(offset_bottom_diff + y_low * width + x_high,
                    static_cast<T>(g2));
          atomicAdd(offset_bottom_diff + y_high * width + x_low,
                    static_cast<T>(g3));
          atomicAdd(offset_bottom_diff + y_high * width + x_high,
                    static_cast<T>(g4));
        }  // if
      }    // ix
    }      // iy
  }        // MUSA_1D_KERNEL_LOOP
}  // BezierAlignBackward

#endif  // BEZIER_ALIGN_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/border_align_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.
// the main difference: (1) use `argmax_idx` for fast computing of gradient
// during the backward. (2) `wh` is directly computed by `boxes`, rather than
// passing it as argument to forward or backward functions.

#ifndef BORDER_ALIGN_MUSA_KERNEL_MUH
#define BORDER_ALIGN_MUSA_KERNEL_MUH

#include <float.h>
#include "pytorch_musa_helper.hpp"

enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };

/*** Forward ***/
template <typename T>
__global__ void border_align_forward_musa_kernel(
    const int nthreads, const T* input, const T* boxes, T* output,
    int* argmax_idx, const int channels, const int box_size, const int height,
    const int width, const int pool_size) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
    // output, and `extreme_idx` is in range [0,3]
    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;
    const T *offset_box, *offset_input, *offset_box_x;
    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,
        val, maxval;

    extreme_idx = threadIdx.y;
    // shape (N, C, box_size, 4) for output
    batch_idx = index / channels / box_size;
    // shape (N, box_size, 4) for boxes
    box_idx = index % box_size + batch_idx * box_size;
    c_idx = (index / box_size) % channels;

    offset_box = boxes + box_idx * 4;
    box_width = *(offset_box + 2) - *offset_box;
    box_height = *(offset_box + 3) - *(offset_box + 1);
    offset_output = output + index * 4 + extreme_idx;
    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
    // shape (N, 4C, h, w) for input.
    // [0,C) for top feature, [C,2C) for left feature,
    // [2C,3C) for bottom feature, [3C,4C) for right feature
    offset_input =
        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *
                    height * width;

    // extreme_idx in [0,1] -> offset_box_x indexed at x1
    // extreme_idx in [2,3] -> offset_box_x indexed at x2
    offset_box_x = offset_box + extreme_idx / 2 * 2;

    // (x1,y1) or (x2,y2) for (x,y)
    x = *offset_box_x;
    y = *(offset_box_x + 1);

    switch (extreme_idx) {
      // top
      case BorderMode::Top:
        stride = box_width / pool_size;
        x_stride = stride;
        y_stride = 0;
        break;
      // left
      case BorderMode::Left:
        stride = box_height / pool_size;
        x_stride = 0;
        y_stride = stride;
        break;
      // bottom
      case BorderMode::Bottom:
        stride = box_width / pool_size;
        x_stride = -stride;
        y_stride = 0;
        break;
      // right
      case BorderMode::Right:
        stride = box_height / pool_size;
        x_stride = 0;
        y_stride = -stride;
        break;
    }

    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or
    // (x2,y2))
    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);
    maxidx = 0;

    // do max_pool along the border
    for (int i = 1; i <= pool_size; i++) {
      x += x_stride;
      y += y_stride;
      val = bilinear_interpolate(offset_input, height, width, y, x, index);
      if (val > maxval) {
        maxval = val;
        maxidx = i;
      }
    }

    // update output and argmax_idx
    *offset_output = maxval;
    *offset_argmax_idx = maxidx;
  }
}

/*** Backward ***/
template <typename T>
__global__ void border_align_backward_musa_kernel(
    const int nthreads, const T* grad_output, const T* boxes,
    const int* argmax_idx, T* grad_input, const int channels,
    const int box_size, const int height, const int width,
    const int pool_size) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
    // output, and `extreme_idx` is in range [0,3]
    int batch_idx, c_idx, box_idx, extreme_idx;
    const int* offset_argmax_idx;
    const T *offset_grad_output, *offset_box, *offset_box_x;
    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,
        y;

    extreme_idx = threadIdx.y;
    batch_idx = index / channels / box_size;
    box_idx = index % box_size + batch_idx * box_size;
    c_idx = (index / box_size) % channels;

    offset_box = boxes + box_idx * 4;
    box_width = *(offset_box + 2) - *offset_box;
    box_height = *(offset_box + 3) - *(offset_box + 1);
    offset_grad_output = grad_output + index * 4 + extreme_idx;
    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
    // [0,C) for top feature grad, [C,2C) for left feature grad,
    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad
    offset_grad_input = grad_input + (batch_idx * channels * 4 +
                                      extreme_idx * channels + c_idx) *
                                         height * width;

    // extreme_idx in [0,1] -> offset_box_x indexed at x1
    // extreme_idx in [2,3] -> offset_box_x indexed at x2
    offset_box_x = offset_box + extreme_idx / 2 * 2;

    switch (extreme_idx) {
      // top
      case BorderMode::Top:
        stride = box_width / pool_size;
        x_stride = stride;
        y_stride = 0;
        break;
      // left
      case BorderMode::Left:
        stride = box_height / pool_size;
        x_stride = 0;
        y_stride = stride;
        break;
      // bottom
      case BorderMode::Bottom:
        stride = box_width / pool_size;
        x_stride = -stride;
        y_stride = 0;
        break;
      // right
      case BorderMode::Right:
        stride = box_height / pool_size;
        x_stride = 0;
        y_stride = -stride;
        break;
    }

    // get position (x,y) which has maximum value during forward
    x = *offset_box_x;
    y = *(offset_box_x + 1);
    x += x_stride * (T)(*offset_argmax_idx);
    y += y_stride * (T)(*offset_argmax_idx);

    T w1, w2, w3, w4;
    int x_low, x_high, y_low, y_high;
    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,
                                  x_high, y_low, y_high, index);

    // update grad_output
    atomicAdd(offset_grad_input + y_low * width + x_low,
              *offset_grad_output * w1);
    atomicAdd(offset_grad_input + y_low * width + x_high,
              *offset_grad_output * w2);
    atomicAdd(offset_grad_input + y_high * width + x_low,
              *offset_grad_output * w3);
    atomicAdd(offset_grad_input + y_high * width + x_high,
              *offset_grad_output * w4);
  }
}

#endif  // BORDER_ALIGN_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/box_iou_quadri_musa.muh
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef BOX_IOU_QUADRI_MUSA_MUH
#define BOX_IOU_QUADRI_MUSA_MUH


#include "pytorch_musa_helper.hpp"
#include "box_iou_rotated_utils.hpp"

// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;

inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }

template <typename T>
__global__ void box_iou_quadri_musa_kernel(
    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
  if (aligned) {
    MUSA_1D_KERNEL_LOOP(index, n_boxes1) {
      int b1 = index;
      int b2 = index;

      int base1 = b1 * 8;

      float block_boxes1[8];
      float block_boxes2[8];

      block_boxes1[0] = dev_boxes1[base1 + 0];
      block_boxes1[1] = dev_boxes1[base1 + 1];
      block_boxes1[2] = dev_boxes1[base1 + 2];
      block_boxes1[3] = dev_boxes1[base1 + 3];
      block_boxes1[4] = dev_boxes1[base1 + 4];
      block_boxes1[5] = dev_boxes1[base1 + 5];
      block_boxes1[6] = dev_boxes1[base1 + 6];
      block_boxes1[7] = dev_boxes1[base1 + 7];

      int base2 = b2 * 8;

      block_boxes2[0] = dev_boxes2[base2 + 0];
      block_boxes2[1] = dev_boxes2[base2 + 1];
      block_boxes2[2] = dev_boxes2[base2 + 2];
      block_boxes2[3] = dev_boxes2[base2 + 3];
      block_boxes2[4] = dev_boxes2[base2 + 4];
      block_boxes2[5] = dev_boxes2[base2 + 5];
      block_boxes2[6] = dev_boxes2[base2 + 6];
      block_boxes2[7] = dev_boxes2[base2 + 7];

      dev_ious[index] =
          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
    }
  } else {
    MUSA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
      int b1 = index / n_boxes2;
      int b2 = index % n_boxes2;

      int base1 = b1 * 8;

      float block_boxes1[8];
      float block_boxes2[8];

      block_boxes1[0] = dev_boxes1[base1 + 0];
      block_boxes1[1] = dev_boxes1[base1 + 1];
      block_boxes1[2] = dev_boxes1[base1 + 2];
      block_boxes1[3] = dev_boxes1[base1 + 3];
      block_boxes1[4] = dev_boxes1[base1 + 4];
      block_boxes1[5] = dev_boxes1[base1 + 5];
      block_boxes1[6] = dev_boxes1[base1 + 6];
      block_boxes1[7] = dev_boxes1[base1 + 7];

      int base2 = b2 * 8;

      block_boxes2[0] = dev_boxes2[base2 + 0];
      block_boxes2[1] = dev_boxes2[base2 + 1];
      block_boxes2[2] = dev_boxes2[base2 + 2];
      block_boxes2[3] = dev_boxes2[base2 + 3];
      block_boxes2[4] = dev_boxes2[base2 + 4];
      block_boxes2[5] = dev_boxes2[base2 + 5];
      block_boxes2[6] = dev_boxes2[base2 + 6];
      block_boxes2[7] = dev_boxes2[base2 + 7];

      dev_ious[index] =
          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/musa/box_iou_rotated_musa.muh
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
#ifndef BOX_IOU_ROTATED_MUSA_MUH
#define BOX_IOU_ROTATED_MUSA_MUH

#include "pytorch_musa_helper.hpp"
#include "box_iou_rotated_utils.hpp"

// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;

inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }

template <typename T>
__global__ void box_iou_rotated_musa_kernel(
    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
  if (aligned) {
    MUSA_1D_KERNEL_LOOP(index, n_boxes1) {
      int b1 = index;
      int b2 = index;

      int base1 = b1 * 5;

      float block_boxes1[5];
      float block_boxes2[5];

      block_boxes1[0] = dev_boxes1[base1 + 0];
      block_boxes1[1] = dev_boxes1[base1 + 1];
      block_boxes1[2] = dev_boxes1[base1 + 2];
      block_boxes1[3] = dev_boxes1[base1 + 3];
      block_boxes1[4] = dev_boxes1[base1 + 4];

      int base2 = b2 * 5;

      block_boxes2[0] = dev_boxes2[base2 + 0];
      block_boxes2[1] = dev_boxes2[base2 + 1];
      block_boxes2[2] = dev_boxes2[base2 + 2];
      block_boxes2[3] = dev_boxes2[base2 + 3];
      block_boxes2[4] = dev_boxes2[base2 + 4];

      dev_ious[index] =
          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
    }
  } else {
    MUSA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
      int b1 = index / n_boxes2;
      int b2 = index % n_boxes2;

      int base1 = b1 * 5;

      float block_boxes1[5];
      float block_boxes2[5];

      block_boxes1[0] = dev_boxes1[base1 + 0];
      block_boxes1[1] = dev_boxes1[base1 + 1];
      block_boxes1[2] = dev_boxes1[base1 + 2];
      block_boxes1[3] = dev_boxes1[base1 + 3];
      block_boxes1[4] = dev_boxes1[base1 + 4];

      int base2 = b2 * 5;

      block_boxes2[0] = dev_boxes2[base2 + 0];
      block_boxes2[1] = dev_boxes2[base2 + 1];
      block_boxes2[2] = dev_boxes2[base2 + 2];
      block_boxes2[3] = dev_boxes2[base2 + 3];
      block_boxes2[4] = dev_boxes2[base2 + 4];

      dev_ious[index] =
          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/musa/carafe_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CARAFE_MUSA_KERNEL_MUH
#define CARAFE_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

#ifdef MMCV_WITH_HIP
#define WARP_SIZE 64
#else
#define WARP_SIZE 32
#endif
#define THREADS_PER_PIXEL 32
#define MAX_SHARED_MEMORY 49152
#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
#define MAXIMIZE_KERNEL_SIZE true
#define kTileDim 32
#define kBlockRows 8
#define FULL_MASK 0xffffffff

inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }

__device__ inline int Loc2Index(const int n, const int c, const int h,
                                const int w, const int channel_num,
                                const int height, const int width) {
  int index = w + (h + (c + n * channel_num) * height) * width;
  return index;
}
#ifndef MMCV_WITH_HIP
/* TODO: move this to a common place */
template <typename scalar_t>
__device__ inline scalar_t min(scalar_t a, scalar_t b) {
  return a < b ? a : b;
}

template <typename scalar_t>
__device__ inline scalar_t max(scalar_t a, scalar_t b) {
  return a > b ? a : b;
}
#endif
template <typename scalar_t>
__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP
    val += __shfl_down(val, offset);
#else
    val += __shfl_down_sync(FULL_MASK, val, offset);
#endif
  return val;
}

template <>
__device__ __forceinline__ phalf warpReduceSum(phalf val) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP
    // Using PyTorch's macro for half support
    __PHALF(val) += WARP_SHFL_DOWN(val, offset);
#else
    __PHALF(val) +=
        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
#endif
  return val;
}

// Splits the original matrix into submatrices with size 32 * 32.
// Each block transposes one submatrix by loading it into shared memory.
// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
template <typename scalar_t>
__global__ void BatchTranspose2DMUSAKernel(const int N, const int H,
                                           const int W, const int dh,
                                           const int dw,
                                           const scalar_t *__restrict__ X,
                                           scalar_t *__restrict__ Y) {
  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
  const int n = blockIdx.x / (dh * dw);
  const int k = blockIdx.x % (dh * dw);
  const int r = k / dw;
  const int c = k % dw;
  const int offset = n * H * W;
  int x = c * kTileDim + threadIdx.x;
  int y = r * kTileDim + threadIdx.y;
  if (x < W) {
    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
    }
  }
  __syncthreads();
  x = r * kTileDim + threadIdx.x;
  y = c * kTileDim + threadIdx.y;
  if (x < H) {
    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
    }
  }
}

template <typename scalar_t>
__global__ void CARAFEForward(
    const int num_kernels, const scalar_t *__restrict__ bottom_data,
    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
    const int group_size, const int scale_factor, const int channels,
    const int down_height, const int down_width, const int height,
    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
#if MAXIMIZE_KERNEL_SIZE
  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
#else
  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
#endif


  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index > num_kernels - 1) {
    return;
  }
  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
  index = index / THREADS_PER_PIXEL;
  const int pw = index % width;
  const int ph = (index / width) % height;
  const int n = index / width / height;

  const int down_pw = pw / scale_factor;
  const int down_ph = ph / scale_factor;

  const int start_w = down_pw - (kernel_size - 1) / 2;
  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
  const int start_h = down_ph - (kernel_size - 1) / 2;
  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
  }
  __syncthreads();


  const int channels_per_group = ceilf(channels / (float)group_size);
#pragma unroll
  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
    int mask_group = c / channels_per_group;
    scalar_t output_val = 0;
#pragma unroll
    for (int iy = start_h; iy < end_h; iy++) {
#pragma unroll
      for (int ix = start_w; ix < end_w; ix++) {
        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
          continue;
        }
        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
        int mask_c =
            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
        int feat_index =
            Loc2Index(n, iy, ix, c, down_height, down_width, channels);

        output_val += bottom_data[feat_index] *
                      shared_mask[mask_c * WARP_SIZE + pixel_id];
      }
    }

    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
    top_data[top_index] = output_val;
  }
}

template <typename scalar_t>
__global__ void CARAFEBackward_Feature(
    const int num_kernels, const scalar_t *__restrict__ top_diff,
    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
    const int group_size, const int scale_factor, const int channels,
    const int down_height, const int down_width, const int height,
    const int width, const int mask_channels,
    scalar_t *__restrict__ bottom_diff) {
#if MAXIMIZE_KERNEL_SIZE
  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
#else
  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
#endif

  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index > num_kernels - 1) {
    return;
  }

  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
  // (n, c, ph, pw) is an element in the bottom_data
  index = index / THREADS_PER_PIXEL;
  const int pw = index % width;
  const int ph = (index / width) % height;
  const int n = index / width / height;

  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
    const int mask_w = (c % kernel_size) * scale_factor;
    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
    const int mask_x = start_w + mask_w;
    const int mask_y = start_h + mask_h;
    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
      shared_mask[c * WARP_SIZE + pixel_id] = 0;
      continue;
    }
    const int mask_group = c / (kernel_size * kernel_size);
    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
    int mask_index =
        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
  }
  __syncthreads();
  const int channels_per_group = ceilf(channels / (float)group_size);
#pragma unroll
  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
    int mask_group = c / channels_per_group;
    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
    scalar_t output_val = 0;
#pragma unroll
    for (int iy = start_h; iy < end_h; iy += scale_factor) {
#pragma unroll
      for (int ix = start_w; ix < end_w; ix += scale_factor) {
        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
          continue;
        }
        int mask_iy =
            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
        int mask_ix =
            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
        int mask_c =
            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
        output_val +=
            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
      }
    }
    bottom_diff[top_index] = output_val;
  }
}

template <typename scalar_t>
__global__ void FeatureSum(const int num_kernels,
                           const scalar_t *__restrict__ input_data,
                           const int scale_factor, const int channels,
                           const int height, const int width,
                           scalar_t *__restrict__ output_data) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index > num_kernels - 1) {
    return;
  }
  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
  index = index / THREADS_PER_PIXEL;
  const int pw = index % width;
  const int ph = (index / width) % height;
  const int n = index / width / height;
  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
    scalar_t output_val = 0;
    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
                                 width * scale_factor, channels);
        output_val += input_data[input_id];
      }
    }
    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
    output_data[output_id] = output_val;
  }
}

template <typename scalar_t>
__global__ void CARAFEBackward_Mask(const int num_kernels,
                                    const scalar_t *__restrict__ top_diff,
                                    const scalar_t *__restrict__ bottom_data,
                                    const int kernel_size, const int group_size,
                                    const int scale_factor, const int channels,
                                    const int down_height, const int down_width,
                                    const int height, const int width,
                                    const int mask_channels,
                                    scalar_t *__restrict__ mask_diff) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index > num_kernels - 1) {
    return;
  }

  const int lane_id = index % WARP_SIZE;
  index = index / WARP_SIZE;
  const int mask_c = index % mask_channels;
  // (n, c, ph, pw) is an element in the bottom_data
  index = index / mask_channels;
  const int pw = index % width;
  const int ph = (index / width) % height;
  const int n = index / width / height;

  const int down_pw = pw / scale_factor;
  const int down_ph = ph / scale_factor;

  const int mask_group = mask_c / (kernel_size * kernel_size);
  const int mask_loc = mask_c % (kernel_size * kernel_size);

  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
  const int offset_y =
      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;

  const int down_x = down_pw + offset_x;
  const int down_y = down_ph + offset_y;

  scalar_t output_val = 0;

  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
      down_x <= down_width - 1) {
    const int channels_per_mask = ceilf(channels / (float)group_size);
    const int start = channels_per_mask * mask_group;
    const int end = min(channels_per_mask * (mask_group + 1), channels);
    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
      int bottom_id =
          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
      output_val += top_diff[top_id] * bottom_data[bottom_id];
    }
  }
#ifdef MMCV_WITH_HIP
  __syncthreads();
#else
  __syncwarp();
#endif
  output_val = warpReduceSum(output_val);
  if (lane_id == 0) {
    const int mask_id =
        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
    mask_diff[mask_id] = output_val;
  }
}

#endif  // CARAFE_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/carafe_naive_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CARAFE_NAIVE_MUSA_KERNEL_MUH
#define CARAFE_NAIVE_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

__device__ inline int Loc2Index(const int n, const int c, const int h,
                                const int w, const int channel_num,
                                const int height, const int width) {
  int index = w + (h + (c + n * channel_num) * height) * width;
  return index;
}

template <typename scalar_t>
__global__ void carafe_naive_forward_musa_kernel(
    const int nthreads, const scalar_t *bottom_data,
    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
    const int group_size, const int scale_factor, const int channels,
    const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the bottom_data
    int pw = index % width;
    int ph = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    int mask_channels = kernel_size * kernel_size * group_size;
    int mask_group = c / (channels / group_size);

    int down_pw = pw / scale_factor;
    int down_ph = ph / scale_factor;
    int down_width = width / scale_factor;
    int down_height = height / scale_factor;
    int start_w = down_pw - (kernel_size - 1) / 2;
    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
    int start_h = down_ph - (kernel_size - 1) / 2;
    int end_h = down_ph + (kernel_size - 1) / 2 + 1;

    scalar_t output_val = 0;
    for (int iy = start_h; iy < end_h; iy++) {
      for (int ix = start_w; ix < end_w; ix++) {
        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
          continue;
        }
        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
        int mask_c =
            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
        int feat_index =
            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
        int mask_index =
            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
      }
    }
    top_data[index] = output_val;
  }
}

template <typename scalar_t>
__global__ void carafe_naive_backward_musa_kernel(
    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
    const int kernel_size, const int group_size, const int scale_factor,
    const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the bottom_data
    int pw = index % width;
    int ph = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    int mask_channels = kernel_size * kernel_size * group_size;
    int mask_group = c / (channels / group_size);

    int down_pw = pw / scale_factor;
    int down_ph = ph / scale_factor;
    int down_width = width / scale_factor;
    int down_height = height / scale_factor;
    int start_w = down_pw - (kernel_size - 1) / 2;
    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
    int start_h = down_ph - (kernel_size - 1) / 2;
    int end_h = down_ph + (kernel_size - 1) / 2 + 1;

    for (int iy = start_h; iy < end_h; iy++) {
      for (int ix = start_w; ix < end_w; ix++) {
        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
          continue;
        }
        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
        int mask_c =
            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
        int feat_index =
            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
        int mask_index =
            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
        atomicAdd(bottom_diff + feat_index,
                  bottom_masks[mask_index] * top_diff[index]);
        atomicAdd(mask_diff + mask_index,
                  bottom_data[feat_index] * top_diff[index]);
      }
    }
  }
}

#endif  // CARAFE_NAIVE_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/chamfer_distance_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
#ifndef CHAMFER_DISTANCE_MUSA_KERNEL_MUH
#define CHAMFER_DISTANCE_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"
#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144

#if MUSA_ARCH > 21
template <typename scalar_t>
__global__ void chamfer_distance_forward_musa_kernel(int b, int n,
                                                     const scalar_t* xyz, int m,
                                                     const scalar_t* xyz2,
                                                     scalar_t* result,
                                                     int* result_i) {
  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
  for (int i = blockIdx.x; i < b; i += gridDim.x) {
    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
        buf[j] = xyz2[(i * m + k2) * 2 + j];
      }
      __syncthreads();
      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
        scalar_t x1 = xyz[(i * n + j) * 2 + 0];
        scalar_t y1 = xyz[(i * n + j) * 2 + 1];
        int best_i = 0;
        scalar_t best = 1e10;
        int end_ka = end_k & (~3);
        if (end_ka == THREADS_PER_BLOCK) {
          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
#pragma unroll
            for (int j = 0; j < 4; ++j) {
              scalar_t x2 = buf[(k + j) * 2] - x1;
              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
              scalar_t d = x2 * x2 + y2 * y2;
              if (d < best) {
                best = d;
                best_i = k + k2 + j;
              }
            }
          }
        } else {
          for (int k = 0; k < end_ka; k += 4) {
#pragma unroll
            for (int j = 0; j < 4; ++j) {
              scalar_t x2 = buf[(k + j) * 2] - x1;
              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
              scalar_t d = x2 * x2 + y2 * y2;
              if (d < best) {
                best = d;
                best_i = k + k2 + j;
              }
            }
          }
        }
        for (int k = end_ka; k < end_k; k++) {
          scalar_t x2 = buf[k * 2 + 0] - x1;
          scalar_t y2 = buf[k * 2 + 1] - y1;
          scalar_t d = x2 * x2 + y2 * y2;
          if (k == 0 || d < best) {
            best = d;
            best_i = k + k2;
          }
        }
        if (k2 == 0 || result[(i * n + j)] > best) {
          result[(i * n + j)] = best;
          result_i[(i * n + j)] = best_i;
        }
      }
      __syncthreads();
    }
  }
}

template <typename scalar_t>
__global__ void chamfer_distance_backward_musa_kernel(
    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
    scalar_t* grad_xyz2) {
  for (int i = blockIdx.x; i < b; i += gridDim.x) {
    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
      int j2 = idx1[i * n + j];
      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
      scalar_t g = grad_dist1[i * n + j] * 2;
      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
    }
  }
}
#else
#warning "chamfer_distance is supported when MUSA_ARCH > 21"
#endif  //MUSA_ARCH

#endif  // CHAMFER_DISTANCE_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/common_musa_helper.hpp
================================================
#ifndef COMMON_MUSA_HELPER
#define COMMON_MUSA_HELPER

#include <musa.h>

#define MUSA_1D_KERNEL_LOOP(i, n)                              \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)

#define MUSA_2D_KERNEL_LOOP(i, n, j, m)                             \
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
       i += blockDim.x * gridDim.x)                                 \
    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
         j += blockDim.y * gridDim.y)

#define MUSA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)

#define THREADS_PER_BLOCK 512

inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
  int optimal_block_num = (N + num_threads - 1) / num_threads;
  int max_block_num = 4096;
  return min(optimal_block_num, max_block_num);
}

template <typename T>
__device__ T bilinear_interpolate(const T* input, const int height,
                                  const int width, T y, T x,
                                  const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  int y_low = (int)y;
  int x_low = (int)x;
  int y_high;
  int x_high;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;
  // do bilinear interpolation
  T v1 = input[y_low * width + x_low];
  T v2 = input[y_low * width + x_high];
  T v3 = input[y_high * width + x_low];
  T v4 = input[y_high * width + x_high];
  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  return val;
}

template <typename T>
__device__ void bilinear_interpolate_gradient(
    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
    int& x_low, int& x_high, int& y_low, int& y_high,
    const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    // empty
    w1 = w2 = w3 = w4 = 0.;
    x_low = x_high = y_low = y_high = -1;
    return;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  y_low = (int)y;
  x_low = (int)x;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;

  // reference in forward
  // T v1 = input[y_low * width + x_low];
  // T v2 = input[y_low * width + x_high];
  // T v3 = input[y_high * width + x_low];
  // T v4 = input[y_high * width + x_high];
  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  return;
}
#endif  // COMMON_MUSA_HELPER


================================================
FILE: mmcv/ops/csrc/common/musa/convex_iou_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CONVEX_IOU_MUSA_KERNEL_MUH
#define CONVEX_IOU_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

#define MAXN 100
#define NMAX 512
__device__ const double EPS = 1E-8;

__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }

struct Point {
  double x, y;
  __device__ Point() {}
  __device__ Point(double x, double y) : x(x), y(y) {}
};

__device__ inline bool point_same(Point& a, Point& b) {
  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
}

__device__ inline void swap1(Point* a, Point* b) {
  Point temp;
  temp.x = a->x;
  temp.y = a->y;

  a->x = b->x;
  a->y = b->y;

  b->x = temp.x;
  b->y = temp.y;
}

__device__ inline void reverse1(Point* a, const int n) {
  for (int i = 0; i < (n - 1) / 2.0; i++) {
    Point* j = &(a[i]);
    Point* k = &(a[n - 1 - i]);
    swap1(j, k);
  }
}

__device__ inline double cross(Point o, Point a, Point b) {
  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
}

__device__ inline double dis(Point a, Point b) {
  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
}
__device__ inline double area(Point* ps, int n) {
  ps[n] = ps[0];
  double res = 0;
  for (int i = 0; i < n; i++) {
    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
  }
  return res / 2.0;
}
__device__ inline double polygon_area_grad(Point* ps, int n,
                                           int* polygon_to_pred_index,
                                           int n_pred, double* grad_C) {
  ps[n] = ps[0];
  double partion_grad[4 * 30 + 2];
  double res = 0;
  for (int i = 0; i < n; i++) {
    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
    partion_grad[i * 4 + 2] = ps[i + 1].y;
    partion_grad[i * 4 + 3] = -ps[i + 1].x;
    if (i != n - 1) {
      partion_grad[i * 4 + 4] = -ps[i].y;
      partion_grad[i * 4 + 5] = ps[i].x;
    } else {
      partion_grad[0] = -ps[i].y;
      partion_grad[1] = ps[i].x;
    }
  }
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < n_pred; j++) {
      if (i == polygon_to_pred_index[j]) {
        grad_C[2 * polygon_to_pred_index[j + n_pred]] =
            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
        break;
      }
    }
    for (int j = 0; j < n_pred; j++) {
      if (i == polygon_to_pred_index[j]) {
        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
        break;
      }
    }
  }

  return res / 2.0;
}

__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
                                double* cut_grad, int m, int n, int i) {
  double s1, s2;
  double s2_s1_2;
  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
  s1 = cross(a, b, c);
  s2 = cross(a, b, d);

  ds1_dxc = -(b.y - a.y);
  ds1_dyc = b.x - a.x;
  ds2_dxd = ds1_dxc;
  ds2_dyd = ds1_dyc;
  s2_s1_2 = (s2 - s1) * (s2 - s1);

  if (sig(s1) == 0 && sig(s2) == 0) return 2;
  if (sig(s2 - s1) == 0) return 0;

  dxp_dxc =
      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
      (s2_s1_2);
  dxp_dyc =
      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
      (s2_s1_2);
  dxp_dxd =
      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
      (s2_s1_2);
  dxp_dyd =
      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
      (s2_s1_2);

  dyp_dxc =
      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
      (s2_s1_2);
  dyp_dyc =
      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
      (s2_s1_2);
  dyp_dxd =
      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
      (s2_s1_2);
  dyp_dyd =
      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
      (s2_s1_2);

  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
  if (i == n - 1) {
    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;
    cut_grad[4 * n * m + 1] = dyp_dxd;
    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;
    cut_grad[4 * n * m + 3] = dyp_dyd;
  } else {
    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;
    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;
    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
  }

  return 1;
}
__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
                                   double* cut_grad) {
  Point pp[MAXN];
  double ccur_grad[MAXN] = {};
  int m = 0;
  p[n] = p[0];
  int k = n;
  for (int i = 0; i < n; i++) {
    if (sig(cross(a, b, p[i])) > 0) {
      pp[m] = p[i];
      ccur_grad[4 * n * m + 4 * i] = 1.0;
      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
      m++;
    }
    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
      m++;
    }
  }

  n = 0;
  for (int i = 0; i < m; i++) {
    if (!i || !(point_same(pp[i], pp[i - 1]))) {
      p[n] = pp[i];
      for (int j = 0; j < 4 * k; j++) {
        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
      }
      n++;
    }
  }

  while (n > 1 && point_same(p[n - 1], p[0])) n--;
}

__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
                                       double* grad_AB, int order,
                                       int convex_n) {
  Point o(0, 0);
  int res_flag = 0;
  int s1 = sig(cross(o, a, b));
  int s2 = sig(cross(o, c, d));
  if (s1 == 0 || s2 == 0) return 0.0;
  if (s1 == -1) {
    Point* i = &a;
    Point* j = &b;
    swap1(i, j);
    res_flag = 1;
  }
  if (s2 == -1) {
    Point* i = &c;
    Point* j = &d;
    swap1(i, j);
  }
  Point p[10] = {o, a, b};
  int n = 3, n0 = 3, n1, n2, n3;
  double cut_grad1[MAXN] = {};
  double cut_grad2[MAXN] = {};
  double cut_grad3[MAXN] = {};
  double p1_p_grad[10][10] = {};
  double p2_p1_grad[10][10] = {};
  double p3_p2_grad[10][10] = {};

  double p3_p1_grad[10][10] = {};
  double p3_p_grad[10][10] = {};

  // 1
  polygon_cut(p, n, o, c, cut_grad1);
  n1 = n;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < 4 * n0; j++) {
      if (!(j % 2)) {
        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
      } else {
        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
      }
    }
  }

  // 2
  polygon_cut(p, n, c, d, cut_grad2);
  n2 = n;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < 4 * n1; j++) {
      if (!(j % 2)) {
        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
      } else {
        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
      }
    }
  }
  // 3
  polygon_cut(p, n, d, o, cut_grad3);
  n3 = n;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < 4 * n2; j++) {
      if (!(j % 2)) {
        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
      } else {
        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
      }
    }
  }

  // mul
  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
  for (int i = 0; i < 2 * n3; i++) {
    for (int j = 0; j < 2 * n1; j++) {
      double sum = 0.0;
      for (int m = 0; m < 2 * n2; m++) {
        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
      }
      p3_p1_grad[i][j] = sum;
    }
  }

  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
  for (int i = 0; i < 2 * n3; i++) {
    for (int j = 0; j < 2 * n0; j++) {
      double sum = 0.0;
      for (int m = 0; m < 2 * n1; m++) {
        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
      }
      p3_p_grad[i][j] = sum;
    }
  }

  // calculate S_grad
  int polygon_index_box_index[20];
  double grad_polygon[20];
  double S_grad[6];

  for (int i = 0; i < n3; i++) {
    polygon_index_box_index[i] = i;
    polygon_index_box_index[i + n3] = i;
  }

  double res =
      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);

  if (s1 * s2 == -1) {
    for (int j = 0; j < 2 * 3; j++) {
      double sum = 0.0;
      for (int m = 0; m < 2 * n3; m++) {
        sum = sum - grad_polygon[m] * p3_p_grad[m][j];
      }
      S_grad[j] = sum;
    }

    if (order != convex_n - 1) {
      if (res_flag) {
        grad_AB[2 * order] += S_grad[4];
        grad_AB[2 * order + 1] += S_grad[5];
        grad_AB[2 * order + 2] += S_grad[2];
        grad_AB[2 * order + 3] += S_grad[3];

      } else {
        grad_AB[2 * order] += S_grad[2];
        grad_AB[2 * order + 1] += S_grad[3];
        grad_AB[2 * order + 2] += S_grad[4];
        grad_AB[2 * order + 3] += S_grad[5];
      }
    } else {
      if (res_flag) {
        grad_AB[2 * order] += S_grad[4];
        grad_AB[2 * order + 1] += S_grad[5];
        grad_AB[0] += S_grad[2];
        grad_AB[1] += S_grad[3];

      } else {
        grad_AB[2 * order] += S_grad[2];
        grad_AB[2 * order + 1] += S_grad[3];
        grad_AB[0] += S_grad[4];
        grad_AB[1] += S_grad[5];
      }
    }
    res = -res;
  } else {
    for (int j = 0; j < 2 * 3; j++) {
      double sum = 0.0;
      for (int m = 0; m < 2 * n3; m++) {
        sum = sum + grad_polygon[m] * p3_p_grad[m][j];
      }
      S_grad[j] = sum;
    }

    if (order != convex_n - 1) {
      if (res_flag) {
        grad_AB[2 * order] += S_grad[4];
        grad_AB[2 * order + 1] += S_grad[5];
        grad_AB[2 * order + 2] += S_grad[2];
        grad_AB[2 * order + 3] += S_grad[3];
      } else {
        grad_AB[2 * order] += S_grad[2];
        grad_AB[2 * order + 1] += S_grad[3];
        grad_AB[2 * order + 2] += S_grad[4];
        grad_AB[2 * order + 3] += S_grad[5];
      }
    } else {
      if (res_flag) {
        grad_AB[2 * order] += S_grad[4];
        grad_AB[2 * order + 1] += S_grad[5];
        grad_AB[0] += S_grad[2];
        grad_AB[1] += S_grad[3];
      } else {
        grad_AB[2 * order] += S_grad[2];
        grad_AB[2 * order + 1] += S_grad[3];
        grad_AB[0] += S_grad[4];
        grad_AB[1] += S_grad[5];
      }
    }
  }
  return res;
}

__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
                                        double* grad_AB) {
  if (area(ps1, n1) < 0) reverse1(ps1, n1);
  if (area(ps2, n2) < 0) reverse1(ps2, n2);
  ps1[n1] = ps1[0];
  ps2[n2] = ps2[0];
  double res = 0;
  for (int i = 0; i < n1; i++) {
    for (int j = 0; j < n2; j++) {
      res +=
          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
    }
  }
  return res;
}

__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
  Point p_max, p_k;
  int max_index, k_index;
  int Stack[NMAX] = {}, top1, top2;
  double sign;
  Point right_point[10], left_point[10];

  for (int i = 0; i < n_poly; i++) {
    if (in_poly[i].y < in_poly[0].y ||
        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
      Point* j = &(in_poly[0]);
      Point* k = &(in_poly[i]);
      swap1(j, k);
    }
    if (i == 0) {
      p_max = in_poly[0];
      max_index = 0;
    }
    if (in_poly[i].y > p_max.y ||
        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
      p_max = in_poly[i];
      max_index = i;
    }
  }

  if (max_index == 0) {
    max_index = 1;
    p_max = in_poly[max_index];
  }

  k_index = 0, Stack[0] = 0, top1 = 0;
  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
                                         dis(in_poly[Stack[top1]], p_k)))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top1++;
    Stack[top1] = k_index;
  }
  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];

  k_index = 0, Stack[0] = 0, top2 = 0;

  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
                                        dis(in_poly[Stack[top2]], p_k))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top2++;
    Stack[top2] = k_index;
  }
  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];

  for (int i = 0; i < top1 + top2; i++) {
    if (i <= top1) {
      in_poly[i] = right_point[i];
    } else {
      in_poly[i] = left_point[top2 - (i - top1)];
    }
  }
  n_poly = top1 + top2;
}

__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
                                           int n2, double* grad_C) {
  Point polygon[MAXN];
  int n = n1 + n2, n_poly = 0;
  for (int i = 0; i < n1; i++) {
    for (int j = 0; j < n - n1; j++) {
      if (point_same(ps1[i], ps2[j])) {
        for (int k = j; k < n - n1 - 1; k++) {
          ps2[k] = ps2[k + 1];
        }
        n2--;
        break;
      }
    }
  }
  n_poly = n1 + n2;
  for (int i = 0; i < n_poly; i++) {
    if (i < n1) {
      polygon[i] = ps1[i];
    } else {
      polygon[i] = ps2[i - n1];
    }
  }

  Jarvis(polygon, n_poly);

  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};
  int n_pred = 0;
  for (int i = 0; i < n_poly; i++) {
    for (int j = 0; j < n1; j++) {
      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
        polygon_to_pred_index[n_pred] = i;
        polygon_to_pred_index[n_pred + n1] = j;
        n_pred += 1;
        break;
      }
    }
  }
  if (n_pred == 0) {
    double polygon_area = fabs(area(polygon, n_poly));
    for (int i = 0; i < 18; i++) {
      grad_C[i] = 0.0;
    }
    return polygon_area;
  } else {
    double polygon_area =
        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
    if (polygon_area < 0) {
      for (int i = 0; i < 18; i++) {
        grad_C[i] = -grad_C[i];
      }
    }
    return fabs(polygon_area);
  }
}

// convex_find and get the polygon_index_box_index
__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
                                        int* points_to_convex_ind) {
  int n_input = n_poly;
  Point input_poly[20];
  for (int i = 0; i < n_input; i++) {
    input_poly[i].x = in_poly[i].x;
    input_poly[i].y = in_poly[i].y;
  }
  Point p_max, p_k;
  int max_index, k_index;
  int Stack[20], top1, top2;
  double sign;
  Point right_point[10], left_point[10];

  for (int i = 0; i < n_poly; i++) {
    if (in_poly[i].y < in_poly[0].y ||
        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
      Point* j = &(in_poly[0]);
      Point* k = &(in_poly[i]);
      swap1(j, k);
    }
    if (i == 0) {
      p_max = in_poly[0];
      max_index = 0;
    }
    if (in_poly[i].y > p_max.y ||
        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
      p_max = in_poly[i];
      max_index = i;
    }
  }
  if (max_index == 0) {
    max_index = 1;
    p_max = in_poly[max_index];
  }

  k_index = 0, Stack[0] = 0, top1 = 0;
  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
                                         dis(in_poly[Stack[top1]], p_k)))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top1++;
    Stack[top1] = k_index;
  }
  for (int i = 0; i <= top1; i++) {
    right_point[i] = in_poly[Stack[i]];
  }

  k_index = 0, Stack[0] = 0, top2 = 0;

  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
                                        dis(in_poly[Stack[top2]], p_k))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top2++;
    Stack[top2] = k_index;
  }

  for (int i = top2 - 1; i >= 0; i--) {
    left_point[i] = in_poly[Stack[i]];
  }

  for (int i = 0; i < top1 + top2; i++) {
    if (i <= top1) {
      in_poly[i] = right_point[i];
    } else {
      in_poly[i] = left_point[top2 - (i - top1)];
    }
  }
  n_poly = top1 + top2;
  for (int i = 0; i < n_poly; i++) {
    for (int j = 0; j < n_input; j++) {
      if (point_same(in_poly[i], input_poly[j])) {
        points_to_convex_ind[i] = j;
        break;
      }
    }
  }
}

template <typename T>
__device__ inline float devrIoU(T const* const p, T const* const q,
                                T* point_grad, const int idx) {
  Point ps1[MAXN], ps2[MAXN];

  Point convex[MAXN];
  for (int i = 0; i < 9; i++) {
    convex[i].x = (double)p[i * 2];
    convex[i].y = (double)p[i * 2 + 1];
  }
  int n_convex = 9;
  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
  Jarvis_and_index(convex, n_convex, points_to_convex_ind);

  int n1 = n_convex;
  int n2 = 4;

  for (int i = 0; i < n1; i++) {
    ps1[i].x = (double)convex[i].x;
    ps1[i].y = (double)convex[i].y;
  }

  for (int i = 0; i < n2; i++) {
    ps2[i].x = (double)q[i * 2];
    ps2[i].y = (double)q[i * 2 + 1];
  }

  int polygon_index_box_index[18];
  for (int i = 0; i < n1; i++) {
    polygon_index_box_index[i] = i;
    polygon_index_box_index[i + n1] = i;
  }

  double grad_A[18] = {};
  double grad_AB[18] = {};
  double grad_C[18] = {};

  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
  double S_pred =
      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
  if (S_pred < 0) {
    for (int i = 0; i < n_convex * 2; i++) {
      grad_A[i] = -grad_A[i];
    }
  }
  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;

  double iou = inter_area / union_area;
  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);

  //    printf("%d:live\n", idx);
  double rot_giou = iou - (polygon_area - union_area) / polygon_area;

  float grad_point_temp[18] = {};

  for (int i = 0; i < n_convex; i++) {
    int grad_point = points_to_convex_ind[i];
    grad_point_temp[2 * grad_point] =
        (float)((union_area + inter_area) / (union_area * union_area) *
                    grad_AB[2 * i] -
                iou / union_area * grad_A[2 * i] -
                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
    grad_point_temp[2 * grad_point + 1] =
        (float)((union_area + inter_area) / (union_area * union_area) *
                    grad_AB[2 * i + 1] -
                iou / union_area * grad_A[2 * i + 1] -
                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
  }

  for (int i = 0; i < 9; i++) {
    point_grad[2 * i] = grad_point_temp[2 * i];
    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
  }
  return (float)rot_giou;
}

template <typename T>
__global__ void convex_giou_musa_kernel(const int ex_n_boxes,
                                        const int gt_n_boxes, const T* ex_boxes,
                                        const T* gt_boxes, T* point_grad) {
  MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) {
    const T* cur_box = ex_boxes + index * 18;
    const T* cur_gt_box = gt_boxes + index * 8;
    T* cur_grad = point_grad + index * 19;
    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
    cur_grad[18] = giou;
  }
}

__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
  double s1, s2;
  s1 = cross(a, b, c);
  s2 = cross(a, b, d);
  if (sig(s1) == 0 && sig(s2) == 0) return 2;
  if (sig(s2 - s1) == 0) return 0;
  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
  return 1;
}

__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
  Point pp[MAXN];
  int m = 0;
  p[n] = p[0];
  for (int i = 0; i < n; i++) {
    if (sig(cross(a, b, p[i])) > 0) {
      pp[m] = p[i];
      m++;
    }
    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
      lineCross(a, b, p[i], p[i + 1], pp[m]);
      m++;
    }
  }
  n = 0;
  for (int i = 0; i < m; i++) {
    if (!i || !(point_same(pp[i], pp[i - 1]))) {
      p[n] = pp[i];
      n++;
    }
  }

  while (n > 1 && point_same(p[n - 1], p[0])) n--;
}

__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
  Point o(0, 0);
  int s1 = sig(cross(o, a, b));
  int s2 = sig(cross(o, c, d));
  if (s1 == 0 || s2 == 0) return 0.0;
  if (s1 == -1) {
    Point* i = &a;
    Point* j = &b;
    swap1(i, j);
  }
  if (s2 == -1) {
    Point* i = &c;
    Point* j = &d;
    swap1(i, j);
  }
  Point p[10] = {o, a, b};
  int n = 3;

  polygon_cut(p, n, o, c);
  polygon_cut(p, n, c, d);
  polygon_cut(p, n, d, o);
  double res = area(p, n);
  if (s1 * s2 == -1) res = -res;
  return res;
}
__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
                                        int n2) {
  if (area(ps1, n1) < 0) reverse1(ps1, n1);
  if (area(ps2, n2) < 0) reverse1(ps2, n2);
  ps1[n1] = ps1[0];
  ps2[n2] = ps2[0];
  double res = 0;
  for (int i = 0; i < n1; i++) {
    for (int j = 0; j < n2; j++) {
      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
    }
  }
  return res;
}

template <typename T>
__device__ inline float devrIoU(T const* const p, T const* const q) {
  Point ps1[MAXN], ps2[MAXN];
  Point convex[MAXN];
  for (int i = 0; i < 9; i++) {
    convex[i].x = (double)p[i * 2];
    convex[i].y = (double)p[i * 2 + 1];
  }
  int n_convex = 9;
  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
  int n1 = n_convex;
  for (int i = 0; i < n1; i++) {
    ps1[i].x = (double)convex[i].x;
    ps1[i].y = (double)convex[i].y;
  }
  int n2 = 4;
  for (int i = 0; i < n2; i++) {
    ps2[i].x = (double)q[i * 2];
    ps2[i].y = (double)q[i * 2 + 1];
  }
  double inter_area = intersectAreaO(ps1, n1, ps2, n2);
  double S_pred = area(ps1, n1);
  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
  double iou = inter_area / union_area;
  return (float)iou;
}

template <typename T>
__global__ void convex_iou_musa_kernel(const int ex_n_boxes,
                                       const int gt_n_boxes, const T* ex_boxes,
                                       const T* gt_boxes, T* iou) {
  MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) {
    const T* cur_box = ex_boxes + index * 18;
    for (int i = 0; i < gt_n_boxes; i++) {
      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
    }
  }
}
#endif  // CONVEX_IOU_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/correlation_musa.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
// Original licence: Under MIT License

#ifndef CORRELATION_MUSA
#define CORRELATION_MUSA

#include "pytorch_musa_helper.hpp"

#include <musa.h>
#include <musa_runtime.h>
// Using <torch/extension.h> is recommended in the official documentation in
// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
// However, we use <torch/types.h> for compatibility with MUSA 9.0
// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
#include <torch/types.h>

#include <iostream>
#include <vector>

using namespace torch;

#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>
#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)

#define WARP_SIZE 32
#define FULL_MASK 0xffffffff

template <typename scalar_t>
__global__ void correlation_forward_musa_kernel(
    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW,
    int oH, int oW) {
  const int iH = rInput1.size(1);
  const int iW = rInput1.size(2);
  const int C = rInput1.size(3);

  const int n = blockIdx.x;
  const int h = blockIdx.y * blockDim.y + threadIdx.y;
  const int w = blockIdx.z * blockDim.z + threadIdx.z;

  if (h >= oH || w >= oW) return;

  const int thread = threadIdx.x;

  const int start_i = -padH + h * dH;
  const int start_j = -padW + w * dW;

  const int patchRadH = dilation_patchH * (patchH - 1) / 2;
  const int patchRadW = dilation_patchW * (patchW - 1) / 2;

  for (int ph = 0; ph < patchH; ++ph) {
    int ph_dilated = ph * dilation_patchH - patchRadH;
    for (int pw = 0; pw < patchW; ++pw) {
      int pw_dilated = pw * dilation_patchW - patchRadW;
      scalar_t prod_sum = 0.0f;
      for (int i = 0; i < kH; ++i) {
        int i1 = start_i + i * dilationH;
        int i2 = i1 + ph_dilated;
        if (WITHIN_BOUNDS(i1, i2, iH, iH)) {
          for (int j = 0; j < kW; ++j) {
            int j1 = start_j + j * dilationW;
            int j2 = j1 + pw_dilated;
            if (WITHIN_BOUNDS(j1, j2, iW, iW)) {
              for (int c = thread; c < C; c += WARP_SIZE) {
                scalar_t v1 = rInput1[n][i1][j1][c];
                scalar_t v2 = rInput2[n][i2][j2][c];
                prod_sum += v1 * v2;
              }
            }
          }
        }
      }
      // accumulate
      for (int offset = 16; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP
        prod_sum += __shfl_down(float(prod_sum), offset);
#else
        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
#endif
      if (thread == 0) {
        output[n][ph][pw][h][w] = prod_sum;
      }
    }
  }
}

template <typename scalar_t>
__global__ void correlation_backward_musa_kernel_input1(
    const TensorAcc5R grad_output, const TensorAcc4R input2,
    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
    const int patchW, const int padH, const int padW, const int dilationH,
    const int dilationW, const int dilation_patchH, const int dilation_patchW,
    const int dH, const int dW) {
  const int iH = input2.size(1);
  const int iW = input2.size(2);
  const int C = input2.size(3);

  const int H = grad_output.size(3);
  const int W = grad_output.size(4);

  const int patchRadH = (patchH - 1) / 2;
  const int patchRadW = (patchW - 1) / 2;

  const int n = blockIdx.x;
  const int h = blockIdx.y;
  const int w = blockIdx.z;

  const int h_2 = h + padH;
  const int w_2 = w + padW;
  const int min_h = h_2 - kH * dilationH;
  const int min_w = w_2 - kW * dilationW;

  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
    const int ph = i / patchW;
    const int pw = i % patchW;
    int i1 = h + dilation_patchH * (ph - patchRadH);
    int j1 = w + dilation_patchW * (pw - patchRadW);

    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
      scalar_t grad_val = 0.0f;
      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
        int i2 = (h_3) / dH;
        if (i2 * dH != h_3) continue;
        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
          int j2 = (w_3) / dW;
          if (j2 * dW != w_3) continue;
          if (WITHIN_BOUNDS(i2, j2, H, W)) {
            grad_val += grad_output[n][ph][pw][i2][j2];
          }
        }
      }
      grad_cache[i] = grad_val;
    }
  }
  __syncthreads();

  for (int c = threadIdx.x; c < C; c += blockDim.x) {
    scalar_t grad_input_val = 0.0f;
    for (int ph = 0; ph < patchH; ++ph) {
      int i1 = h + dilation_patchH * (ph - patchRadH);
      for (int pw = 0; pw < patchW; ++pw) {
        int j1 = w + dilation_patchW * (pw - patchRadW);
        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
        }
      }
    }
    grad_input1[n][c][h][w] = grad_input_val;
  }
}

template <typename scalar_t>
__global__ void correlation_backward_musa_kernel_input2(
    const TensorAcc5R grad_output, const TensorAcc4R input1,
    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
    int padW, int dilationH, int dilationW, int dilation_patchH,
    int dilation_patchW, int dH, int dW) {
  const int iH = input1.size(1);
  const int iW = input1.size(2);
  const int C = input1.size(3);

  const int patchRadH = (patchH - 1) / 2;
  const int patchRadW = (patchW - 1) / 2;

  const int H = grad_output.size(3);
  const int W = grad_output.size(4);

  const int dilatedKH = kH * dilationH;
  const int dilatedKW = kW * dilationW;

  const int n = blockIdx.x;
  const int h = blockIdx.y;
  const int w = blockIdx.z;

  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
    const int ph = i / patchW;
    const int pw = i % patchW;
    int i1 = h - dilation_patchH * (ph - patchRadH);
    int j1 = w - dilation_patchW * (pw - patchRadW);

    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
      scalar_t grad_val = 0.0f;

      const int h_2 = i1 + padH;
      const int w_2 = j1 + padW;
      const int min_h = h_2 - dilatedKH;
      const int min_w = w_2 - dilatedKW;

      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
        int i2 = (h_3) / dH;
        if (i2 * dH != h_3) continue;
        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
          int j2 = (w_3) / dW;
          if (j2 * dW != w_3) continue;
          if (WITHIN_BOUNDS(i2, j2, H, W)) {
            grad_val += grad_output[n][ph][pw][i2][j2];
          }
        }
      }
      grad_cache[i] = grad_val;
    }
  }
  __syncthreads();

  for (int c = threadIdx.x; c < C; c += blockDim.x) {
    scalar_t grad_input_val = 0.0f;
    for (int ph = 0; ph < patchH; ++ph) {
      int i1 = h - dilation_patchH * (ph - patchRadH);
      for (int pw = 0; pw < patchW; ++pw) {
        int j1 = w - dilation_patchW * (pw - patchRadW);
        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
        }
      }
    }
    grad_input2[n][c][h][w] = grad_input_val;
  }
}
#endif


================================================
FILE: mmcv/ops/csrc/common/musa/deform_conv_musa_kernel.muh
================================================
/*!
 ******************* BEGIN Caffe Copyright Notice and Disclaimer
 *****************
 *
 * COPYRIGHT
 *
 * All contributions by the University of California:
 * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 * All rights reserved.
 *
 * All other contributions:
 * Copyright (c) 2014-2017, the respective contributors
 * All rights reserved.
 *
 * Caffe uses a shared copyright model: each contributor holds copyright over
 * their contributions to Caffe. The project versioning records all such
 * contribution and copyright details. If a contributor wants to further mark
 * their specific copyright on a particular contribution, they should indicate
 * their copyright solely in the commit message of the change when it is
 * committed.
 *
 * LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * CONTRIBUTION AGREEMENT
 *
 * By contributing to the BVLC/caffe repository through pull-request, comment,
 * or otherwise, the contributor releases their content to the
 * license and copyright terms herein.
 *
 ***************** END Caffe Copyright Notice and Disclaimer
 *********************
 *
 * Copyright (c) 2018 Microsoft
 * Licensed under The MIT License [see LICENSE for details]
 * \file modulated_deformable_im2col.muh
 * \brief Function definitions of converting an image to
 * column matrix based on kernel, padding, dilation, and offset.
 * These functions are mainly used in deformable convolution operators.
 * \ref: https://arxiv.org/abs/1703.06211
 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
 */

// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu

#ifndef DEFORM_CONV_MUSA_KERNEL_MUH
#define DEFORM_CONV_MUSA_KERNEL_MUH

#include <float.h>
#include "pytorch_musa_helper.hpp"


template <typename T>
__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
                                        const int height, const int width, T h,
                                        T w) {
  if (h <= -1 || height <= h || w <= -1 || width <= w) {
    return 0;
  }

  int h_low = floorf(h);
  int w_low = floorf(w);
  int h_high = h_low + 1;
  int w_high = w_low + 1;

  T lh = h - h_low;
  T lw = w - w_low;
  T hh = 1 - lh, hw = 1 - lw;

  T v1 = 0;
  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
  T v2 = 0;
  if (h_low >= 0 && w_high <= width - 1)
    v2 = input[h_low * data_width + w_high];
  T v3 = 0;
  if (h_high <= height - 1 && w_low >= 0)
    v3 = input[h_high * data_width + w_low];
  T v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
    v4 = input[h_high * data_width + w_high];

  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

template <typename T>
__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
                                 const int w, const int height,
                                 const int width) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;
  if (h == argmax_h_low && w == argmax_w_low)
    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
  if (h == argmax_h_low && w == argmax_w_high)
    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
  if (h == argmax_h_high && w == argmax_w_low)
    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
  if (h == argmax_h_high && w == argmax_w_high)
    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
  return weight;
}

template <typename T>
__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
                                   const int width, const T *im_data,
                                   const int data_width, const int bp_dir) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;

  if (bp_dir == 0) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += -1 * (argmax_w - argmax_w_low) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_w - argmax_w_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  } else if (bp_dir == 1) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += -1 * (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  }

  return weight;
}

template <typename T>
__global__ void deformable_im2col_gpu_kernel(
    const int n, const T *data_im, const T *data_offset, const int height,
    const int width, const int kernel_h, const int kernel_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int num_channels, const int deformable_group, const int height_col,
    const int width_col, T *data_col) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    // index index of output matrix
    const int w_col = index % width_col;
    const int h_col = (index / width_col) % height_col;
    const int b_col = (index / width_col / height_col) % batch_size;
    const int c_im = (index / width_col / height_col) / batch_size;
    const int c_col = c_im * kernel_h * kernel_w;

    // compute deformable group index
    const int deformable_group_index = c_im / channel_per_deformable_group;

    const int h_in = h_col * stride_h - pad_h;
    const int w_in = w_col * stride_w - pad_w;
    T *data_col_ptr =
        data_col +
        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
    const T *data_im_ptr =
        data_im + (b_col * num_channels + c_im) * height * width;
    const T *data_offset_ptr =
        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        const int data_offset_h_ptr =
            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
        const int data_offset_w_ptr =
            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
            w_col;
        const T offset_h = data_offset_ptr[data_offset_h_ptr];
        const T offset_w = data_offset_ptr[data_offset_w_ptr];
        T val = static_cast<T>(0);
        const T h_im = h_in + i * dilation_h + offset_h;
        const T w_im = w_in + j * dilation_w + offset_w;
        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
                                           h_im, w_im);
        *data_col_ptr = val;
        data_col_ptr += batch_size * height_col * width_col;
      }
    }
  }
}

template <typename T>
__global__ void deformable_col2im_gpu_kernel(
    const int n, const T *data_col, const T *data_offset, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int deformable_group, const int height_col, const int width_col,
    T *grad_im) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    const int j = (index / width_col / height_col / batch_size) % kernel_w;
    const int i =
        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
    const int c =
        index / width_col / height_col / batch_size / kernel_w / kernel_h;
    // compute the start and end of the output

    const int deformable_group_index = c / channel_per_deformable_group;

    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int b = (index / width_col / height_col) % batch_size;
    int w_in = w_out * stride_w - pad_w;
    int h_in = h_out * stride_h - pad_h;

    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const int data_offset_h_ptr =
        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
    const int data_offset_w_ptr =
        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
    const T offset_h = data_offset_ptr[data_offset_h_ptr];
    const T offset_w = data_offset_ptr[data_offset_w_ptr];
    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;

    const T cur_top_grad = data_col[index];
    const int cur_h = (int)cur_inv_h_data;
    const int cur_w = (int)cur_inv_w_data;
    for (int dy = -2; dy <= 2; dy++) {
      for (int dx = -2; dx <= 2; dx++) {
        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
          int cur_bottom_grad_pos =
              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
                                         cur_h + dy, cur_w + dx, height, width);
          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
        }
      }
    }
  }
}

template <typename T>
__global__ void deformable_col2im_coord_gpu_kernel(
    const int n, const T *data_col, const T *data_im, const T *data_offset,
    const int channels, const int height, const int width, const int kernel_h,
    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int offset_channels, const int deformable_group, const int height_col,
    const int width_col, T *grad_offset) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    T val = 0;
    int w = index % width_col;
    int h = (index / width_col) % height_col;
    int c = (index / width_col / height_col) % offset_channels;
    int b = (index / width_col / height_col) / offset_channels;
    // compute the start and end of the output

    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
    const int col_step = kernel_h * kernel_w;
    int cnt = 0;
    const T *data_col_ptr = data_col + deformable_group_index *
                                           channel_per_deformable_group *
                                           batch_size * width_col * height_col;
    const T *data_im_ptr =
        data_im + (b * deformable_group + deformable_group_index) *
                      channel_per_deformable_group / kernel_h / kernel_w *
                      height * width;
    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;

    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
         col_c += col_step) {
      const int col_pos =
          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
      const int bp_dir = offset_c % 2;

      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
      int i =
          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
      int w_out = col_pos % width_col;
      int h_out = (col_pos / width_col) % height_col;
      int w_in = w_out * stride_w - pad_w;
      int h_in = h_out * stride_h - pad_h;
      const int data_offset_h_ptr =
          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
      const int data_offset_w_ptr =
          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
           w_out);
      const T offset_h = data_offset_ptr[data_offset_h_ptr];
      const T offset_w = data_offset_ptr[data_offset_w_ptr];
      T inv_h = h_in + i * dilation_h + offset_h;
      T inv_w = w_in + j * dilation_w + offset_w;
      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
        inv_h = inv_w = -2;
      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
                                             data_im_ptr + cnt * height * width,
                                             width, bp_dir);
      val += weight * data_col_ptr[col_pos];
      cnt += 1;
    }

    grad_offset[index] = val;
  }
}

#endif  // DEFORM_CONV_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/deform_roi_pool_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef DEFORM_ROI_POOL_MUSA_KERNEL_MUH
#define DEFORM_ROI_POOL_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"
template <typename T>
__global__ void deform_roi_pool_forward_musa_kernel(
    const int nthreads, const T* input, const T* rois, const T* offset,
    T* output, const int pooled_height, const int pooled_width,
    const T spatial_scale, const int sampling_ratio, const T gamma,
    const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];

    // Do not using rounding; this implementation detail is critical
    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;

    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;

    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    const T* offset_input =
        input + (roi_batch_ind * channels + c) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_height / pooled_height));
    int roi_bin_grid_w =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_width / pooled_width));

    // Compute roi offset
    if (offset != NULL) {
      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
                              ph * pooled_width + pw;
      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
      T offset_roi_h =
          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
      roi_start_w += offset_roi_w;
      roi_start_h += offset_roi_h;
    }

    // We do average pooling inside a bin
    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
    T output_val = 0.;
    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
      const T y = roi_start_h + ph * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = roi_start_w + pw * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);
        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
        output_val += val;
      }
    }
    output[index] = output_val / count;
  }
}

template <typename T>
__global__ void deform_roi_pool_backward_musa_kernel(
    const int nthreads, const T* grad_output, const T* input, const T* rois,
    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
    const int pooled_width, const T spatial_scale, const int sampling_ratio,
    const T gamma, const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];
    const T* offset_input =
        input + ((roi_batch_ind * channels + c) * height * width);
    T* offset_grad_input =
        grad_input + ((roi_batch_ind * channels + c) * height * width);

    // Do not using rounding; this implementation detail is critical
    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;

    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;

    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_height / pooled_height));
    int roi_bin_grid_w =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_width / pooled_width));

    // Compute roi offset
    if (offset != NULL) {
      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
                              ph * pooled_width + pw;
      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
      T offset_roi_h =
          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
      roi_start_w += offset_roi_w;
      roi_start_h += offset_roi_h;
    }

    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
    const T grad_output_this_bin = grad_output[index] / count;

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
      const T y = roi_start_h + ph * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = roi_start_w + pw * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;
        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high, index);

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_grad_input + y_low * width + x_low,
                    grad_output_this_bin * w1);
          atomicAdd(offset_grad_input + y_low * width + x_high,
                    grad_output_this_bin * w2);
          atomicAdd(offset_grad_input + y_high * width + x_low,
                    grad_output_this_bin * w3);
          atomicAdd(offset_grad_input + y_high * width + x_high,
                    grad_output_this_bin * w4);
          if (offset != NULL) {
            T input_00 = offset_input[y_low * width + x_low];
            T input_10 = offset_input[y_low * width + x_high];
            T input_01 = offset_input[y_high * width + x_low];
            T input_11 = offset_input[y_high * width + x_high];
            T ogx = gamma * roi_width * grad_output_this_bin *
                    (input_11 * (y - y_low) + input_10 * (y_high - y) +
                     input_01 * (y_low - y) + input_00 * (y - y_high));
            T ogy = gamma * roi_height * grad_output_this_bin *
                    (input_11 * (x - x_low) + input_01 * (x_high - x) +
                     input_10 * (x_low - x) + input_00 * (x - x_high));
            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
                          ph * pooled_width + pw,
                      ogx);
            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
                          pooled_width * pooled_height + ph * pooled_width + pw,
                      ogy);
          }
        }
      }
    }
  }
}

#endif  // DEFORM_ROI_POOL_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/diff_iou_rotated_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
#include "pytorch_musa_helper.hpp"

#define MAX_NUM_VERT_IDX 9
#define INTERSECTION_OFFSET 8
#define EPSILON 1e-8

inline int opt_n_thread(int work_size) {
  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
}

/*
compare normalized vertices (vertices around (0,0))
if vertex1 < vertex2 return true.
order: minimum at x-aixs, become larger in anti-clockwise direction
*/
__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
    return false;  // if equal, return false

  if (y1 > 0 && y2 < 0) return true;
  if (y1 < 0 && y2 > 0) return false;

  float n1 = x1 * x1 + y1 * y1 + EPSILON;
  float n2 = x2 * x2 + y2 * y2 + EPSILON;
  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;

  if (y1 > 0 && y2 > 0) {
    if (diff > EPSILON)
      return true;
    else
      return false;
  }
  if (y1 < 0 && y2 < 0) {
    if (diff < EPSILON)
      return true;
    else
      return false;
  }
  return false;
}

__global__ void diff_iou_rotated_sort_vertices_forward_musa_kernel(
    int b, int n, int m, const float *__restrict__ vertices,
    const bool *__restrict__ mask, const int *__restrict__ num_valid,
    int *__restrict__ idx) {
  int batch_idx = blockIdx.x;
  vertices += batch_idx * n * m * 2;
  mask += batch_idx * n * m;
  num_valid += batch_idx * n;
  idx += batch_idx * n * MAX_NUM_VERT_IDX;

  int index = threadIdx.x;  // index of polygon
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride) {
    int pad;  // index of arbitrary invalid intersection point (not box corner!)
    for (int j = INTERSECTION_OFFSET; j < m; ++j) {
      if (!mask[i * m + j]) {
        pad = j;
        break;
      }
    }
    if (num_valid[i] < 3) {
      // not enough vertices, take an invalid intersection point
      // (zero padding)
      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
        idx[i * MAX_NUM_VERT_IDX + j] = pad;
      }
    } else {
      // sort the valid vertices
      // note the number of valid vertices is known
      // note: check that num_valid[i] < MAX_NUM_VERT_IDX
      for (int j = 0; j < num_valid[i]; ++j) {
        // initialize with a "big" value
        float x_min = 1;
        float y_min = -EPSILON;
        int i_take = 0;
        int i2;
        float x2, y2;
        if (j != 0) {
          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
          x2 = vertices[i * m * 2 + i2 * 2 + 0];
          y2 = vertices[i * m * 2 + i2 * 2 + 1];
        }
        for (int k = 0; k < m; ++k) {
          float x = vertices[i * m * 2 + k * 2 + 0];
          float y = vertices[i * m * 2 + k * 2 + 1];
          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
              x_min = x;
              y_min = y;
              i_take = k;
            }
          }
        }
        idx[i * MAX_NUM_VERT_IDX + j] = i_take;
      }
      // duplicate the first idx
      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];

      // pad zeros
      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
        idx[i * MAX_NUM_VERT_IDX + j] = pad;
      }

      // for corner case: the two boxes are exactly the same.
      // in this case, idx would have duplicate elements, which makes the
      // shoelace formula broken because of the definition, the duplicate
      // elements only appear in the first 8 positions (they are "corners in
      // box", not "intersection of edges")
      if (num_valid[i] == 8) {
        int counter = 0;
        for (int j = 0; j < 4; ++j) {
          int check = idx[i * MAX_NUM_VERT_IDX + j];
          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
          }
        }
        if (counter == 4) {
          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
            idx[i * MAX_NUM_VERT_IDX + j] = pad;
          }
        }
      }

      // TODO: still might need to cover some other corner cases :(
    }
  }
}


================================================
FILE: mmcv/ops/csrc/common/musa/furthest_point_sample_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH
#define FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
                         int idx1, int idx2) {
  const float v1 = dists[idx1], v2 = dists[idx2];
  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
  dists[idx1] = max(v1, v2);
  dists_i[idx1] = v2 > v1 ? i2 : i1;
}

template <unsigned int block_size>
__global__ void furthest_point_sampling_forward_musa_kernel(
    int b, int n, int m, const float *__restrict__ dataset,
    float *__restrict__ temp, int *__restrict__ idxs) {
  // dataset: (B, N, 3)
  // tmp: (B, N)
  // output:
  //      idx: (B, M)

  if (m <= 0) return;
  __shared__ float dists[block_size];
  __shared__ int dists_i[block_size];

  int batch_index = blockIdx.x;
  dataset += batch_index * n * 3;
  temp += batch_index * n;
  idxs += batch_index * m;

  int tid = threadIdx.x;
  const int stride = block_size;

  int old = 0;
  if (threadIdx.x == 0) idxs[0] = old;

  __syncthreads();
  for (int j = 1; j < m; j++) {
    int besti = 0;
    float best = -1;
    float x1 = dataset[old * 3 + 0];
    float y1 = dataset[old * 3 + 1];
    float z1 = dataset[old * 3 + 2];
    for (int k = tid; k < n; k += stride) {
      float x2, y2, z2;
      x2 = dataset[k * 3 + 0];
      y2 = dataset[k * 3 + 1];
      z2 = dataset[k * 3 + 2];
      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
      // if (mag <= 1e-3)
      // continue;

      float d =
          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
      float d2 = min(d, temp[k]);
      temp[k] = d2;
      besti = d2 > best ? k : besti;
      best = d2 > best ? d2 : best;
    }
    dists[tid] = best;
    dists_i[tid] = besti;
    __syncthreads();

#pragma unroll
    for (int block_size_thres = 1024; block_size_thres >= 2;
         block_size_thres >>= 1) {
      const int tid_thres = block_size_thres / 2;
      if (block_size >= block_size_thres && tid < tid_thres) {
        __update(dists, dists_i, tid, tid + tid_thres);
      }
      __syncthreads();
    }

    old = dists_i[0];
    if (tid == 0) idxs[j] = old;
  }
}

// Modified from
// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
template <unsigned int block_size>
__global__ void furthest_point_sampling_with_dist_forward_musa_kernel(
    int b, int n, int m, const float *__restrict__ dataset,
    float *__restrict__ temp, int *__restrict__ idxs) {
  // dataset: (B, N, N)
  // tmp: (B, N)
  // output:
  //      idx: (B, M)

  if (m <= 0) return;
  __shared__ float dists[block_size];
  __shared__ int dists_i[block_size];

  int batch_index = blockIdx.x;
  dataset += batch_index * n * n;
  temp += batch_index * n;
  idxs += batch_index * m;

  int tid = threadIdx.x;
  const int stride = block_size;

  int old = 0;
  if (threadIdx.x == 0) idxs[0] = old;

  __syncthreads();
  for (int j = 1; j < m; j++) {
    int besti = 0;
    float best = -1;
    // float x1 = dataset[old * 3 + 0];
    // float y1 = dataset[old * 3 + 1];
    // float z1 = dataset[old * 3 + 2];
    for (int k = tid; k < n; k += stride) {
      // float x2, y2, z2;
      // x2 = dataset[k * 3 + 0];
      // y2 = dataset[k * 3 + 1];
      // z2 = dataset[k * 3 + 2];

      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
      // (z2 - z1);
      float d = dataset[old * n + k];

      float d2 = min(d, temp[k]);
      temp[k] = d2;
      besti = d2 > best ? k : besti;
      best = d2 > best ? d2 : best;
    }
    dists[tid] = best;
    dists_i[tid] = besti;
    __syncthreads();

#pragma unroll
    for (int block_size_thres = 1024; block_size_thres >= 2;
         block_size_thres >>= 1) {
      const int tid_thres = block_size_thres / 2;
      if (block_size >= block_size_thres && tid < tid_thres) {
        __update(dists, dists_i, tid, tid + tid_thres);
      }
      __syncthreads();
    }

    old = dists_i[0];
    if (tid == 0) idxs[j] = old;
  }
}

#endif  // FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/gather_points_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef GATHER_POINTS_MUSA_KERNEL_MUH
#define GATHER_POINTS_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

#define TOTAL_THREADS 1024

template <typename T>
__global__ void gather_points_forward_musa_kernel(int b, int c, int n, int m,
                                                  const T *points,
                                                  const int *__restrict__ idx,
                                                  T *out) {
  // points: (B, C, N)
  // idx: (B, M)
  // output:
  //      out: (B, C, M)

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, m) {
    if (bs_idx >= b || c_idx >= c) return;

    out += bs_idx * c * m + c_idx * m + pt_idx;
    idx += bs_idx * m + pt_idx;
    points += bs_idx * c * n + c_idx * n;
    out[0] = points[idx[0]];
  }
}

template <typename T>
__global__ void gather_points_backward_musa_kernel(int b, int c, int n, int m,
                                                   const T *grad_out,
                                                   const int *__restrict__ idx,
                                                   T *grad_points) {
  // grad_out: (B, C, M)
  // idx: (B, M)
  // output:
  //      grad_points: (B, C, N)

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, m) {
    if (bs_idx >= b || c_idx >= c) return;

    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
    idx += bs_idx * m + pt_idx;
    grad_points += bs_idx * c * n + c_idx * n;

    atomicAdd(grad_points + idx[0], grad_out[0]);
  }
}

#endif  // GATHER_POINTS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/group_points_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#ifndef GROUP_POINTS_MUSA_KERNEL_MUH
#define GROUP_POINTS_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void group_points_forward_musa_kernel(int b, int c, int n,
                                                 int npoints, int nsample,
                                                 const T *points,
                                                 const int *__restrict__ idx,
                                                 T *out) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(index, npoints * nsample) {
    if (bs_idx >= b || c_idx >= c) return;

    int pt_idx = index / nsample;
    int sample_idx = index % nsample;

    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
                  pt_idx * nsample + sample_idx;

    out[out_idx] = points[in_idx];
  }
}

template <typename T>
__global__ void group_points_backward_musa_kernel(int b, int c, int n,
                                                  int npoints, int nsample,
                                                  const T *grad_out,
                                                  const int *__restrict__ idx,
                                                  T *grad_points) {
  // grad_out: (B, C, npoints, nsample)
  // idx: (B, npoints, nsample)
  // output:
  //      grad_points: (B, C, N)
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(index, npoints * nsample) {
    int pt_idx = index / nsample;
    if (bs_idx >= b || c_idx >= c) return;

    int sample_idx = index % nsample;
    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
                pt_idx * nsample + sample_idx;
    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;

    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
  }
}

#endif  // GROUP_POINTS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/iou3d_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef IOU3D_MUSA_KERNEL_MUH
#define IOU3D_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

const int THREADS_PER_BLOCK_IOU3D = 16;
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
__device__ const float EPS = 1e-8;

struct Point {
  float x, y;
  __device__ Point() {}
  __device__ Point(double _x, double _y) { x = _x, y = _y; }

  __device__ void set(float _x, float _y) {
    x = _x;
    y = _y;
  }

  __device__ Point operator+(const Point &b) const {
    return Point(x + b.x, y + b.y);
  }

  __device__ Point operator-(const Point &b) const {
    return Point(x - b.x, y - b.y);
  }
};

__device__ inline float cross(const Point &a, const Point &b) {
  return a.x * b.y - a.y * b.x;
}

__device__ inline float cross(const Point &p1, const Point &p2,
                              const Point &p0) {
  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
}

__device__ int check_rect_cross(const Point &p1, const Point &p2,
                                const Point &q1, const Point &q2) {
  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
            min(q1.y, q2.y) <= max(p1.y, p2.y);
  return ret;
}

__device__ inline int check_in_box2d(const float *box, const Point &p) {
  // params: box (7) [x, y, z, dx, dy, dz, heading]
  const float MARGIN = 1e-2;

  float center_x = box[0], center_y = box[1];
  // rotate the point in the opposite direction of box
  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;

  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
          fabs(rot_y) < box[4] / 2 + MARGIN);
}

__device__ inline int intersection(const Point &p1, const Point &p0,
                                   const Point &q1, const Point &q0,
                                   Point &ans_point) {
  // fast exclusion
  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;

  // check cross standing
  float s1 = cross(q0, p1, p0);
  float s2 = cross(p1, q1, p0);
  float s3 = cross(p0, q1, q0);
  float s4 = cross(q1, p1, q0);

  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;

  // calculate intersection of two lines
  float s5 = cross(q1, p1, p0);
  if (fabs(s5 - s1) > EPS) {
    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);

  } else {
    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
    float D = a0 * b1 - a1 * b0;

    ans_point.x = (b0 * c1 - b1 * c0) / D;
    ans_point.y = (a1 * c0 - a0 * c1) / D;
  }

  return 1;
}

__device__ inline void rotate_around_center(const Point &center,
                                            const float angle_cos,
                                            const float angle_sin, Point &p) {
  float new_x =
      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
  float new_y =
      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
  p.set(new_x, new_y);
}

__device__ inline int point_cmp(const Point &a, const Point &b,
                                const Point &center) {
  return atan2(a.y - center.y, a.x - center.x) >
         atan2(b.y - center.y, b.x - center.x);
}

__device__ inline float box_overlap(const float *box_a, const float *box_b) {
  // params box_a: [x, y, z, dx, dy, dz, heading]
  // params box_b: [x, y, z, dx, dy, dz, heading]

  float a_angle = box_a[6], b_angle = box_b[6];
  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;

  Point center_a(box_a[0], box_a[1]);
  Point center_b(box_b[0], box_b[1]);

  Point box_a_corners[5];
  box_a_corners[0].set(a_x1, a_y1);
  box_a_corners[1].set(a_x2, a_y1);
  box_a_corners[2].set(a_x2, a_y2);
  box_a_corners[3].set(a_x1, a_y2);

  Point box_b_corners[5];
  box_b_corners[0].set(b_x1, b_y1);
  box_b_corners[1].set(b_x2, b_y1);
  box_b_corners[2].set(b_x2, b_y2);
  box_b_corners[3].set(b_x1, b_y2);

  // get oriented corners
  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);

  for (int k = 0; k < 4; k++) {
    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
  }

  box_a_corners[4] = box_a_corners[0];
  box_b_corners[4] = box_b_corners[0];

  // get intersection of lines
  Point cross_points[16];
  Point poly_center;
  int cnt = 0, flag = 0;

  poly_center.set(0, 0);
  for (int i = 0; i < 4; i++) {
    for (int j = 0; j < 4; j++) {
      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
                          box_b_corners[j + 1], box_b_corners[j],
                          cross_points[cnt]);
      if (flag) {
        poly_center = poly_center + cross_points[cnt];
        cnt++;
      }
    }
  }

  // check corners
  for (int k = 0; k < 4; k++) {
    if (check_in_box2d(box_a, box_b_corners[k])) {
      poly_center = poly_center + box_b_corners[k];
      cross_points[cnt] = box_b_corners[k];
      cnt++;
    }
    if (check_in_box2d(box_b, box_a_corners[k])) {
      poly_center = poly_center + box_a_corners[k];
      cross_points[cnt] = box_a_corners[k];
      cnt++;
    }
  }

  poly_center.x /= cnt;
  poly_center.y /= cnt;

  // sort the points of polygon
  Point temp;
  for (int j = 0; j < cnt - 1; j++) {
    for (int i = 0; i < cnt - j - 1; i++) {
      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
        temp = cross_points[i];
        cross_points[i] = cross_points[i + 1];
        cross_points[i + 1] = temp;
      }
    }
  }

  // get the overlap areas
  float area = 0;
  for (int k = 0; k < cnt - 1; k++) {
    area += cross(cross_points[k] - cross_points[0],
                  cross_points[k + 1] - cross_points[0]);
  }

  return fabs(area) / 2.0;
}

__device__ inline float iou_bev(const float *box_a, const float *box_b) {
  // params box_a: [x, y, z, dx, dy, dz, heading]
  // params box_b: [x, y, z, dx, dy, dz, heading]
  float sa = box_a[3] * box_a[4];
  float sb = box_b[3] * box_b[4];
  float s_overlap = box_overlap(box_a, box_b);
  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
}

__global__ void iou3d_boxes_overlap_bev_forward_musa_kernel(
    const int num_a, const float *boxes_a, const int num_b,
    const float *boxes_b, float *ans_overlap) {
  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
  MUSA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
    if (a_idx >= num_a || b_idx >= num_b) {
      return;
    }

    const float *cur_box_a = boxes_a + a_idx * 7;
    const float *cur_box_b = boxes_b + b_idx * 7;
    float cur_overlap = box_overlap(cur_box_a, cur_box_b);
    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
  }
}

__global__ void iou3d_nms3d_forward_musa_kernel(const int boxes_num,
                                                const float nms_overlap_thresh,
                                                const float *boxes,
                                                unsigned long long *mask) {
  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
  const int blocks =
      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
    // if (row_start > col_start) return;

    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
                               THREADS_PER_BLOCK_NMS);
    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
                               THREADS_PER_BLOCK_NMS);

    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];

    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 7 + 0] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
      block_boxes[threadIdx.x * 7 + 1] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
      block_boxes[threadIdx.x * 7 + 2] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
      block_boxes[threadIdx.x * 7 + 3] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
      block_boxes[threadIdx.x * 7 + 4] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
      block_boxes[threadIdx.x * 7 + 5] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
      block_boxes[threadIdx.x * 7 + 6] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
      const float *cur_box = boxes + cur_box_idx * 7;

      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks =
          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
      mask[cur_box_idx * col_blocks + col_start] = t;
    }
  }
}

__device__ inline float iou_normal(float const *const a, float const *const b) {
  // params: a: [x, y, z, dx, dy, dz, heading]
  // params: b: [x, y, z, dx, dy, dz, heading]

  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
  float interS = width * height;
  float Sa = a[3] * a[4];
  float Sb = b[3] * b[4];
  return interS / fmaxf(Sa + Sb - interS, EPS);
}

__global__ void iou3d_nms3d_normal_forward_musa_kernel(
    const int boxes_num, const float nms_overlap_thresh, const float *boxes,
    unsigned long long *mask) {
  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
  // params: mask (N, N/THREADS_PER_BLOCK_NMS)

  const int blocks =
      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
    // if (row_start > col_start) return;

    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
                               THREADS_PER_BLOCK_NMS);
    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
                               THREADS_PER_BLOCK_NMS);

    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];

    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 7 + 0] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
      block_boxes[threadIdx.x * 7 + 1] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
      block_boxes[threadIdx.x * 7 + 2] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
      block_boxes[threadIdx.x * 7 + 3] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
      block_boxes[threadIdx.x * 7 + 4] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
      block_boxes[threadIdx.x * 7 + 5] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
      block_boxes[threadIdx.x * 7 + 6] =
          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
      const float *cur_box = boxes + cur_box_idx * 7;

      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks =
          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
      mask[cur_box_idx * col_blocks + col_start] = t;
    }
  }
}

#endif  // IOU3D_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/knn_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#ifndef KNN_MUSA_KERNEL_MUH
#define KNN_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"
inline __device__ void swap_float(float *x, float *y) {
  float tmp = *x;
  *x = *y;
  *y = tmp;
}

inline __device__ void swap_int(int *x, int *y) {
  int tmp = *x;
  *x = *y;
  *y = tmp;
}

__device__ void reheap(float *dist, int *idx, int k) {
  int root = 0;
  int child = root * 2 + 1;
  while (child < k) {
    if (child + 1 < k && dist[child + 1] > dist[child]) child++;
    if (dist[root] > dist[child]) return;
    swap_float(&dist[root], &dist[child]);
    swap_int(&idx[root], &idx[child]);
    root = child;
    child = root * 2 + 1;
  }
}

__device__ void heap_sort(float *dist, int *idx, int k) {
  int i;
  for (i = k - 1; i > 0; i--) {
    swap_float(&dist[0], &dist[i]);
    swap_int(&idx[0], &idx[i]);
    reheap(dist, idx, i);
  }
}

// input: xyz (b, n, 3) new_xyz (b, m, 3)
// output: idx (b, m, nsample) dist2 (b, m, nsample)
template <typename T>
__global__ void knn_forward_musa_kernel(int b, int n, int m, int nsample,
                                        const T *xyz, const T *new_xyz,
                                        int *__restrict__ idx, T *dist2) {
  int bs_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, m) {
    if (bs_idx >= b) return;

    new_xyz += bs_idx * m * 3 + pt_idx * 3;
    xyz += bs_idx * n * 3;
    idx += bs_idx * m * nsample + pt_idx * nsample;
    dist2 += bs_idx * m * nsample + pt_idx * nsample;

    T new_x = new_xyz[0];
    T new_y = new_xyz[1];
    T new_z = new_xyz[2];

    float best_dist[100];
    int best_idx[100];
    for (int i = 0; i < nsample; i++) {
      best_dist[i] = 1e10;
      best_idx[i] = 0;
    }
    for (int i = 0; i < n; i++) {
      T x = xyz[i * 3 + 0];
      T y = xyz[i * 3 + 1];
      T z = xyz[i * 3 + 2];
      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
             (new_z - z) * (new_z - z);
      if (d2 < best_dist[0]) {
        best_dist[0] = d2;
        best_idx[0] = i;
        reheap(best_dist, best_idx, nsample);
      }
    }
    heap_sort(best_dist, best_idx, nsample);
    for (int i = 0; i < nsample; i++) {
      idx[i] = best_idx[i];
      dist2[i] = best_dist[i];
    }
  }
}

#endif  // KNN_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/masked_conv2d_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MASKED_CONV2D_MUSA_KERNEL_MUH
#define MASKED_CONV2D_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename scalar_t>
__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
                                    const int height, const int width,
                                    const int kernel_h, const int kernel_w,
                                    const int pad_h, const int pad_w,
                                    const int64_t *mask_h_idx,
                                    const int64_t *mask_w_idx,
                                    const int mask_cnt, scalar_t *data_col) {
  // mask_cnt * channels
  MUSA_1D_KERNEL_LOOP(index, n) {
    const int m_index = index % mask_cnt;
    const int h_col = mask_h_idx[m_index];
    const int w_col = mask_w_idx[m_index];
    const int c_im = index / mask_cnt;
    const int c_col = c_im * kernel_h * kernel_w;
    const int h_offset = h_col - pad_h;
    const int w_offset = w_col - pad_w;
    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
    for (int i = 0; i < kernel_h; ++i) {
      int h_im = h_offset + i;
      for (int j = 0; j < kernel_w; ++j) {
        int w_im = w_offset + j;
        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
          *data_col_ptr =
              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
        } else {
          *data_col_ptr = 0.0;
        }
        data_col_ptr += mask_cnt;
      }
    }
  }
}

template <typename scalar_t>
__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
                                    const int height, const int width,
                                    const int channels,
                                    const int64_t *mask_h_idx,
                                    const int64_t *mask_w_idx,
                                    const int mask_cnt, scalar_t *data_im) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    const int m_index = index % mask_cnt;
    const int h_im = mask_h_idx[m_index];
    const int w_im = mask_w_idx[m_index];
    const int c_im = index / mask_cnt;
    // compute the start and end of the output
    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
  }
}

#endif  // MASKED_CONV2D_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/min_area_polygons_musa.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MIN_AREA_POLYGONS_MUSA_KERNEL_MUH
#define MIN_AREA_POLYGONS_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

#define MAXN 20
__device__ const float PI = 3.1415926;

struct Point {
  float x, y;
  __device__ Point() {}
  __device__ Point(float x, float y) : x(x), y(y) {}
};

__device__ inline void swap1(Point *a, Point *b) {
  Point temp;
  temp.x = a->x;
  temp.y = a->y;

  a->x = b->x;
  a->y = b->y;

  b->x = temp.x;
  b->y = temp.y;
}
__device__ inline float cross(Point o, Point a, Point b) {
  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
}

__device__ inline float dis(Point a, Point b) {
  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
}
__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
  float convex_points[2][MAXN];
  for (int j = 0; j < n_points; j++) {
    convex_points[0][j] = ps[j].x;
  }
  for (int j = 0; j < n_points; j++) {
    convex_points[1][j] = ps[j].y;
  }

  Point edges[MAXN];
  float edges_angles[MAXN];
  float unique_angles[MAXN];
  int n_edges = n_points - 1;
  int n_unique = 0;
  int unique_flag = 0;

  for (int i = 0; i < n_edges; i++) {
    edges[i].x = ps[i + 1].x - ps[i].x;
    edges[i].y = ps[i + 1].y - ps[i].y;
  }
  for (int i = 0; i < n_edges; i++) {
    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
    if (edges_angles[i] >= 0) {
      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
    } else {
      edges_angles[i] =
          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
    }
  }
  unique_angles[0] = edges_angles[0];
  n_unique += 1;
  for (int i = 1; i < n_edges; i++) {
    for (int j = 0; j < n_unique; j++) {
      if (edges_angles[i] == unique_angles[j]) {
        unique_flag += 1;
      }
    }
    if (unique_flag == 0) {
      unique_angles[n_unique] = edges_angles[i];
      n_unique += 1;
      unique_flag = 0;
    } else {
      unique_flag = 0;
    }
  }

  float minarea = 1e12;
  for (int i = 0; i < n_unique; i++) {
    float R[2][2];
    float rot_points[2][MAXN];
    R[0][0] = cos(unique_angles[i]);
    R[0][1] = sin(unique_angles[i]);
    R[1][0] = -sin(unique_angles[i]);
    R[1][1] = cos(unique_angles[i]);
    // R x Points
    for (int m = 0; m < 2; m++) {
      for (int n = 0; n < n_points; n++) {
        float sum = 0.0;
        for (int k = 0; k < 2; k++) {
          sum = sum + R[m][k] * convex_points[k][n];
        }
        rot_points[m][n] = sum;
      }
    }

    // xmin;
    float xmin, ymin, xmax, ymax;
    xmin = 1e12;
    for (int j = 0; j < n_points; j++) {
      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
        continue;
      } else {
        if (rot_points[0][j] < xmin) {
          xmin = rot_points[0][j];
        }
      }
    }
    // ymin
    ymin = 1e12;
    for (int j = 0; j < n_points; j++) {
      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
        continue;
      } else {
        if (rot_points[1][j] < ymin) {
          ymin = rot_points[1][j];
        }
      }
    }
    // xmax
    xmax = -1e12;
    for (int j = 0; j < n_points; j++) {
      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
        continue;
      } else {
        if (rot_points[0][j] > xmax) {
          xmax = rot_points[0][j];
        }
      }
    }
    // ymax
    ymax = -1e12;
    for (int j = 0; j < n_points; j++) {
      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
        continue;
      } else {
        if (rot_points[1][j] > ymax) {
          ymax = rot_points[1][j];
        }
      }
    }
    float area = (xmax - xmin) * (ymax - ymin);
    if (area < minarea) {
      minarea = area;
      minbox[0] = unique_angles[i];
      minbox[1] = xmin;
      minbox[2] = ymin;
      minbox[3] = xmax;
      minbox[4] = ymax;
    }
  }
}

// convex_find
__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
  int n_input = n_poly;
  Point input_poly[20];
  for (int i = 0; i < n_input; i++) {
    input_poly[i].x = in_poly[i].x;
    input_poly[i].y = in_poly[i].y;
  }
  Point p_max, p_k;
  int max_index, k_index;
  int Stack[20], top1, top2;
  // float sign;
  double sign;
  Point right_point[10], left_point[10];

  for (int i = 0; i < n_poly; i++) {
    if (in_poly[i].y < in_poly[0].y ||
        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
      Point *j = &(in_poly[0]);
      Point *k = &(in_poly[i]);
      swap1(j, k);
    }
    if (i == 0) {
      p_max = in_poly[0];
      max_index = 0;
    }
    if (in_poly[i].y > p_max.y ||
        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
      p_max = in_poly[i];
      max_index = i;
    }
  }
  if (max_index == 0) {
    max_index = 1;
    p_max = in_poly[max_index];
  }

  k_index = 0, Stack[0] = 0, top1 = 0;
  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
                                         dis(in_poly[Stack[top1]], p_k)))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top1++;
    Stack[top1] = k_index;
  }

  for (int i = 0; i <= top1; i++) {
    right_point[i] = in_poly[Stack[i]];
  }

  k_index = 0, Stack[0] = 0, top2 = 0;

  while (k_index != max_index) {
    p_k = p_max;
    k_index = max_index;
    for (int i = 1; i < n_poly; i++) {
      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
                                        dis(in_poly[Stack[top2]], p_k))) {
        p_k = in_poly[i];
        k_index = i;
      }
    }
    top2++;
    Stack[top2] = k_index;
  }

  for (int i = top2 - 1; i >= 0; i--) {
    left_point[i] = in_poly[Stack[i]];
  }

  for (int i = 0; i < top1 + top2; i++) {
    if (i <= top1) {
      in_poly[i] = right_point[i];
    } else {
      in_poly[i] = left_point[top2 - (i - top1)];
    }
  }
  n_poly = top1 + top2;
}

template <typename T>
__device__ inline void Findminbox(T const *const p, T *minpoints) {
  Point ps1[MAXN];
  Point convex[MAXN];
  for (int i = 0; i < 9; i++) {
    convex[i].x = p[i * 2];
    convex[i].y = p[i * 2 + 1];
  }
  int n_convex = 9;
  Jarvis(convex, n_convex);
  int n1 = n_convex;
  for (int i = 0; i < n1; i++) {
    ps1[i].x = convex[i].x;
    ps1[i].y = convex[i].y;
  }
  ps1[n1].x = convex[0].x;
  ps1[n1].y = convex[0].y;

  float minbbox[5] = {0};
  minBoundingRect(ps1, n1 + 1, minbbox);
  float angle = minbbox[0];
  float xmin = minbbox[1];
  float ymin = minbbox[2];
  float xmax = minbbox[3];
  float ymax = minbbox[4];
  float R[2][2];

  R[0][0] = cos(angle);
  R[0][1] = sin(angle);
  R[1][0] = -sin(angle);
  R[1][1] = cos(angle);

  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
}

template <typename T>
__global__ void min_area_polygons_musa_kernel(const int ex_n_boxes,
                                              const T *ex_boxes, T *minbox) {
  MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) {
    const T *cur_box = ex_boxes + index * 18;
    T *cur_min_box = minbox + index * 8;
    Findminbox(cur_box, cur_min_box);
  }
}

#endif  // MIN_AREA_POLYGONS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/modulated_deform_conv_musa_kernel.muh
================================================
/*!
 ******************* BEGIN Caffe Copyright Notice and Disclaimer
 *****************
 *
 * COPYRIGHT
 *
 * All contributions by the University of California:
 * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 * All rights reserved.
 *
 * All other contributions:
 * Copyright (c) 2014-2017, the respective contributors
 * All rights reserved.
 *
 * Caffe uses a shared copyright model: each contributor holds copyright over
 * their contributions to Caffe. The project versioning records all such
 * contribution and copyright details. If a contributor wants to further mark
 * their specific copyright on a particular contribution, they should indicate
 * their copyright solely in the commit message of the change when it is
 * committed.
 *
 * LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * CONTRIBUTION AGREEMENT
 *
 * By contributing to the BVLC/caffe repository through pull-request, comment,
 * or otherwise, the contributor releases their content to the
 * license and copyright terms herein.
 *
 ***************** END Caffe Copyright Notice and Disclaimer
 *********************
 *
 * Copyright (c) 2018 Microsoft
 * Licensed under The MIT License [see LICENSE for details]
 * \file modulated_deformable_im2col.muh
 * \brief Function definitions of converting an image to
 * column matrix based on kernel, padding, dilation, and offset.
 * These functions are mainly used in deformable convolution operators.
 * \ref: https://arxiv.org/abs/1703.06211
 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
 */

// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu

#ifndef MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH
#define MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH

#include <float.h>
#include "pytorch_musa_helper.hpp"


template <typename T>
__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
                                  const int height, const int width, T h, T w) {
  int h_low = floorf(h);
  int w_low = floorf(w);
  int h_high = h_low + 1;
  int w_high = w_low + 1;

  T lh = h - h_low;
  T lw = w - w_low;
  T hh = 1 - lh, hw = 1 - lw;

  T v1 = 0;
  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
  T v2 = 0;
  if (h_low >= 0 && w_high <= width - 1)
    v2 = input[h_low * data_width + w_high];
  T v3 = 0;
  if (h_high <= height - 1 && w_low >= 0)
    v3 = input[h_high * data_width + w_low];
  T v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
    v4 = input[h_high * data_width + w_high];

  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

template <typename T>
__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
                                      const int w, const int height,
                                      const int width) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;
  if (h == argmax_h_low && w == argmax_w_low)
    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
  if (h == argmax_h_low && w == argmax_w_high)
    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
  if (h == argmax_h_high && w == argmax_w_low)
    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
  if (h == argmax_h_high && w == argmax_w_high)
    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
  return weight;
}

template <typename T>
__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
                                        const int height, const int width,
                                        const T *im_data, const int data_width,
                                        const int bp_dir) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;

  if (bp_dir == 0) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += -1 * (argmax_w - argmax_w_low) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_w - argmax_w_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  } else if (bp_dir == 1) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += -1 * (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  }

  return weight;
}

template <typename T>
__global__ void modulated_deformable_im2col_gpu_kernel(
    const int n, const T *data_im, const T *data_offset, const T *data_mask,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int num_channels, const int deformable_group, const int height_col,
    const int width_col, T *data_col) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    // index index of output matrix
    const int w_col = index % width_col;
    const int h_col = (index / width_col) % height_col;
    const int b_col = (index / width_col / height_col) % batch_size;
    const int c_im = (index / width_col / height_col) / batch_size;
    const int c_col = c_im * kernel_h * kernel_w;

    // compute deformable group index
    const int deformable_group_index = c_im / channel_per_deformable_group;

    const int h_in = h_col * stride_h - pad_h;
    const int w_in = w_col * stride_w - pad_w;

    T *data_col_ptr =
        data_col +
        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
    const T *data_im_ptr =
        data_im + (b_col * num_channels + c_im) * height * width;
    const T *data_offset_ptr =
        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    const T *data_mask_ptr =
        data_mask + (b_col * deformable_group + deformable_group_index) *
                        kernel_h * kernel_w * height_col * width_col;

    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        const int data_offset_h_ptr =
            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
        const int data_offset_w_ptr =
            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
            w_col;
        const int data_mask_hw_ptr =
            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
        const T offset_h = data_offset_ptr[data_offset_h_ptr];
        const T offset_w = data_offset_ptr[data_offset_w_ptr];
        const T mask = data_mask_ptr[data_mask_hw_ptr];
        T val = static_cast<T>(0);
        const T h_im = h_in + i * dilation_h + offset_h;
        const T w_im = w_in + j * dilation_w + offset_w;
        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
                                     w_im);
        *data_col_ptr = val * mask;
        data_col_ptr += batch_size * height_col * width_col;
      }
    }
  }
}

template <typename T>
__global__ void modulated_deformable_col2im_gpu_kernel(
    const int n, const T *data_col, const T *data_offset, const T *data_mask,
    const int channels, const int height, const int width, const int kernel_h,
    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int deformable_group, const int height_col, const int width_col,
    T *grad_im) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    const int j = (index / width_col / height_col / batch_size) % kernel_w;
    const int i =
        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
    const int c =
        index / width_col / height_col / batch_size / kernel_w / kernel_h;
    // compute the start and end of the output

    const int deformable_group_index = c / channel_per_deformable_group;

    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int b = (index / width_col / height_col) % batch_size;
    int w_in = w_out * stride_w - pad_w;
    int h_in = h_out * stride_h - pad_h;

    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const T *data_mask_ptr =
        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
                        kernel_w * height_col * width_col;
    const int data_offset_h_ptr =
        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
    const int data_offset_w_ptr =
        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
    const int data_mask_hw_ptr =
        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
    const T offset_h = data_offset_ptr[data_offset_h_ptr];
    const T offset_w = data_offset_ptr[data_offset_w_ptr];
    const T mask = data_mask_ptr[data_mask_hw_ptr];
    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;

    const T cur_top_grad = data_col[index] * mask;
    const int cur_h = (int)cur_inv_h_data;
    const int cur_w = (int)cur_inv_w_data;
    for (int dy = -2; dy <= 2; dy++) {
      for (int dx = -2; dx <= 2; dx++) {
        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
          int cur_bottom_grad_pos =
              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
          T weight =
              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
                                       cur_h + dy, cur_w + dx, height, width);
          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
        }
      }
    }
  }
}

template <typename T>
__global__ void modulated_deformable_col2im_coord_gpu_kernel(
    const int n, const T *data_col, const T *data_im, const T *data_offset,
    const T *data_mask, const int channels, const int height, const int width,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int channel_per_deformable_group,
    const int batch_size, const int offset_channels, const int deformable_group,
    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    T val = 0, mval = 0;
    int w = index % width_col;
    int h = (index / width_col) % height_col;
    int c = (index / width_col / height_col) % offset_channels;
    int b = (index / width_col / height_col) / offset_channels;
    // compute the start and end of the output

    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
    const int col_step = kernel_h * kernel_w;
    int cnt = 0;
    const T *data_col_ptr = data_col + deformable_group_index *
                                           channel_per_deformable_group *
                                           batch_size * width_col * height_col;
    const T *data_im_ptr =
        data_im + (b * deformable_group + deformable_group_index) *
                      channel_per_deformable_group / kernel_h / kernel_w *
                      height * width;
    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const T *data_mask_ptr =
        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
                        kernel_w * height_col * width_col;

    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;

    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
         col_c += col_step) {
      const int col_pos =
          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
      const int bp_dir = offset_c % 2;

      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
      int i =
          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
      int w_out = col_pos % width_col;
      int h_out = (col_pos / width_col) % height_col;
      int w_in = w_out * stride_w - pad_w;
      int h_in = h_out * stride_h - pad_h;
      const int data_offset_h_ptr =
          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
      const int data_offset_w_ptr =
          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
           w_out);
      const int data_mask_hw_ptr =
          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
      const T offset_h = data_offset_ptr[data_offset_h_ptr];
      const T offset_w = data_offset_ptr[data_offset_w_ptr];
      const T mask = data_mask_ptr[data_mask_hw_ptr];
      T inv_h = h_in + i * dilation_h + offset_h;
      T inv_w = w_in + j * dilation_w + offset_w;
      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
        inv_h = inv_w = -2;
      else
        mval += data_col_ptr[col_pos] *
                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
                                     height, width, inv_h, inv_w);
      const T weight = dmcn_get_coordinate_weight(
          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
          width, bp_dir);
      val += weight * data_col_ptr[col_pos] * mask;
      cnt += 1;
    }
    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
    grad_offset[index] = val;
    if (offset_c % 2 == 0)
      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
      // height_col + h) * width_col + w], mask_req, mval);
      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
                      kernel_w +
                  offset_c / 2) *
                     height_col +
                 h) *
                    width_col +
                w] = mval;
  }
}

#endif  // MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/ms_deform_attn_musa_kernel.muh
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#ifndef DEFORM_ATTN_MUSA_KERNEL
#define DEFORM_ATTN_MUSA_KERNEL

#include "common_musa_helper.hpp"
#include "pytorch_musa_helper.hpp"

template <typename scalar_t>
__device__ scalar_t ms_deform_attn_im2col_bilinear(
    const scalar_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const scalar_t &h,
    const scalar_t &w, const int &m, const int &c) {
  const int h_low = floorf(h);
  const int w_low = floorf(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const scalar_t lh = h - h_low;
  const scalar_t lw = w - w_low;
  const scalar_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  scalar_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
  }
  scalar_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
  }
  scalar_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
  }
  scalar_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
  }

  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

template <typename scalar_t>
__device__ void ms_deform_attn_col2im_bilinear(
    const scalar_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const scalar_t &h,
    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
    const scalar_t &attn_weight, scalar_t *&grad_value,
    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
  const int h_low = floorf(h);
  const int w_low = floorf(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const scalar_t lh = h - h_low;
  const scalar_t lw = w - w_low;
  const scalar_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const scalar_t top_grad_value = top_grad * attn_weight;
  scalar_t grad_h_weight = 0, grad_w_weight = 0;

  scalar_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
    grad_h_weight -= hw * v1;
    grad_w_weight -= hh * v1;
    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
  }
  scalar_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
    grad_h_weight -= lw * v2;
    grad_w_weight += hh * v2;
    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
  }
  scalar_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
  }
  scalar_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
    grad_h_weight += lw * v4;
    grad_w_weight += lh * v4;
    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
  }

  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  *grad_attn_weight = top_grad * val;
  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
}

template <typename scalar_t>
__device__ void ms_deform_attn_col2im_bilinear_gm(
    const scalar_t *&bottom_data, const int &height, const int &width,
    const int &nheads, const int &channels, const scalar_t &h,
    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
    const scalar_t &attn_weight, scalar_t *&grad_value,
    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
  const int h_low = floorf(h);
  const int w_low = floorf(w);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const scalar_t lh = h - h_low;
  const scalar_t lw = w - w_low;
  const scalar_t hh = 1 - lh, hw = 1 - lw;

  const int w_stride = nheads * channels;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
  const int base_ptr = m * channels + c;

  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const scalar_t top_grad_value = top_grad * attn_weight;
  scalar_t grad_h_weight = 0, grad_w_weight = 0;

  scalar_t v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
    grad_h_weight -= hw * v1;
    grad_w_weight -= hh * v1;
    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
  }
  scalar_t v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
    grad_h_weight -= lw * v2;
    grad_w_weight += hh * v2;
    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
  }
  scalar_t v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
  }
  scalar_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
    grad_h_weight += lw * v4;
    grad_w_weight += lh * v4;
    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
  }

  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  atomicAdd(grad_attn_weight, top_grad * val);
  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
}

template <typename scalar_t>
__global__ void ms_deformable_im2col_gpu_kernel(
    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
    const scalar_t *data_attn_weight, const int batch_size,
    const int spatial_size, const int num_heads, const int channels,
    const int num_levels, const int num_query, const int num_point,
    scalar_t *data_col) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    scalar_t *data_col_ptr = data_col + index;
    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
    scalar_t col = 0;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const scalar_t *data_value_ptr =
          data_value +
          (data_value_ptr_init_offset + level_start_id * qid_stride);
      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;

        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
                                                spatial_w, num_heads, channels,
                                                h_im, w_im, m_col, c_col) *
                 weight;
        }

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
      }
    }
    *data_col_ptr = col;
  }
}

template <typename scalar_t, unsigned int blockSize>
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
  __shared__ scalar_t cache_grad_attn_weight[blockSize];
  unsigned int tid = threadIdx.x;
  const int qid_stride = num_heads * channels;
  MUSA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();
        if (tid == 0) {
          scalar_t _grad_w = cache_grad_sampling_loc[0],
                   _grad_h = cache_grad_sampling_loc[1],
                   _grad_a = cache_grad_attn_weight[0];
          int sid = 2;
          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
            _grad_w += cache_grad_sampling_loc[sid];
            _grad_h += cache_grad_sampling_loc[sid + 1];
            _grad_a += cache_grad_attn_weight[_tid];
            sid += 2;
          }

          *grad_sampling_loc_out = _grad_w;
          *(grad_sampling_loc_out + 1) = _grad_h;
          *grad_attn_weight_out = _grad_a;
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t, unsigned int blockSize>
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
  __shared__ scalar_t cache_grad_attn_weight[blockSize];
  unsigned int tid = threadIdx.x;
  MUSA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
          }
          __syncthreads();
        }

        if (tid == 0) {
          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
          *grad_attn_weight_out = cache_grad_attn_weight[0];
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  extern __shared__ int _s[];
  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
  unsigned int tid = threadIdx.x;
  MUSA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();
        if (tid == 0) {
          scalar_t _grad_w = cache_grad_sampling_loc[0],
                   _grad_h = cache_grad_sampling_loc[1],
                   _grad_a = cache_grad_attn_weight[0];
          int sid = 2;
          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
            _grad_w += cache_grad_sampling_loc[sid];
            _grad_h += cache_grad_sampling_loc[sid + 1];
            _grad_a += cache_grad_attn_weight[_tid];
            sid += 2;
          }

          *grad_sampling_loc_out = _grad_w;
          *(grad_sampling_loc_out + 1) = _grad_h;
          *grad_attn_weight_out = _grad_a;
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  extern __shared__ int _s[];
  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
  unsigned int tid = threadIdx.x;
  MUSA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
             s >>= 1, spre >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
            if (tid + (s << 1) < spre) {
              cache_grad_attn_weight[tid] +=
                  cache_grad_attn_weight[tid + (s << 1)];
              cache_grad_sampling_loc[xid1] +=
                  cache_grad_sampling_loc[xid2 + (s << 1)];
              cache_grad_sampling_loc[xid1 + 1] +=
                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
            }
          }
          __syncthreads();
        }

        if (tid == 0) {
          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
          *grad_attn_weight_out = cache_grad_attn_weight[0];
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  extern __shared__ int _s[];
  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
  unsigned int tid = threadIdx.x;
  MUSA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
        *(cache_grad_attn_weight + threadIdx.x) = 0;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              cache_grad_sampling_loc + (threadIdx.x << 1),
              cache_grad_attn_weight + threadIdx.x);
        }

        __syncthreads();

        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
             s >>= 1, spre >>= 1) {
          if (tid < s) {
            const unsigned int xid1 = tid << 1;
            const unsigned int xid2 = (tid + s) << 1;
            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
            cache_grad_sampling_loc[xid1 + 1] +=
                cache_grad_sampling_loc[xid2 + 1];
            if (tid + (s << 1) < spre) {
              cache_grad_attn_weight[tid] +=
                  cache_grad_attn_weight[tid + (s << 1)];
              cache_grad_sampling_loc[xid1] +=
                  cache_grad_sampling_loc[xid2 + (s << 1)];
              cache_grad_sampling_loc[xid1 + 1] +=
                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
            }
          }
          __syncthreads();
        }

        if (tid == 0) {
          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
        }
        __syncthreads();

        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}

template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_gm(
    const int n, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  MUSA_1D_KERNEL_LOOP(index, n) {
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    _temp /= num_query;
    const int b_col = _temp;

    const scalar_t top_grad = grad_col[index];

    int data_weight_ptr = sampling_index * num_levels * num_point;
    int data_loc_w_ptr = data_weight_ptr << 1;
    const int grad_sampling_ptr = data_weight_ptr;
    scalar_t *grad_sampling_loc_out =
        grad_sampling_loc + (grad_sampling_ptr << 1);
    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
    const int grad_weight_stride = 1;
    const int grad_loc_stride = 2;
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;

    for (int l_col = 0; l_col < num_levels; ++l_col) {
      const int level_start_id = data_level_start_index[l_col];
      const int spatial_h_ptr = l_col << 1;
      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
      const int value_ptr_offset =
          data_value_ptr_init_offset + level_start_id * qid_stride;
      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;

      for (int p_col = 0; p_col < num_point; ++p_col) {
        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
        const scalar_t weight = data_attn_weight[data_weight_ptr];

        const scalar_t h_im = loc_h * spatial_h - 0.5;
        const scalar_t w_im = loc_w * spatial_w - 0.5;
        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
          ms_deform_attn_col2im_bilinear_gm(
              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
              grad_sampling_loc_out, grad_attn_weight_out);
        }
        data_weight_ptr += 1;
        data_loc_w_ptr += 2;
        grad_attn_weight_out += grad_weight_stride;
        grad_sampling_loc_out += grad_loc_stride;
      }
    }
  }
}
#endif  // DEFORM_ATTN_MUSA_KERNEL


================================================
FILE: mmcv/ops/csrc/common/musa/nms_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef NMS_MUSA_KERNEL_MUH
#define NMS_MUSA_KERNEL_MUH

#include <float.h>
#include "pytorch_musa_helper.hpp"


int const threadsPerBlock = sizeof(unsigned long long int) * 8;

__device__ inline bool devIoU(float const *const a, float const *const b,
                              const int offset, const float threshold) {
  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
  float width = fmaxf(right - left + offset, 0.f),
        height = fmaxf(bottom - top + offset, 0.f);
  float interS = width * height;
  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
  return interS > threshold * (Sa + Sb - interS);
}

__global__ static void nms_musa(const int n_boxes, const float iou_threshold,
                                const int offset, const float *dev_boxes,
                                unsigned long long *dev_mask) {
  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
  MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
    const int tid = threadIdx.x;

    if (row_start > col_start) return;

    const int row_size =
        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    __shared__ float block_boxes[threadsPerBlock * 4];
    if (tid < col_size) {
      block_boxes[tid * 4 + 0] =
          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
      block_boxes[tid * 4 + 1] =
          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
      block_boxes[tid * 4 + 2] =
          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
      block_boxes[tid * 4 + 3] =
          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
    }
    __syncthreads();

    if (tid < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + tid;
      const float *cur_box = dev_boxes + cur_box_idx * 4;
      int i = 0;
      unsigned long long int t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = tid + 1;
      }
      for (i = start; i < col_size; i++) {
        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
          t |= 1ULL << i;
        }
      }
      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
    }
  }
}

__global__ static void gather_keep_from_mask(bool *keep,
                                             const unsigned long long *dev_mask,
                                             const int n_boxes) {
  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
  const int tid = threadIdx.x;

  // mark the bboxes which have been removed.
  extern __shared__ unsigned long long removed[];

  // initialize removed.
  for (int i = tid; i < col_blocks; i += blockDim.x) {
    removed[i] = 0;
  }
  __syncthreads();

  for (int nblock = 0; nblock < col_blocks; ++nblock) {
    auto removed_val = removed[nblock];
    __syncthreads();
    const int i_offset = nblock * threadsPerBlock;
#pragma unroll
    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
      const int i = i_offset + inblock;
      if (i >= n_boxes) break;
      // select a candidate, check if it should kept.
      if (!(removed_val & (1ULL << inblock))) {
        if (tid == 0) {
          // mark the output.
          keep[i] = true;
        }
        auto p = dev_mask + i * col_blocks;
        // remove all bboxes which overlap the candidate.
        for (int j = tid; j < col_blocks; j += blockDim.x) {
          if (j >= nblock) removed[j] |= p[j];
        }
        __syncthreads();
        removed_val = removed[nblock];
      }
    }
  }
}

#endif  // NMS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/nms_quadri_musa.muh
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef NMS_QUADRI_MUSA_MUH
#define NMS_QUADRI_MUSA_MUH

#include "pytorch_musa_helper.hpp"
#include "box_iou_rotated_utils.hpp"

__host__ __device__ inline int divideUP(const int x, const int y) {
  return (((x) + (y)-1) / (y));
}

namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}

template <typename T>
__global__ void nms_quadri_musa_kernel(const int n_boxes,
                                       const float iou_threshold,
                                       const T* dev_boxes,
                                       unsigned long long* dev_mask,
                                       const int multi_label) {
  if (multi_label == 1) {
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;

    // if (row_start > col_start) return;

    const int row_size =
        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    // Compared to nms_cuda_kernel, where each box is represented with 4 values
    // (x1, y1, x2, y2), each rotated box is represented with 8 values
    // (x1, y1, ..., x4, y4) here.
    __shared__ T block_boxes[threadsPerBlock * 8];
    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 8 + 0] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];
      block_boxes[threadIdx.x * 8 + 1] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];
      block_boxes[threadIdx.x * 8 + 2] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];
      block_boxes[threadIdx.x * 8 + 3] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];
      block_boxes[threadIdx.x * 8 + 4] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];
      block_boxes[threadIdx.x * 8 + 5] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];
      block_boxes[threadIdx.x * 8 + 6] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];
      block_boxes[threadIdx.x * 8 + 7] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
      const T* cur_box = dev_boxes + cur_box_idx * 9;
      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        // Instead of devIoU used by original horizontal nms, here
        // we use the single_box_iou_quadri function from
        // box_iou_rotated_utils.h
        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
            iou_threshold) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
      dev_mask[cur_box_idx * col_blocks + col_start] = t;
    }
  } else {
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;

    // if (row_start > col_start) return;

    const int row_size =
        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    // Compared to nms_cuda_kernel, where each box is represented with 4 values
    // (x1, y1, x2, y2), each rotated box is represented with 8 values
    // (x1, y1, , ..., x4, y4) here.
    __shared__ T block_boxes[threadsPerBlock * 8];
    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 8 + 0] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0];
      block_boxes[threadIdx.x * 8 + 1] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1];
      block_boxes[threadIdx.x * 8 + 2] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2];
      block_boxes[threadIdx.x * 8 + 3] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3];
      block_boxes[threadIdx.x * 8 + 4] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4];
      block_boxes[threadIdx.x * 8 + 5] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5];
      block_boxes[threadIdx.x * 8 + 6] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6];
      block_boxes[threadIdx.x * 8 + 7] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
      const T* cur_box = dev_boxes + cur_box_idx * 8;
      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        // Instead of devIoU used by original horizontal nms, here
        // we use the single_box_iou_quadri function from
        // box_iou_rotated_utils.h
        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
            iou_threshold) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
      dev_mask[cur_box_idx * col_blocks + col_start] = t;
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/musa/nms_rotated_musa.muh
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
#ifndef NMS_ROTATED_MUSA_MUH
#define NMS_ROTATED_MUSA_MUH

#include "pytorch_musa_helper.hpp"
#include "box_iou_rotated_utils.hpp"

__host__ __device__ inline int divideUP(const int x, const int y) {
  return (((x) + (y)-1) / (y));
}

namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}

template <typename T>
__global__ void nms_rotated_musa_kernel(const int n_boxes,
                                        const float iou_threshold,
                                        const T* dev_boxes,
                                        unsigned long long* dev_mask,
                                        const int multi_label) {
  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel

  if (multi_label == 1) {
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;

    // if (row_start > col_start) return;

    const int row_size =
        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    // Compared to nms_cuda_kernel, where each box is represented with 4 values
    // (x1, y1, x2, y2), each rotated box is represented with 5 values
    // (x_center, y_center, width, height, angle_degrees) here.
    __shared__ T block_boxes[threadsPerBlock * 5];
    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 5 + 0] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
      block_boxes[threadIdx.x * 5 + 1] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
      block_boxes[threadIdx.x * 5 + 2] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
      block_boxes[threadIdx.x * 5 + 3] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
      block_boxes[threadIdx.x * 5 + 4] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
      const T* cur_box = dev_boxes + cur_box_idx * 6;
      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        // Instead of devIoU used by original horizontal nms, here
        // we use the single_box_iou_rotated function from
        // box_iou_rotated_utils.h
        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
            iou_threshold) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
      dev_mask[cur_box_idx * col_blocks + col_start] = t;
    }
  } else {
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;

    // if (row_start > col_start) return;

    const int row_size =
        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
    const int col_size =
        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

    // Compared to nms_cuda_kernel, where each box is represented with 4 values
    // (x1, y1, x2, y2), each rotated box is represented with 5 values
    // (x_center, y_center, width, height, angle_degrees) here.
    __shared__ T block_boxes[threadsPerBlock * 5];
    if (threadIdx.x < col_size) {
      block_boxes[threadIdx.x * 5 + 0] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
      block_boxes[threadIdx.x * 5 + 1] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
      block_boxes[threadIdx.x * 5 + 2] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
      block_boxes[threadIdx.x * 5 + 3] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
      block_boxes[threadIdx.x * 5 + 4] =
          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
    }
    __syncthreads();

    if (threadIdx.x < row_size) {
      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
      const T* cur_box = dev_boxes + cur_box_idx * 5;
      int i = 0;
      unsigned long long t = 0;
      int start = 0;
      if (row_start == col_start) {
        start = threadIdx.x + 1;
      }
      for (i = start; i < col_size; i++) {
        // Instead of devIoU used by original horizontal nms, here
        // we use the single_box_iou_rotated function from
        // box_iou_rotated_utils.h
        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
            iou_threshold) {
          t |= 1ULL << i;
        }
      }
      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
      dev_mask[cur_box_idx * col_blocks + col_start] = t;
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/musa/points_in_boxes_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINT_IN_BOXES_MUSA_KERNEL_MUH
#define POINT_IN_BOXES_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
                                             T &local_x, T &local_y) {
  T cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
}

template <typename T>
__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
                                        T &local_y) {
  // param pt: (x, y, z)
  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
  // cz in the bottom center
  T x = pt[0], y = pt[1], z = pt[2];
  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
  cz += z_size /
        2.0;  // shift to the center since cz in box3d is the bottom center

  if (fabsf(z - cz) > z_size / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
  return in_flag;
}

template <typename T>
__global__ void points_in_boxes_part_forward_musa_kernel(
    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
    int *box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
  // (B, npoints), default -1

  int bs_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) {
    if (bs_idx >= batch_size) return;

    boxes += bs_idx * boxes_num * 7;
    pts += bs_idx * pts_num * 3 + pt_idx * 3;
    box_idx_of_points += bs_idx * pts_num + pt_idx;

    T local_x = 0, local_y = 0;
    int cur_in_flag = 0;
    for (int k = 0; k < boxes_num; k++) {
      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
      if (cur_in_flag) {
        box_idx_of_points[0] = k;
        break;
      }
    }
  }
}

template <typename T>
__global__ void points_in_boxes_all_forward_musa_kernel(
    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
    int *box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
  // (B, npoints), default -1

  int bs_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) {
    if (bs_idx >= batch_size) return;

    boxes += bs_idx * boxes_num * 7;
    pts += bs_idx * pts_num * 3 + pt_idx * 3;
    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;

    T local_x = 0, local_y = 0;
    for (int k = 0; k < boxes_num; k++) {
      const int cur_in_flag =
          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
      if (cur_in_flag) {
        box_idx_of_points[k] = 1;
      }
    }
  }
}

#endif  // POINT_IN_BOXES_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/points_in_polygons_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINTS_IN_POLYGONS_MUSA_KERNEL_MUH
#define POINTS_IN_POLYGONS_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

struct point {
  float x, y;
};

template <typename scalar_t>
__global__ void points_in_polygons_forward_musa_kernel(
    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
    const int rows, const int cols, scalar_t *inside_flag) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int row = index / cols;
    int col = index % cols;

    const scalar_t *offset_vertex1 = vertex1 + row * 2;
    const scalar_t *offset_vertex2 = vertex2 + col * 8;

    point point_[1];
    point polygon[4];

    point_[0].x = offset_vertex1[0];
    point_[0].y = offset_vertex1[1];

    polygon[0].x = offset_vertex2[0];
    polygon[0].y = offset_vertex2[1];
    polygon[1].x = offset_vertex2[2];
    polygon[1].y = offset_vertex2[3];
    polygon[2].x = offset_vertex2[4];
    polygon[2].y = offset_vertex2[5];
    polygon[3].x = offset_vertex2[6];
    polygon[3].y = offset_vertex2[7];

    int nCross = 0;
    int i, j;
    float sx, sy, tx, ty, px, py, x;
    for (i = 0, j = 3; i < 4; j = i, i++) {
      sx = polygon[i].x;
      sy = polygon[i].y;
      tx = polygon[j].x;
      ty = polygon[j].y;

      px = point_[0].x;
      py = point_[0].y;

      if (py < min(sy, ty)) continue;
      if (py > max(sy, ty)) continue;

      if ((sx == px && sy == py) || (tx == px && ty == py)) {
        break;
      } else {
        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
          x = sx + (py - sy) * (tx - sx) / (ty - sy);
          if (x == px) {
            break;
          }
          if (x > px) {
            nCross++;
          }
        }
      }
    }
    if (nCross % 2 == 1) {
      inside_flag[index] = 1.0;
    } else {
      inside_flag[index] = 0.0;
    }
    return;
  }
}

#endif  // POINTS_IN_POLYGONS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/prroi_pool_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
// Distributed under terms of the MIT license.
#ifndef PRROI_POOL_MUSA_KERNEL_MUH
#define PRROI_POOL_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
                                                        const int h,
                                                        const int w,
                                                        const int height,
                                                        const int width) {
  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
  T retVal = overflow ? 0.0f : data[h * width + w];
  return retVal;
}

template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
  return (1.0f - abs(dh)) * (1.0f - abs(dw));
}

template <typename T>
__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
                                                                   T c1, T c2) {
  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
}

template <typename T>
__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
                                              const T w, const int height,
                                              const int width) {
  T retVal = 0.0f;
  int h1 = floorf(h);
  int w1 = floorf(w);
  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
  h1 = floorf(h) + 1;
  w1 = floorf(w);
  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
  h1 = floorf(h);
  w1 = floorf(w) + 1;
  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
  h1 = floorf(h) + 1;
  w1 = floorf(w) + 1;
  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
  return retVal;
}

template <typename T>
__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
                                               const int s_h, const int s_w,
                                               const int e_h, const int e_w,
                                               const T y0, const T x0,
                                               const T y1, const T x1,
                                               const int h0, const int w0) {
  T alpha, beta, lim_alpha, lim_beta, tmp;
  T sum_out = 0;

  alpha = x0 - T(s_w);
  beta = y0 - T(s_h);
  lim_alpha = x1 - T(s_w);
  lim_beta = y1 - T(s_h);
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;

  alpha = T(e_w) - x1;
  lim_alpha = T(e_w) - x0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;

  alpha = x0 - T(s_w);
  beta = T(e_h) - y1;
  lim_alpha = x1 - T(s_w);
  lim_beta = T(e_h) - y0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;

  alpha = T(e_w) - x1;
  lim_alpha = T(e_w) - x0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;

  return sum_out;
}

template <typename T>
__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
                                                  const int h, const int w,
                                                  const int height,
                                                  const int width,
                                                  const T coeff) {
  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
}

template <typename T>
__device__ static void PrRoIPoolingMatDistributeDiff(
    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
    const int w0) {
  T alpha, beta, lim_alpha, lim_beta, tmp;

  alpha = x0 - T(s_w);
  beta = y0 - T(s_h);
  lim_alpha = x1 - T(s_w);
  lim_beta = y1 - T(s_h);
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);

  alpha = T(e_w) - x1;
  lim_alpha = T(e_w) - x0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);

  alpha = x0 - T(s_w);
  beta = T(e_h) - y1;
  lim_alpha = x1 - T(s_w);
  lim_beta = T(e_h) - y0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);

  alpha = T(e_w) - x1;
  lim_alpha = T(e_w) - x0;
  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
         0.5f * alpha * alpha) *
        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
}

template <typename T>
__global__ void prroi_pool_forward_musa_kernel(
    const int nthreads, const T *input, const T *rois, T *output,
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T *offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];

    T roi_x1 = offset_rois[1] * spatial_scale;
    T roi_y1 = offset_rois[2] * spatial_scale;
    T roi_x2 = offset_rois[3] * spatial_scale;
    T roi_y2 = offset_rois[4] * spatial_scale;

    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
    T bin_size_h = roi_height / static_cast<T>(pooled_height);
    T bin_size_w = roi_width / static_cast<T>(pooled_width);

    const T *this_data =
        input + (roi_batch_ind * channels + c) * height * width;
    T *this_out = output + index;

    T bin_x1 = roi_x1 + bin_size_w * pw;
    T bin_y1 = roi_y1 + bin_size_h * ph;
    T bin_x2 = bin_x1 + bin_size_w;
    T bin_y2 = bin_y1 + bin_size_h;

    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
    if (bin_size == 0) {
      *this_out = 0;
      continue;
    }

    T sum_out = 0;

    int start_x, start_y, end_x, end_y;

    start_x = floorf(bin_x1);
    end_x = ceilf(bin_x2);
    start_y = floorf(bin_y1);
    end_y = ceilf(bin_y2);

    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
        sum_out += PrRoIPoolingMatCalculation(
            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
            width);
    *this_out = sum_out / bin_size;
  }
}

template <typename T>
__global__ void prroi_pool_backward_musa_kernel(
    const int nthreads, const T *grad_output, const T *rois, T *grad_input,
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;
    auto rois_cur = rois + n * 5;

    int roi_batch_ind = rois_cur[0];
    T roi_x1 = rois_cur[1] * spatial_scale;
    T roi_y1 = rois_cur[2] * spatial_scale;
    T roi_x2 = rois_cur[3] * spatial_scale;
    T roi_y2 = rois_cur[4] * spatial_scale;

    T roi_width = max(roi_x2 - roi_x1, (T)0);
    T roi_height = max(roi_y2 - roi_y1, (T)0);
    T bin_size_h = roi_height / static_cast<T>(pooled_height);
    T bin_size_w = roi_width / static_cast<T>(pooled_width);

    const T *this_out_grad = grad_output + index;
    T *this_data_grad =
        grad_input + (roi_batch_ind * channels + c) * height * width;

    T bin_x1 = roi_x1 + bin_size_w * pw;
    T bin_y1 = roi_y1 + bin_size_h * ph;
    T bin_x2 = bin_x1 + bin_size_w;
    T bin_y2 = bin_y1 + bin_size_h;

    T bin_size = max(T(0.0), bin_size_w * bin_size_h);

    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;

    int start_x, start_y, end_x, end_y;

    start_x = floorf(bin_x1);
    end_x = ceilf(bin_x2);
    start_y = floorf(bin_y1);
    end_y = ceilf(bin_y2);

    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
        PrRoIPoolingMatDistributeDiff(
            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
            width);
  }
}

template <typename T>
__global__ void prroi_pool_coor_backward_musa_kernel(
    const int nthreads, const T *output, const T *grad_output, const T *input,
    const T *rois, T *grad_rois, const int pooled_height,
    const int pooled_width, const T spatial_scale, const int channels,
    const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;
    auto rois_cur = rois + n * 5;

    int roi_batch_ind = rois_cur[0];
    T roi_x1 = rois_cur[1] * spatial_scale;
    T roi_y1 = rois_cur[2] * spatial_scale;
    T roi_x2 = rois_cur[3] * spatial_scale;
    T roi_y2 = rois_cur[4] * spatial_scale;

    T roi_width = max(roi_x2 - roi_x1, (T)0);
    T roi_height = max(roi_y2 - roi_y1, (T)0);
    T bin_size_h = roi_height / static_cast<T>(pooled_height);
    T bin_size_w = roi_width / static_cast<T>(pooled_width);

    const T output_grad_val = grad_output[index];
    const T *this_input_data =
        input + (roi_batch_ind * channels + c) * height * width;
    const T output_val = output[index];
    T *this_rois_grad = grad_rois + n * 5;

    T bin_x1 = roi_x1 + bin_size_w * pw;
    T bin_y1 = roi_y1 + bin_size_h * ph;
    T bin_x2 = bin_x1 + bin_size_w;
    T bin_y2 = bin_y1 + bin_size_h;

    T bin_size = max(T(0.0), bin_size_w * bin_size_h);

    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;

    // WARNING: to be discussed
    if (sum_out == 0) continue;

    int start_x, start_y, end_x, end_y;

    start_x = floorf(bin_x1);
    end_x = ceilf(bin_x2);
    start_y = floorf(bin_y1);
    end_y = ceilf(bin_y2);

    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
      grad_x1_y += PrRoIPoolingSingleCoorIntegral(
          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
                                    height, width),
          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
                                    height, width));

      grad_x2_y += PrRoIPoolingSingleCoorIntegral(
          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
                                    height, width),
          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
                                    height, width));
    }

    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
                                    height, width),
          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
                                    height, width));

      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
                                    height, width),
          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
                                    height, width));
    }

    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;

    partial_x1 = partial_x1 / bin_size * spatial_scale;
    partial_x2 = partial_x2 / bin_size * spatial_scale;
    partial_y1 = partial_y1 / bin_size * spatial_scale;
    partial_y2 = partial_y2 / bin_size * spatial_scale;

    // (index, x1, y1, x2, y2)
    this_rois_grad[0] = 0;
    atomicAdd(this_rois_grad + 1,
              (partial_x1 * (1.0f - T(pw) / pooled_width) +
               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
                  output_grad_val);
    atomicAdd(this_rois_grad + 2,
              (partial_y1 * (1.0f - T(ph) / pooled_height) +
               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
                  output_grad_val);
    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
                                   partial_x1 * T(pw) / pooled_width) *
                                      output_grad_val);
    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
                                   partial_y1 * T(ph) / pooled_height) *
                                      output_grad_val);
  }
}

#endif  // ROI_POOL_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/psamask_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef PSAMASK_MUSA_KERNEL_MUH
#define PSAMASK_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

// MUSA: grid stride looping
#ifndef MUSA_KERNEL_LOOP
#define MUSA_KERNEL_LOOP(i, n)                                 \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)
#endif

template <typename T>
__global__ void psamask_collect_forward_musa(
    const int nthreads, const int h_feature, const int w_feature,
    const int h_mask, const int w_mask, const int half_h_mask,
    const int half_w_mask, const T* mask_data, T* buffer_data) {
  MUSA_KERNEL_LOOP(index, nthreads) {
    const int w = index % w_feature;
    const int h = (index / w_feature) % h_feature;
    const int n = index / w_feature / h_feature;
    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
    const int hstart = max(0, half_h_mask - h);
    const int hend = min(h_mask, h_feature + half_h_mask - h);
    const int wstart = max(0, half_w_mask - w);
    const int wend = min(w_mask, w_feature + half_w_mask - w);
    // (hidx,                    widx                   ) with mask-indexed
    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
    for (int hidx = hstart; hidx < hend; hidx++) {
      for (int widx = wstart; widx < wend; widx++) {
        buffer_data[(n * h_feature * w_feature +
                     (hidx + h - half_h_mask) * w_feature +
                     (widx + w - half_w_mask)) *
                        h_feature * w_feature +
                    h * w_feature + w] = mask_data
            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
                 w_feature +
             w];
      }
    }
  }
}

template <typename T>
__global__ void psamask_distribute_forward_musa(
    const int nthreads, const int h_feature, const int w_feature,
    const int h_mask, const int w_mask, const int half_h_mask,
    const int half_w_mask, const T* mask_data, T* buffer_data) {
  MUSA_KERNEL_LOOP(index, nthreads) {
    const int w = index % w_feature;
    const int h = (index / w_feature) % h_feature;
    const int n = index / w_feature / h_feature;
    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
    const int hstart = max(0, half_h_mask - h);
    const int hend = min(h_mask, h_feature + half_h_mask - h);
    const int wstart = max(0, half_w_mask - w);
    const int wend = min(w_mask, w_feature + half_w_mask - w);
    // (hidx,                    widx                   ) with mask-indexed
    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
    for (int hidx = hstart; hidx < hend; hidx++) {
      for (int widx = wstart; widx < wend; widx++) {
        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
                        h_feature * w_feature +
                    (hidx + h - half_h_mask) * w_feature +
                    (widx + w - half_w_mask)] = mask_data
            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
                 w_feature +
             w];
      }
    }
  }
}

template <typename T>
__global__ void psamask_collect_backward_musa(
    const int nthreads, const int h_feature, const int w_feature,
    const int h_mask, const int w_mask, const int half_h_mask,
    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
  MUSA_KERNEL_LOOP(index, nthreads) {
    const int w = index % w_feature;
    const int h = (index / w_feature) % h_feature;
    const int n = index / w_feature / h_feature;
    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
    const int hstart = max(0, half_h_mask - h);
    const int hend = min(h_mask, h_feature + half_h_mask - h);
    const int wstart = max(0, half_w_mask - w);
    const int wend = min(w_mask, w_feature + half_w_mask - w);
    // (hidx,                    widx                   ) with mask-indexed
    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
    for (int hidx = hstart; hidx < hend; hidx++) {
      for (int widx = wstart; widx < wend; widx++) {
        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
                   h) *
                      w_feature +
                  w] = buffer_diff[(n * h_feature * w_feature +
                                    (hidx + h - half_h_mask) * w_feature +
                                    (widx + w - half_w_mask)) *
                                       h_feature * w_feature +
                                   h * w_feature + w];
      }
    }
  }
}

template <typename T>
__global__ void psamask_distribute_backward_musa(
    const int nthreads, const int h_feature, const int w_feature,
    const int h_mask, const int w_mask, const int half_h_mask,
    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
  MUSA_KERNEL_LOOP(index, nthreads) {
    const int w = index % w_feature;
    const int h = (index / w_feature) % h_feature;
    const int n = index / w_feature / h_feature;
    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
    const int hstart = max(0, half_h_mask - h);
    const int hend = min(h_mask, h_feature + half_h_mask - h);
    const int wstart = max(0, half_w_mask - w);
    const int wend = min(w_mask, w_feature + half_w_mask - w);
    // (hidx,                    widx                   ) with mask-indexed
    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
    for (int hidx = hstart; hidx < hend; hidx++) {
      for (int widx = wstart; widx < wend; widx++) {
        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
                   h) *
                      w_feature +
                  w] =
            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
                            h_feature * w_feature +
                        (hidx + h - half_h_mask) * w_feature +
                        (widx + w - half_w_mask)];
      }
    }
  }
}

#endif  // PSAMASK_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/riroi_align_rotated_musa_kernel.muh
================================================
// Modified from
// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
#ifndef RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH
#define RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH

#include <float.h>
#include "pytorch_musa_helper.hpp"

/*** Forward ***/
template <typename scalar_t>
__global__ void riroi_align_rotated_forward_musa_kernel(
    const int nthreads, const scalar_t *bottom_data,
    const scalar_t *bottom_rois, const scalar_t spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int pooled_height,
    const int pooled_width, const int num_orientations, scalar_t *top_data) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int o = (index / pooled_width / pooled_height) % num_orientations;
    int c =
        (index / pooled_width / pooled_height / num_orientations) % channels;
    int n = index / pooled_width / pooled_height / num_orientations / channels;

    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not using rounding; this implementation detail is critical
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    // Force malformed ROIs to be 1x1
    roi_width = max(roi_width, (scalar_t)1.);
    roi_height = max(roi_height, (scalar_t)1.);
    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
                          static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w =
        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    // find aligned index
    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
    int ind = floorf(ind_float);
    scalar_t l_var = ind_float - (scalar_t)ind;
    scalar_t r_var = 1.0 - l_var;
    // correct start channel
    ind = (ind + num_orientations) % num_orientations;
    // rotated channel
    int ind_rot = (o - ind + num_orientations) % num_orientations;
    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
    const scalar_t *offset_bottom_data =
        bottom_data + (roi_batch_ind * channels * num_orientations +
                       c * num_orientations + ind_rot) *
                          height * width;

    const scalar_t *offset_bottom_data_plus =
        bottom_data + (roi_batch_ind * channels * num_orientations +
                       c * num_orientations + ind_rot_plus) *
                          height * width;
    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (num_samples > 0)
                             ? num_samples
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosscalar_theta = cos(theta);
    scalar_t sinscalar_theta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    scalar_t output_val = 0.;
    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy =
          roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta (counterclockwise) around the center and translate
        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;

        scalar_t val = bilinear_interpolate<scalar_t>(
            offset_bottom_data, height, width, y, x, index);
        scalar_t val_plus = bilinear_interpolate<scalar_t>(
            offset_bottom_data_plus, height, width, y, x, index);
        output_val += r_var * val + l_var * val_plus;
      }
    }
    output_val /= count;

    top_data[index] = output_val;
  }
}

/*** Backward ***/
template <typename scalar_t>
__global__ void riroi_align_rotated_backward_musa_kernel(
    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
    const scalar_t spatial_scale, const int num_samples, const bool clockwise,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, const int num_orientations,
    scalar_t *bottom_diff) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int o = (index / pooled_width / pooled_height) % num_orientations;
    int c =
        (index / pooled_width / pooled_height / num_orientations) % channels;
    int n = index / pooled_width / pooled_height / num_orientations / channels;

    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not round
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    // Force malformed ROIs to be 1x1
    roi_width = max(roi_width, (scalar_t)1.);
    roi_height = max(roi_height, (scalar_t)1.);

    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
                          static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w =
        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    // find aligned index
    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
    int ind = floorf(ind_float);
    scalar_t l_var = ind_float - (scalar_t)ind;
    scalar_t r_var = 1.0 - l_var;
    // correct start channel
    ind = (ind + num_orientations) % num_orientations;
    // rotated channel
    int ind_rot = (o - ind + num_orientations) % num_orientations;
    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
    scalar_t *offset_bottom_diff =
        bottom_diff + (roi_batch_ind * channels * num_orientations +
                       c * num_orientations + ind_rot) *
                          height * width;
    scalar_t *offset_bottom_diff_plus =
        bottom_diff + (roi_batch_ind * channels * num_orientations +
                       c * num_orientations + ind_rot_plus) *
                          height * width;
    int top_offset =
        (n * channels * num_orientations + c * num_orientations + o) *
        pooled_height * pooled_width;
    const scalar_t *offset_top_diff = top_diff + top_offset;
    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (num_samples > 0)
                             ? num_samples
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosTheta = cos(theta);
    scalar_t sinTheta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy =
          roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta around the center and translate
        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;

        scalar_t w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
                                                w4, x_low, x_high, y_low,
                                                y_high, index);

        scalar_t g1 = top_diff_this_bin * w1 / count;
        scalar_t g2 = top_diff_this_bin * w2 / count;
        scalar_t g3 = top_diff_this_bin * w3 / count;
        scalar_t g4 = top_diff_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);

          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
                    g1 * l_var);
          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
                    g2 * l_var);
          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
                    g3 * l_var);
          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
                    g4 * l_var);

        }  // if
      }    // ix
    }      // iy
  }        // MUSA_1D_KERNEL_LOOP
}  // RiRoIAlignBackward

#endif  // RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/roi_align_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROI_ALIGN_MUSA_KERNEL_MUH
#define ROI_ALIGN_MUSA_KERNEL_MUH

#include <float.h>
#include "pytorch_musa_helper.hpp"


/*** Forward ***/
template <typename T>
__global__ void roi_align_forward_musa_kernel(
    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
    T* argmax_x, const int pooled_height, const int pooled_width,
    const T spatial_scale, const int sampling_ratio,
    const int pool_mode,  // 0 - max pool, 1 - avg pool
    const bool aligned, const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];

    // Do not using rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T roi_start_w = offset_rois[1] * spatial_scale - offset;
    T roi_start_h = offset_rois[2] * spatial_scale - offset;
    T roi_end_w = offset_rois[3] * spatial_scale - offset;
    T roi_end_h = offset_rois[4] * spatial_scale - offset;

    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;
    if (!aligned) {  // for backward-compatibility only
      roi_width = max(roi_width, (T)1.);
      roi_height = max(roi_height, (T)1.);
    }

    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    const T* offset_input =
        input + (roi_batch_ind * channels + c) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_height / pooled_height));
    int roi_bin_grid_w =
        (sampling_ratio > 0)
            ? sampling_ratio
            : static_cast<int>(ceilf(roi_width / pooled_width));

    if (pool_mode == 0) {
      // We do max pooling inside a bin
      T maxval = -FLT_MAX;
      T maxidx_y = -1.f, maxidx_x = -1.f;
      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
        const T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);
        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
          const T x = roi_start_w + pw * bin_size_w +
                      static_cast<T>(ix + .5f) * bin_size_w /
                          static_cast<T>(roi_bin_grid_w);
          T val =
              bilinear_interpolate(offset_input, height, width, y, x, index);
          if (val > maxval) {
            maxval = val;
            maxidx_y = y;
            maxidx_x = x;
          }
        }
      }
      output[index] = maxval;
      argmax_y[index] = maxidx_y;
      argmax_x[index] = maxidx_x;
    } else if (pool_mode == 1) {
      // We do average pooling inside a bin
      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
      T output_val = 0.;
      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
        const T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);
        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
          const T x = roi_start_w + pw * bin_size_w +
                      static_cast<T>(ix + .5f) * bin_size_w /
                          static_cast<T>(roi_bin_grid_w);
          T val =
              bilinear_interpolate(offset_input, height, width, y, x, index);
          output_val += val;
        }
      }
      output[index] = output_val / count;
    }
  }
}

/*** Backward ***/
template <typename T>
__global__ void roi_align_backward_musa_kernel(
    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
    const T* argmax_x, T* grad_input, const int pooled_height,
    const int pooled_width, const T spatial_scale, const int sampling_ratio,
    const int pool_mode,  // 0 - max pool, 1 - avg pool
    const bool aligned, const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T grad_output_this_bin = grad_output[index];

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];
    T* offset_grad_input =
        grad_input + ((roi_batch_ind * channels + c) * height * width);

    if (pool_mode == 0) {
      T y = argmax_y[index], x = argmax_x[index];
      if (y != -1.f) {
        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;
        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high, index);

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_grad_input + y_low * width + x_low,
                    grad_output_this_bin * w1);
          atomicAdd(offset_grad_input + y_low * width + x_high,
                    grad_output_this_bin * w2);
          atomicAdd(offset_grad_input + y_high * width + x_low,
                    grad_output_this_bin * w3);
          atomicAdd(offset_grad_input + y_high * width + x_high,
                    grad_output_this_bin * w4);
        }
      }
    } else if (pool_mode == 1) {
      // Do not using rounding; this implementation detail is critical
      T offset = aligned ? (T)0.5 : (T)0.0;
      T roi_start_w = offset_rois[1] * spatial_scale - offset;
      T roi_start_h = offset_rois[2] * spatial_scale - offset;
      T roi_end_w = offset_rois[3] * spatial_scale - offset;
      T roi_end_h = offset_rois[4] * spatial_scale - offset;

      T roi_width = roi_end_w - roi_start_w;
      T roi_height = roi_end_h - roi_start_h;
      if (!aligned) {  // for backward-compatibility only
        roi_width = max(roi_width, (T)1.);
        roi_height = max(roi_height, (T)1.);
      }

      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

      // We use roi_bin_grid to sample the grid and mimic integral
      int roi_bin_grid_h =
          (sampling_ratio > 0)
              ? sampling_ratio
              : static_cast<int>(ceilf(roi_height / pooled_height));
      int roi_bin_grid_w =
          (sampling_ratio > 0)
              ? sampling_ratio
              : static_cast<int>(ceilf(roi_width / pooled_width));

      // We do average (integral) pooling inside a bin
      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
        const T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);
        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
          const T x = roi_start_w + pw * bin_size_w +
                      static_cast<T>(ix + .5f) * bin_size_w /
                          static_cast<T>(roi_bin_grid_w);

          T w1, w2, w3, w4;
          int x_low, x_high, y_low, y_high;
          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                        x_low, x_high, y_low, y_high, index);

          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
            atomicAdd(offset_grad_input + y_low * width + x_low,
                      grad_output_this_bin * w1 / count);
            atomicAdd(offset_grad_input + y_low * width + x_high,
                      grad_output_this_bin * w2 / count);
            atomicAdd(offset_grad_input + y_high * width + x_low,
                      grad_output_this_bin * w3 / count);
            atomicAdd(offset_grad_input + y_high * width + x_high,
                      grad_output_this_bin * w4 / count);
          }
        }
      }
    }
  }
}

#endif  // ROI_ALIGN_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/roi_align_rotated_musa_kernel.muh
================================================
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH
#define ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH

#include <float.h>
#include "pytorch_musa_helper.hpp"

/*** Forward ***/
template <typename scalar_t>
__global__ void roi_align_rotated_forward_musa_kernel(
    const int nthreads, const scalar_t *bottom_data,
    const scalar_t *bottom_rois, const scalar_t spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, scalar_t *top_data) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not using rounding; this implementation detail is critical
    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    if (!aligned) {  // for backward-compatibility only
      // Force malformed ROIs to be 1x1
      roi_width = max(roi_width, (scalar_t)1.);
      roi_height = max(roi_height, (scalar_t)1.);
    }
    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
                          static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w =
        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    const scalar_t *offset_bottom_data =
        bottom_data + (roi_batch_ind * channels + c) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosscalar_theta = cos(theta);
    scalar_t sinscalar_theta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    scalar_t output_val = 0.;
    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy =
          roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta (counterclockwise) around the center and translate
        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;

        scalar_t val = bilinear_interpolate<scalar_t>(
            offset_bottom_data, height, width, y, x, index);
        output_val += val;
      }
    }
    output_val /= count;

    top_data[index] = output_val;
  }
}

/*** Backward ***/
template <typename scalar_t>
__global__ void roi_align_rotated_backward_musa_kernel(
    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
    const bool clockwise, const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not round
    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    if (!aligned) {  // for backward-compatibility only
      // Force malformed ROIs to be 1x1
      roi_width = max(roi_width, (scalar_t)1.);
      roi_height = max(roi_height, (scalar_t)1.);
    }
    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
                          static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w =
        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    scalar_t *offset_bottom_diff =
        bottom_diff + (roi_batch_ind * channels + c) * height * width;

    int top_offset = (n * channels + c) * pooled_height * pooled_width;
    const scalar_t *offset_top_diff = top_diff + top_offset;
    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosTheta = cos(theta);
    scalar_t sinTheta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy =
          roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta around the center and translate
        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;

        scalar_t w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
                                                w4, x_low, x_high, y_low,
                                                y_high, index);

        scalar_t g1 = top_diff_this_bin * w1 / count;
        scalar_t g2 = top_diff_this_bin * w2 / count;
        scalar_t g3 = top_diff_this_bin * w3 / count;
        scalar_t g4 = top_diff_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
        }  // if
      }    // ix
    }      // iy
  }        // MUSA_1D_KERNEL_LOOP
}  // RoIAlignBackward

#endif  // ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/roi_pool_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROI_POOL_MUSA_KERNEL_MUH
#define ROI_POOL_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void roi_pool_forward_musa_kernel(
    const int nthreads, const T* input, const T* rois, T* output, int* argmax,
    const int pooled_height, const int pooled_width, const T spatial_scale,
    const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];
    // calculate the roi region on feature maps
    T roi_x1 = offset_rois[1] * spatial_scale;
    T roi_y1 = offset_rois[2] * spatial_scale;
    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;
    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;

    // force malformed rois to be 1x1
    T roi_w = roi_x2 - roi_x1;
    T roi_h = roi_y2 - roi_y1;
    if (roi_w <= 0 || roi_h <= 0) continue;

    T bin_size_w = roi_w / static_cast<T>(pooled_width);
    T bin_size_h = roi_h / static_cast<T>(pooled_height);

    // the corresponding bin region
    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);
    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);
    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);
    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);

    // add roi offsets and clip to input boundaries
    bin_x1 = min(max(bin_x1, 0), width);
    bin_y1 = min(max(bin_y1, 0), height);
    bin_x2 = min(max(bin_x2, 0), width);
    bin_y2 = min(max(bin_y2, 0), height);
    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);

    const T* offset_input =
        input + (roi_batch_ind * channels + c) * height * width;
    // Define an empty pooling region to be zero
    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
    T max_val = is_empty ? 0 : -FLT_MAX;
    int max_idx = -1;
    for (int h = bin_y1; h < bin_y2; ++h) {
      for (int w = bin_x1; w < bin_x2; ++w) {
        int offset = h * width + w;
        if (offset_input[offset] > max_val) {
          max_val = offset_input[offset];
          max_idx = offset;
        }
      }
    }
    output[index] = max_val;
    if (argmax != NULL) argmax[index] = max_idx;
  }
}

template <typename T>
__global__ void roi_pool_backward_musa_kernel(
    const int nthreads, const T* grad_output, const T* rois, const int* argmax,
    T* grad_input, const int pooled_height, const int pooled_width,
    const int channels, const int height, const int width) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c) is an element in the pooled output
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    int roi_batch_ind = rois[n * 5];
    T* grad_input_offset =
        grad_input + ((roi_batch_ind * channels + c) * height * width);
    int argmax_index = argmax[index];

    if (argmax_index != -1) {
      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);
    }
  }
}

#endif  // ROI_POOL_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/roiaware_pool3d_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROIAWARE_POOL3D_MUSA_KERNEL_MUH
#define ROIAWARE_POOL3D_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
                                             T &local_x, T &local_y) {
  T cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
}

template <typename T>
__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
                                        T &local_y) {
  // param pt: (x, y, z)
  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
  // cz in the bottom center
  T x = pt[0], y = pt[1], z = pt[2];
  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
  cz += z_size /
        2.0;  // shift to the center since cz in box3d is the bottom center

  if (fabsf(z - cz) > z_size / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
  return in_flag;
}

template <typename T>
__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
                                            int out_x, int out_y, int out_z,
                                            const T *rois, const T *pts,
                                            int *pts_mask) {
  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
  // y_idxs, z_idxs) by binary bit
  int box_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) {
    if (box_idx >= boxes_num) return;

    pts += pt_idx * 3;
    rois += box_idx * 7;
    pts_mask += box_idx * pts_num + pt_idx;

    T local_x = 0, local_y = 0;
    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);

    pts_mask[0] = -1;
    if (cur_in_flag > 0) {
      T local_z = pts[2] - rois[2];
      T x_size = rois[3], y_size = rois[4], z_size = rois[5];

      T x_res = x_size / out_x;
      T y_res = y_size / out_y;
      T z_res = z_size / out_z;

      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
      unsigned int z_idx = int(local_z / z_res);

      x_idx = min(max(x_idx, 0), out_x - 1);
      y_idx = min(max(y_idx, 0), out_y - 1);
      z_idx = min(max(z_idx, 0), out_z - 1);

      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;

      pts_mask[0] = idx_encoding;
    }
  }
}

template <typename T>
__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
                                             int max_pts_each_voxel, int out_x,
                                             int out_y, int out_z,
                                             const int *pts_mask,
                                             T *pts_idx_of_voxels) {
  // params pts_mask: (N, npoints)  0 or 1
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  MUSA_1D_KERNEL_LOOP(box_idx, boxes_num) {
    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;

    for (int k = 0; k < pts_num; k++) {
      if (pts_mask[box_idx * pts_num + k] != -1) {
        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
        unsigned int z_idx = idx_encoding & 0xFF;
        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
                                   y_idx * out_z * max_pts_each_voxel +
                                   z_idx * max_pts_each_voxel;
        unsigned int cnt = pts_idx_of_voxels[base_offset];
        if (cnt < max_num_pts) {
          pts_idx_of_voxels[base_offset + cnt + 1] = k;
          pts_idx_of_voxels[base_offset]++;
        }
      }
    }
  }
}

template <typename T>
__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
                                   int max_pts_each_voxel, int out_x, int out_y,
                                   int out_z, const T *pts_feature,
                                   const int *pts_idx_of_voxels,
                                   T *pooled_features, int *argmax) {
  // params pts_feature: (npoints, C)
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
  // params argmax: (N, out_x, out_y, out_z, C)

  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
    int x_idx = voxel_idx_flat / (out_y * out_z);
    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
    int z_idx = voxel_idx_flat % out_z;
    if (box_idx >= boxes_num || channel_idx >= channels) return;

    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
                         offset_base * max_pts_each_voxel;
    pooled_features += box_idx * out_x * out_y * out_z * channels +
                       offset_base * channels + channel_idx;
    argmax += box_idx * out_x * out_y * out_z * channels +
              offset_base * channels + channel_idx;

    int argmax_idx = -1;
    float max_val = -1e50;

    int total_pts = pts_idx_of_voxels[0];

    for (int k = 1; k <= total_pts; k++) {
      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
          max_val) {
        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
        argmax_idx = pts_idx_of_voxels[k];
      }
    }

    if (argmax_idx != -1) {
      pooled_features[0] = max_val;
    }
    argmax[0] = argmax_idx;
  }
}

template <typename T>
__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
                                   int max_pts_each_voxel, int out_x, int out_y,
                                   int out_z, const T *pts_feature,
                                   const int *pts_idx_of_voxels,
                                   T *pooled_features) {
  // params pts_feature: (npoints, C)
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
  // params argmax: (N, out_x, out_y, out_z, C)

  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
    int x_idx = voxel_idx_flat / (out_y * out_z);
    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
    int z_idx = voxel_idx_flat % out_z;
    if (box_idx >= boxes_num || channel_idx >= channels) return;

    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
                         offset_base * max_pts_each_voxel;
    pooled_features += box_idx * out_x * out_y * out_z * channels +
                       offset_base * channels + channel_idx;

    float sum_val = 0;
    int total_pts = pts_idx_of_voxels[0];

    for (int k = 1; k <= total_pts; k++) {
      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
    }

    if (total_pts > 0) {
      pooled_features[0] = sum_val / total_pts;
    }
  }
}

template <typename T>
__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
                                            int out_x, int out_y, int out_z,
                                            const int *argmax,
                                            const T *grad_out, T *grad_in) {
  // params argmax: (N, out_x, out_y, out_z, C)
  // params grad_out: (N, out_x, out_y, out_z, C)
  // params grad_in: (npoints, C), return value

  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
    int x_idx = voxel_idx_flat / (out_y * out_z);
    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
    int z_idx = voxel_idx_flat % out_z;
    if (box_idx >= boxes_num || channel_idx >= channels) return;

    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
    argmax += box_idx * out_x * out_y * out_z * channels +
              offset_base * channels + channel_idx;
    grad_out += box_idx * out_x * out_y * out_z * channels +
                offset_base * channels + channel_idx;

    if (argmax[0] == -1) return;

    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
  }
}

template <typename T>
__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
                                            int out_x, int out_y, int out_z,
                                            int max_pts_each_voxel,
                                            const int *pts_idx_of_voxels,
                                            const T *grad_out, T *grad_in) {
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params grad_out: (N, out_x, out_y, out_z, C)
  // params grad_in: (npoints, C), return value

  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
    int x_idx = voxel_idx_flat / (out_y * out_z);
    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
    int z_idx = voxel_idx_flat % out_z;
    if (box_idx >= boxes_num || channel_idx >= channels) return;

    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
                         offset_base * max_pts_each_voxel;
    grad_out += box_idx * out_x * out_y * out_z * channels +
                offset_base * channels + channel_idx;

    int total_pts = pts_idx_of_voxels[0];
    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
    for (int k = 1; k <= total_pts; k++) {
      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
                grad_out[0] * cur_grad);
    }
  }
}

#endif  // ROIAWARE_POOL3D_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/roipoint_pool3d_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROIPOINT_POOL3D_MUSA_KERNEL_MUH
#define ROIPOINT_POOL3D_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
                                             T &local_x, T &local_y) {
  T cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
}

template <typename T>
__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
                                        T &local_y) {
  // param pt: (x, y, z)
  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
  // bottom center
  T x = pt[0], y = pt[1], z = pt[2];
  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center

  if (fabsf(z - cz) > dz / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
              (local_y > -dy / 2.0) & (local_y < dy / 2.0);
  return in_flag;
}

template <typename T>
__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
                                    const T *xyz, const T *boxes3d,
                                    int *pts_assign) {
  // params xyz: (B, N, 3)
  // params boxes3d: (B, M, 7)
  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
  // background points
  int box_idx = blockIdx.y;
  int bs_idx = blockIdx.z;
  MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) {
    if (box_idx >= boxes_num || bs_idx >= batch_size) return;

    int assign_idx =
        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
    pts_assign[assign_idx] = 0;

    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;

    T local_x = 0, local_y = 0;
    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
                                        local_x, local_y);
    pts_assign[assign_idx] = cur_in_flag;
  }
}

__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
                               int sampled_pts_num, const int *pts_assign,
                               int *pts_idx, int *pooled_empty_flag) {
  // params xyz: (B, N, 3)
  // params pts_feature: (B, N, C)
  // params pts_assign: (B, N)
  // params pts_idx: (B, M, 512)
  // params pooled_empty_flag: (B, M)
  MUSA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
    int bs_idx = blockIdx.y;

    int cnt = 0;
    for (int k = 0; k < pts_num; k++) {
      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
                     boxes_idx]) {
        if (cnt < sampled_pts_num) {
          pts_idx[bs_idx * boxes_num * sampled_pts_num +
                  boxes_idx * sampled_pts_num + cnt] = k;
          cnt++;
        } else
          break;
      }
    }

    if (cnt == 0) {
      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
    } else if (cnt < sampled_pts_num) {
      // duplicate same points for sampling
      for (int k = cnt; k < sampled_pts_num; k++) {
        int duplicate_idx = k % cnt;
        int base_offset =
            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
      }
    }
  }
}

template <typename T>
__global__ void roipoint_pool3d_forward(
    int batch_size, int pts_num, int boxes_num, int feature_in_len,
    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,
    T *pooled_features, int *pooled_empty_flag) {
  // params xyz: (B, N, 3)
  // params pts_idx: (B, M, 512)
  // params pts_feature: (B, N, C)
  // params pooled_features: (B, M, 512, 3+C)
  // params pooled_empty_flag: (B, M)
  int box_idx = blockIdx.y;
  int bs_idx = blockIdx.z;
  MUSA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;

    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
                   box_idx * sampled_pts_num + sample_pt_idx;
    int src_pt_idx = pts_idx[temp_idx];
    int dst_feature_offset = temp_idx * (3 + feature_in_len);

    for (int j = 0; j < 3; j++)
      pooled_features[dst_feature_offset + j] =
          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];

    int src_feature_offset =
        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
    memcpy(pooled_features + dst_feature_offset + 3,
           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
  }
}

#endif  // ROIPOINT_POOL3D_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/rotated_feature_align_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#ifndef ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH
#define ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename scalar_t>
__global__ void rotated_feature_align_forward_kernel(
    const int nthreads, const int points, const scalar_t* bottom_data,
    const scalar_t* best_bboxes, const scalar_t spatial_scale,
    const int channels, const int height, const int width, scalar_t* top_data) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int w = index % width;
    int h = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    const scalar_t* bbox_offset =
        best_bboxes + ((n * height + h) * width + w) * 5;
    scalar_t roi_y = bbox_offset[0] * spatial_scale;
    scalar_t roi_x = bbox_offset[1] * spatial_scale;

    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
    scalar_t py[5] = {roi_y, 0, 0, 0, 0};

    if (points > 1) {
      scalar_t roi_w = bbox_offset[2] * spatial_scale;
      scalar_t roi_h = bbox_offset[3] * spatial_scale;
      scalar_t roi_a = bbox_offset[4];

      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
      scalar_t wx = cosa * w_2, wy = sina * w_2;
      scalar_t hx = -sina * h_2, hy = cosa * h_2;

      px[1] = roi_x + wx + hx;
      py[1] = roi_y + wy + hy;
      px[2] = roi_x - wx + hx;
      py[2] = roi_y - wy + hy;
      px[3] = roi_x - wx - hx;
      py[3] = roi_y - wy - hy;
      px[4] = roi_x + wx - hx;
      py[4] = roi_y + wy - hy;
    }

    const scalar_t* offset_bottom_data =
        bottom_data + (n * channels + c) * height * width;

    scalar_t output_val = bottom_data[index];
    for (int i = 0; i < points; i++) {
      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
                                                   width, py[i], px[i], i);
    }
    top_data[index] = output_val;
  }
}

template <typename scalar_t>
__global__ void rotated_feature_align_backward_kernel(
    const int nthreads, const int points, const scalar_t* top_diff,
    const scalar_t* best_bboxes, const scalar_t spatial_scale,
    const int channels, const int height, const int width,
    scalar_t* bottom_diff) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int w = index % width;
    int h = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    const scalar_t* bbox_offset =
        best_bboxes + ((n * height + h) * width + w) * 5;
    scalar_t roi_y = bbox_offset[0] * spatial_scale;
    scalar_t roi_x = bbox_offset[1] * spatial_scale;

    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
    scalar_t py[5] = {roi_y, 0, 0, 0, 0};

    if (points > 1) {
      scalar_t roi_w = bbox_offset[2] * spatial_scale;
      scalar_t roi_h = bbox_offset[3] * spatial_scale;
      scalar_t roi_a = bbox_offset[4];

      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
      scalar_t wx = cosa * w_2, wy = sina * w_2;
      scalar_t hx = -sina * h_2, hy = cosa * h_2;

      px[1] = roi_x + wx + hx;
      py[1] = roi_y + wy + hy;
      px[2] = roi_x - wx + hx;
      py[2] = roi_y - wy + hy;
      px[3] = roi_x - wx - hx;
      py[3] = roi_y - wy - hy;
      px[4] = roi_x + wx - hx;
      py[4] = roi_y + wy - hy;
    }

    scalar_t* offset_bottom_diff =
        bottom_diff + (n * channels + c) * height * width;
    scalar_t value_top_diff = top_diff[index];

    atomicAdd(bottom_diff + index, value_top_diff);
    for (int i = 0; i < points; i++) {
      scalar_t w1, w2, w3, w4;
      int x_low, x_high, y_low, y_high;

      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
                                              w2, w3, w4, x_low, x_high, y_low,
                                              y_high, i);
      scalar_t g1 = value_top_diff * w1;
      scalar_t g2 = value_top_diff * w2;
      scalar_t g3 = value_top_diff * w3;
      scalar_t g4 = value_top_diff * w4;
      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
      }
    }
  }
}
#endif  // ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/scatter_points_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SCATTER_POINTS_MUSA_KERNEL_MUH
#define SCATTER_POINTS_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
int const maxGridDim = 50000;

__device__ __forceinline__ static void reduceMax(float *address, float val) {
  int *address_as_i = reinterpret_cast<int *>(address);
  int old = *address_as_i, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_i, assumed,
                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
  } while (assumed != old || __int_as_float(old) < val);
}

__device__ __forceinline__ static void reduceMax(double *address, double val) {
  unsigned long long *address_as_ull =
      reinterpret_cast<unsigned long long *>(address);
  unsigned long long old = *address_as_ull, assumed;
  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, assumed,
        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
  } while (assumed != old || __longlong_as_double(old) < val);
}

__device__ __forceinline__ static void reduceAdd(float *address, float val) {
  atomicAdd(address, val);
}

__device__ __forceinline__ static void reduceAdd(double *address, double val) {
  atomicAdd(address, val);

}

template <typename T>
__global__ void feats_reduce_kernel(
    const T *feats, const int32_t *coors_map,
    T *reduced_feats,  // shall be 0 at initialization
    const int num_input, const int num_feats, const reduce_t reduce_type) {
  MUSA_1D_KERNEL_LOOP(x, num_input) {
    int32_t reduce_to = coors_map[x];
    if (reduce_to == -1) continue;

    const T *feats_offset = feats + x * num_feats;
    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
    if (reduce_type == reduce_t::MAX) {
      for (int i = 0; i < num_feats; i++) {
        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
      }
    } else {
      for (int i = 0; i < num_feats; i++) {
        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
      }
    }
  }
}

template <typename T>
__global__ void add_reduce_traceback_grad_kernel(
    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
    const int32_t *reduce_count, const int num_input, const int num_feats,
    const reduce_t reduce_type) {
  MUSA_1D_KERNEL_LOOP(x, num_input) {
    int32_t reduce_to = coors_map[x];
    if (reduce_to == -1) {
      continue;
    }

    const int input_offset = x * num_feats;
    T *grad_feats_offset = grad_feats + input_offset;
    const int reduced_offset = reduce_to * num_feats;
    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;

    if (reduce_type == reduce_t::SUM) {
      for (int i = 0; i < num_feats; i++) {
        grad_feats_offset[i] = grad_reduced_feats_offset[i];
      }
    } else if (reduce_type == reduce_t::MEAN) {
      for (int i = 0; i < num_feats; i++) {
        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
                               static_cast<T>(reduce_count[reduce_to]);
      }
    }
  }
}

template <typename T>
__global__ void max_reduce_traceback_scatter_idx_kernel(
    const T *feats, const T *reduced_feats, int32_t *reduce_from,
    const int32_t *coors_map, const int num_input, const int num_feats) {
  MUSA_1D_KERNEL_LOOP(x, num_input) {
    int32_t reduce_to = coors_map[x];

    const int input_offset = x * num_feats;
    const T *feats_offset = feats + input_offset;

    if (reduce_to == -1) {
      continue;
    }

    const int reduced_offset = reduce_to * num_feats;
    const T *reduced_feats_offset = reduced_feats + reduced_offset;
    int32_t *reduce_from_offset = reduce_from + reduced_offset;

    for (int i = 0; i < num_feats; i++) {
      if (feats_offset[i] == reduced_feats_offset[i]) {
        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
      }
    }
  }
}

template <typename T>
__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
                                               const T *grad_reduced_feats,
                                               const int32_t *reduce_from,
                                               const int num_reduced,
                                               const int num_feats) {
  MUSA_1D_KERNEL_LOOP(x, num_reduced) {
    const int reduced_offset = x * num_feats;
    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;

    for (int i = 0; i < num_feats; i++) {
      grad_feats[scatter_to_offset[i] * num_feats + i] =
          grad_reduced_feats_offset[i];
    }
  }
}

#endif  // SCATTER_POINTS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/sigmoid_focal_loss_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH
#define SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void sigmoid_focal_loss_forward_musa_kernel(
    const int nthreads, const T* input, const int64_t* target, const T* weight,
    T* output, const T gamma, const T alpha, const int num_classes) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int n = index / num_classes;
    int c = index % num_classes;

    int64_t t = target[n];
    T flag_p = (t == c);
    T flag_n = (t != c);

    // p = sigmoid(x) = 1. / 1. + expf(-x)
    T p = (T)1. / ((T)1. + expf(-input[index]));

    // (1 - p)**gamma * log(p)
    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));
    // p**gamma * log(1 - p)
    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));

    output[index] = (T)0.;
    output[index] += -flag_p * alpha * term_p;
    output[index] += -flag_n * ((T)1. - alpha) * term_n;
    if (weight != NULL) {
      output[index] *= weight[t];
    }
  }
}

template <typename T>
__global__ void sigmoid_focal_loss_backward_musa_kernel(
    const int nthreads, const T* input, const int64_t* target, const T* weight,
    T* grad_input, const T gamma, const T alpha, const int num_classes) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int n = index / num_classes;
    int c = index % num_classes;

    int64_t t = target[n];
    T flag_p = (t == c);
    T flag_n = (t != c);

    // p = sigmoid(x) = 1. / 1. + expf(-x)
    T p = (T)1. / ((T)1. + exp(-input[index]));

    // (1 - p)**gamma * (1 - p - gamma*p*log(p))
    T term_p = pow(((T)1. - p), gamma) *
               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));
    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)
    T term_n = pow(p, gamma) *
               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);

    grad_input[index] = (T)0.;
    grad_input[index] += -flag_p * alpha * term_p;
    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;
    if (weight != NULL) {
      grad_input[index] *= weight[t];
    }
  }
}

#endif  // SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/softmax_focal_loss_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH
#define SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void softmax_focal_loss_forward_musa_kernel(
    const int nthreads, const T* softmax, const int64_t* target,
    const T* weight, T* output, const T gamma, const T alpha,
    const int num_classes) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int64_t label = target[index];
    T pred = softmax[index * num_classes + label];

    if (label >= 0) {
      output[index] =
          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));
    } else {
      output[index] = 0;
    }
    if (weight != NULL) {
      output[index] *= weight[label];
    }
  }
}

template <typename T>
__global__ void softmax_focal_loss_backward_musa1_kernel(
    const int nthreads, const T* softmax, const int64_t* target,
    const T* weight, T* buff, const T gamma, const T alpha,
    const int num_classes) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int64_t label = target[index];
    T pred = softmax[index * num_classes + label];

    if (label >= 0) {
      buff[index] = alpha * (-pow((T)1. - pred, gamma) +
                             gamma * pow((T)1. - pred, gamma - 1) * pred *
                                 log(max(pred, (T)FLT_MIN)));
    } else {
      buff[index] = 0;
    }
    if (weight != NULL) {
      buff[index] *= weight[label];
    }
  }
}

template <typename T>
__global__ void softmax_focal_loss_backward_musa2_kernel(
    const int nthreads, const T* softmax, const int64_t* target, const T* buff,
    T* grad_input, const int num_classes) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    int n = index / num_classes;
    int c = index % num_classes;
    int64_t label = target[n];

    if (label >= 0) {
      T flag = (label == c ? (T)1. : (T)0.);
      grad_input[index] = buff[n] * (flag - softmax[index]);
    } else {
      grad_input[index] = 0;
    }
  }
}

#endif  // SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/spconv/indice.muh
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef INDICE_MU_H_
#define INDICE_MU_H_
#include <utils/spconv/spconv/geometry.h>
#include <utils/spconv/tensorview/tensorview.h>

#include <utils/spconv/tensorview/helper_kernel.muh>

template <typename Index, typename IndexGrid, unsigned NDim,
          int KernelMaxVolume = 256>
__global__ void prepareIndicePairsKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> kernelSize,
    const tv::SimpleVector<Index, NDim> stride,
    const tv::SimpleVector<Index, NDim> padding,
    const tv::SimpleVector<Index, NDim> dilation,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];
  }
  Index numValidPoints = 0;
  Index validPoints[KernelMaxVolume * (NDim + 1)];
  Index *pointPtr = nullptr;
  auto indicePairsDim2 = indicePairs.dim(2);
  Index index;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
        validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
      indicePairs(offset, 0, oldNum) = ix;
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
              spatialVolume * indicesIn(ix, 0);
      indicePairs(offset, 1, oldNum) = index;
      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
    }
  }
}

template <typename Index, typename IndexGrid, unsigned NDim,
          int KernelMaxVolume = 256>
__global__ void prepareDeConvIndicePairsKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> kernelSize,
    const tv::SimpleVector<Index, NDim> stride,
    const tv::SimpleVector<Index, NDim> padding,
    const tv::SimpleVector<Index, NDim> dilation,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];
  }
  Index numValidPoints = 0;
  Index validPoints[KernelMaxVolume * (NDim + 1)];
  Index *pointPtr = nullptr;
  auto indicePairsDim2 = indicePairs.dim(2);
  Index index;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = getValidOutPosTranspose<Index, NDim>(
        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
        validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
      indicePairs(offset, 0, oldNum) = ix;
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
              spatialVolume * indicesIn(ix, 0);
      indicePairs(offset, 1, oldNum) = index;
      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
    }
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignGridAndIndiceOutKernel(
    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
    int numAct, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
  Index index;
  auto indicesOutPtr = indicesOut.data();
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    index = indicePairUnique[ix];
    gridsOut[index] = ix;
    index = tv::rowArrayIdxInv<Index, NDim>(
        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
    indicesOut[ix * (NDim + 1)] = index % batchSize;
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignIndicePairsKernel(
    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
    int numActIn, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indicePairUnique,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  Index index;
  int kernelVolume = indicePairs.dim(0);
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    for (int i = 0; i < kernelVolume; ++i) {
      index = indicePairs(i, 1, ix);
      if (index > -1) {
        indicePairs(i, 1, ix) = gridsOut[index];
      }
    }
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void prepareSubMGridKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index index = 0;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
                                         outSpatialShape.data()) +
            spatialVolume * indicesIn(ix, 0);
    gridsOut[index] = ix;
  }
}

template <typename Index, typename IndexGrid, unsigned NDim,
          int KernelMaxVolume = 256>
__global__ void getSubMIndicePairsKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
    const tv::SimpleVector<Index, NDim> kernelSize,
    const tv::SimpleVector<Index, NDim> stride,
    const tv::SimpleVector<Index, NDim> padding,
    const tv::SimpleVector<Index, NDim> dilation,
    const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index numValidPoints = 0;
  Index validPoints[KernelMaxVolume * (NDim + 1)];
  Index *pointPtr = nullptr;
  Index index = 0;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
        validPoints);
    for (int i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
              spatialVolume * indicesIn(ix, 0);
      if (gridsOut[index] > -1) {
        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
        indicePairs(offset, 1, oldNum) = gridsOut[index];
        indicePairs(offset, 0, oldNum) = ix;
      }
    }
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique,
                                tv::TensorView<IndexGrid> gridsOut,
                                int numAct) {
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    gridsOut[indicePairUnique[ix]] = -1;
  }
}

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridSubMKernel(
    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
  int outSpatialShapeReg[NDim];
  for (int i = 0; i < NDim; ++i) {
    outSpatialShapeReg[i] = outSpatialShape[i];
  }
  Index spatialVolume = 1;
  auto indsPtr = indices;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index index;
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    indsPtr = indices + ix * (NDim + 1);
    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/musa/spconv/reordering.muh
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef REORDERING_MU_H_
#define REORDERING_MU_H_
#include <utils/spconv/tensorview/helper_kernel.muh>

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,
                                    const Index *indices, int size,
                                    int numPlanes) {
  int ILPStrideX[NumILP];
  Index inds[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;

  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < size)
        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < size)
          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
              features[inds[ilp] + iy];
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType>
__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,
                                const Index *indices, int size, int numPlanes) {
  int ILPStrideX[NumILP];
  Index inds[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;

  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < size)
        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < size)
          reinterpret_cast<VecType *>(
              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType = int4>
__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,
                                     const Index *indices, int size,
                                     int numPlanes) {
  int ILPStrideY[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
  features += blockIdx.x * NumTLP;
  buffer += blockIdx.x * NumTLP;

  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      reinterpret_cast<VecType *>(
          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
          reinterpret_cast<const VecType *>(
              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
                        threadIdx.x];
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddGenericKernel(scalar_t *outFeatures,
                                        const scalar_t *buffer,
                                        const Index *indices, int size,
                                        int numPlanes) {
  int ILPStrideX[NumILP];
  Index inds[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < size)
        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < size) {
          outFeatures[inds[ilp] + iy] +=
              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType = int4>
__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,
                                         const scalar_t *buffer,
                                         const Index *indices, int size,
                                         int numPlanes) {
  int ILPStrideY[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
  outFeatures += blockIdx.x * NumTLP;
  buffer += blockIdx.x * NumTLP;
  scalar_t buf[vecloadFactor];
  scalar_t buf2[vecloadFactor];
  Index idx;
  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      reinterpret_cast<VecType *>(buf)[0] =
          reinterpret_cast<VecType *>(outFeatures)[idx];
      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
#pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
        buf[i] += buf2[i];
      }
      reinterpret_cast<VecType *>(outFeatures)[idx] =
          reinterpret_cast<VecType *>(buf)[0];
    }
  }
}

#endif


================================================
FILE: mmcv/ops/csrc/common/musa/stack_ball_query_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
#ifndef STACK_BALL_QUERY_MUSA_KERNEL_MUH
#define STACK_BALL_QUERY_MUSA_KERNEL_MUH


#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void stack_ball_query_forward_musa_kernel(
    int B, int M, float radius, int nsample, const T *new_xyz,
    const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt,
    int *idx) {
  // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
  // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
  // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
  // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
  // output:
  //      idx: (M, nsample)
  const T *cur_xyz = xyz;
  int *cur_idx = idx;
  MUSA_1D_KERNEL_LOOP(pt_idx, M) {
    int bs_idx = 0;
    for (int pt_cnt = 0; bs_idx < B; bs_idx++) {
      pt_cnt += new_xyz_batch_cnt[bs_idx];
      if (pt_idx < pt_cnt) break;
    }

    int xyz_batch_start_idx = 0;
    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];

    const T *new_xyz_p = new_xyz + pt_idx * 3;
    cur_xyz += xyz_batch_start_idx * 3;
    cur_idx += pt_idx * nsample;

    float radius2 = radius * radius;
    T new_x = new_xyz_p[0];
    T new_y = new_xyz_p[1];
    T new_z = new_xyz_p[2];
    int n = xyz_batch_cnt[bs_idx];

    int cnt = 0;
    for (int k = 0; k < n; ++k) {
      T x = cur_xyz[k * 3 + 0];
      T y = cur_xyz[k * 3 + 1];
      T z = cur_xyz[k * 3 + 2];
      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
             (new_z - z) * (new_z - z);
      if (d2 < radius2) {
        if (cnt == 0) {
          for (int l = 0; l < nsample; ++l) {
            cur_idx[l] = k;
          }
        }
        cur_idx[cnt] = k;
        ++cnt;
        if (cnt >= nsample) break;
      }
    }
    if (cnt == 0) cur_idx[0] = -1;
  }
}

#endif  // STACK_BALL_QUERY_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/stack_group_points_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#ifndef STACK_GROUP_POINTS_MUSA_KERNEL_MUH
#define STACK_GROUP_POINTS_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"
#include <stdio.h>
template <typename T>
__global__ void stack_group_points_forward_musa_kernel(
    int b, int c, int m, int nsample, const T *features,
    const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt,
    T *out) {
  // :param features: (N1 + N2 ..., C) tensor of features to group
  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
  // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor
  // containing the indices of features to group with :param idx_batch_cnt:
  // (batch_size) [M1 + M2 ...] tensor containing the indices of features to
  // group with :return:
  //     output: (M1 + M2, C, nsample) tensor
  MUSA_1D_KERNEL_LOOP(index, m * c * nsample) {
    const T *cur_features = features;
    const int *cur_idx = idx;
    int sample_idx = index % nsample;
    int c_idx = (index / nsample) % c;
    int pt_idx = (index / nsample / c);

    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
    for (int k = 1; k < b; k++) {
      if (pt_idx < pt_cnt) break;
      pt_cnt += idx_batch_cnt[k];
      bs_idx = k;
    }

    int features_batch_start_idx = 0;
    int features_batch_end_idx = features_batch_cnt[0];
    for (int k = 0; k < bs_idx; k++) {
      features_batch_start_idx += features_batch_cnt[k];
      features_batch_end_idx =
          features_batch_start_idx + features_batch_cnt[k + 1];
    }
    cur_features += features_batch_start_idx * c;

    cur_idx += pt_idx * nsample + sample_idx;
    int in_idx = cur_idx[0] * c + c_idx;
    int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx;
    if (in_idx < features_batch_end_idx * c) {
      out[out_idx] = cur_features[in_idx];
    }
  }
}

template <typename T>
__global__ void stack_group_points_backward_musa_kernel(
    int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx,
    const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) {
  // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the
  // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing
  // the indices of features to group with :param idx_batch_cnt: (batch_size)
  // [M1 + M2 ...] tensor containing the indices of features to group with
  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
  // indices of features to group with :return:
  //     grad_features: (N1 + N2 ..., C) gradient of the features
  MUSA_1D_KERNEL_LOOP(index, m * c * nsample) {
    const T *cur_grad_out = grad_out;
    const int *cur_idx = idx;
    T *cur_grad_features = grad_features;
    int sample_idx = index % nsample;
    int c_idx = (index / nsample) % c;
    int pt_idx = (index / nsample / c);

    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;

    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
    for (int k = 1; k < b; k++) {
      if (pt_idx < pt_cnt) break;
      pt_cnt += idx_batch_cnt[k];
      bs_idx = k;
    }

    int features_batch_start_idx = 0;
    for (int k = 0; k < bs_idx; k++)
      features_batch_start_idx += features_batch_cnt[k];

    cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx;
    cur_idx += pt_idx * nsample + sample_idx;
    cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx;

    atomicAdd(cur_grad_features, cur_grad_out[0]);
  }
}

#endif  // GROUP_POINTS_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/sync_bn_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SYNCBN_MUSA_KERNEL_MUH
#define SYNCBN_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void sync_bn_forward_mean_musa_kernel(const T *input, float *mean,
                                                 int num, int channels,
                                                 int spatial) {
  __shared__ float buffer[THREADS_PER_BLOCK];
  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    buffer[tid] += input[index];
  }
  __syncthreads();

  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer[tid] += buffer[tid + s];
    }
    __syncthreads();
  }
  int total = num * spatial;
  if (tid == 0) {
    mean[c] = buffer[0] / total;
  }
}

template <>
__global__ void sync_bn_forward_mean_musa_kernel(const phalf *input,
                                                 float *mean, int num,
                                                 int channels, int spatial) {
  __shared__ float buffer[THREADS_PER_BLOCK];
  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    buffer[tid] += static_cast<float>(input[index]);
  }
  __syncthreads();

  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer[tid] += buffer[tid + s];
    }
    __syncthreads();
  }
  int total = num * spatial;
  if (tid == 0) {
    mean[c] = buffer[0] / total;
  }
}

template <typename T>
__global__ void sync_bn_forward_var_musa_kernel(const T *input,
                                                const float *mean, float *var,
                                                int num, int channels,
                                                int spatial) {
  __shared__ float buffer[THREADS_PER_BLOCK];
  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    float td = input[index] - mean[c];
    buffer[tid] += td * td;
  }
  __syncthreads();
  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer[tid] += buffer[tid + s];
    }
    __syncthreads();
  }
  int total = num * spatial;
  if (tid == 0) {
    var[c] = buffer[0] / total;
  }
}

template <>
__global__ void sync_bn_forward_var_musa_kernel(const phalf *input,
                                                const float *mean, float *var,
                                                int num, int channels,
                                                int spatial) {
  __shared__ float buffer[THREADS_PER_BLOCK];
  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    float td = static_cast<float>(input[index]) - mean[c];
    buffer[tid] += td * td;
  }
  __syncthreads();
  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer[tid] += buffer[tid + s];
    }
    __syncthreads();
  }
  int total = num * spatial;
  if (tid == 0) {
    var[c] = buffer[0] / total;
  }
}

template <typename T>
__global__ void sync_bn_forward_output_musa_kernel(
    const T *input, const float *mean, const float *var, float *running_mean,
    float *running_var, const float *weight, const float *bias, float *norm,
    float *std, T *output, int num, int channels, int spatial, float eps,
    float momentum, int group_size) {
  int tid = threadIdx.x;
  int c = blockIdx.x;
  float mean_value = mean[c];
  float std_value = sqrt(var[c] + eps);

  if (weight != nullptr) {
    float weight_value = weight[c];
    float bias_value = bias[c];
    if (norm != nullptr) {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        norm[index] = (input[index] - mean_value) / std_value;
        output[index] = norm[index] * weight_value + bias_value;
      }
    } else {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] =
            (input[index] - mean_value) / std_value * weight_value + bias_value;
      }
    }
  } else {
    if (norm != nullptr) {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] = norm[index] = (input[index] - mean_value) / std_value;
      }
    } else {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] = (input[index] - mean_value) / std_value;
      }
    }
  }
  if (tid == 0) {
    if (std != nullptr) std[c] = std_value;
    if (running_mean != nullptr) {
      running_mean[c] =
          momentum * mean_value + (1 - momentum) * running_mean[c];
      int count = num * spatial * group_size;
      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
    }
  }
}

template <>
__global__ void sync_bn_forward_output_musa_kernel(
    const phalf *input, const float *mean, const float *var,
    float *running_mean, float *running_var, const float *weight,
    const float *bias, float *norm, float *std, phalf *output, int num,
    int channels, int spatial, float eps, float momentum, int group_size) {
  int tid = threadIdx.x;
  int c = blockIdx.x;
  float mean_value = mean[c];
  float std_value = sqrt(var[c] + eps);
  if (weight != nullptr) {
    float weight_value = weight[c];
    float bias_value = bias[c];
    if (norm != nullptr) {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        norm[index] =
            (static_cast<float>(input[index]) - mean_value) / std_value;
        output[index] =
            static_cast<phalf>(norm[index] * weight_value + bias_value);
      }
    } else {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] =
            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /
                                   std_value * weight_value +
                               bias_value);
      }
    }
  } else {
    if (norm != nullptr) {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        norm[index] =
            (static_cast<float>(input[index]) - mean_value) / std_value;
        output[index] = static_cast<phalf>(norm[index]);
      }
    } else {
      for (int i = tid; i < num * spatial; i += blockDim.x) {
        int index =
            (i / spatial) * channels * spatial + c * spatial + i % spatial;
        output[index] = static_cast<phalf>(
            (static_cast<float>(input[index]) - mean_value) / std_value);
      }
    }
  }
  if (tid == 0) {
    if (std != nullptr) std[c] = std_value;
    if (running_mean != nullptr) {
      running_mean[c] =
          momentum * mean_value + (1 - momentum) * running_mean[c];
      int count = num * spatial * group_size;
      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
    }
  }
}

template <typename T>
__global__ void sync_bn_backward_param_musa_kernel(const T *grad_output,
                                                   const float *norm,
                                                   float *grad_weight,
                                                   float *grad_bias, int num,
                                                   int channels, int spatial) {
  __shared__ float buffer1[THREADS_PER_BLOCK];
  __shared__ float buffer2[THREADS_PER_BLOCK];

  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer1[tid] = buffer2[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    buffer1[tid] += grad_output[index] * norm[index];
    buffer2[tid] += grad_output[index];
  }
  __syncthreads();

  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer1[tid] += buffer1[tid + s];
      buffer2[tid] += buffer2[tid + s];
    }
    __syncthreads();
  }
  if (tid == 0) {
    grad_weight[c] = buffer1[0];
    grad_bias[c] = buffer2[0];
  }
}

template <>
__global__ void sync_bn_backward_param_musa_kernel(const phalf *grad_output,
                                                   const float *norm,
                                                   float *grad_weight,
                                                   float *grad_bias, int num,
                                                   int channels, int spatial) {
  __shared__ float buffer1[THREADS_PER_BLOCK];
  __shared__ float buffer2[THREADS_PER_BLOCK];

  int tid = threadIdx.x;
  int c = blockIdx.x;
  buffer1[tid] = buffer2[tid] = 0;
  for (int i = tid; i < num * spatial; i += blockDim.x) {
    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];
    buffer2[tid] += static_cast<float>(grad_output[index]);
  }
  __syncthreads();

  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (tid < s) {
      buffer1[tid] += buffer1[tid + s];
      buffer2[tid] += buffer2[tid + s];
    }
    __syncthreads();
  }
  if (tid == 0) {
    grad_weight[c] = buffer1[0];
    grad_bias[c] = buffer2[0];
  }
}

template <typename T>
__global__ void sync_bn_backward_data_musa_kernel(
    int output_size, const T *grad_output, const float *weight,
    const float *grad_weight, const float *grad_bias, const float *norm,
    const float *std, T *grad_input, int num, int channels, int spatial) {
  int factor = num * spatial;
  MUSA_1D_KERNEL_LOOP(index, output_size) {
    int c = (index / spatial) % channels;
    grad_input[index] =
        weight[c] *
        (grad_output[index] -
         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
        std[c];
  }
}

template <>
__global__ void sync_bn_backward_data_musa_kernel(
    int output_size, const phalf *grad_output, const float *weight,
    const float *grad_weight, const float *grad_bias, const float *norm,
    const float *std, phalf *grad_input, int num, int channels, int spatial) {
  int factor = num * spatial;
  MUSA_1D_KERNEL_LOOP(index, output_size) {
    int c = (index / spatial) % channels;
    grad_input[index] = static_cast<phalf>(
        weight[c] *
        (static_cast<float>(grad_output[index]) -
         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
        std[c]);
  }
}

#endif  // SYNCBN_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/three_interpolate_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef THREE_INTERPOLATE_MUSA_KERNEL_MUH
#define THREE_INTERPOLATE_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void three_interpolate_forward_musa_kernel(
    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,
    const T *weight, T *out) {
  // points: (B, C, M)
  // idx: (B, N, 3)
  // weight: (B, N, 3)
  // output:
  //      out: (B, C, N)

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, n) {
    if (bs_idx >= b || c_idx >= c) return;

    weight += bs_idx * n * 3 + pt_idx * 3;
    points += bs_idx * c * m + c_idx * m;
    idx += bs_idx * n * 3 + pt_idx * 3;
    out += bs_idx * c * n + c_idx * n;

    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
                  weight[2] * points[idx[2]];
  }
}

template <typename T>
__global__ void three_interpolate_backward_musa_kernel(
    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,
    const T *weight, T *grad_points) {
  // grad_out: (B, C, N)
  // weight: (B, N, 3)
  // output:
  //      grad_points: (B, C, M)

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, n) {
    if (bs_idx >= b || c_idx >= c) return;

    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
    weight += bs_idx * n * 3 + pt_idx * 3;
    grad_points += bs_idx * c * m + c_idx * m;
    idx += bs_idx * n * 3 + pt_idx * 3;

    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
  }
}

#endif  // THREE_INTERPOLATE_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/three_nn_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef THREE_NN_MUSA_KERNEL_MUH
#define THREE_NN_MUSA_KERNEL_MUH


#include "pytorch_musa_helper.hpp"
template <typename T>
__global__ void three_nn_forward_musa_kernel(int b, int n, int m,
                                             const T *unknown, const T *known,
                                             T *dist2, int *__restrict__ idx) {
  // unknown: (B, N, 3)
  // known: (B, M, 3)
  // output:
  //      dist2: (B, N, 3)
  //      idx: (B, N, 3)

  int bs_idx = blockIdx.y;
  MUSA_1D_KERNEL_LOOP(pt_idx, n) {
    if (bs_idx >= b) return;

    unknown += bs_idx * n * 3 + pt_idx * 3;
    known += bs_idx * m * 3;
    dist2 += bs_idx * n * 3 + pt_idx * 3;
    idx += bs_idx * n * 3 + pt_idx * 3;

    T ux = unknown[0];
    T uy = unknown[1];
    T uz = unknown[2];

    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
    int besti1 = 0, besti2 = 0, besti3 = 0;
    for (int k = 0; k < m; ++k) {
      T x = known[k * 3 + 0];
      T y = known[k * 3 + 1];
      T z = known[k * 3 + 2];
      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
      if (d < best1) {
        best3 = best2;
        besti3 = besti2;
        best2 = best1;
        besti2 = besti1;
        best1 = d;
        besti1 = k;
      } else if (d < best2) {
        best3 = best2;
        besti3 = besti2;
        best2 = d;
        besti2 = k;
      } else if (d < best3) {
        best3 = d;
        besti3 = k;
      }
    }
    dist2[0] = best1;
    dist2[1] = best2;
    dist2[2] = best3;
    idx[0] = besti1;
    idx[1] = besti2;
    idx[2] = besti3;
  }
}

#endif  // THREE_NN_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/tin_shift_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TIN_SHIFT_MUSA_KERNEL_MUH
#define TIN_SHIFT_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

template <typename T>
__global__ void tin_shift_forward_musa_kernel(
    const int nthreads, const T* input, const int* shift, T* output,
    const int batch_size, const int channels, const int t_size,
    const int hw_size, const int group_size, const int group_channel) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    const int hw_index = index % hw_size;
    const int j = (index / hw_size) % channels;

    const int n_index = (index / hw_size / channels) % batch_size;
    int group_id = j / group_channel;
    int t_shift = shift[n_index * group_size + group_id];
    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
    for (int i = 0; i < t_size; i++) {
      int now_t = i + t_shift;
      int data_id = i * hw_size * channels + offset;
      if (now_t < 0 || now_t >= t_size) {
        continue;
      }
      int out_id = now_t * hw_size * channels + offset;
      output[out_id] = input[data_id];
    }
  }
}

template <typename T>
__global__ void tin_shift_backward_musa_kernel(
    const int nthreads, const T* input, const int* shift, T* output,
    const int batch_size, const int channels, const int t_size,
    const int hw_size, const int group_size, const int group_channel) {
  MUSA_1D_KERNEL_LOOP(index, nthreads) {
    const int hw_index = index % hw_size;
    const int j = (index / hw_size) % channels;

    const int n_index = (index / hw_size / channels) % batch_size;
    int group_id = j / group_channel;
    int t_shift = shift[n_index * group_size + group_id];
    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
    for (int i = 0; i < t_size; i++) {
      int now_t = i + t_shift;
      int data_id = i * hw_size * channels + offset;
      if (now_t < 0 || now_t >= t_size) {
        continue;
      }
      int out_id = now_t * hw_size * channels + offset;
      output[out_id] = input[data_id];
    }
  }
}

#endif  // TIN_SHIFT_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/musa/voxelization_musa_kernel.muh
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef VOXELIZATION_MUSA_KERNEL_MUH
#define VOXELIZATION_MUSA_KERNEL_MUH

#include "pytorch_musa_helper.hpp"

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;

template <typename T, typename T_int>
__global__ void dynamic_voxelize_kernel(
    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
    const float voxel_z, const float coors_x_min, const float coors_y_min,
    const float coors_z_min, const float coors_x_max, const float coors_y_max,
    const float coors_z_max, const int grid_x, const int grid_y,
    const int grid_z, const int num_points, const int num_features,
    const int NDim) {
  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
  MUSA_1D_KERNEL_LOOP(index, num_points) {
    // To save some computation
    auto points_offset = points + index * num_features;
    auto coors_offset = coors + index * NDim;
    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
    if (c_x < 0 || c_x >= grid_x) {
      coors_offset[0] = -1;
      continue;
    }

    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
    if (c_y < 0 || c_y >= grid_y) {
      coors_offset[0] = -1;
      coors_offset[1] = -1;
      continue;
    }

    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
    if (c_z < 0 || c_z >= grid_z) {
      coors_offset[0] = -1;
      coors_offset[1] = -1;
      coors_offset[2] = -1;
    } else {
      coors_offset[0] = c_z;
      coors_offset[1] = c_y;
      coors_offset[2] = c_x;
    }
  }
}

template <typename T, typename T_int>
__global__ void assign_point_to_voxel(const int nthreads, const T* points,
                                      T_int* point_to_voxelidx,
                                      T_int* coor_to_voxelidx, T* voxels,
                                      const int max_points,
                                      const int num_features,
                                      const int num_points, const int NDim) {
  MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
    int index = thread_idx / num_features;

    int num = point_to_voxelidx[index];
    int voxelidx = coor_to_voxelidx[index];
    if (num > -1 && voxelidx > -1) {
      auto voxels_offset =
          voxels + voxelidx * max_points * num_features + num * num_features;

      int k = thread_idx % num_features;
      voxels_offset[k] = points[thread_idx];
    }
  }
}

template <typename T, typename T_int>
__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
                                   T_int* point_to_voxelidx,
                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
                                   const int num_points, const int NDim) {
  MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
    // if (index >= num_points) return;
    int index = thread_idx / NDim;
    int num = point_to_voxelidx[index];
    int voxelidx = coor_to_voxelidx[index];
    if (num == 0 && voxelidx > -1) {
      auto coors_offset = voxel_coors + voxelidx * NDim;
      int k = thread_idx % NDim;
      coors_offset[k] = coor[thread_idx];
    }
  }
}

template <typename T_int>
__global__ void point_to_voxelidx_kernel(const T_int* coor,
                                         T_int* point_to_voxelidx,
                                         T_int* point_to_pointidx,
                                         const int max_points,
                                         const int max_voxels,
                                         const int num_points, const int NDim) {
  MUSA_1D_KERNEL_LOOP(index, num_points) {
    auto coor_offset = coor + index * NDim;
    // skip invalid points
    if (coor_offset[0] == -1) continue;

    int num = 0;
    int coor_x = coor_offset[0];
    int coor_y = coor_offset[1];
    int coor_z = coor_offset[2];
    // only calculate the coors before this coor[index]
    for (int i = 0; i < index; ++i) {
      auto prev_coor = coor + i * NDim;
      if (prev_coor[0] == -1) continue;

      // Find all previous points that have the same coors
      // if find the same coor, record it
      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
          (prev_coor[2] == coor_z)) {
        num++;
        if (num == 1) {
          // point to the same coor that first show up
          point_to_pointidx[index] = i;
        } else if (num >= max_points) {
          // out of boundary
          break;
        }
      }
    }
    if (num == 0) {
      point_to_pointidx[index] = index;
    }
    if (num < max_points) {
      point_to_voxelidx[index] = num;
    }
  }
}

template <typename T_int>
__global__ void determin_voxel_num(
    // const T_int* coor,
    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
    const int max_points, const int max_voxels, const int num_points) {
  // only calculate the coors before this coor[index]
  for (int i = 0; i < num_points; ++i) {
    int point_pos_in_voxel = point_to_voxelidx[i];
    // record voxel
    if (point_pos_in_voxel == -1) {
      // out of max_points or invalid point
      continue;
    } else if (point_pos_in_voxel == 0) {
      // record new voxel
      int voxelidx = voxel_num[0];
      if (voxel_num[0] >= max_voxels) continue;
      voxel_num[0] += 1;
      coor_to_voxelidx[i] = voxelidx;
      num_points_per_voxel[voxelidx] = 1;
    } else {
      int point_idx = point_to_pointidx[i];
      int voxelidx = coor_to_voxelidx[point_idx];
      if (voxelidx != -1) {
        coor_to_voxelidx[i] = voxelidx;
        num_points_per_voxel[voxelidx] += 1;
      }
    }
  }
}

__global__ void nondeterministic_get_assign_pos(
    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
  MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    int coors_idx = coors_map[thread_idx];
    if (coors_idx > -1) {
      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
      pts_id[thread_idx] = coors_pts_pos;
      if (coors_pts_pos == 0) {
        coors_order[coors_idx] = atomicAdd(coors_count, 1);
      }
    }
  }
}

template <typename T>
__global__ void nondeterministic_assign_point_voxel(
    const int nthreads, const T* points, const int32_t* coors_map,
    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
    const int max_voxels, const int max_points, const int num_features,
    const int NDim) {
  MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    int coors_idx = coors_map[thread_idx];
    int coors_pts_pos = pts_id[thread_idx];
    if (coors_idx > -1 && coors_pts_pos < max_points) {
      int coors_pos = coors_order[coors_idx];
      if (coors_pos < max_voxels) {
        auto voxels_offset =
            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
        auto points_offset = points + thread_idx * num_features;
        for (int k = 0; k < num_features; k++) {
          voxels_offset[k] = points_offset[k];
        }
        if (coors_pts_pos == 0) {
          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
          auto coors_offset = coors + coors_pos * NDim;
          auto coors_in_offset = coors_in + coors_idx * NDim;
          for (int k = 0; k < NDim; k++) {
            coors_offset[k] = coors_in_offset[k];
          }
        }
      }
    }
  }
}

#endif  // VOXELIZATION_MUSA_KERNEL_MUH


================================================
FILE: mmcv/ops/csrc/common/parrots_cpp_helper.hpp
================================================
#ifndef PARROTS_CPP_HELPER
#define PARROTS_CPP_HELPER
#include <parrots/darray/darraymath.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/darraylite.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include <vector>

using namespace parrots;

#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
  case prim_type: {                                     \
    using scalar_t = type;                              \
    return __VA_ARGS__();                               \
  }

#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
  [&] {                                                             \
    const auto& the_type = TYPE;                                    \
    switch (the_type) {                                             \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
      default:                                                      \
        PARROTS_NOTSUPPORTED;                                       \
    }                                                               \
  }()

#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
  [&] {                                                              \
    const auto& the_type = TYPE;                                     \
    switch (the_type) {                                              \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
      default:                                                       \
        PARROTS_NOTSUPPORTED;                                        \
    }                                                                \
  }()

#endif  // PARROTS_CPP_HELPER


================================================
FILE: mmcv/ops/csrc/common/parrots_cuda_helper.hpp
================================================
#ifndef PARROTS_CUDA_HELPER
#define PARROTS_CUDA_HELPER

#include <cuda.h>
#include <float.h>

#include <parrots/darray/darraymath.hpp>
#include <parrots/darray/mathfunctions.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/darrayutil.hpp>
#include <parrots/foundation/exceptions.hpp>
#include <parrots/foundation/float16.hpp>
#include <parrots/foundation/mathfunction.hpp>

#include "common_cuda_helper.hpp"
#include "parrots_cudawarpfunction.cuh"

using namespace parrots;
using phalf = float16;

#define __PHALF(x) (x.y)

#define PARROTS_CUDA_CHECK(exp)                         \
  do {                                                  \
    cudaError_t err = exp;                              \
    if (err != cudaSuccess) {                           \
      fprintf(stderr, "cudaCheckError() failed : %s\n", \
              cudaGetErrorString(err));                 \
      exit(-1);                                         \
    }                                                   \
  } while (0)

#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
  case prim_type: {                                     \
    using scalar_t = type;                              \
    return __VA_ARGS__();                               \
  }

#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
  [&] {                                                             \
    const auto& the_type = TYPE;                                    \
    switch (the_type) {                                             \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
      default:                                                      \
        PARROTS_NOTSUPPORTED;                                       \
    }                                                               \
  }()

#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
  [&] {                                                              \
    const auto& the_type = TYPE;                                     \
    switch (the_type) {                                              \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
      default:                                                       \
        PARROTS_NOTSUPPORTED;                                        \
    }                                                                \
  }()

/** atomicAdd **/
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600

static __inline__ __device__ double atomicAdd(double* address, double val) {
  unsigned long long int* address_as_ull = (unsigned long long int*)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val == 0.0) return __longlong_as_double(old);
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed,
                    __double_as_longlong(val + __longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
}

#endif

static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) {
  unsigned int* aligned =
      (unsigned int*)((size_t)address - ((size_t)address & 2));
  unsigned int old = *aligned;
  unsigned int assumed;
  unsigned short old_as_us;
  do {
    assumed = old;
    old_as_us =
        (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff);

#if __CUDACC_VER_MAJOR__ >= 9
    float16 tmp;
    tmp.x = old_as_us;
    float16 sum = tmp + val;
    unsigned short sum_as_us = sum.x;
//         half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us))
//         + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum);
#else
    unsigned short sum_as_us =
        __float2half_rn(__half2float(old_as_us) + (float)(val));
#endif

    unsigned int sum_as_ui = (size_t)address & 2
                                 ? (sum_as_us << 16) | (old & 0xffff)
                                 : (old & 0xffff0000) | sum_as_us;
    old = atomicCAS(aligned, assumed, sum_as_ui);
  } while (assumed != old);
  //__half_raw raw = {old_as_us};
  // return float16(raw);
  return *reinterpret_cast<float16*>(&old_as_us);
}
#endif  // PARROTS_CUDA_HELPER


================================================
FILE: mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
================================================
#ifndef PYTORCH_CPP_HELPER
#define PYTORCH_CPP_HELPER
#include <torch/types.h>

#include <vector>

using namespace at;

#define CHECK_CUDA(x) \
  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_MLU(x) \
  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
#define CHECK_MUSA(x) \
  TORCH_CHECK(x.device().is_privateuseone(), #x " must be a MUSA tensor")
#define CHECK_CPU(x) \
  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) \
  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) \
  CHECK_CUDA(x);            \
  CHECK_CONTIGUOUS(x)
#define CHECK_MLU_INPUT(x) \
  CHECK_MLU(x);            \
  CHECK_CONTIGUOUS(x)
#define CHECK_MUSA_INPUT(x) \
  CHECK_MUSA(x);            \
  CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) \
  CHECK_CPU(x);            \
  CHECK_CONTIGUOUS(x)

#endif  // PYTORCH_CPP_HELPER


================================================
FILE: mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
================================================
#ifndef PYTORCH_CUDA_HELPER
#define PYTORCH_CUDA_HELPER

#include <ATen/ATen.h>
#ifdef MMCV_WITH_MUSA
#include "common_musa_helper.hpp"
#include "torch_musa/csrc/aten/musa/MUSAContext.h"
#include "torch_musa/csrc/core/MUSAGuard.h"
#include "torch_musa/share/generated_cuda_compatible/aten/src/THC/THCAtomics.muh"
#include "torch_musa/share/generated_cuda_compatible/include/ATen/musa/MUSA_PORT_ApplyUtils.muh"
#else
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>

#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <THC/THCAtomics.cuh>

#include "common_cuda_helper.hpp"
#endif

using at::Half;
using at::Tensor;
using phalf = at::Half;

#define __PHALF(x) (x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

#endif  // PYTORCH_CUDA_HELPER


================================================
FILE: mmcv/ops/csrc/common/pytorch_device_registry.hpp
================================================
#ifndef PYTORCH_DEVICE_REGISTRY_H
#define PYTORCH_DEVICE_REGISTRY_H

// Using <torch/extension.h> is recommended in the official documentation in
// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
// However, we use <torch/types.h> for compatibility with CUDA 9.0
// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
#include <torch/types.h>

#include <cassert>
#include <functional>
#include <map>
#include <type_traits>

#ifdef MMCV_WITH_MUSA
#include "torch_musa/csrc/aten/utils/Utils.h"
#endif

inline std::string GetDeviceStr(const at::Device& device) {
  std::string str = DeviceTypeName(device.type(), true);
  if (device.has_index()) {
    str.push_back(':');
    str.append(std::to_string(device.index()));
  }
  return str;
}

// Registry
template <typename F, F f>
class DeviceRegistry;

template <typename Ret, typename... Args, Ret (*f)(Args...)>
class DeviceRegistry<Ret (*)(Args...), f> {
 public:
  using FunctionType = Ret (*)(Args...);
  static const int MAX_DEVICE_TYPES =
      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);

  void Register(at::DeviceType device, FunctionType function) {
    funcs_[int8_t(device)] = function;
  }

  FunctionType Find(at::DeviceType device) const {
    return funcs_[int8_t(device)];
  }

  static DeviceRegistry& instance() {
    static DeviceRegistry inst;
    return inst;
  }

 private:
  DeviceRegistry() {
    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
      funcs_[i] = nullptr;
    }
  };
  FunctionType funcs_[MAX_DEVICE_TYPES];
};

// get device of first tensor param

template <typename T, typename... Args,
          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
                           bool> = true>
at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
  return std::forward<T>(t).device();
}
template <typename T, typename... Args,
          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
                           bool> = true>
at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
  return GetFirstTensorDevice(std::forward<Args>(args)...);
}

// check device consistency

inline std::pair<int, at::Device> CheckDeviceConsistency(
    const at::Device& device, int index) {
  return {index, device};
}

template <typename T, typename... Args,
          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
                           bool> = true>
std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
                                                  int index, T&& t,
                                                  Args&&... args);

template <typename T, typename... Args,
          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
                           bool> = true>
std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
                                                  int index, T&& t,
                                                  Args&&... args) {
  auto new_device = std::forward<T>(t).device();
  if (new_device.type() != device.type() ||
      new_device.index() != device.index()) {
    return {index, new_device};
  }
  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
}

template <
    typename T, typename... Args,
    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
                                                  int index, T&& t,
                                                  Args&&... args) {
  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
}

// dispatch

template <typename R, typename... Args>
auto Dispatch(const R& registry, const char* name, Args&&... args) {
  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
  auto inconsist =
      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
              inconsist.first,
              ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
              " vs ", GetDeviceStr(device).c_str(), "\n")
  auto f_ptr = registry.Find(device.type());
  TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
              GetDeviceStr(device).c_str(), " not found.\n")
  return f_ptr(std::forward<Args>(args)...);
}

// helper macro

#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()

#define REGISTER_DEVICE_IMPL(key, device, value)           \
  struct key##_##device##_registerer {                     \
    key##_##device##_registerer() {                        \
      DEVICE_REGISTRY(key).Register(at::k##device, value); \
    }                                                      \
  };                                                       \
  static key##_##device##_registerer _##key##_##device##_registerer;

#define DISPATCH_DEVICE_IMPL(key, ...) \
  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)

#endif  // PYTORCH_DEVICE_REGISTRY


================================================
FILE: mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
================================================
/*************************************************************************
 * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#ifndef PYTORCH_MLU_HELPER_HPP_
#define PYTORCH_MLU_HELPER_HPP_

#ifdef MMCV_WITH_MLU
#include "aten.h"

#define NFU_ALIGN_SIZE 128

#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))

#define PAD_DOWN(x, y) (((x) / (y)) * (y))

#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))

#define CEIL_ALIGN(x, y) (((x) + (y) - 1) / (y) * (y))

inline int32_t getJobLimitCapability() {
  CNcontext drv_ctx;
  TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), "cnCtxGetCurrent fails");
  CNctxConfigParam ctx_conf_param;
  TORCH_CHECK(
      CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT,
                                        &ctx_conf_param),
      "cnGetCtxConfigParam fails.");
  return (int32_t)ctx_conf_param.unionLimit;
}

inline int32_t getCoreNumOfJobLimitCapability() {
  switch (getJobLimitCapability()) {
    default:
      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) *
             getJobLimitCapability();
    case CN_KERNEL_CLASS_BLOCK:
      return 1;
    case CN_KERNEL_CLASS_UNION:
      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
    case CN_KERNEL_CLASS_UNION2:
      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2;
    case CN_KERNEL_CLASS_UNION4:
      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4;
    case CN_KERNEL_CLASS_UNION8:
      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8;
    case CN_KERNEL_CLASS_UNION16:
      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16;
  }
}

#endif  // MMCV_WITH_MLU

#endif  // PYTORCH_MLU_HELPER_HPP_


================================================
FILE: mmcv/ops/csrc/common/pytorch_musa_helper.hpp
================================================
#ifndef PYTORCH_MUSA_HELPER
#define PYTORCH_MUSA_HELPER

#include <ATen/ATen.h>

#include <ATen/musa/MUSA_PORT_ApplyUtils.muh>
#include <THC/THCAtomics.muh>

#include "common_musa_helper.hpp"
#include "torch_musa/csrc/aten/musa/Exceptions.h"
#include "torch_musa/csrc/aten/musa/MUSAContext.h"
#include "torch_musa/csrc/core/MUSAGuard.h"

using at::Half;
using at::Tensor;
using phalf = at::Half;

#define __PHALF(x) (x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

#endif  // PYTORCH_CUDA_HELPER


================================================
FILE: mmcv/ops/csrc/common/pytorch_npu_helper.hpp
================================================
/******************************************************************************
 * Copyright (c) 2022 Huawei Technologies Co., Ltd
 * All rights reserved.
 *
 * Licensed under the BSD 3-Clause License  (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * https://opensource.org/licenses/BSD-3-Clause
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef PYTORCH_NPU_HELPER_HPP_
#define PYTORCH_NPU_HELPER_HPP_

#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
#include <torch_npu/csrc/framework/utils/OpAdapter.h>

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#include "pytorch_npu_util.hpp"

#define NPU_NAME_SPACE at_npu::native

#ifdef MMCV_WITH_XLA
#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)
#else
#define REGISTER_NPU_IMPL(key, value) \
  REGISTER_DEVICE_IMPL(key, PrivateUse1, value)
#endif

#ifdef MMCV_WITH_XLA
#define CHECK_NPU(x) \
  TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor")
#else
#define CHECK_NPU(x)                                    \
  TORCH_CHECK(x.device().type() == at::kPrivateUse1, #x \
              " must be a NPU "                         \
              "tensor")

#endif
#endif  // PYTORCH_NPU_HELPER_HPP_


================================================
FILE: mmcv/ops/csrc/common/pytorch_npu_util.hpp
================================================
/******************************************************************************
 * Copyright (c) 2022 Huawei Technologies Co., Ltd
 * All rights reserved.
 *
 * Licensed under the BSD 3-Clause License  (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * https://opensource.org/licenses/BSD-3-Clause
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_
#define MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_

#include <ATen/Tensor.h>
#include <acl/acl_base.h>
#include <acl/acl_rt.h>
#include <c10/util/Exception.h>
#include <dlfcn.h>
#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
#include <torch_npu/csrc/framework/utils/OpAdapter.h>

#include <functional>
#include <type_traits>
#include <vector>

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
#include "torch_npu/csrc/core/npu/NPUStream.h"
#include "torch_npu/csrc/framework/OpCommand.h"
#include "torch_npu/csrc/framework/interface/EnvVariables.h"
#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
#include "torch_npu/csrc/framework/utils/OpPreparation.h"

#define NPU_NAME_SPACE at_npu::native

typedef struct aclOpExecutor aclOpExecutor;
typedef struct aclTensor aclTensor;
typedef struct aclScalar aclScalar;
typedef struct aclIntArray aclIntArray;
typedef struct aclFloatArray aclFloatArray;
typedef struct aclBoolArray aclBoolArray;
typedef struct aclTensorList aclTensorList;

typedef aclTensor *(*_aclCreateTensor)(
    const int64_t *view_dims, uint64_t view_dims_num, aclDataType data_type,
    const int64_t *stride, int64_t offset, aclFormat format,
    const int64_t *storage_dims, uint64_t storage_dims_num, void *tensor_data);
typedef aclScalar *(*_aclCreateScalar)(void *value, aclDataType data_type);
typedef aclIntArray *(*_aclCreateIntArray)(const int64_t *value, uint64_t size);
typedef aclFloatArray *(*_aclCreateFloatArray)(const float *value,
                                               uint64_t size);
typedef aclBoolArray *(*_aclCreateBoolArray)(const bool *value, uint64_t size);
typedef aclTensorList *(*_aclCreateTensorList)(const aclTensor *const *value,
                                               uint64_t size);

typedef int (*_aclDestroyTensor)(const aclTensor *tensor);
typedef int (*_aclDestroyScalar)(const aclScalar *scalar);
typedef int (*_aclDestroyIntArray)(const aclIntArray *array);
typedef int (*_aclDestroyFloatArray)(const aclFloatArray *array);
typedef int (*_aclDestroyBoolArray)(const aclBoolArray *array);
typedef int (*_aclDestroyTensorList)(const aclTensorList *array);

constexpr int kHashBufSize = 8192;
constexpr int kHashBufMaxSize = kHashBufSize + 1024;
extern thread_local char g_hashBuf[kHashBufSize];
extern thread_local int g_hashOffset;

#ifdef MMCV_WITH_XLA
#define DEVICE_TYPE at_npu::key::NativeDeviceType
#else
#define DEVICE_TYPE c10::DeviceType::PrivateUse1
#endif

#define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \
  _(at::ScalarType::Byte, ACL_UINT8)                \
  _(at::ScalarType::Char, ACL_INT8)                 \
  _(at::ScalarType::Short, ACL_INT16)               \
  _(at::ScalarType::Int, ACL_INT32)                 \
  _(at::ScalarType::Long, ACL_INT64)                \
  _(at::ScalarType::Half, ACL_FLOAT16)              \
  _(at::ScalarType::Float, ACL_FLOAT)               \
  _(at::ScalarType::Double, ACL_DOUBLE)             \
  _(at::ScalarType::ComplexHalf, ACL_DT_UNDEFINED)  \
  _(at::ScalarType::ComplexFloat, ACL_COMPLEX64)    \
  _(at::ScalarType::ComplexDouble, ACL_COMPLEX128)  \
  _(at::ScalarType::Bool, ACL_BOOL)                 \
  _(at::ScalarType::QInt8, ACL_DT_UNDEFINED)        \
  _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED)       \
  _(at::ScalarType::QInt32, ACL_DT_UNDEFINED)       \
  _(at::ScalarType::BFloat16, ACL_BF16)             \
  _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED)     \
  _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED)     \
  _(at::ScalarType::Undefined, ACL_DT_UNDEFINED)    \
  _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED)

constexpr aclDataType kATenScalarTypeToAclDataTypeTable
    [static_cast<int64_t>(at::ScalarType::NumOptions) + 1] = {
#define DEFINE_ENUM(_1, n) n,
        AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM)
#undef DEFINE_ENUM
};

#define GET_OP_API_FUNC(apiName) \
  reinterpret_cast<_##apiName>(GetOpApiFuncAddr(#apiName))

#define MEMCPY_TO_BUF(data_expression, size_expression)               \
  if (g_hashOffset + (size_expression) > kHashBufSize) {              \
    g_hashOffset = kHashBufMaxSize;                                   \
    return;                                                           \
  }                                                                   \
  memcpy(g_hashBuf + g_hashOffset, data_expression, size_expression); \
  g_hashOffset += size_expression;

inline const char *GetOpApiLibName(void) { return "libopapi.so"; }

inline const char *GetCustOpApiLibName(void) { return "libcust_opapi.so"; }

inline void *GetOpApiFuncAddrInLib(void *handler, const char *libName,
                                   const char *apiName) {
  auto funcAddr = dlsym(handler, apiName);
  if (funcAddr == nullptr) {
    ASCEND_LOGW("dlsym %s from %s failed, error:%s.", apiName, libName,
                dlerror());
  }
  return funcAddr;
}

inline void *GetOpApiLibHandler(const char *libName) {
  auto handler = dlopen(libName, RTLD_LAZY);
  if (handler == nullptr) {
    ASCEND_LOGW("dlopen %s failed, error:%s.", libName, dlerror());
  }
  return handler;
}

inline void *GetOpApiFuncAddr(const char *apiName) {
  static auto custOpApiHandler = GetOpApiLibHandler(GetCustOpApiLibName());
  if (custOpApiHandler != nullptr) {
    auto funcAddr =
        GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName);
    if (funcAddr != nullptr) {
      return funcAddr;
    }
  }

  static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName());
  if (opApiHandler == nullptr) {
    return nullptr;
  }
  return GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName);
}

inline c10::Scalar ConvertTensorToScalar(const at::Tensor &tensor) {
  c10::Scalar expScalar;
  const at::Tensor *aclInput = &tensor;
  if (aclInput->scalar_type() == at::ScalarType::Double) {
    double value = *(double *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  } else if (aclInput->scalar_type() == at::ScalarType::Long) {
    int64_t value = *(int64_t *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  } else if (aclInput->scalar_type() == at::ScalarType::Float) {
    float value = *(float *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  } else if (aclInput->scalar_type() == at::ScalarType::Int) {
    int value = *(int *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  } else if (aclInput->scalar_type() == at::ScalarType::Half) {
    c10::Half value = *(c10::Half *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  } else if (aclInput->scalar_type() == at::ScalarType::Bool) {
    int8_t value = *(int8_t *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  } else if (aclInput->scalar_type() == at::ScalarType::ComplexDouble) {
    c10::complex<double> value = *(c10::complex<double> *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  } else if (aclInput->scalar_type() == at::ScalarType::ComplexFloat) {
    c10::complex<float> value = *(c10::complex<float> *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  } else if (aclInput->scalar_type() == at::ScalarType::BFloat16) {
    c10::BFloat16 value = *(c10::BFloat16 *)aclInput->data_ptr();
    c10::Scalar scalar(value);
    expScalar = scalar;
  }
  return expScalar;
}

inline at::Tensor CopyTensorHostToDevice(const at::Tensor &cpu_tensor) {
  at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory();
  int deviceIndex = 0;
  return cpuPinMemTensor.to(c10::Device(DEVICE_TYPE, deviceIndex),
                            cpuPinMemTensor.scalar_type(), true, true);
}

inline at::Tensor CopyScalarToDevice(const c10::Scalar &cpu_scalar,
                                     at::ScalarType scalar_data_type) {
  return CopyTensorHostToDevice(
      scalar_to_tensor(cpu_scalar).to(scalar_data_type));
}

inline aclTensor *ConvertType(const at::Tensor &at_tensor) {
  static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor);
  if (aclCreateTensor == nullptr) {
    return nullptr;
  }

  if (!at_tensor.defined()) {
    return nullptr;
  }
  at::ScalarType scalar_data_type = at_tensor.scalar_type();
  aclDataType acl_data_type =
      kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(scalar_data_type)];
  TORCH_CHECK(
      acl_data_type != ACL_DT_UNDEFINED,
      std::string(c10::toString(scalar_data_type)) + " has not been supported")
  c10::SmallVector<int64_t, 5> storageDims;
  // if acl_data_type is ACL_STRING, storageDims is empty.
  auto itemsize = at_tensor.itemsize();
  if (itemsize == 0) {
    AT_ERROR("When ConvertType, tensor item size of cannot be zero.");
    return nullptr;
  }
  if (acl_data_type != ACL_STRING) {
    storageDims.push_back(at_tensor.storage().nbytes() / itemsize);
  }

  const auto dimNum = at_tensor.sizes().size();
  aclFormat format = ACL_FORMAT_ND;
  switch (dimNum) {
    case 3:
      format = ACL_FORMAT_NCL;
      break;
    case 4:
      format = ACL_FORMAT_NCHW;
      break;
    case 5:
      format = ACL_FORMAT_NCDHW;
      break;
    default:
      format = ACL_FORMAT_ND;
  }

  if (at_tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
    c10::Scalar expScalar = ConvertTensorToScalar(at_tensor);
    at::Tensor aclInput = CopyScalarToDevice(expScalar, scalar_data_type);
    return aclCreateTensor(aclInput.sizes().data(), aclInput.sizes().size(),
                           acl_data_type, aclInput.strides().data(),
                           aclInput.storage_offset(), format,
                           storageDims.data(), storageDims.size(),
                           const_cast<void *>(aclInput.storage().data()));
  }

  auto acl_tensor = aclCreateTensor(
      at_tensor.sizes().data(), at_tensor.sizes().size(), acl_data_type,
      at_tensor.strides().data(), at_tensor.storage_offset(), format,
      storageDims.data(), storageDims.size(),
      const_cast<void *>(at_tensor.storage().data()));
  return acl_tensor;
}

inline aclScalar *ConvertType(const at::Scalar &at_scalar) {
  static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar);
  if (aclCreateScalar == nullptr) {
    return nullptr;
  }

  at::ScalarType scalar_data_type = at_scalar.type();
  aclDataType acl_data_type =
      kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(scalar_data_type)];
  TORCH_CHECK(
      acl_data_type != ACL_DT_UNDEFINED,
      std::string(c10::toString(scalar_data_type)) + " has not been supported")
  aclScalar *acl_scalar = nullptr;
  switch (scalar_data_type) {
    case at::ScalarType::Double: {
      double value = at_scalar.toDouble();
      acl_scalar = aclCreateScalar(&value, acl_data_type);
      break;
    }
    case at::ScalarType::Long: {
      int64_t value = at_scalar.toLong();
      acl_scalar = aclCreateScalar(&value, acl_data_type);
      break;
    }
    case at::ScalarType::Bool: {
      bool value = at_scalar.toBool();
      acl_scalar = aclCreateScalar(&value, acl_data_type);
      break;
    }
    case at::ScalarType::ComplexDouble: {
      auto value = at_scalar.toComplexDouble();
      acl_scalar = aclCreateScalar(&value, acl_data_type);
      break;
    }
    default:
      acl_scalar = nullptr;
      break;
  }
  return acl_scalar;
}

inline aclIntArray *ConvertType(const at::IntArrayRef &at_array) {
  static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray);
  if (aclCreateIntArray == nullptr) {
    return nullptr;
  }
  auto array = aclCreateIntArray(at_array.data(), at_array.size());
  return array;
}

template <std::size_t N>
inline aclBoolArray *ConvertType(const std::array<bool, N> &value) {
  static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray);
  if (aclCreateBoolArray == nullptr) {
    return nullptr;
  }

  auto array = aclCreateBoolArray(value.data(), value.size());
  return array;
}

inline aclBoolArray *ConvertType(const at::ArrayRef<bool> &value) {
  static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray);
  if (aclCreateBoolArray == nullptr) {
    return nullptr;
  }

  auto array = aclCreateBoolArray(value.data(), value.size());
  return array;
}

inline aclTensorList *ConvertType(const at::TensorList &at_tensor_list) {
  static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList);
  if (aclCreateTensorList == nullptr) {
    return nullptr;
  }

  std::vector<const aclTensor *> tensor_list(at_tensor_list.size());
  for (size_t i = 0; i < at_tensor_list.size(); i++) {
    tensor_list[i] = ConvertType(at_tensor_list[i]);
  }
  auto acl_tensor_list =
      aclCreateTensorList(tensor_list.data(), tensor_list.size());
  return acl_tensor_list;
}

inline aclTensor *ConvertType(const c10::optional<at::Tensor> &opt_tensor) {
  if (opt_tensor.has_value() && opt_tensor.value().defined()) {
    return ConvertType(opt_tensor.value());
  }
  return nullptr;
}

inline aclIntArray *ConvertType(
    const c10::optional<at::IntArrayRef> &opt_array) {
  if (opt_array.has_value()) {
    return ConvertType(opt_array.value());
  }
  return nullptr;
}

inline aclScalar *ConvertType(const c10::optional<at::Scalar> &opt_scalar) {
  if (opt_scalar.has_value()) {
    return ConvertType(opt_scalar.value());
  }
  return nullptr;
}

inline aclDataType ConvertType(const at::ScalarType scalarType) {
  return kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(scalarType)];
}

template <typename T>
T ConvertType(T value) {
  return value;
}

template <typename Tuple, size_t... I>
auto ConvertToOpApiFunc(const Tuple &params, void *opApiAddr,
                        std::index_sequence<I...>) {
  typedef int (*OpApiFunc)(
      typename std::decay<decltype(std::get<I>(params))>::type...);
  auto func = reinterpret_cast<OpApiFunc>(opApiAddr);
  return func;
}

template <typename Tuple>
auto ConvertToOpApiFunc(const Tuple &params, void *opApiAddr) {
  static constexpr auto size = std::tuple_size<Tuple>::value;
  return ConvertToOpApiFunc(params, opApiAddr,
                            std::make_index_sequence<size>{});
}

inline void Release(aclTensor *p) {
  static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor);
  if (aclDestroyTensor == nullptr) {
    return;
  }
  aclDestroyTensor(p);
}

inline void Release(aclScalar *p) {
  static const auto aclDestroyScalar = GET_OP_API_FUNC(aclDestroyScalar);
  if (aclDestroyScalar == nullptr) {
    return;
  }
  aclDestroyScalar(p);
}

inline void Release(aclIntArray *p) {
  static const auto aclDestroyIntArray = GET_OP_API_FUNC(aclDestroyIntArray);
  if (aclDestroyIntArray == nullptr) {
    return;
  }

  aclDestroyIntArray(p);
}

inline void Release(aclBoolArray *p) {
  static const auto aclDestroyBoolArray = GET_OP_API_FUNC(aclDestroyBoolArray);
  if (aclDestroyBoolArray == nullptr) {
    return;
  }

  aclDestroyBoolArray(p);
}

inline void Release(aclTensorList *p) {
  static const auto aclDestroyTensorList =
      GET_OP_API_FUNC(aclDestroyTensorList);
  if (aclDestroyTensorList == nullptr) {
    return;
  }

  aclDestroyTensorList(p);
}

template <typename T>
void Release(T value) {
  (void)value;
}

template <typename Tuple, size_t... I>
void CallRelease(Tuple t, std::index_sequence<I...>) {
  (void)std::initializer_list<int>{(Release(std::get<I>(t)), 0)...};
}

template <typename Tuple>
void ReleaseConvertTypes(Tuple &t) {
  static constexpr auto size = std::tuple_size<Tuple>::value;
  CallRelease(t, std::make_index_sequence<size>{});
}

template <typename... Ts>
constexpr auto ConvertTypes(Ts &...args) {
  return std::make_tuple(ConvertType(args)...);
}

template <typename Function, typename Tuple, size_t... I>
auto call(Function f, Tuple t, std::index_sequence<I...>) {
  return f(std::get<I>(t)...);
}

template <typename Function, typename Tuple>
auto call(Function f, Tuple t) {
  static constexpr auto size = std::tuple_size<Tuple>::value;
  return call(f, t, std::make_index_sequence<size>{});
}

template <std::size_t N>
void AddParamToBuf(const std::array<bool, N> &value) {
  MEMCPY_TO_BUF(value.data(), value.size() * sizeof(bool));
}

template <typename T>
void AddParamToBuf(const T &value) {
  MEMCPY_TO_BUF(&value, sizeof(T));
}

void AddParamToBuf(const at::Tensor &);
void AddParamToBuf(const at::Scalar &);
void AddParamToBuf(const at::IntArrayRef &);
void AddParamToBuf(const at::ArrayRef<bool> &);
void AddParamToBuf(const at::TensorList &);
void AddParamToBuf(const c10::optional<at::Tensor> &);
void AddParamToBuf(const c10::optional<at::IntArrayRef> &);
void AddParamToBuf(const c10::optional<at::Scalar> &);
void AddParamToBuf(const at::ScalarType);
void AddParamToBuf(const string &);
void AddParamToBuf();

template <typename T, typename... Args>
void AddParamToBuf(const T &arg, Args &...args) {
  AddParamToBuf(arg);
  AddParamToBuf(args...);
}

uint64_t CalcHashId();
typedef int (*InitHugeMemThreadLocal)(void *, bool);
typedef void (*UnInitHugeMemThreadLocal)(void *, bool);
typedef void (*ReleaseHugeMem)(void *, bool);

#define EXEC_NPU_CMD(aclnn_api, ...)                                          \
  do {                                                                        \
    static const auto getWorkspaceSizeFuncAddr =                              \
        GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize");                      \
    static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api);           \
    static const auto initMemAddr =                                           \
        GetOpApiFuncAddr("InitHugeMemThreadLocal");                           \
    static const auto unInitMemAddr =                                         \
        GetOpApiFuncAddr("UnInitHugeMemThreadLocal");                         \
    static const auto releaseMemAddr = GetOpApiFuncAddr("ReleaseHugeMem");    \
    TORCH_CHECK(                                                              \
        getWorkspaceSizeFuncAddr != nullptr && opApiFuncAddr != nullptr,      \
        #aclnn_api, " or ", #aclnn_api "GetWorkspaceSize", " not in ",        \
        GetOpApiLibName(), ", or ", GetOpApiLibName(), "not found.");         \
    auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);           \
    uint64_t workspace_size = 0;                                              \
    uint64_t *workspace_size_addr = &workspace_size;                          \
    aclOpExecutor *executor = nullptr;                                        \
    aclOpExecutor **executor_addr = &executor;                                \
    InitHugeMemThreadLocal initMemFunc =                                      \
        reinterpret_cast<InitHugeMemThreadLocal>(initMemAddr);                \
    UnInitHugeMemThreadLocal unInitMemFunc =                                  \
        reinterpret_cast<UnInitHugeMemThreadLocal>(unInitMemAddr);            \
    if (initMemFunc) {                                                        \
      initMemFunc(nullptr, false);                                            \
    }                                                                         \
    auto converted_params =                                                   \
        ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr);        \
    static auto getWorkspaceSizeFunc =                                        \
        ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr);       \
    auto workspace_status = call(getWorkspaceSizeFunc, converted_params);     \
    TORCH_CHECK(workspace_status == 0,                                        \
                "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \
    void *workspace_addr = nullptr;                                           \
    if (workspace_size != 0) {                                                \
      at::TensorOptions options =                                             \
          at::TensorOptions(torch_npu::utils::get_npu_device_type());         \
      auto workspace_tensor =                                                 \
          at::empty({workspace_size}, options.dtype(kByte));                  \
      workspace_addr = const_cast<void *>(workspace_tensor.storage().data()); \
    }                                                                         \
    auto acl_call = [converted_params, workspace_addr, workspace_size,        \
                     acl_stream, executor]() -> int {                         \
      typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *,             \
                               const aclrtStream);                            \
      OpApiFunc opApiFunc = reinterpret_cast<OpApiFunc>(opApiFuncAddr);       \
      auto api_ret =                                                          \
          opApiFunc(workspace_addr, workspace_size, executor, acl_stream);    \
      TORCH_CHECK(api_ret == 0, "call " #aclnn_api " failed, detail:",        \
                  aclGetRecentErrMsg());                                      \
      ReleaseConvertTypes(converted_params);                                  \
      ReleaseHugeMem releaseMemFunc =                                         \
          reinterpret_cast<ReleaseHugeMem>(releaseMemAddr);                   \
      if (releaseMemFunc) {                                                   \
        releaseMemFunc(nullptr, false);                                       \
      }                                                                       \
      return api_ret;                                                         \
    };                                                                        \
    at_npu::native::OpCommand cmd;                                            \
    cmd.Name(#aclnn_api);                                                     \
    cmd.SetCustomHandler(acl_call);                                           \
    cmd.Run();                                                                \
    if (unInitMemFunc) {                                                      \
      unInitMemFunc(nullptr, false);                                          \
    }                                                                         \
  } while (false)

#endif  // MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>

namespace detail {
template <class scalar_t>
int getTotalSize(std::vector<scalar_t> arg) {
  return arg.size();
}

template <class scalar_t, class... TArgs>
int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
  return arg.size() * getTotalSize(args...);
}

template <typename scalar_t>
int getSize(std::vector<scalar_t> arg) {
  return arg.size();
}

template <int Idx, class TT, class scalar_t>
void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
  std::get<Idx>(src) = arg[counter[Idx]];
}

template <int Idx, class TT, class scalar_t, class... TArgs>
void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
              std::vector<TArgs> &...args) {
  std::get<Idx>(src) = arg[counter[Idx]];
  assigner<Idx + 1>(src, counter, args...);
}
}  // namespace detail

template <class... TArgs>
std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
  int length = detail::getTotalSize(args...);
  std::vector<int> sizes = {detail::getSize(args)...};
  int size = sizes.size();

  std::vector<std::tuple<TArgs...>> params(length);
  std::vector<int> counter(size);
  for (int i = 0; i < length; ++i) {
    detail::assigner<0>(params[i], counter, args...);
    counter[size - 1] += 1;
    for (int c = size - 1; c >= 0; --c) {
      if (counter[c] == sizes[c] && c > 0) {
        counter[c - 1] += 1;
        counter[c] = 0;
      }
    }
  }
  return params;
}

#endif


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/prettyprint.h
================================================
//          Copyright Louis Delacroix 2010 - 2014.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE_1_0.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)
//
// A pretty printing library for C++
//
// Usage:
// Include this header, and operator<< will "just work".

#ifndef H_PRETTY_PRINT
#define H_PRETTY_PRINT

#include <cstddef>
#include <iterator>
#include <memory>
#include <ostream>
#include <set>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <valarray>

namespace pretty_print {
namespace detail {
// SFINAE type trait to detect whether T::const_iterator exists.

struct sfinae_base {
  using yes = char;
  using no = yes[2];
};

template <typename T>
struct has_const_iterator : private sfinae_base {
 private:
  template <typename C>
  static yes &test(typename C::const_iterator *);
  template <typename C>
  static no &test(...);

 public:
  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
  using type = T;
};

template <typename T>
struct has_begin_end : private sfinae_base {
 private:
  template <typename C>
  static yes &
  f(typename std::enable_if<
      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
                                            const>(&C::begin)),
                   typename C::const_iterator (C::*)() const>::value>::type *);

  template <typename C>
  static no &f(...);

  template <typename C>
  static yes &g(typename std::enable_if<
                std::is_same<decltype(static_cast<typename C::const_iterator (
                                          C::*)() const>(&C::end)),
                             typename C::const_iterator (C::*)() const>::value,
                void>::type *);

  template <typename C>
  static no &g(...);

 public:
  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
};

}  // namespace detail

// Holds the delimiter values for a specific character type

template <typename TChar>
struct delimiters_values {
  using char_type = TChar;
  const char_type *prefix;
  const char_type *delimiter;
  const char_type *postfix;
};

// Defines the delimiter values for a specific container and character type

template <typename T, typename TChar>
struct delimiters {
  using type = delimiters_values<TChar>;
  static const type values;
};

// Functor to print containers. You can use this directly if you want
// to specify a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.

template <typename T, typename TChar = char,
          typename TCharTraits = ::std::char_traits<TChar>,
          typename TDelimiters = delimiters<T, TChar>>
struct print_container_helper {
  using delimiters_type = TDelimiters;
  using ostream_type = std::basic_ostream<TChar, TCharTraits>;

  template <typename U>
  struct printer {
    static void print_body(const U &c, ostream_type &stream) {
      using std::begin;
      using std::end;

      auto it = begin(c);
      const auto the_end = end(c);

      if (it != the_end) {
        for (;;) {
          stream << *it;

          if (++it == the_end) break;

          if (delimiters_type::values.delimiter != NULL)
            stream << delimiters_type::values.delimiter;
        }
      }
    }
  };

  print_container_helper(const T &container) : container_(container) {}

  inline void operator()(ostream_type &stream) const {
    if (delimiters_type::values.prefix != NULL)
      stream << delimiters_type::values.prefix;

    printer<T>::print_body(container_, stream);

    if (delimiters_type::values.postfix != NULL)
      stream << delimiters_type::values.postfix;
  }

 private:
  const T &container_;
};

// Specialization for pairs

template <typename T, typename TChar, typename TCharTraits,
          typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits,
                              TDelimiters>::printer<std::pair<T1, T2>> {
  using ostream_type =
      typename print_container_helper<T, TChar, TCharTraits,
                                      TDelimiters>::ostream_type;

  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
    stream << c.first;
    if (print_container_helper<T, TChar, TCharTraits,
                               TDelimiters>::delimiters_type::values
            .delimiter != NULL)
      stream << print_container_helper<T, TChar, TCharTraits,
                                       TDelimiters>::delimiters_type::values
                    .delimiter;
    stream << c.second;
  }
};

// Specialization for tuples

template <typename T, typename TChar, typename TCharTraits,
          typename TDelimiters>
template <typename... Args>
struct print_container_helper<T, TChar, TCharTraits,
                              TDelimiters>::printer<std::tuple<Args...>> {
  using ostream_type =
      typename print_container_helper<T, TChar, TCharTraits,
                                      TDelimiters>::ostream_type;
  using element_type = std::tuple<Args...>;

  template <std::size_t I>
  struct Int {};

  static void print_body(const element_type &c, ostream_type &stream) {
    tuple_print(c, stream, Int<0>());
  }

  static void tuple_print(const element_type &, ostream_type &,
                          Int<sizeof...(Args)>) {}

  static void tuple_print(
      const element_type &c, ostream_type &stream,
      typename std::conditional<sizeof...(Args) != 0, Int<0>,
                                std::nullptr_t>::type) {
    stream << std::get<0>(c);
    tuple_print(c, stream, Int<1>());
  }

  template <std::size_t N>
  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
    if (print_container_helper<T, TChar, TCharTraits,
                               TDelimiters>::delimiters_type::values
            .delimiter != NULL)
      stream << print_container_helper<T, TChar, TCharTraits,
                                       TDelimiters>::delimiters_type::values
                    .delimiter;

    stream << std::get<N>(c);

    tuple_print(c, stream, Int<N + 1>());
  }
};

// Prints a print_container_helper to the specified stream.

template <typename T, typename TChar, typename TCharTraits,
          typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
    std::basic_ostream<TChar, TCharTraits> &stream,
    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
  helper(stream);
  return stream;
}

// Basic is_container template; specialize to derive from std::true_type for all
// desired container types

template <typename T>
struct is_container
    : public std::integral_constant<bool,
                                    detail::has_const_iterator<T>::value &&
                                        detail::has_begin_end<T>::beg_value &&
                                        detail::has_begin_end<T>::end_value> {};

template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type {};

template <std::size_t N>
struct is_container<char[N]> : std::false_type {};

template <typename T>
struct is_container<std::valarray<T>> : std::true_type {};

template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type {};

template <typename... Args>
struct is_container<std::tuple<Args...>> : std::true_type {};

// Default delimiters

template <typename T>
struct delimiters<T, char> {
  static const delimiters_values<char> values;
};
template <typename T>
const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
template <typename T>
struct delimiters<T, wchar_t> {
  static const delimiters_values<wchar_t> values;
};
template <typename T>
const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
                                                                   L"]"};

// Delimiters for (multi)set and unordered_(multi)set

template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, char> {
  static const delimiters_values<char> values;
};

template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
                                                                  "}"};

template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
  static const delimiters_values<wchar_t> values;
};

template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
        L"{", L", ", L"}"};

template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
  static const delimiters_values<char> values;
};

template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
        "{", ", ", "}"};

template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
  static const delimiters_values<wchar_t> values;
};

template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
        L"{", L", ", L"}"};

template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
  static const delimiters_values<char> values;
};

template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
    "{", ", ", "}"};

template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
  static const delimiters_values<wchar_t> values;
};

template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters<
    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
    L"{", L", ", L"}"};

template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
                  char> {
  static const delimiters_values<char> values;
};

template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
    "{", ", ", "}"};

template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
                  wchar_t> {
  static const delimiters_values<wchar_t> values;
};

template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t>
    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
               wchar_t>::values = {L"{", L", ", L"}"};

// Delimiters for pair and tuple

template <typename T1, typename T2>
struct delimiters<std::pair<T1, T2>, char> {
  static const delimiters_values<char> values;
};
template <typename T1, typename T2>
const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
    "(", ", ", ")"};
template <typename T1, typename T2>
struct delimiters<::std::pair<T1, T2>, wchar_t> {
  static const delimiters_values<wchar_t> values;
};
template <typename T1, typename T2>
const delimiters_values<wchar_t>
    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};

template <typename... Args>
struct delimiters<std::tuple<Args...>, char> {
  static const delimiters_values<char> values;
};
template <typename... Args>
const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
    "(", ", ", ")"};
template <typename... Args>
struct delimiters<::std::tuple<Args...>, wchar_t> {
  static const delimiters_values<wchar_t> values;
};
template <typename... Args>
const delimiters_values<wchar_t>
    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};

// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".

struct custom_delims_base {
  virtual ~custom_delims_base() {}
  virtual std::ostream &stream(::std::ostream &) = 0;
  virtual std::wostream &stream(::std::wostream &) = 0;
};

template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base {
  custom_delims_wrapper(const T &t_) : t(t_) {}

  std::ostream &stream(std::ostream &s) {
    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
               t);
  }

  std::wostream &stream(std::wostream &s) {
    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
                                       Delims>(t);
  }

 private:
  const T &t;
};

template <typename Delims>
struct custom_delims {
  template <typename Container>
  custom_delims(const Container &c)
      : base(new custom_delims_wrapper<Container, Delims>(c)) {}

  std::unique_ptr<custom_delims_base> base;
};

template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
  return p.base->stream(s);
}

// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;

template <typename T>
struct array_wrapper_n {
  typedef const T *const_iterator;
  typedef T value_type;

  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
  inline const_iterator begin() const { return _array; }
  inline const_iterator end() const { return _array + _n; }

 private:
  const T *const _array;
  size_t _n;
};

// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
// 5 of container m.)

template <typename T>
struct bucket_print_wrapper {
  typedef typename T::const_local_iterator const_iterator;
  typedef typename T::size_type size_type;

  const_iterator begin() const { return m_map.cbegin(n); }

  const_iterator end() const { return m_map.cend(n); }

  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}

 private:
  const T &m_map;
  const size_type n;
};

}  // namespace pretty_print

// Global accessor functions for the convenience wrappers

template <typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
                                                           size_t n) {
  return pretty_print::array_wrapper_n<T>(a, n);
}

template <typename T>
pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
                                                   typename T::size_type n) {
  return pretty_print::bucket_print_wrapper<T>(m, n);
}

// Main magic entry point: An overload snuck into namespace std.
// Can we do better?

namespace std {
// Prints a container to the stream using default delimiters

template <typename T, typename TChar, typename TCharTraits>
inline typename enable_if<::pretty_print::is_container<T>::value,
                          basic_ostream<TChar, TCharTraits> &>::type
operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
  return stream
         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
                container);
}
}  // namespace std

#endif  // H_PRETTY_PRINT


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include <pybind11/embed.h>
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <spconv/tensorview/tensorview.h>

#include <algorithm>
#include <iostream>

namespace py = pybind11;

template <typename scalar_t, typename TPyObject>
std::vector<scalar_t> array2Vector(TPyObject arr) {
  py::array arr_np = arr;
  size_t size = arr.attr("size").template cast<size_t>();
  py::array_t<scalar_t> arr_cc = arr_np;
  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
  return data;
}

template <typename scalar_t>
std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
  return data;
}

template <typename scalar_t, typename TPyObject>
tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
  py::array arr_np = arr;
  py::array_t<scalar_t> arr_cc = arr_np;
  tv::Shape shape;
  for (int i = 0; i < arr_cc.ndim(); ++i) {
    shape.push_back(arr_cc.shape(i));
  }
  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
}
template <typename scalar_t>
tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
  tv::Shape shape;
  for (int i = 0; i < arr.ndim(); ++i) {
    shape.push_back(arr.shape(i));
  }
  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
}


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_

#include <utils/spconv/tensorview/tensorview.h>

#include <iostream>
#include <limits>

template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
                                    const Index *kernelSize,
                                    const Index *stride, const Index *padding,
                                    const Index *dilation,
                                    const Index *outSpatialShape, Index *out) {
  Index lowers[NDim];
  Index uppers[NDim];
  Index counter[NDim];
  Index counterSize[NDim];
  Index pointCounter = 0;
  Index val;
  Index numPoints = 1;
  Index m, offset;
  bool valid = false;
#pragma unroll
  for (unsigned i = 0; i < NDim; ++i) {
    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
                 stride[i] + padding[i]) /
                stride[i];
    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
  }

#pragma unroll
  for (unsigned i = 0; i < NDim; ++i) {
    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
    numPoints *= counterSize[i];
  }

#pragma unroll
  for (unsigned i = 0; i < NDim; ++i) {
    counter[i] = 0;
  }
  for (int i = 0; i < numPoints; ++i) {
    valid = true;
    m = 1;
    offset = 0;
#pragma unroll
    for (int j = NDim - 1; j >= 0; --j) {
      val = uppers[j] - counter[j] * dilation[j];
      out[pointCounter * (NDim + 1) + j] = val;
      if (val < 0 || (val > outSpatialShape[j] - 1)) {
        valid = false;
        // break;
      }
      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
      m *= kernelSize[j];
    }

    out[pointCounter * (NDim + 1) + NDim] = offset;
    if (valid) ++pointCounter;
    counter[NDim - 1] += 1;
#pragma unroll
    for (int c = NDim - 1; c >= 0; --c) {
      if (counter[c] == counterSize[c] && c > 0) {
        counter[c - 1] += 1;
        counter[c] = 0;
      }
    }
  }
  return pointCounter;
}

template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPosTranspose(
    const Index *input_pos, const Index *kernelSize, const Index *stride,
    const Index *padding, const Index *dilation, const Index *outSpatialShape,
    Index *out) {
  Index lowers[NDim];
  Index uppers[NDim];
  Index counter[NDim];
  Index counterSize[NDim];
  Index pointCounter = 0;
  Index val;
  Index numPoints = 1;
  Index m, offset;
  bool valid = false;
#pragma unroll
  for (unsigned i = 0; i < NDim; ++i) {
    lowers[i] = input_pos[i] * stride[i] - padding[i];
    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
  }
#pragma unroll
  for (unsigned i = 0; i < NDim; ++i) {
    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
    numPoints *= counterSize[i];
  }
#pragma unroll
  for (unsigned i = 0; i < NDim; ++i) {
    counter[i] = 0;
  }
  for (int i = 0; i < numPoints; ++i) {
    valid = true;
    m = 1;
    offset = 0;
#pragma unroll
    for (int j = NDim - 1; j >= 0; --j) {
      val = uppers[j] - counter[j] * dilation[j];
      out[pointCounter * (NDim + 1) + j] = val;
      if (val < 0 || (val > outSpatialShape[j] - 1)) {
        valid = false;
      }
      offset += m * (val - lowers[j]) / dilation[j];
      m *= kernelSize[j];
    }
    out[pointCounter * (NDim + 1) + NDim] = offset;
    if (valid) ++pointCounter;
    counter[NDim - 1] += 1;
#pragma unroll
    for (int c = NDim - 1; c >= 0; --c) {
      if (counter[c] == counterSize[c] && c > 0) {
        counter[c - 1] += 1;
        counter[c] = 0;
      }
    }
  }
  return pointCounter;
}

template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
                         tv::TensorView<Index> indicesOut,
                         tv::TensorView<IndexGrid> gridsOut,
                         tv::TensorView<Index> indicePairs,
                         tv::TensorView<Index> indiceNum,
                         const Index *kernelSize, const Index *stride,
                         const Index *padding, const Index *dilation,
                         const Index *outSpatialShape) {
  // indicesOut: num_active * kernelVolume * (NDim + 1)
  Index numAct = 0;
  auto numActIn = indicesIn.dim(0);
  Index batchIdx = 0;
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];
  }
  Index numValidPoints = 0;
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  for (int j = 0; j < numActIn; ++j) {
    batchIdx = indicesIn(j, 0);
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
        dilation, outSpatialShape, validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
                   spatialVolume * batchIdx;
      if (gridsOut[index] == -1) {
        for (unsigned k = 1; k < NDim + 1; ++k) {
          indicesOut(numAct, k) = pointPtr[k - 1];
        }
        indicesOut(numAct, 0) = batchIdx;
        gridsOut[index] = numAct++;
      }
      // indicePairs: [K, 2, L]
      indicePairs(offset, 0, indiceNum[offset]) = j;
      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
    }
  }
  return numAct;
}

template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
                           tv::TensorView<Index> indicesOut,
                           tv::TensorView<IndexGrid> gridsOut,
                           tv::TensorView<Index> indicePairs,
                           tv::TensorView<Index> indiceNum,
                           const Index *kernelSize, const Index *stride,
                           const Index *padding, const Index *dilation,
                           const Index *outSpatialShape) {
  Index numAct = 0;
  auto numActIn = indicesIn.dim(0);
  Index batchIdx = 0;
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];
  }
  Index numValidPoints = 0;
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  for (int j = 0; j < numActIn; ++j) {
    batchIdx = indicesIn(j, 0);
    numValidPoints = getValidOutPosTranspose<Index, NDim>(
        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
        dilation, outSpatialShape, validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
                   spatialVolume * batchIdx;
      if (gridsOut[index] == -1) {
        for (unsigned k = 1; k < NDim + 1; ++k) {
          indicesOut(numAct, k) = pointPtr[k - 1];
        }
        indicesOut(numAct, 0) = batchIdx;
        gridsOut[index] = numAct++;
      }
      // indicePairs: [K, 2, L]
      indicePairs(offset, 0, indiceNum[offset]) = j;
      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
    }
  }
  return numAct;
}

template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
                         tv::TensorView<IndexGrid> gridsOut,
                         tv::TensorView<Index> indicePairs,
                         tv::TensorView<Index> indiceNum,
                         const Index *const kernelSize,
                         const Index *const stride, const Index *const padding,
                         const Index *dilation,
                         const Index *const outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i];
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];
  }
  Index numValidPoints = 0;
  // Index validPoints[kernelVolume * (NDim + 1)];
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
  Index *validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
  Index index = 0;
  for (int j = 0; j < numActIn; ++j) {
    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
                                         outSpatialShape) +
            spatialVolume * indicesIn(j, 0);
    gridsOut[index] = j;
  }
  for (int j = 0; j < numActIn; ++j) {
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
        dilation, outSpatialShape, validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
              spatialVolume * indicesIn(j, 0);
      if (gridsOut[index] > -1) {
        indicePairs(offset, 0, indiceNum[offset]) = j;
        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
      }
    }
  }
  return numActIn;
}

#endif


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>

namespace functor {
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1 {
  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose);
};

template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2 {
  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid = false);
};

template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor {
  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid = false);
};

template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor {
  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid = false);
};
}  // namespace functor

#endif


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>

namespace functor {
template <typename Device, typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor {
  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> inFeatures,
                  tv::TensorView<const Index> indices, int size);
};

template <typename Device, typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor {
  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> inFeatures,
                  tv::TensorView<const scalar_t> fout,
                  tv::TensorView<scalar_t> fin,
                  tv::TensorView<const Index> indices, int size);
};
}  // namespace functor

#endif


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
================================================
#ifndef MP_HELPER_H_
#define MP_HELPER_H_
#include <type_traits>
#include <utility>

template <class... T>
struct mp_list {};

template <class T, T... I>
using mp_list_c = mp_list<std::integral_constant<T, I>...>;

namespace detail {

template <class... T, class F>
constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
}

template <class F>
constexpr F mp_for_each_impl(mp_list<>, F &&f) {
  return std::forward<F>(f);
}

}  // namespace detail

namespace detail {

template <class A, template <class...> class B>
struct mp_rename_impl {
  // An error "no type named 'type'" here means that the first argument to
  // mp_rename is not a list
};

template <template <class...> class A, class... T, template <class...> class B>
struct mp_rename_impl<A<T...>, B> {
  using type = B<T...>;
};

}  // namespace detail

template <class A, template <class...> class B>
using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;

template <class L, class F>
constexpr F mp_for_each(F &&f) {
  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
                                    std::forward<F>(f));
}

#endif


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include <math.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

#include <algorithm>
#include <iostream>

namespace py = pybind11;
using namespace pybind11::literals;

template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
                          py::array_t<int> coors,
                          py::array_t<int> num_points_per_voxel,
                          py::array_t<int> coor_to_voxelidx,
                          std::vector<DType> voxel_size,
                          std::vector<DType> coors_range, int max_points,
                          int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
  auto coors_rw = coors.mutable_unchecked<2>();
  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
  auto N = points_rw.shape(0);
  auto num_features = points_rw.shape(1);
  constexpr int ndim_minus_1 = NDim - 1;
  int voxel_num = 0;
  bool failed = false;
  int coor[NDim];
  int c;
  int grid_size[NDim];
  for (int i = 0; i < NDim; ++i) {
    grid_size[i] =
        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }
  int voxelidx, num;
  for (int i = 0; i < N; ++i) {
    failed = false;
    for (int j = 0; j < NDim; ++j) {
      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
      if ((c < 0 || c >= grid_size[j])) {
        failed = true;
        break;
      }
      coor[ndim_minus_1 - j] = c;
    }
    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      if (voxel_num >= max_voxels) continue;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
        coors_rw(voxelidx, k) = coor[k];
      }
    }
    num = num_points_per_voxel_rw(voxelidx);
    if (num < max_points) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
      }
      num_points_per_voxel_rw(voxelidx) += 1;
    }
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
  }
  return voxel_num;
}

template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(py::array_t<DType> points,
                               py::array_t<DType> voxels,
                               py::array_t<DType> means, py::array_t<int> coors,
                               py::array_t<int> num_points_per_voxel,
                               py::array_t<int> coor_to_voxelidx,
                               std::vector<DType> voxel_size,
                               std::vector<DType> coors_range, int max_points,
                               int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto means_rw = means.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
  auto coors_rw = coors.mutable_unchecked<2>();
  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
  auto N = points_rw.shape(0);
  auto num_features = points_rw.shape(1);
  constexpr int ndim_minus_1 = NDim - 1;
  int voxel_num = 0;
  bool failed = false;
  int coor[NDim];
  int c;
  int grid_size[NDim];
  for (int i = 0; i < NDim; ++i) {
    grid_size[i] =
        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }
  int voxelidx, num;
  for (int i = 0; i < N; ++i) {
    failed = false;
    for (int j = 0; j < NDim; ++j) {
      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
      if ((c < 0 || c >= grid_size[j])) {
        failed = true;
        break;
      }
      coor[ndim_minus_1 - j] = c;
    }
    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      if (voxel_num >= max_voxels) continue;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
        coors_rw(voxelidx, k) = coor[k];
      }
    }
    num = num_points_per_voxel_rw(voxelidx);
    if (num < max_points) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
      }
      num_points_per_voxel_rw(voxelidx) += 1;
      for (int k = 0; k < num_features; ++k) {
        means_rw(voxelidx, k) +=
            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
      }
    }
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
    num = num_points_per_voxel_rw(i);
    for (int j = num; j < max_points; ++j) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(i, j, k) = means_rw(i, k);
      }
    }
  }
  return voxel_num;
}

template <typename DType, int NDim>
int points_to_voxel_3d_np_height(
    py::array_t<DType> points, py::array_t<DType> voxels,
    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
    std::vector<DType> voxel_size, std::vector<DType> coors_range,
    int max_points, int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto height_rw = height.template mutable_unchecked<2>();
  auto maxs_rw = maxs.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
  auto coors_rw = coors.mutable_unchecked<2>();
  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
  auto N = points_rw.shape(0);
  auto num_features = points_rw.shape(1);
  constexpr int ndim_minus_1 = NDim - 1;
  int voxel_num = 0;
  bool failed = false;
  int coor[NDim];
  int c;
  int grid_size[NDim];
  for (int i = 0; i < NDim; ++i) {
    grid_size[i] =
        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }
  int voxelidx, num;
  for (int i = 0; i < N; ++i) {
    failed = false;
    for (int j = 0; j < NDim; ++j) {
      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
      if ((c < 0 || c >= grid_size[j])) {
        failed = true;
        break;
      }
      coor[ndim_minus_1 - j] = c;
    }
    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      if (voxel_num >= max_voxels) continue;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
        coors_rw(voxelidx, k) = coor[k];
      }
    }
    num = num_points_per_voxel_rw(voxelidx);
    if (num < max_points) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
        height_rw(voxelidx, k) =
            std::min(points_rw(i, k), height_rw(voxelidx, k));
        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
      }
      num_points_per_voxel_rw(voxelidx) += 1;
    }
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
    for (int k = 0; k < num_features; ++k) {
      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
    }
  }
  return voxel_num;
}

template <typename DType, int NDim>
int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
                    py::array_t<DType> height, py::array_t<DType> maxs,
                    py::array_t<int> coor_to_voxelidx,
                    std::vector<DType> voxel_size,
                    std::vector<DType> coors_range, int max_voxels, DType eps) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto height_rw = height.template mutable_unchecked<1>();
  auto maxs_rw = maxs.template mutable_unchecked<1>();
  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
  auto N = points_rw.shape(0);
  auto num_features = points_rw.shape(1);
  constexpr int ndim_minus_1 = NDim - 1;
  int voxel_num = 0;
  bool failed = false;
  int coor[NDim];
  int c;
  int grid_size[NDim];
  for (int i = 0; i < NDim; ++i) {
    grid_size[i] =
        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }
  int voxelidx, num;
  for (int i = 0; i < N; ++i) {
    failed = false;
    for (int j = 0; j < NDim; ++j) {
      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
      if ((c < 0 || c >= grid_size[j])) {
        failed = true;
        break;
      }
      coor[ndim_minus_1 - j] = c;
    }
    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
    }
    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
  }
  for (int i = 0; i < N; ++i) {
    failed = false;
    for (int j = 0; j < NDim; ++j) {
      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
      if ((c < 0 || c >= grid_size[j])) {
        failed = true;
        break;
      }
      coor[ndim_minus_1 - j] = c;
    }
    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
      mask(i) = 0;
    }
  }
}

template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(
    py::array_t<DType> points, py::array_t<DType> voxels,
    py::array_t<int> voxel_mask, py::array_t<DType> mins,
    py::array_t<DType> maxs, py::array_t<int> coors,
    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
    std::vector<DType> voxel_size, std::vector<DType> coors_range,
    int max_points, int max_voxels, int block_factor, int block_size,
    DType height_threshold) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto mins_rw = mins.template mutable_unchecked<2>();
  auto maxs_rw = maxs.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
  auto coors_rw = coors.mutable_unchecked<2>();
  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
  auto N = points_rw.shape(0);
  auto num_features = points_rw.shape(1);
  constexpr int ndim_minus_1 = NDim - 1;
  int voxel_num = 0;
  bool failed = false;
  int coor[NDim];
  int c;
  int grid_size[NDim];

  DType max_value, min_value;
  for (int i = 0; i < NDim; ++i) {
    grid_size[i] =
        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }
  int block_shape_H = grid_size[1] / block_factor;
  int block_shape_W = grid_size[0] / block_factor;
  int voxelidx, num;
  int block_coor[2];
  int startx, stopx, starty, stopy;
  for (int i = 0; i < N; ++i) {
    failed = false;
    for (int j = 0; j < NDim; ++j) {
      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
      if ((c < 0 || c >= grid_size[j])) {
        failed = true;
        break;
      }
      coor[ndim_minus_1 - j] = c;
    }
    if (failed) continue;
    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      if (voxel_num >= max_voxels) continue;
      voxel_num += 1;
      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
      for (int k = 0; k < NDim; ++k) {
        coors_rw(voxelidx, k) = coor[k];
      }
    }
    num = num_points_per_voxel_rw(voxelidx);
    if (num < max_points) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
      }
      block_coor[0] = coor[1] / block_factor;
      block_coor[1] = coor[2] / block_factor;
      mins_rw(block_coor[0], block_coor[1]) =
          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
      maxs_rw(block_coor[0], block_coor[1]) =
          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
      num_points_per_voxel_rw(voxelidx) += 1;
    }
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor[1] = coors_rw(i, 1);
    coor[2] = coors_rw(i, 2);
    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
    block_coor[0] = coor[1] / block_factor;
    block_coor[1] = coor[2] / block_factor;
    min_value = mins_rw(block_coor[0], block_coor[1]);
    max_value = maxs_rw(block_coor[0], block_coor[1]);
    startx = std::max(0, block_coor[0] - block_size / 2);
    stopx =
        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
    starty = std::max(0, block_coor[1] - block_size / 2);
    stopy =
        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);

    for (int j = startx; j < stopx; ++j) {
      for (int k = starty; k < stopy; ++k) {
        min_value = std::min(min_value, mins_rw(j, k));
        max_value = std::max(max_value, maxs_rw(j, k));
      }
    }
    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
  }
  return voxel_num;
}


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>

namespace functor {
template <typename Device, typename scalar_t, typename Index>
struct SparseGatherFunctor {
  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
                  tv::TensorView<const scalar_t> features,
                  tv::TensorView<const Index> indices, int size);
};

template <typename Device, typename scalar_t, typename Index>
struct SparseScatterAddFunctor {
  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
                  tv::TensorView<const scalar_t> buffer,
                  tv::TensorView<const Index> indices, int size,
                  bool stable = false);
};
}  // namespace functor

#endif


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
================================================
#pragma once
namespace tv {
namespace detail {

template <typename scalar_t>
class KernelLoop {
  struct Iterator {
    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
        : index_(index), delta_(delta) {}
    __forceinline__ __device__ scalar_t operator*() const { return index_; }
    __forceinline__ __device__ Iterator &operator++() {
      index_ += delta_;
      return *this;
    }
    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
      bool greater = index_ > other.index_;
      bool less = index_ < other.index_;
      if (!other.delta_) {
        return less;
      }
      if (!delta_) {
        return greater;
      }
      return less || greater;
    }

   private:
    scalar_t index_;
    const scalar_t delta_;
  };

 public:
  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
                                        scalar_t end)
      : begin_(begin), delta_(delta), end_(end) {}

  __forceinline__ __device__ Iterator begin() const {
    return Iterator{begin_, delta_};
  }
  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }

 private:
  scalar_t begin_;
  scalar_t delta_;
  scalar_t end_;
};

}  // namespace detail

template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
    scalar_t count) {
  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
                                      gridDim.x * blockDim.x * NumILP, count);
}

// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
    scalar_t count) {
  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
                                      gridDim.y * blockDim.y * NumILP, count);
}

// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
    scalar_t count) {
  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
                                      gridDim.z * blockDim.z * NumILP, count);
}

}  // namespace tv


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.muh
================================================
#pragma once
namespace tv {
namespace detail {

template <typename scalar_t>
class KernelLoop {
  struct Iterator {
    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
        : index_(index), delta_(delta) {}
    __forceinline__ __device__ scalar_t operator*() const { return index_; }
    __forceinline__ __device__ Iterator &operator++() {
      index_ += delta_;
      return *this;
    }
    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
      bool greater = index_ > other.index_;
      bool less = index_ < other.index_;
      if (!other.delta_) {
        return less;
      }
      if (!delta_) {
        return greater;
      }
      return less || greater;
    }

   private:
    scalar_t index_;
    const scalar_t delta_;
  };

 public:
  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
                                        scalar_t end)
      : begin_(begin), delta_(delta), end_(end) {}

  __forceinline__ __device__ Iterator begin() const {
    return Iterator{begin_, delta_};
  }
  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }

 private:
  scalar_t begin_;
  scalar_t delta_;
  scalar_t end_;
};

}  // namespace detail

template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
    scalar_t count) {
  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
                                      gridDim.x * blockDim.x * NumILP, count);
}

// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
    scalar_t count) {
  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
                                      gridDim.y * blockDim.y * NumILP, count);
}

// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
    scalar_t count) {
  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
                                      gridDim.z * blockDim.z * NumILP, count);
}

}  // namespace tv


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
================================================
#pragma once
// from pytorch.aten
#include "tensorview.h"
namespace tv {
namespace launch {

template <typename T1, typename T2>
inline int DivUp(const T1 a, const T2 b) {
  return (a + b - 1) / b;
}

constexpr int CUDA_NUM_THREADS = 1024;
constexpr int MUSA_NUM_THREADS = 1024;
inline int getBlocks(const int N) {
  TV_ASSERT_RT_ERR(N > 0,
                   "CUDA kernel launch blocks must be positive, but got N=", N);
  return DivUp(N, CUDA_NUM_THREADS);
}
}  // namespace launch
}  // namespace tv


================================================
FILE: mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <memory>
#include <sstream>
#include <type_traits>
#include <vector>

#include "pytorch_cpp_helper.hpp"

namespace tv {

#if defined(__NVCC__) || defined(__HIP__) || defined(__MUSA__)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#define TV_ASSERT(expr) assert(expr)
#elif defined(__CUDACC_RTC__)
#define TV_ASSERT(expr) assert(expr)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#else
#define TV_ASSERT(x) assert(x)
#define TV_HOST_DEVICE_INLINE inline
#define TV_HOST_DEVICE
#endif

#define TV_REQUIRE(expr, ...) \
  {                           \
    if (!(expr)) {            \
      printf(__VA_ARGS__);    \
      assert(expr);           \
    }                         \
  }

#define TV_DEVICE_REQUIRE(expr, ...)                      \
  {                                                       \
    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
    assert(expr);                                         \
  }

template <class SStream, class T>
void sstream_print(SStream &ss, T val) {
  ss << val;
}

template <class SStream, class T, class... TArgs>
void sstream_print(SStream &ss, T val, TArgs... args) {
  ss << val << " ";
  sstream_print(ss, args...);
}

#define TV_ASSERT_RT_ERR(expr, ...)                     \
  {                                                     \
    if (!(expr)) {                                      \
      std::stringstream __macro_s;                      \
      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
      __macro_s << #expr << " assert failed. ";         \
      tv::sstream_print(__macro_s, __VA_ARGS__);        \
      throw std::runtime_error(__macro_s.str());        \
    }                                                   \
  }

#define TV_ASSERT_INVALID_ARG(expr, ...)                \
  {                                                     \
    if (!(expr)) {                                      \
      std::stringstream __macro_s;                      \
      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
      __macro_s << #expr << " assert failed. ";         \
      tv::sstream_print(__macro_s, __VA_ARGS__);        \
      throw std::invalid_argument(__macro_s.str());     \
    }                                                   \
  }

#define TV_CHECK_CUDA_ERR()                                    \
  {                                                            \
    auto err = cudaGetLastError();                             \
    if (err != cudaSuccess) {                                  \
      std::stringstream __macro_s;                             \
      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
      __macro_s << "cuda execution failed with error " << err; \
      throw std::runtime_error(__macro_s.str());               \
    }                                                          \
  }

#define TV_CHECK_MUSA_ERR()                                    \
  {                                                            \
    auto err = musaGetLastError();                             \
    if (err != musaSuccess) {                                  \
      std::stringstream __macro_s;                             \
      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
      __macro_s << "musa execution failed with error " << err; \
      throw std::runtime_error(__macro_s.str());               \
    }                                                          \
  }

struct CPU {};

#define TV_MAX_DIM 6

template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
struct SimpleVector {
 public:
  TV_HOST_DEVICE_INLINE SimpleVector() {};
  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
    TV_ASSERT(q.size() <= MaxDim);
    mSize = 0;
    for (scalar_t s : q) {
      mArray[mSize++] = s;
    }
    mSize = q.size();
  }
  SimpleVector(const std::vector<scalar_t> &arr) {
    TV_ASSERT(arr.size() <= MaxDim);
    for (size_t i = 0; i < arr.size(); ++i) {
      mArray[i] = arr[i];
    }
    mSize = arr.size();
  }
  TV_HOST_DEVICE_INLINE SimpleVector(
      const SimpleVector<scalar_t, MaxDim> &arr) {
    TV_ASSERT(arr.size() <= MaxDim);
    for (size_t i = 0; i < arr.size(); ++i) {
      mArray[i] = arr[i];
    }
    mSize = arr.size();
  }
  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
#ifdef TV_DEBUG
    TV_ASSERT(idx >= 0 && idx < mSize);
#endif
    return mArray[idx];
  }
  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
#ifdef TV_DEBUG
    TV_ASSERT(idx >= 0 && idx < mSize);
#endif
    return mArray[idx];
  }
  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
#ifdef TV_DEBUG
    TV_ASSERT(mSize < MaxDim);
#endif
    mArray[mSize] = s;
    mSize++;
  }
  TV_HOST_DEVICE_INLINE void pop_back() {
#ifdef TV_DEBUG
    TV_ASSERT(mSize > 0);
#endif
    mSize--;
  }

  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }

  typedef size_t size_type;

  class iterator {
   public:
    typedef iterator self_type;
    typedef scalar_t value_type;
    typedef scalar_t &reference;
    typedef scalar_t *pointer;
    typedef std::forward_iterator_tag iterator_category;
    typedef std::ptrdiff_t difference_type;
    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
      self_type i = *this;
      ptr_++;
      return i;
    }
    TV_HOST_DEVICE_INLINE self_type operator++() {
      ptr_++;
      return *this;
    }
    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
      return ptr_ == rhs.ptr_;
    }
    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
      return ptr_ != rhs.ptr_;
    }

   private:
    pointer ptr_;
  };

  class const_iterator {
   public:
    typedef const_iterator self_type;
    typedef scalar_t value_type;
    typedef const scalar_t &reference;
    typedef const scalar_t *pointer;
    typedef std::ptrdiff_t difference_type;
    typedef std::forward_iterator_tag iterator_category;
    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
      self_type i = *this;
      ptr_++;
      return i;
    }
    TV_HOST_DEVICE_INLINE self_type operator++() {
      ptr_++;
      return *this;
    }
    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
      return ptr_ == rhs.ptr_;
    }
    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
      return ptr_ != rhs.ptr_;
    }

   private:
    pointer ptr_;
  };

  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }

  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }

  TV_HOST_DEVICE_INLINE const_iterator begin() const {
    return const_iterator(mArray);
  }

  TV_HOST_DEVICE_INLINE const_iterator end() const {
    return const_iterator(mArray + mSize);
  }
  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
    return const_iterator(mArray);
  }

  TV_HOST_DEVICE_INLINE const_iterator cend() const {
    return const_iterator(mArray + mSize);
  }

 protected:
  scalar_t mArray[MaxDim];
  size_t mSize = 0;
};

template <typename scalar_t, size_t MaxDim>
bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
                const SimpleVector<scalar_t, MaxDim> &rfs) {
  if (lfs.size() != rfs.size()) return false;
  for (size_t i = 0; i < lfs.size(); ++i) {
    if (lfs[i] != rfs[i]) return false;
  }
  return true;
}

template <typename scalar_t, size_t MaxDim>
bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
                const SimpleVector<scalar_t, MaxDim> &rfs) {
  return !(lfs == rfs);
}

struct Slice {
  template <class... Integers>
  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
    SimpleVector<int, 3> slices{int(ints)...};
    mSlices[0] = -1;
    mSlices[1] = -1;
    mSlices[2] = -1;
    for (size_t i = 0; i < slices.size(); ++i) {
      mSlices[i] = slices[i];
    }
  }

  TV_HOST_DEVICE_INLINE Slice() {
    mSlices[0] = -1;
    mSlices[1] = -1;
    mSlices[2] = -1;
  }
  template <typename scalar_t>
  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
    mSlices[0] = -1;
    mSlices[1] = -1;
    mSlices[2] = -1;
    TV_ASSERT(slice.size() <= 3);
    int idx = 0;
    for (scalar_t s : slice) {
      mSlices[idx] = int(s);
      ++idx;
    }
  }
  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
#ifdef TV_DEBUG
    TV_ASSERT(idx >= 0 && idx < 3);
#endif
    return mSlices[idx];
  }
  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
#ifdef TV_DEBUG
    TV_ASSERT(idx >= 0 && idx < 3);
#endif
    return mSlices[idx];
  }

 protected:
  int mSlices[3];
};

template <size_t MaxDim = TV_MAX_DIM>
struct ShapeBase : public SimpleVector<int, MaxDim> {
  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>() {};
  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
      : SimpleVector<int, MaxDim>(shape) {}

  // TODO: find out why this template can no be used on windows
  // template <typename scalar_t, template <class...> class Container>
  // ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
      : SimpleVector<int, MaxDim>(shape) {}
  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}

  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
#ifdef TV_DEBUG
    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
#endif
    ShapeBase<MaxDim> shape;
    for (int i = start; i < end; ++i) {
      shape.push_back(this->mArray[i]);
    }
    return shape;
  }
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
#ifdef TV_DEBUG
    TV_ASSERT(start >= 0 && start <= this->mSize);
#endif
    ShapeBase<MaxDim> shape;
    for (int i = start; i < this->mSize; ++i) {
      shape.push_back(this->mArray[i]);
    }
    return shape;
  }

  TV_HOST_DEVICE_INLINE size_t size() const {
    if (this->mSize == 0) return 0;
    size_t s = 1;
    for (int i = 0; i < int(this->mSize); ++i) {
      s *= this->mArray[i];
    }
    return s;
  }
  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
    ShapeBase<MaxDim> shape;
    for (int i = 0; i < this->mSize; ++i) {
      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
    }
    return shape;
  }
  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
    ShapeBase<MaxDim> shape;
    for (int i = 0; i < this->mSize; ++i) {
      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
    }
    return shape;
  }
};

using Shape = ShapeBase<TV_MAX_DIM>;

template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
                                           Inds... indexes) {
  unsigned offset = 0;
  unsigned m = 1;
  int indexes_vec[sizeof...(indexes)] = {indexes...};
#ifdef TV_DEBUG
  TV_ASSERT(sizeof...(indexes) == shape.size());
#endif
#pragma unroll
  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
    offset += m * indexes_vec[i];
    m *= shape[i];
  }
  return offset;
}

TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
                                           std::vector<int> &indexes_vec) {
  unsigned offset = 0;
  unsigned m = 1;
  for (int i = shape.size() - 1; i >= 0; --i) {
    offset += m * indexes_vec[i];
    m *= shape[i];
  }
  return offset;
}

template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
                                           Inds... indexes) {
  unsigned offset = 0;
  unsigned m = 1;
  int indexes_vec[sizeof...(indexes)] = {indexes...};
#pragma unroll
  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
    offset += m * indexes_vec[i];
    m *= shape[i];
  }
  return offset;
}

TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
                                           const Shape &indexes_vec) {
  unsigned offset = 0;
  unsigned m = 1;
  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
    offset += m * indexes_vec[i];
    m *= shape[i];
  }
  return offset;
}

template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
                                           const Index *shape) {
  unsigned offset = 0;
  unsigned m = 1;
#pragma unroll
  for (int i = NDim - 1; i >= 0; --i) {
    offset += m * indexes[i];
    m *= shape[i];
  }
  return offset;
}

template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
                                           const Index *shape) {
#pragma unroll
  for (int i = NDim - 1; i >= 0; --i) {
    output[i] = index % shape[i];
    index -= output[i];
    index /= shape[i];
  }
  return index;
}

template <int N>
struct ArrayIndexRowMajor {
  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
                                            const Shape &indexes) {
    return indexes[N - 1] +
           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
  }
};

template <>
struct ArrayIndexRowMajor<0> {
  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
                                            const Shape &indexes) {
    return 0;
  }
};

namespace detail {
template <typename scalar_t>
constexpr const char *simpleTypeName(scalar_t val = scalar_t());
template <>
constexpr const char *simpleTypeName(float val) {
  return "float32";
}
template <>
constexpr const char *simpleTypeName(double val) {
  return "float64";
}
template <>
constexpr const char *simpleTypeName(int val) {
  return "int32";
}
template <>
constexpr const char *simpleTypeName(unsigned val) {
  return "uint32";
}
template <>
constexpr const char *simpleTypeName(long val) {
  return "int64";
}
template <>
constexpr const char *simpleTypeName(unsigned long val) {
  return "uint64";
}
};  // namespace detail

template <typename scalar_t, int Rank = -1>
struct TensorView {
  TV_HOST_DEVICE_INLINE TensorView() {}
  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
      : mPtr(ptr), mShape(shape) {}
  template <class... Integers>
  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
      : mPtr(ptr) {
    mShape = {int(shapes)...};
  }

  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
      const TensorView<scalar_t, Rank> &tensor) {
    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
               "\n");
    scalar_t *ptr = mPtr;
    const scalar_t *other_ptr = tensor.data();
    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
    return *this;
  }

  template <typename T1>
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
      std::initializer_list<T1> seq) {
    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
               "\n");
    scalar_t *ptr = mPtr;
    for (const T1 &s : seq) *(ptr++) = scalar_t(s);
    return *this;
  }

  template <class... Inds>
  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
#ifdef TV_DEBUG
    int idxes[sizeof...(Inds)]{int(inds)...};
    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
               mShape.ndim());
    for (int i = 0; i < sizeof...(inds); ++i) {
      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
                 mShape[i]);
    }
#endif
    return mPtr[rowArrayIdx(mShape, int(inds)...)];
  }
  template <class... Inds>
  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
#ifdef TV_DEBUG
    int idxes[sizeof...(Inds)]{int(inds)...};
    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
               mShape.ndim());
    for (int i = 0; i < sizeof...(inds); ++i) {
      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
                 mShape[i]);
    }
#endif
    return mPtr[rowArrayIdx(mShape, int(inds)...)];
  }
  TV_HOST_DEVICE_INLINE scalar_t &operator()() {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mPtr != nullptr,
                      "you want get value but the view is empty.%s", "\n");
    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
               "\n");
    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
               mShape.ndim());
#endif
#endif
    return mPtr[0];
  }
  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mPtr != nullptr,
                      "you want get value but the view is empty.%s", "\n");
    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
               "\n");
    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
               mShape.ndim());
#endif
#endif
    return mPtr[0];
  }

  template <class T1>
  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#else
    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
               mShape.ndim());
    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#endif
#endif
    return mPtr[i1];
  }
  template <class T1, class T2>
  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
                      mShape[0]);
    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
                      mShape[1]);
#else
    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
               mShape.ndim());
    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
#endif
    return mPtr[i1 * mShape[1] + i2];
  }
  template <class T1, class T2, class T3>
  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
                      mShape[0]);
    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
                      mShape[1]);
    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
                      mShape[2]);
#else
    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
               mShape.ndim());
    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
#endif
    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
  }
  template <class T1, class T2, class T3, class T4>
  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
                      mShape[0]);
    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
                      mShape[1]);
    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
                      mShape[2]);
    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
                      mShape[3]);
#else
    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
               mShape.ndim());
    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
#endif
    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
  }

  template <class T1>
  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
                      mShape[0]);
#else
    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
               mShape.ndim());
    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
#endif
#endif
    return mPtr[i1];
  }
  template <class T1, class T2>
  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
                      mShape[0]);
    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
                      mShape[1]);
#else
    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
               mShape.ndim());
    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);

#endif
#endif
    return mPtr[i1 * mShape[1] + i2];
  }
  template <class T1, class T2, class T3>
  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
                      mShape[0]);
    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
                      mShape[1]);
    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
                      mShape[2]);
#else
    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
               mShape.ndim());
    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
#endif
    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
  }
  template <class T1, class T2, class T3, class T4>
  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
                                                   T4 i4) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
                      mShape[0]);
    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
                      mShape[1]);
    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
                      mShape[2]);
    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
                      mShape[3]);
#else
    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
               mShape.ndim());
    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
#endif
    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
  }

  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
#else
    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
               int(idx), size());
#endif
#endif
    return mPtr[idx];
  }
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
      SimpleVector<Slice> slice_vec) {
    return _subview(slice_vec);
  }
  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
      SimpleVector<Slice> slice_vec) const {
    return _subview(slice_vec);
  }
  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
  template <class... Inds>
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
    Shape shapes{int(newShapes)...};
    TV_ASSERT(shapes.size() == size());
    mShape = shapes;
    return *this;
  }
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
    TV_ASSERT(shapes.size() == size());
    mShape = shapes;
    return *this;
  }
  template <class... Inds>
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
      Inds... newShapes) const {
    Shape shapes{int(newShapes)...};
    for (size_t i = 0; i < shapes.ndim(); ++i) {
      if (shapes[i] == -1) {
        shapes[i] = 1;
        shapes[i] = size() / shapes.size();
        break;
      }
    }
    TV_ASSERT(shapes.size() == size());
    return TensorView<scalar_t, Rank>(mPtr, shapes);
  }
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
    TV_ASSERT(shapes.size() == size());
    return TensorView<scalar_t, Rank>(mPtr, shapes);
  }
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
  }
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
  }
  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }

  template <class... Slices>
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
      Slice slice, Slices... slices) const {
    return subview<float, Slice, Slices...>(slice, slices...);
  }
  template <class T2 = float, class... Slices>
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
      Slices... slices) const {
    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
    Shape new_shape{to_slice(slices)[0]...};
    Shape start{to_slice(slices)[0]...};
    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
    TV_ASSERT(new_shape.ndim() != 0);
    size_t idxsize = new_shape.ndim();
    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
      new_shape.push_back(0);
      start.push_back(0);
    }
#pragma unroll
    for (size_t i = 0; i < sizeof...(Slices); ++i) {
      if (slice_vec[i][1] != -1) {
        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
        TV_ASSERT(new_shape[i] >= 0);
      } else {
        new_shape[i] = 1;
      }
    }
    auto offset = rowArrayIdx(mShape, start);
#pragma unroll
    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
      new_shape[i] = mShape[i];
      TV_ASSERT(new_shape[i] >= 0);
    }
    Shape reduced_shape;
#pragma unroll
    for (size_t i = 0; i < sizeof...(Slices); ++i) {
      if (slice_vec[i][1] != -1) {
        reduced_shape.push_back(new_shape[i]);
      }
    }
#pragma unroll
    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
      reduced_shape.push_back(new_shape[i]);
    }
    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
  }

  template <class... Integers>
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
                                                           Integers... ints) {
    Shape start = {id, ints...};
    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
      start.push_back(0);
    }
    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
                                      mShape.subshape(sizeof...(ints) + 1));
  }

  std::string repr() const {
    std::ostringstream ss;
    if (empty()) return "";
    if (mShape.ndim() == 0) {
      ss << *mPtr;
      ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
      return ss.str();
    }
    Shape counter = mShape;
    auto tensor_flat = this->view(-1);
    for (int i = 0; i < counter.ndim(); ++i) {
      counter[i] = 0;
      ss << "[";
    }
    for (size_t i = 0; i < this->size(); ++i) {
      ss << tensor_flat(rowArrayIdx(mShape, counter));
      counter[counter.ndim() - 1] += 1;
      int inc_count = 0;
      bool print_comma = true;
      for (int c = counter.ndim() - 1; c >= 0; --c) {
        if (counter[c] == this->dim(c) && c > 0) {
          ++inc_count;
          counter[c - 1] += 1;
          counter[c] = 0;
          print_comma = false;
        }
      }
      if (print_comma && i != this->size() - 1) ss << ", ";
      for (int j = 0; j < inc_count; ++j) {
        ss << "]";
      }
      if (i != this->size() - 1) {
        if (inc_count != 0) ss << "\n";
        for (int j = 0; j < inc_count; ++j) {
          ss << "[";
        }
      }
    }
    ss << "]";
    ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
    return ss.str();
  }

 protected:
  // TODO: make this function public.
  // currently this function is called unexpectedly when using subview({0, 0}).
  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
      SimpleVector<Slice> slice_vec) {
    Shape new_shape;
    for (int i = 0; i < slice_vec.size(); ++i) {
      new_shape.push_back(slice_vec[i][0]);
    }
    Shape start = new_shape;
    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
    TV_ASSERT(new_shape.ndim() != 0);
    size_t idxsize = new_shape.ndim();
    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
      new_shape.push_back(0);
      start.push_back(0);
    }
    for (size_t i = 0; i < slice_vec.size(); ++i) {
      if (slice_vec[i][1] != -1) {
        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
        TV_ASSERT(new_shape[i] >= 0);
      } else {
        new_shape[i] = 1;  // reduce dim
      }
    }
    auto offset = rowArrayIdx(mShape, start);
    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
      new_shape[i] = mShape[i];
      TV_ASSERT(new_shape[i] >= 0);
    }
    Shape reduced_shape;
    for (size_t i = 0; i < slice_vec.size(); ++i) {
      if (slice_vec[i][1] != -1) {
        reduced_shape.push_back(new_shape[i]);
      }
    }
    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
      reduced_shape.push_back(new_shape[i]);
    }
    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
  }
  template <typename T1>
  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
    return Slice{int(s), -1, -1};
  }

  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }

  scalar_t *mPtr = nullptr;
  Shape mShape;
};

template <typename Os, typename scalar_t, int Rank>
Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
  os << dt.repr();
  return os;
}

template <typename Os, typename scalar_t, int Rank>
Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
  os << dt.repr();
  return os;
}

namespace detail {
template <typename scalar_t>
constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
template <>
constexpr const char *printfTypeFormat(float val) {
  return "%.2f";
}
template <>
constexpr const char *printfTypeFormat(double val) {
  return "%.2f";
}
template <>
constexpr const char *printfTypeFormat(int val) {
  return "%d";
}
template <>
constexpr const char *printfTypeFormat(unsigned val) {
  return "%u";
}
template <>
constexpr const char *printfTypeFormat(long val) {
  return "%ld";
}
template <>
constexpr const char *printfTypeFormat(unsigned long val) {
  return "%lu";
}
};  // namespace detail

template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
                                    const char *format) {
  if (tensor.empty()) return;
  if (tensor.ndim() == 0) {
    printf(format, tensor());
    printf("\n");
    return;
  }
  Shape counter = tensor.shape();
  auto tensor_flat = tensor.view(-1);
  for (int i = 0; i < counter.ndim(); ++i) {
    counter[i] = 0;
    printf("[");
  }
  for (size_t i = 0; i < tensor.size(); ++i) {
    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
    counter[counter.ndim() - 1] += 1;
    int inc_count = 0;
    bool print_comma = true;
    for (int c = counter.ndim() - 1; c >= 0; --c) {
      if (counter[c] == tensor.dim(c) && c > 0) {
        ++inc_count;
        counter[c - 1] += 1;
        counter[c] = 0;
        print_comma = false;
      }
    }
    if (print_comma && i != tensor.size() - 1) printf(", ");
    for (int j = 0; j < inc_count; ++j) {
      printf("]");
    }
    if (i != tensor.size() - 1) {
      if (inc_count != 0) printf("\n");
      for (int j = 0; j < inc_count; ++j) {
        printf("[");
      }
    }
  }
  printf("]\n");
}

template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
  using Traw = typename std::remove_const<scalar_t>::type;
  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
}
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
  using Traw = typename std::remove_const<scalar_t>::type;
  return printTensorView(TensorView<const scalar_t>(ptr, shape),
                         detail::printfTypeFormat<Traw>());
}
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
                                    const char *format) {
  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
}

}  // namespace tv


================================================
FILE: mmcv/ops/csrc/parrots/active_rotated_filter.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void active_rotated_filter_forward_impl(const Tensor input,
                                        const Tensor indices, Tensor output) {
  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
                       output);
}

void active_rotated_filter_backward_impl(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in) {
  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
                       grad_in);
}

void active_rotated_filter_forward(const Tensor input, const Tensor indices,
                                   Tensor output) {
  active_rotated_filter_forward_impl(input, indices, output);
}

void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
                                    Tensor grad_in) {
  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
}


================================================
FILE: mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "active_rotated_filter_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void active_rotated_filter_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  auto input = buildATensor(ctx, ins[0]);
  auto indices = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  active_rotated_filter_forward(input, indices, output);
}

void active_rotated_filter_backward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  auto grad_out = buildATensor(ctx, ins[0]);
  auto indices = buildATensor(ctx, ins[1]);
  auto grad_in = buildATensor(ctx, outs[0]);
  active_rotated_filter_backward(grad_out, indices, grad_in);
}
#endif

void active_rotated_filter_forward_cpu_parrots(
    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  auto input = buildATensor(ctx, ins[0]);
  auto indices = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  active_rotated_filter_forward(input, indices, output);
}

void active_rotated_filter_backward_cpu_parrots(
    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  auto grad_out = buildATensor(ctx, ins[0]);
  auto indices = buildATensor(ctx, ins[1]);
  auto grad_in = buildATensor(ctx, outs[0]);
  active_rotated_filter_backward(grad_out, indices, grad_in);
}

PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
    .input(2)
    .output(1)
    .apply(active_rotated_filter_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(active_rotated_filter_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
    .input(2)
    .output(1)
    .apply(active_rotated_filter_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(active_rotated_filter_backward_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
#define ACTIVE_ROTATED_FILTER_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void active_rotated_filter_forward(const Tensor input, const Tensor indices,
                                   Tensor output);

void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
                                    Tensor grad_in);

#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/assign_score_withk.cpp
================================================
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor& points,
                                     const Tensor& centers,
                                     const Tensor& scores,
                                     const Tensor& knn_idx, Tensor& output) {
  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
                       aggregate, points, centers, scores, knn_idx, output);
}

void assign_score_withk_backward_impl(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
                       aggregate, grad_out, points, centers, scores, knn_idx,
                       grad_points, grad_centers, grad_scores);
}

void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
                                const Tensor& scores, const Tensor& knn_idx,
                                Tensor& output, int B, int N0, int N1, int M,
                                int K, int O, int aggregate) {
  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
                                  centers, scores, knn_idx, output);
}

void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
                                 const Tensor& centers, const Tensor& scores,
                                 const Tensor& knn_idx, Tensor& grad_points,
                                 Tensor& grad_centers, Tensor& grad_scores,
                                 int B, int N0, int N1, int M, int K, int O,
                                 int aggregate) {
  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
                                   points, centers, scores, knn_idx,
                                   grad_points, grad_centers, grad_scores);
}


================================================
FILE: mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "assign_score_withk_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void assign_score_withk_forward_cuda_parrots(CudaContext& ctx,
                                             const SSElement& attr,
                                             const OperatorBase::in_list_t& ins,
                                             OperatorBase::out_list_t& outs) {
  int B, N0, N1, M, K, O, aggregate;
  SSAttrs(attr)
      .get<int>("B", B)
      .get<int>("N0", N0)
      .get<int>("N1", N1)
      .get<int>("M", M)
      .get<int>("K", K)
      .get<int>("O", O)
      .get<int>("aggregate", aggregate)
      .done();

  const auto& points = buildATensor(ctx, ins[0]);
  const auto& centers = buildATensor(ctx, ins[1]);
  const auto& scores = buildATensor(ctx, ins[2]);
  const auto& knn_idx = buildATensor(ctx, ins[3]);

  auto output = buildATensor(ctx, outs[0]);
  assign_score_withk_forward(points, centers, scores, knn_idx, output, B, N0,
                             N1, M, K, O, aggregate);
}

void assign_score_withk_backward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int B, N0, N1, M, K, O, aggregate;
  SSAttrs(attr)
      .get<int>("B", B)
      .get<int>("N0", N0)
      .get<int>("N1", N1)
      .get<int>("M", M)
      .get<int>("K", K)
      .get<int>("O", O)
      .get<int>("aggregate", aggregate)
      .done();

  const auto& grad_out = buildATensor(ctx, ins[0]);
  const auto& points = buildATensor(ctx, ins[1]);
  const auto& centers = buildATensor(ctx, ins[2]);
  const auto& scores = buildATensor(ctx, ins[3]);
  const auto& knn_idx = buildATensor(ctx, ins[4]);

  auto grad_points = buildATensor(ctx, outs[0]);
  auto grad_centers = buildATensor(ctx, outs[1]);
  auto grad_scores = buildATensor(ctx, outs[2]);
  assign_score_withk_backward(grad_out, points, centers, scores, knn_idx,
                              grad_points, grad_centers, grad_scores, B, N0, N1,
                              M, K, O, aggregate);
}

PARROTS_EXTENSION_REGISTER(assign_score_withk_forward)
    .attr("B")
    .attr("N0")
    .attr("N1")
    .attr("M")
    .attr("K")
    .attr("O")
    .attr("aggregate")
    .input(4)
    .output(1)
    .apply(assign_score_withk_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(assign_score_withk_backward)
    .attr("B")
    .attr("N0")
    .attr("N1")
    .attr("M")
    .attr("K")
    .attr("O")
    .attr("aggregate")
    .input(5)
    .output(3)
    .apply(assign_score_withk_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ASSIGN_SCORE_WITHK_PYTORCH_H
#define ASSIGN_SCORE_WITHK_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
                                const Tensor& scores, const Tensor& knn_idx,
                                Tensor& output, int B, int N0, int N1, int M,
                                int K, int O, int aggregate);

void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
                                 const Tensor& centers, const Tensor& scores,
                                 const Tensor& knn_idx, Tensor& grad_points,
                                 Tensor& grad_centers, Tensor& grad_scores,
                                 int B, int N0, int N1, int M, int K, int O,
                                 int aggregate);

#endif  // ASSIGN_SCORE_WITHK_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/ball_query._parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "ball_query_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void ball_query_parrots(CudaContext& ctx, const SSElement& attr,
                        const OperatorBase::in_list_t& ins,
                        OperatorBase::out_list_t& outs) {
  int b, n, m, nsample;
  float min_radius, max_radius;
  SSAttrs(attr)
      .get<int>("b", b)
      .get<int>("n", n)
      .get<int>("m", m)
      .get<int>("nsample", nsample)
      .get<float>("min_radius", min_radius)
      .get<float>("max_radius", max_radius)
      .done();

  const auto& center_xyz = buildATensor(ctx, ins[0]);
  const auto& xyz = buildATensor(ctx, ins[1]);
  auto idx = buildATensor(ctx, outs[0]);
  ball_query_forward(center_xyz, xyz, idx, b, n, m, min_radius, max_radius,
                     nsample);
}

PARROTS_EXTENSION_REGISTER(ball_query_forward)
    .attr("b")
    .attr("n")
    .attr("m")
    .attr("nsample")
    .attr("min_radius")
    .attr("max_radius")
    .input(2)
    .output(1)
    .apply(ball_query_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/ball_query.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void ball_query_forward_impl(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx) {
  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
                       nsample, new_xyz, xyz, idx);
}

void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                        Tensor idx_tensor, int b, int n, int m,
                        float min_radius, float max_radius, int nsample) {
  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
                          new_xyz_tensor, xyz_tensor, idx_tensor);
}


================================================
FILE: mmcv/ops/csrc/parrots/ball_query_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef BALL_QUERY_PYTORCH_H
#define BALL_QUERY_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void ball_query_forward(const Tensor new_xyz, const Tensor xyz, Tensor idx,
                        int b, int n, int m, float min_radius, float max_radius,
                        int nsample);

#endif  // BALL_QUERY_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/bbox_overlaps.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset) {
  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
                       aligned, offset);
}

void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                   const int mode, const bool aligned, const int offset) {
  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
}


================================================
FILE: mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "bbox_overlaps_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
/*
 * void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor
 * ious, const int mode, const bool aligned, const int offset);
 */
void bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,
                           const OperatorBase::in_list_t& ins,
                           OperatorBase::out_list_t& outs) {
  int mode, offset;
  bool aligned;
  SSAttrs(attr)
      .get<int>("mode", mode)
      .get<bool>("aligned", aligned)
      .get<int>("offset", offset)
      .done();

  const auto& bboxes1 = buildATensor(ctx, ins[0]);
  const auto& bboxes2 = buildATensor(ctx, ins[1]);
  auto ious = buildATensor(ctx, outs[0]);
  bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
}

PARROTS_EXTENSION_REGISTER(bbox_overlaps)
    .attr("mode")
    .attr("aligned")
    .attr("offset")
    .input(2)
    .output(1)
    .apply(bbox_overlaps_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef BBOX_OVERLAPS_PYTORCH_H
#define BBOX_OVERLAPS_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset);

#endif  // BBOX_OVERLAPS_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/border_align.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size) {
  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
                       argmax_idx, pool_size);
}

void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size) {
  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
                       argmax_idx, grad_input, pool_size);
}

void border_align_forward(const Tensor &input, const Tensor &boxes,
                          Tensor output, Tensor argmax_idx,
                          const int pool_size) {
  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
}

void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
                           const Tensor &argmax_idx, Tensor grad_input,
                           const int pool_size) {
  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
                             pool_size);
}


================================================
FILE: mmcv/ops/csrc/parrots/border_align_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "border_align_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void border_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                       const OperatorBase::in_list_t& ins,
                                       OperatorBase::out_list_t& outs) {
  int pool_size;
  SSAttrs(attr).get<int>("pool_size", pool_size).done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& boxes = buildATensor(ctx, ins[1]);

  auto output = buildATensor(ctx, outs[0]);
  auto argmax_idx = buildATensor(ctx, outs[1]);
  border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
}

void border_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
  int pool_size;
  SSAttrs(attr).get<int>("pool_size", pool_size).done();

  const auto& top_grad = buildATensor(ctx, ins[0]);
  const auto& boxes = buildATensor(ctx, ins[1]);
  const auto& argmax_idx = buildATensor(ctx, ins[2]);

  auto bottom_grad = buildATensor(ctx, outs[0]);
  border_align_backward_cuda(top_grad, boxes, argmax_idx, bottom_grad,
                             pool_size);
}

PARROTS_EXTENSION_REGISTER(border_align_forward)
    .attr("pool_size")
    .input(2)
    .output(2)
    .apply(border_align_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(border_align_backward)
    .attr("pool_size")
    .input(3)
    .output(1)
    .apply(border_align_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/border_align_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef BORDER_ALIGN_PYTORCH_H
#define BORDER_ALIGN_PYTORCH_H
#include <torch/extension.h>
using namespace at;

#ifdef MMCV_WITH_CUDA
void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size);

void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size);
#endif

#endif  // BORDER_ALIGN_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/box_iou_rotated.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned) {
  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
                       aligned);
}

// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                     const int mode_flag, const bool aligned) {
  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
}


================================================
FILE: mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "box_iou_rotated_pytorch.h"

using namespace parrots;

/*
 * void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor
 * ious, const int mode_flag, const bool aligned);
 */
void box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                 const OperatorBase::in_list_t& ins,
                                 OperatorBase::out_list_t& outs) {
  bool aligned;
  int mode_flag;
  SSAttrs(attr)
      .get<bool>("aligned", aligned)
      .get<int>("mode_flag", mode_flag)
      .done();

  const auto& boxes1 = buildATensor(ctx, ins[0]);
  const auto& boxes2 = buildATensor(ctx, ins[1]);
  auto ious = buildATensor(ctx, outs[0]);
  box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
}

#ifdef MMCV_WITH_CUDA
/*
 * void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor
 * ious, const int mode_flag, const bool aligned);
 */
void box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                  const OperatorBase::in_list_t& ins,
                                  OperatorBase::out_list_t& outs) {
  bool aligned;
  int mode_flag;
  SSAttrs(attr)
      .get<bool>("aligned", aligned)
      .get<int>("mode_flag", mode_flag)
      .done();

  const auto& boxes1 = buildATensor(ctx, ins[0]);
  const auto& boxes2 = buildATensor(ctx, ins[1]);
  auto ious = buildATensor(ctx, outs[0]);
  box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
}
#endif

PARROTS_EXTENSION_REGISTER(box_iou_rotated)
    .attr("aligned")
    .attr("mode_flag")
    .input(2)
    .output(1)
    .apply(box_iou_rotated_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(box_iou_rotated_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef BOX_IOU_ROTATED_PYTORCH_H
#define BOX_IOU_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned);

#ifdef MMCV_WITH_CUDA
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);
#endif

#endif  // BOX_IOU_ROTATED_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/carafe.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor) {
  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
                       rmasks, output, kernel_size, group_size, scale_factor);
}

void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor) {
  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
                       bottom_grad, mask_grad, kernel_size, group_size,
                       scale_factor);
}

void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
                    Tensor routput, Tensor rmasks, Tensor output,
                    int kernel_size, int group_size, int scale_factor) {
  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
                      kernel_size, group_size, scale_factor);
}

void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
                     Tensor rtop_grad, Tensor rbottom_grad_hs,
                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
                     Tensor mask_grad, int kernel_size, int group_size,
                     int scale_factor) {
  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
                       kernel_size, group_size, scale_factor);
}


================================================
FILE: mmcv/ops/csrc/parrots/carafe_naive.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor) {
  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
                       kernel_size, group_size, scale_factor);
}

void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor) {
  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
                       bottom_grad, mask_grad, kernel_size, group_size,
                       scale_factor);
}

void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                          int kernel_size, int group_size, int scale_factor) {
  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
                            scale_factor);
}

void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
                           Tensor bottom_grad, Tensor mask_grad,
                           int kernel_size, int group_size, int scale_factor) {
  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
                             kernel_size, group_size, scale_factor);
}


================================================
FILE: mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "carafe_naive_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
 *                                int kernel_size, int group_size,
 *                                int scale_factor)
 */
void carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                       const OperatorBase::in_list_t& ins,
                                       OperatorBase::out_list_t& outs) {
  int kernel_size, group_size, scale_factor;
  SSAttrs(attr)
      .get<int>("kernel_size", kernel_size)
      .get<int>("group_size", group_size)
      .get<int>("scale_factor", scale_factor)
      .done();

  const auto& features = buildATensor(ctx, ins[0]);
  const auto& masks = buildATensor(ctx, ins[1]);

  auto output = buildATensor(ctx, outs[0]);
  carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
                            scale_factor);
}

/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor
 * masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,
 *                                int scale_factor);
 */
void carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
  int kernel_size, group_size, scale_factor;
  SSAttrs(attr)
      .get<int>("kernel_size", kernel_size)
      .get<int>("group_size", group_size)
      .get<int>("scale_factor", scale_factor)
      .done();

  const auto& top_grad = buildATensor(ctx, ins[0]);
  const auto& features = buildATensor(ctx, ins[1]);
  const auto& masks = buildATensor(ctx, ins[2]);

  auto bottom_grad = buildATensor(ctx, outs[0]);
  auto mask_grad = buildATensor(ctx, outs[1]);
  carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,
                             kernel_size, group_size, scale_factor);
}

PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
    .attr("kernel_size")
    .attr("group_size")
    .attr("scale_factor")
    .input(2)
    .output(1)
    .apply(carafe_naive_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
    .attr("kernel_size")
    .attr("group_size")
    .attr("scale_factor")
    .input(3)
    .output(2)
    .apply(carafe_naive_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CARAFE_NAIVE_PYTORCH_H
#define CARAFE_NAIVE_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor);

void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor);
#endif  // CARAFE_NAIVE_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/carafe_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "carafe_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
/*
 * void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
 *                          Tensor routput, Tensor rmasks, Tensor output,
 *                          int kernel_size, int group_size, int scale_factor);
 */
void carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                 const OperatorBase::in_list_t& ins,
                                 OperatorBase::out_list_t& outs) {
  int kernel_size, group_size, scale_factor;
  SSAttrs(attr)
      .get<int>("kernel_size", kernel_size)
      .get<int>("group_size", group_size)
      .get<int>("scale_factor", scale_factor)
      .done();

  const auto& features = buildATensor(ctx, ins[0]);
  const auto& masks = buildATensor(ctx, ins[1]);

  auto rfeatures = buildATensor(ctx, outs[0]);
  auto routput = buildATensor(ctx, outs[1]);
  auto rmasks = buildATensor(ctx, outs[2]);
  auto output = buildATensor(ctx, outs[3]);

  carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
                      kernel_size, group_size, scale_factor);
}

/*
 * void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
 *                           Tensor rtop_grad, Tensor rbottom_grad_hs,
 *                           Tensor rbottom_grad, Tensor rmask_grad,
 *                           Tensor bottom_grad, Tensor mask_grad, int
 * kernel_size, int group_size, int scale_factor);
 */
void carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                  const OperatorBase::in_list_t& ins,
                                  OperatorBase::out_list_t& outs) {
  int kernel_size, group_size, scale_factor;
  SSAttrs(attr)
      .get<int>("kernel_size", kernel_size)
      .get<int>("group_size", group_size)
      .get<int>("scale_factor", scale_factor)
      .done();

  const auto& top_grad = buildATensor(ctx, ins[0]);
  const auto& rfeatures = buildATensor(ctx, ins[1]);
  const auto& masks = buildATensor(ctx, ins[2]);

  auto rtop_grad = buildATensor(ctx, outs[0]);
  auto rbottom_grad_hs = buildATensor(ctx, outs[1]);
  auto rbottom_grad = buildATensor(ctx, outs[2]);
  auto rmask_grad = buildATensor(ctx, outs[3]);
  auto bottom_grad = buildATensor(ctx, outs[4]);
  auto mask_grad = buildATensor(ctx, outs[5]);

  carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
                       kernel_size, group_size, scale_factor);
}

PARROTS_EXTENSION_REGISTER(carafe_forward)
    .attr("kernel_size")
    .attr("group_size")
    .attr("scale_factor")
    .input(2)
    .output(4)
    .apply(carafe_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(carafe_backward)
    .attr("kernel_size")
    .attr("group_size")
    .attr("scale_factor")
    .input(3)
    .output(6)
    .apply(carafe_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/carafe_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CARAFE_PYTORCH_H
#define CARAFE_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor);

void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor);
#endif  // CARAFE_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/chamfer_distance.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
                                   const Tensor dist1, const Tensor dist2,
                                   const Tensor idx1, const Tensor idx2) {
  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
                       idx1, idx2);
}

void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
                                    Tensor idx1, Tensor idx2, Tensor graddist1,
                                    Tensor graddist2, Tensor gradxyz1,
                                    Tensor gradxyz2) {
  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
                       graddist1, graddist2, gradxyz1, gradxyz2);
}

void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
                              const Tensor dist1, const Tensor dist2,
                              const Tensor idx1, const Tensor idx2) {
  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
}

void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
                               Tensor idx1, Tensor idx2, Tensor graddist1,
                               Tensor graddist2, Tensor gradxyz1,
                               Tensor gradxyz2) {
  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
                                 gradxyz1, gradxyz2);
}


================================================
FILE: mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "chamfer_distance_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void chamfer_distance_forward_cuda_parrots(CudaContext& ctx,
                                           const SSElement& attr,
                                           const OperatorBase::in_list_t& ins,
                                           OperatorBase::out_list_t& outs) {
  auto xyz1 = buildATensor(ctx, ins[0]);
  auto xyz2 = buildATensor(ctx, ins[1]);
  auto dist1 = buildATensor(ctx, outs[0]);
  auto dist2 = buildATensor(ctx, outs[1]);
  auto idx1 = buildATensor(ctx, outs[2]);
  auto idx2 = buildATensor(ctx, outs[3]);
  chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
}

void chamfer_distance_backward_cuda_parrots(CudaContext& ctx,
                                            const SSElement& attr,
                                            const OperatorBase::in_list_t& ins,
                                            OperatorBase::out_list_t& outs) {
  auto xyz1 = buildATensor(ctx, ins[0]);
  auto xyz2 = buildATensor(ctx, ins[1]);
  auto idx1 = buildATensor(ctx, ins[2]);
  auto idx2 = buildATensor(ctx, ins[3]);
  auto graddist1 = buildATensor(ctx, ins[4]);
  auto graddist2 = buildATensor(ctx, ins[5]);
  auto gradxyz1 = buildATensor(ctx, outs[0]);
  auto gradxyz2 = buildATensor(ctx, outs[1]);
  chamfer_distance_backward(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
                            gradxyz1, gradxyz2);
}

PARROTS_EXTENSION_REGISTER(chamfer_distance_forward)
    .input(2)
    .output(4)
    .apply(chamfer_distance_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(chamfer_distance_backward)
    .input(6)
    .output(2)
    .apply(chamfer_distance_backward_cuda_parrots)
    .done();

#endif


================================================
FILE: mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
#define ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
                              const Tensor dist1, const Tensor dist2,
                              const Tensor idx1, const Tensor idx);

void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
                               Tensor idx1, Tensor idx2, Tensor graddist1,
                               Tensor graddist2, Tensor gradxyz1,
                               Tensor gradxyz2);

#endif  // ACTIVE_CHAMFER_DISTANCE_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/contour_expand.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/whai362/PSENet
#include <iostream>
#include <queue>

#include "pytorch_cpp_helper.hpp"

using namespace std;

class Point2d {
 public:
  int x;
  int y;

  Point2d() : x(0), y(0) {}
  Point2d(int _x, int _y) : x(_x), y(_y) {}
};

void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
                   const int *label_map, int &label_num, int &min_area,
                   vector<vector<int>> &text_line) {
  std::vector<int> area(label_num + 1);
  int kernel_num = data_shape[0];
  int height = data_shape[1];
  int width = data_shape[2];

  for (int x = 0; x < height; ++x) {
    for (int y = 0; y < width; ++y) {
      int label = label_map[x * width + y];
      if (label == 0) continue;
      area[label] += 1;
    }
  }

  queue<Point2d> queue, next_queue;
  for (int x = 0; x < height; ++x) {
    vector<int> row(width);
    for (int y = 0; y < width; ++y) {
      int label = label_map[x * width + y];
      if (label == 0) continue;
      if (area[label] < min_area) continue;

      Point2d point(x, y);
      queue.push(point);
      row[y] = label;
    }
    text_line.emplace_back(row);
  }

  int dx[] = {-1, 1, 0, 0};
  int dy[] = {0, 0, -1, 1};
  vector<int> kernel_step(kernel_num);
  std::for_each(kernel_step.begin(), kernel_step.end(),
                [=](int &k) { return k * height * width; });

  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
    while (!queue.empty()) {
      Point2d point = queue.front();
      queue.pop();
      int x = point.x;
      int y = point.y;
      int label = text_line[x][y];

      bool is_edge = true;
      for (int d = 0; d < 4; ++d) {
        int tmp_x = x + dx[d];
        int tmp_y = y + dy[d];

        if (tmp_x < 0 || tmp_x >= height) continue;
        if (tmp_y < 0 || tmp_y >= width) continue;
        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
        if (kernel_value == 0) continue;
        if (text_line[tmp_x][tmp_y] > 0) continue;

        Point2d point(tmp_x, tmp_y);
        queue.push(point);
        text_line[tmp_x][tmp_y] = label;
        is_edge = false;
      }

      if (is_edge) {
        next_queue.push(point);
      }
    }
    swap(queue, next_queue);
  }
}

std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
                                             Tensor internal_kernel_label,
                                             int min_kernel_area,
                                             int kernel_num) {
  kernel_mask = kernel_mask.contiguous();
  internal_kernel_label = internal_kernel_label.contiguous();
  assert(kernel_mask.dim() == 3);
  assert(internal_kernel_label.dim() == 2);
  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
  CHECK_CPU_INPUT(kernel_mask);
  CHECK_CPU_INPUT(internal_kernel_label);
  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
  IntArrayRef data_shape = kernel_mask.sizes();

  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
  vector<vector<int>> text_line;

  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
                min_kernel_area, text_line);

  return text_line;
}


================================================
FILE: mmcv/ops/csrc/parrots/contour_expand_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "contour_expand_pytorch.h"

using namespace parrots;
using namespace std;

template <typename T>
void contour_expand_parrots(T& ctx, const SSElement& attr,
                            const OperatorBase::in_list_t& ins,
                            OperatorBase::out_list_t& outs) {
  int min_kernel_area, kernel_num;
  SSAttrs(attr)
      .get<int>("min_kernel_area", min_kernel_area)
      .get<int>("kernel_num", kernel_num)
      .done();
  at::Tensor kernel_mask;
  at::Tensor internal_kernel_label;
  kernel_mask = buildATensor(ctx, ins[0]);
  internal_kernel_label = buildATensor(ctx, ins[1]);
  auto out = contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
                            kernel_num);
  int n = out.size(), m = 0;
  for (int i = 0; i < n; ++i)
    if (m < out[i].size()) m = out[i].size();
  auto options = torch::TensorOptions().dtype(at::kInt);
  auto tensor = torch::zeros({n, m}, options);
  for (int i = 0; i < n; i++)
    tensor.slice(0, i, i + 1) =
        torch::from_blob(out[i].data(), {out[i].size()}, options);
  updateDArray(ctx, tensor, outs[0]);
}

PARROTS_EXTENSION_REGISTER(contour_expand)
    .attr("min_kernel_area")
    .attr("kernel_num")
    .input(2)
    .output(1)
    .apply(contour_expand_parrots<HostContext>)
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/contour_expand_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CONTOUR_EXPAND_PYTORCH_H
#define CONTOUR_EXPAND_PYTORCH_H
#include <torch/extension.h>
using namespace at;

std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
                                             Tensor internal_kernel_label,
                                             int min_kernel_area,
                                             int kernel_num);

#endif  // CONTOUR_EXPAND_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/convex_iou.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
                     Tensor ious) {
  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
}

void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
  convex_iou_impl(pointsets, polygons, ious);
}

void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
                      Tensor output) {
  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
}

void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
  convex_giou_impl(pointsets, polygons, output);
}


================================================
FILE: mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "convex_iou_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                     const OperatorBase::in_list_t& ins,
                                     OperatorBase::out_list_t& outs) {
  auto pointsets = buildATensor(ctx, ins[0]);
  auto polygons = buildATensor(ctx, ins[1]);
  auto ious = buildATensor(ctx, outs[0]);
  convex_iou(pointsets, polygons, ious);
}

void convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                      const OperatorBase::in_list_t& ins,
                                      OperatorBase::out_list_t& outs) {
  auto pointsets = buildATensor(ctx, ins[0]);
  auto polygons = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  convex_giou(pointsets, polygons, output);
}

PARROTS_EXTENSION_REGISTER(convex_iou)
    .input(2)
    .output(1)
    .apply(convex_iou_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(convex_giou)
    .input(2)
    .output(1)
    .apply(convex_giou_forward_cuda_parrots)
    .done();

#endif


================================================
FILE: mmcv/ops/csrc/parrots/convex_iou_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CONVEX_IOU_PYTORCH_H
#define CONVEX_IOU_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);

void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);

#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/correlation.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include <iostream>

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW) {
  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
                       patchH, patchW, padH, padW, dilationH, dilationW,
                       dilation_patchH, dilation_patchW, dH, dW);
}

void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
                       padW, dilationH, dilationW, dilation_patchH,
                       dilation_patchW, dH, dW);
}

void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
                         int kW, int patchH, int patchW, int padH, int padW,
                         int dilationH, int dilationW, int dilation_patchH,
                         int dilation_patchW, int dH, int dW) {
  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
                           padW, dilationH, dilationW, dilation_patchH,
                           dilation_patchW, dH, dW);
}

void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                          Tensor grad_input1, Tensor grad_input2, int kH,
                          int kW, int patchH, int patchW, int padH, int padW,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW) {
  correlation_backward_impl(grad_output, input1, input2, grad_input1,
                            grad_input2, kH, kW, patchH, patchW, padH, padW,
                            dilationH, dilationW, dilation_patchH,
                            dilation_patchW, dH, dW);
}


================================================
FILE: mmcv/ops/csrc/parrots/correlation_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "correlation_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void correlation_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                      const OperatorBase::in_list_t& ins,
                                      OperatorBase::out_list_t& outs) {
  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
      dilation_patchW, dH, dW;
  SSAttrs(attr)
      .get<int>("kH", kH)
      .get<int>("kW", kW)
      .get<int>("patchH", patchH)
      .get<int>("patchW", patchW)
      .get<int>("padH", padH)
      .get<int>("padW", padW)
      .get<int>("dilationH", dilationH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilation_patchH", dilation_patchH)
      .get<int>("dilation_patchW", dilation_patchW)
      .get<int>("dH", dH)
      .get<int>("dW", dW)
      .done();

  auto input1 = buildATensor(ctx, ins[0]);
  auto input2 = buildATensor(ctx, ins[1]);

  auto output = buildATensor(ctx, outs[0]);

  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
                      padW, dilationH, dilationW, dilation_patchH,
                      dilation_patchW, dH, dW);
}

void correlation_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                       const OperatorBase::in_list_t& ins,
                                       OperatorBase::out_list_t& outs) {
  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
      dilation_patchW, dH, dW;
  SSAttrs(attr)
      .get<int>("kH", kH)
      .get<int>("kW", kW)
      .get<int>("patchH", patchH)
      .get<int>("patchW", patchW)
      .get<int>("padH", padH)
      .get<int>("padW", padW)
      .get<int>("dilationH", dilationH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilation_patchH", dilation_patchH)
      .get<int>("dilation_patchW", dilation_patchW)
      .get<int>("dH", dH)
      .get<int>("dW", dW)
      .done();

  auto grad_output = buildATensor(ctx, ins[0]);
  auto input1 = buildATensor(ctx, ins[1]);
  auto input2 = buildATensor(ctx, ins[2]);

  auto grad_input1 = buildATensor(ctx, outs[0]);
  auto grad_input2 = buildATensor(ctx, outs[1]);

  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
                       dilation_patchH, dilation_patchW, dH, dW);
}
#endif

void correlation_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                     const OperatorBase::in_list_t& ins,
                                     OperatorBase::out_list_t& outs) {
  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
      dilation_patchW, dH, dW;
  SSAttrs(attr)
      .get<int>("kH", kH)
      .get<int>("kW", kW)
      .get<int>("patchH", patchH)
      .get<int>("patchW", patchW)
      .get<int>("padH", padH)
      .get<int>("padW", padW)
      .get<int>("dilationH", dilationH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilation_patchH", dilation_patchH)
      .get<int>("dilation_patchW", dilation_patchW)
      .get<int>("dH", dH)
      .get<int>("dW", dW)
      .done();

  auto input1 = buildATensor(ctx, ins[0]);
  auto input2 = buildATensor(ctx, ins[1]);

  auto output = buildATensor(ctx, outs[0]);

  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
                      padW, dilationH, dilationW, dilation_patchH,
                      dilation_patchW, dH, dW);
}

void correlation_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                      const OperatorBase::in_list_t& ins,
                                      OperatorBase::out_list_t& outs) {
  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
      dilation_patchW, dH, dW;
  SSAttrs(attr)
      .get<int>("kH", kH)
      .get<int>("kW", kW)
      .get<int>("patchH", patchH)
      .get<int>("patchW", patchW)
      .get<int>("padH", padH)
      .get<int>("padW", padW)
      .get<int>("dilationH", dilationH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilation_patchH", dilation_patchH)
      .get<int>("dilation_patchW", dilation_patchW)
      .get<int>("dH", dH)
      .get<int>("dW", dW)
      .done();

  auto grad_output = buildATensor(ctx, ins[0]);
  auto input1 = buildATensor(ctx, ins[1]);
  auto input2 = buildATensor(ctx, ins[2]);

  auto grad_input1 = buildATensor(ctx, outs[0]);
  auto grad_input2 = buildATensor(ctx, outs[1]);

  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
                       dilation_patchH, dilation_patchW, dH, dW);
}

PARROTS_EXTENSION_REGISTER(correlation_forward)
    .attr("kH")
    .attr("kW")
    .attr("patchH")
    .attr("patchW")
    .attr("padH")
    .attr("padW")
    .attr("dilationH")
    .attr("dilationW")
    .attr("dilation_patchH")
    .attr("dilation_patchW")
    .attr("dH")
    .attr("dW")
    .input(2)
    .output(1)
    .apply(correlation_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(correlation_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(correlation_backward)
    .attr("kH")
    .attr("kW")
    .attr("patchH")
    .attr("patchW")
    .attr("padH")
    .attr("padW")
    .attr("dilationH")
    .attr("dilationW")
    .attr("dilation_patchH")
    .attr("dilation_patchW")
    .attr("dH")
    .attr("dW")
    .input(3)
    .output(2)
    .apply(correlation_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(correlation_backward_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/correlation_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CORRELATION_PYTORCH_H
#define CORRELATION_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
                         int kW, int patchH, int patchW, int padH, int padW,
                         int dilationH, int dilationW, int dilation_patchH,
                         int dilation_patchW, int dH, int dW);

void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                          Tensor grad_input1, Tensor grad_input2, int kH,
                          int kW, int patchH, int patchW, int padH, int padW,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW);

#endif  // CORRELATION_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/cudabind.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void AssignScoreWithKForwardCUDAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& points, const Tensor& centers, const Tensor& scores,
    const Tensor& knn_idx, Tensor& output);

void AssignScoreWithKBackwardCUDAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores);

void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor& points,
                                     const Tensor& centers,
                                     const Tensor& scores,
                                     const Tensor& knn_idx, Tensor& output) {
  AssignScoreWithKForwardCUDAKernelLauncher(
      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
};

void assign_score_withk_backward_cuda(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
  AssignScoreWithKBackwardCUDAKernelLauncher(
      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
      grad_points, grad_centers, grad_scores);
};

void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor& points,
                                     const Tensor& centers,
                                     const Tensor& scores,
                                     const Tensor& knn_idx, Tensor& output);

void assign_score_withk_backward_impl(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores);

REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
                     assign_score_withk_forward_cuda);
REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
                     assign_score_withk_backward_cuda);

void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
                                        float max_radius, int nsample,
                                        const Tensor new_xyz, const Tensor xyz,
                                        Tensor idx);

void ball_query_forward_cuda(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx) {
  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
                                     new_xyz, xyz, idx);
};

void ball_query_forward_impl(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx);
REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);

void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                    Tensor ious, const int mode,
                                    const bool aligned, const int offset);

void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset) {
  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}

void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset);
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);

void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
                                          const Tensor& boxes, Tensor output,
                                          Tensor argmax_idx,
                                          const int pool_size);

void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
                                           const Tensor& boxes,
                                           const Tensor& argmax_idx,
                                           Tensor grad_input,
                                           const int pool_size);

void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size) {
  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
                                       pool_size);
}

void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
                                const Tensor& argmax_idx, Tensor grad_input,
                                const int pool_size) {
  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
                                        grad_input, pool_size);
}

void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size);

void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
                                const Tensor& argmax_idx, Tensor grad_input,
                                const int pool_size);

REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
                     border_align_forward_cuda);
REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
                     border_align_backward_cuda);

void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);

void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);

void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
                                     Tensor rfeatures, Tensor routput,
                                     Tensor rmasks, Tensor output,
                                     const int kernel_size,
                                     const int group_size,
                                     const int scale_factor);

void CARAFEBackwardCUDAKernelLauncher(
    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
    const int kernel_size, const int group_size, const int scale_factor);

void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor) {
  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
                                  output, kernel_size, group_size,
                                  scale_factor);
}

void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor) {
  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
                                   bottom_grad, mask_grad, kernel_size,
                                   group_size, scale_factor);
}

void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor);

void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor);

REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);

void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
                                          const Tensor masks, Tensor output,
                                          const int kernel_size,
                                          const int group_size,
                                          const int scale_factor);

void CARAFENAIVEBackwardCUDAKernelLauncher(
    const Tensor top_grad, const Tensor features, const Tensor masks,
    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
    const int group_size, const int scale_factor);

void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor) {
  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
                                       group_size, scale_factor);
}

void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor) {
  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
                                        mask_grad, kernel_size, group_size,
                                        scale_factor);
}
void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor);

void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor);

REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
                     carafe_naive_forward_cuda);
REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
                     carafe_naive_backward_cuda);

void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
                                          Tensor output, int kH, int kW,
                                          int patchH, int patchW, int padH,
                                          int padW, int dilationH,
                                          int dilationW, int dilation_patchH,
                                          int dilation_patchW, int dH, int dW);

void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
                                           Tensor input2, Tensor grad_input1,
                                           Tensor grad_input2, int kH, int kW,
                                           int patchH, int patchW, int padH,
                                           int padW, int dilationH,
                                           int dilationW, int dilation_patchH,
                                           int dilation_patchW, int dH, int dW);

void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW) {
  CorrelationForwardCUDAKernelLauncher(
      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
      dilationW, dilation_patchH, dilation_patchW, dH, dW);
}

void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
  CorrelationBackwardCUDAKernelLauncher(
      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
      dilation_patchW, dH, dW);
}

void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW);

void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW);

REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
                     correlation_backward_cuda);

void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col);

void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im);

void deformable_col2im_coord_cuda(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset);

void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col);

void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im);

void deformable_col2im_coord_impl(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset);

REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
                     deformable_col2im_coord_cuda);

void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                            Tensor offset, Tensor output,
                                            int pooled_height, int pooled_width,
                                            float spatial_scale,
                                            int sampling_ratio, float gamma);

void DeformRoIPoolBackwardCUDAKernelLauncher(
    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
    float spatial_scale, int sampling_ratio, float gamma);

void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
                                         pooled_height, pooled_width,
                                         spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma) {
  DeformRoIPoolBackwardCUDAKernelLauncher(
      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
      pooled_width, spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma);

void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma);

REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
                     deform_roi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
                     deform_roi_pool_backward_cuda);

void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha);

void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
                                                Tensor weight,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha);

void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha);

void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
                                                Tensor weight, Tensor buff,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha);

void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
                                            gamma, alpha);
}

void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha) {
  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
                                             gamma, alpha);
}

void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
                                            gamma, alpha);
}

void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
                                             grad_input, gamma, alpha);
}

void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha);

void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha);

REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
                     sigmoid_focal_loss_forward_cuda);
REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
                     sigmoid_focal_loss_backward_cuda);
REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
                     softmax_focal_loss_forward_cuda);
REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
                     softmax_focal_loss_backward_cuda);

void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
                                                    const float* dataset,
                                                    float* temp, int* idxs);

void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
    int b, int n, int m, const float* dataset, float* temp, int* idxs);

void furthest_point_sampling_forward_cuda(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m) {
  const float* dataset = points_tensor.data_ptr<float>();
  float* temp = temp_tensor.data_ptr<float>();
  int* idxs = idx_tensor.data_ptr<int>();
  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
}

void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m) {
  const float* dataset = points_tensor.data_ptr<float>();
  float* temp = temp_tensor.data_ptr<float>();
  int* idxs = idx_tensor.data_ptr<int>();
  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
                                                         idxs);
}

void furthest_point_sampling_forward_impl(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m);

void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m);

REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
                     furthest_point_sampling_forward_cuda);
REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
                     furthest_point_sampling_with_dist_forward_cuda);

torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
                                      const torch::Tensor& bias,
                                      const torch::Tensor& refer, int act,
                                      int grad, float alpha, float scale);

torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
                                           const torch::Tensor& bias,
                                           const torch::Tensor& refer, int act,
                                           int grad, float alpha, float scale);
REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
                     fused_bias_leakyrelu_op);

void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                           const Tensor points,
                                           const Tensor idx, Tensor out);

void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                            const Tensor grad_out,
                                            const Tensor idx,
                                            Tensor grad_points);

void gather_points_forward_cuda(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
};

void gather_points_backward_cuda(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
                                         grad_points);
};

void gather_points_forward_impl(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out);

void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points);

REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
                     gather_points_forward_cuda);
REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
                     gather_points_backward_cuda);

void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                          int nsample, const Tensor points,
                                          const Tensor idx, Tensor out);

void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                           int nsample, const Tensor grad_out,
                                           const Tensor idx,
                                           Tensor grad_points);

void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
                                       out);
};

void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
                                        idx, grad_points);
};

void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out);

void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points);

REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
                     group_points_forward_cuda);
REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
                     group_points_backward_cuda);

void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
                                  const Tensor xyz, const Tensor new_xyz,
                                  Tensor idx, Tensor dist2);

void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
}

void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2);
REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);

void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
                                           Tensor top_data, const int kernel_h,
                                           const int kernel_w, const int pad_h,
                                           const int pad_w);

void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
                                           Tensor top_data, const int height,
                                           const int width, const int channels);

void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
                                        kernel_h, kernel_w, pad_h, pad_w);
}

void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
                                        width, channels);
}

void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w);

void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels);

REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
                     masked_im2col_forward_cuda);
REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
                     masked_col2im_forward_cuda);

void modulated_deformable_im2col_cuda(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col);

void modulated_deformable_col2im_cuda(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im);

void modulated_deformable_col2im_coord_cuda(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask);

void modulated_deformable_im2col_impl(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col);

void modulated_deformable_col2im_impl(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im);

void modulated_deformable_col2im_coord_impl(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask);

REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
                     modulated_deformable_im2col_cuda);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
                     modulated_deformable_col2im_cuda);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
                     modulated_deformable_col2im_coord_cuda);

Tensor ms_deform_attn_cuda_forward(const Tensor& value,
                                   const Tensor& spatial_shapes,
                                   const Tensor& level_start_index,
                                   const Tensor& sampling_loc,
                                   const Tensor& attn_weight,
                                   const int im2col_step);

void ms_deform_attn_cuda_backward(
    const Tensor& value, const Tensor& spatial_shapes,
    const Tensor& level_start_index, const Tensor& sampling_loc,
    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);

Tensor ms_deform_attn_impl_forward(const Tensor& value,
                                   const Tensor& spatial_shapes,
                                   const Tensor& level_start_index,
                                   const Tensor& sampling_loc,
                                   const Tensor& attn_weight,
                                   const int im2col_step);

void ms_deform_attn_impl_backward(
    const Tensor& value, const Tensor& spatial_shapes,
    const Tensor& level_start_index, const Tensor& sampling_loc,
    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);

REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
                     ms_deform_attn_cuda_forward);
REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
                     ms_deform_attn_cuda_backward);

Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                             int offset);

Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
}

Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);

void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
                                                int pts_num, const Tensor boxes,
                                                const Tensor pts,
                                                Tensor box_idx_of_points);

void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
                                               int pts_num, const Tensor boxes,
                                               const Tensor pts,
                                               Tensor box_idx_of_points);

void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points) {
  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
                                             boxes, pts, box_idx_of_points);
};

void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points) {
  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
                                            boxes, pts, box_idx_of_points);
};

void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points);

void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points);
REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
                     points_in_boxes_part_forward_cuda);
REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
                     points_in_boxes_all_forward_cuda);

void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
                                      Tensor output, const int num_,
                                      const int h_feature, const int w_feature,
                                      const int h_mask, const int w_mask,
                                      const int half_h_mask,
                                      const int half_w_mask);

void PSAMaskBackwardCUDAKernelLauncher(
    const int psa_type, const Tensor grad_output, Tensor grad_input,
    const int num_, const int h_feature, const int w_feature, const int h_mask,
    const int w_mask, const int half_h_mask, const int half_w_mask);

void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask) {
  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
                                   w_feature, h_mask, w_mask, half_h_mask,
                                   half_w_mask);
}

void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask) {
  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
                                    h_feature, w_feature, h_mask, w_mask,
                                    half_h_mask, half_w_mask);
}

void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask);

void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask);
REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);

void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,
                                       int aligned_height, int aligned_width,
                                       float spatial_scale, int sampling_ratio,
                                       int pool_mode, bool aligned);

void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                        Tensor argmax_y, Tensor argmax_x,
                                        Tensor grad_input, int aligned_height,
                                        int aligned_width, float spatial_scale,
                                        int sampling_ratio, int pool_mode,
                                        bool aligned);

void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  ROIAlignForwardCUDAKernelLauncher(
      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
      spatial_scale, sampling_ratio, pool_mode, aligned);
}

void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
  ROIAlignBackwardCUDAKernelLauncher(
      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}

void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);

void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned);

REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);

void ROIAlignRotatedForwardCUDAKernelLauncher(
    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor output);

void ROIAlignRotatedBackwardCUDAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);

void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);

  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }

  int num_channels = input.size(1);
  int data_height = input.size(2);
  int data_width = input.size(3);
  ROIAlignRotatedForwardCUDAKernelLauncher(
      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
      num_channels, data_height, data_width, num_rois, aligned_height,
      aligned_width, output);
}

void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }

  int num_channels = bottom_grad.size(1);
  int data_height = bottom_grad.size(2);
  int data_width = bottom_grad.size(3);
  ROIAlignRotatedBackwardCUDAKernelLauncher(
      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
      num_channels, data_height, data_width, num_rois, aligned_height,
      aligned_width, bottom_grad);
}

void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise);

void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
                     roi_align_rotated_forward_cuda);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
                     roi_align_rotated_backward_cuda);

void RiROIAlignRotatedForwardCUDAKernelLauncher(
    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor output);

void RiROIAlignRotatedBackwardCUDAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor bottom_grad);

void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
                                      Tensor output, int pooled_height,
                                      int pooled_width, float spatial_scale,
                                      int num_samples, int num_orientations,
                                      bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }
  CHECK_CONTIGUOUS(features);
  CHECK_CONTIGUOUS(rois);
  int num_channels = features.size(1) / num_orientations;
  int data_height = features.size(2);
  int data_width = features.size(3);
  RiROIAlignRotatedForwardCUDAKernelLauncher(
      features, rois, spatial_scale, num_samples, clockwise, num_channels,
      data_height, data_width, num_rois, pooled_height, pooled_width,
      num_orientations, output);
}

void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
                                       Tensor bottom_grad, int pooled_height,
                                       int pooled_width, float spatial_scale,
                                       int num_samples, int num_orientations,
                                       bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }
  CHECK_CONTIGUOUS(top_grad);
  CHECK_CONTIGUOUS(rois);
  int num_channels = bottom_grad.size(1) / num_orientations;
  int data_height = bottom_grad.size(2);
  int data_width = bottom_grad.size(3);
  RiROIAlignRotatedBackwardCUDAKernelLauncher(
      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
      data_height, data_width, num_rois, pooled_height, pooled_width,
      num_orientations, bottom_grad);
}

void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
                                      Tensor output, int pooled_height,
                                      int pooled_width, float spatial_scale,
                                      int num_samples, int num_orientations,
                                      bool clockwise);

void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                       Tensor bottom_grad, int pooled_height,
                                       int pooled_width, float spatial_scale,
                                       int num_samples, int num_orientations,
                                       bool clockwise);

REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
                     riroi_align_rotated_forward_cuda);
REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
                     riroi_align_rotated_backward_cuda);

void RoiawarePool3dForwardCUDAKernelLauncher(
    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
    int out_y, int out_z, const Tensor rois, const Tensor pts,
    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
    Tensor pooled_features, int pool_method);

void RoiawarePool3dBackwardCUDAKernelLauncher(
    int boxes_num, int out_x, int out_y, int out_z, int channels,
    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
    const Tensor grad_out, Tensor grad_in, int pool_method);

void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method) {
  RoiawarePool3dForwardCUDAKernelLauncher(
      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
      pool_method);
};

void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method) {
  RoiawarePool3dBackwardCUDAKernelLauncher(
      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
};

void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method);

void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method);

REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
                     roiaware_pool3d_forward_cuda);
REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
                     roiaware_pool3d_backward_cuda);

void RoIPointPool3dForwardCUDAKernelLauncher(
    int batch_size, int pts_num, int boxes_num, int feature_in_len,
    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);

void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag) {
  RoIPointPool3dForwardCUDAKernelLauncher(
      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
};

void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag);
REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
                     roipoint_pool3d_forward_cuda);

void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                      Tensor argmax, int pooled_height,
                                      int pooled_width, float spatial_scale);

void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                       Tensor argmax, Tensor grad_input,
                                       int pooled_height, int pooled_width,
                                       float spatial_scale);

void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale) {
  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
                                   pooled_width, spatial_scale);
}

void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale) {
  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
                                    pooled_height, pooled_width, spatial_scale);
}

void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale);
void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale);
REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;

std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
    const at::Tensor& feats, const at::Tensor& coors,
    const reduce_t reduce_type);

void DynamicPointToVoxelBackwardCUDAKernelLauncher(
    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
    const at::Tensor& feats, const at::Tensor& reduced_feats,
    const at::Tensor& coors_map, const at::Tensor& reduce_count,
    const reduce_t reduce_type);

std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
    const torch::Tensor& feats, const torch::Tensor& coors,
    const reduce_t reduce_type) {
  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
                                                      reduce_type);
};

void dynamic_point_to_voxel_backward_cuda(
    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
    const reduce_t reduce_type) {
  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
                                                feats, reduced_feats, coors_idx,
                                                reduce_count, reduce_type);
};

std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
    const torch::Tensor& feats, const torch::Tensor& coors,
    const reduce_t reduce_type);

void dynamic_point_to_voxel_backward_impl(
    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
    const reduce_t reduce_type);

REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
                     dynamic_point_to_voxel_forward_cuda);
REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
                     dynamic_point_to_voxel_backward_cuda);

void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);

void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
                                        Tensor var);

void SyncBNForwardOutputCUDAKernelLauncher(
    const Tensor input, const Tensor mean, const Tensor var,
    Tensor running_mean, Tensor running_var, const Tensor weight,
    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
    float momentum, int group_size);

void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
                                           const Tensor norm,
                                           Tensor grad_weight,
                                           Tensor grad_bias);

void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
                                          const Tensor weight,
                                          const Tensor grad_weight,
                                          const Tensor grad_bias,
                                          const Tensor norm, const Tensor std,
                                          Tensor grad_input);

void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
}

void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
                              Tensor var) {
  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
}

void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size) {
  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
                                        running_var, weight, bias, norm, std,
                                        output, eps, momentum, group_size);
}

void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias) {
  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
                                        grad_bias);
}

void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input) {
  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
                                       grad_bias, norm, std, grad_input);
}

void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);

void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
                              Tensor var);

void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size);

void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias);

void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input);

REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
                     sync_bn_forward_mean_cuda);
REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
                     sync_bn_forward_output_cuda);
REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
                     sync_bn_backward_param_cuda);
REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
                     sync_bn_backward_data_cuda);

void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
                                               const Tensor points,
                                               const Tensor idx,
                                               const Tensor weight, Tensor out);

void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
                                                const Tensor grad_out,
                                                const Tensor idx,
                                                const Tensor weight,
                                                Tensor grad_points);

void three_interpolate_forward_cuda(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out) {
  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
                                            out);
};

void three_interpolate_backward_cuda(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points) {
  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
                                             grad_points);
};

void three_interpolate_forward_impl(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out);

void three_interpolate_backward_impl(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points);
REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
                     three_interpolate_forward_cuda);
REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
                     three_interpolate_backward_cuda);

void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
                                      const Tensor known, Tensor dist2,
                                      Tensor idx);

void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx) {
  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
};

void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx);
REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);

void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
                                       Tensor output);

void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
                                        Tensor grad_input);

void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
  TINShiftForwardCUDAKernelLauncher(input, shift, output);
}

void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
                             Tensor grad_input) {
  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
}

void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
                             Tensor grad_input);
REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);

torch::Tensor upfirdn2d_op(const torch::Tensor& input,
                           const torch::Tensor& kernel, int up_x, int up_y,
                           int down_x, int down_y, int pad_x0, int pad_x1,
                           int pad_y0, int pad_y1);

torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
                                const torch::Tensor& kernel, int up_x, int up_y,
                                int down_x, int down_y, int pad_x0, int pad_x1,
                                int pad_y0, int pad_y1);
REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);

int HardVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3);

int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3);

void DynamicVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor& points, at::Tensor& coors,
    const std::vector<float> voxel_size, const std::vector<float> coors_range,
    const int NDim = 3);

int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
                               at::Tensor& coors,
                               at::Tensor& num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim) {
  return HardVoxelizeForwardCUDAKernelLauncher(
      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
      max_points, max_voxels, NDim);
};

int nondeterministic_hard_voxelize_forward_cuda(
    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim) {
  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
      max_points, max_voxels, NDim);
};

void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim) {
  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
                                           coors_range, NDim);
};

int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
                               at::Tensor& coors,
                               at::Tensor& num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim);

int nondeterministic_hard_voxelize_forward_impl(
    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim);

void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim);

REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
                     hard_voxelize_forward_cuda);
REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
                     nondeterministic_hard_voxelize_forward_cuda);
REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
                     dynamic_voxelize_forward_cuda);

void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
                                                  const Tensor best_bboxes,
                                                  const float spatial_scale,
                                                  const int points,
                                                  Tensor output);

void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
                                                   const Tensor best_bboxes,
                                                   const float spatial_scale,
                                                   const int points,
                                                   Tensor bottom_grad);

void rotated_feature_align_forward_cuda(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output) {
  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
                                               spatial_scale, points, output);
};

void rotated_feature_align_backward_cuda(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad) {
  RotatedFeatureAlignBackwardCUDAKernelLauncher(
      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
};

void rotated_feature_align_forward_impl(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output);

void rotated_feature_align_backward_impl(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad);

REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
                     rotated_feature_align_forward_cuda);
REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
                     rotated_feature_align_backward_cuda);

void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
                                               const at::Tensor polygons,
                                               const int rows, const int cols,
                                               at::Tensor output);

void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols) {
  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
                                            output);
};

void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols);

REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
                     points_in_polygons_forward_cuda);

void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);

void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
}

void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);

REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);

void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
                                                  const Tensor indices,
                                                  Tensor output);

void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
                                                   const Tensor indices,
                                                   Tensor grad_in);

void active_rotated_filter_forward_cuda(const Tensor input,
                                        const Tensor indices, Tensor output) {
  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
};

void active_rotated_filter_backward_cuda(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in) {
  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
};

void active_rotated_filter_forward_impl(const Tensor input,
                                        const Tensor indices, Tensor output);

void active_rotated_filter_backward_impl(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in);

REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
                     active_rotated_filter_forward_cuda);
REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
                     active_rotated_filter_backward_cuda);

void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                 Tensor ious);

void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                  Tensor output);

void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
                     Tensor ious) {
  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
}

void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
                      Tensor output) {
  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
}

void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
                     Tensor ious);

void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
                      Tensor output);

REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);

Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
                                                    Tensor mask,
                                                    Tensor num_valid);

Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
                                                   Tensor num_valid) {
  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
                                                      num_valid);
}

Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
                                                   Tensor num_valid);

REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
                     diff_iou_rotated_sort_vertices_forward_cuda);

void ChamferDistanceForwardCUDAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
    const Tensor dist2, const Tensor idx1, const Tensor idx2);

void ChamferDistanceBackwardCUDAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);

void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
                                   const Tensor dist1, const Tensor dist2,
                                   const Tensor idx1, const Tensor idx2) {
  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
                                           idx2);
};

void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
                                    Tensor idx1, Tensor idx2, Tensor graddist1,
                                    Tensor graddist2, Tensor gradxyz1,
                                    Tensor gradxyz2) {
  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
                                            graddist2, gradxyz1, gradxyz2);
};

void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
                                   const Tensor dist1, const Tensor dist2,
                                   const Tensor idx1, const Tensor idx2);

void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
                                    Tensor idx1, Tensor idx2, Tensor graddist1,
                                    Tensor graddist2, Tensor gradxyz1,
                                    Tensor gradxyz2);

REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
                     chamfer_distance_forward_cuda);
REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
                     chamfer_distance_backward_cuda);

void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                        Tensor output, int pooled_height,
                                        int pooled_width, float spatial_scale);

void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                         Tensor grad_input, int pooled_height,
                                         int pooled_width, float spatial_scale);

void PrROIPoolCoorBackwardCUDAKernelLauncher(
    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);

void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
                             int pooled_height, int pooled_width,
                             float spatial_scale) {
  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
                                     pooled_width, spatial_scale);
}

void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
                              Tensor grad_input, int pooled_height,
                              int pooled_width, float spatial_scale) {
  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
                                      pooled_height, pooled_width,
                                      spatial_scale);
}

void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
                                   Tensor input, Tensor rois, Tensor grad_rois,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale) {
  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
                                          grad_rois, pooled_height,
                                          pooled_width, spatial_scale);
}

void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                             int pooled_height, int pooled_width,
                             float spatial_scale);
void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
                              Tensor grad_input, int pooled_height,
                              int pooled_width, float spatial_scale);
void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
                                   Tensor input, Tensor rois, Tensor grad_rois,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale);
REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
                     prroi_pool_coor_backward_cuda);


================================================
FILE: mmcv/ops/csrc/parrots/deform_conv.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col) {
  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
                       stride_w, dilation_h, dilation_w, parallel_imgs,
                       deformable_group, data_col);
}

void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im) {
  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
                       stride_w, dilation_h, dilation_w, parallel_imgs,
                       deformable_group, grad_im);
}

void deformable_col2im_coord_impl(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset) {
  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
                       data_offset, channels, height, width, ksize_h, ksize_w,
                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                       parallel_imgs, deformable_group, grad_offset);
}

void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
                             at::Tensor *gradOutput, at::Tensor weight, int kH,
                             int kW, int dH, int dW, int padH, int padW,
                             int dilationH, int dilationW, int group,
                             int deformable_group) {
  TORCH_CHECK(
      weight.ndimension() == 4,
      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
      weight.ndimension());

  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");

  TORCH_CHECK(kW > 0 && kH > 0,
              "kernel size should be greater than zero, but got kH: %d kW: %d",
              kH, kW);

  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
              "kernel size should be consistent with weight, ",
              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
              kH, kW, weight.size(2), weight.size(3));

  TORCH_CHECK(dW > 0 && dH > 0,
              "stride should be greater than zero, but got dH: %d dW: %d", dH,
              dW);

  TORCH_CHECK(
      dilationW > 0 && dilationH > 0,
      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
      dilationH, dilationW);

  int ndim = input.ndimension();
  int dimf = 0;
  int dimh = 1;
  int dimw = 2;

  if (ndim == 4) {
    dimf++;
    dimh++;
    dimw++;
  }

  TORCH_CHECK(ndim == 3 || ndim == 4,
              "3D or 4D input tensor expected but got: %s", ndim);

  long nInputPlane = weight.size(1) * group;
  long inputHeight = input.size(dimh);
  long inputWidth = input.size(dimw);
  long nOutputPlane = weight.size(0);
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;

  TORCH_CHECK(nInputPlane % deformable_group == 0,
              "input channels must divide deformable group size");

  if (outputWidth < 1 || outputHeight < 1)
    AT_ERROR(
        "Given input size: (%ld x %ld x %ld). "
        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
        outputWidth);

  TORCH_CHECK(input.size(1) == nInputPlane,
              "invalid number of input planes, expected: %d, but got: %d",
              nInputPlane, input.size(1));

  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
              "input image is smaller than kernel");

  TORCH_CHECK(
      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
      "invalid spatial size of offset, expected height: %d width: %d, but "
      "got height: %d width: %d",
      outputHeight, outputWidth, offset.size(2), offset.size(3));

  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
              "invalid number of channels of offset");

  if (gradOutput != NULL) {
    TORCH_CHECK(
        gradOutput->size(dimf) == nOutputPlane,
        "invalid number of gradOutput planes, expected: %d, but got: %d",
        nOutputPlane, gradOutput->size(dimf));

    TORCH_CHECK(
        (gradOutput->size(dimh) == outputHeight &&
         gradOutput->size(dimw) == outputWidth),
        "invalid size of gradOutput, expected height: %d width: %d , but "
        "got height: %d width: %d",
        outputHeight, outputWidth, gradOutput->size(dimh),
        gradOutput->size(dimw));
  }
}

void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
                         Tensor output, Tensor columns, Tensor ones, int kW,
                         int kH, int dW, int dH, int padW, int padH,
                         int dilationW, int dilationH, int group,
                         int deformable_group, int im2col_step) {
  if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(weight);
    CHECK_CUDA_INPUT(output);
    CHECK_CUDA_INPUT(columns);
    CHECK_CUDA_INPUT(ones);
#else
    AT_ERROR("DeformConv is not compiled with GPU support");
#endif
  } else {
    CHECK_CPU_INPUT(input);
    CHECK_CPU_INPUT(offset);
    CHECK_CPU_INPUT(weight);
    CHECK_CPU_INPUT(output);
    CHECK_CPU_INPUT(columns);
    CHECK_CPU_INPUT(ones);
  }

  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
                          padW, dilationH, dilationW, group, deformable_group);
  at::DeviceGuard guard(input.device());

  int batch = 1;
  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
    input.unsqueeze_(0);
    offset.unsqueeze_(0);
  }

  // todo: assert batchsize dividable by im2col_step

  long batchSize = input.size(0);
  long nInputPlane = input.size(1);
  long inputHeight = input.size(2);
  long inputWidth = input.size(3);

  long nOutputPlane = weight.size(0);

  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");

  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
                        outputHeight, outputWidth});
  columns = at::zeros(
      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
      input.options());

  if (ones.ndimension() != 2 ||
      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
    ones = at::ones({outputHeight, outputWidth}, input.options());
  }

  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
                      inputHeight, inputWidth});
  offset =
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
                                    im2col_step * outputHeight, outputWidth},
                                   output.options());

  output_buffer = output_buffer.view(
      {output_buffer.size(0), group, output_buffer.size(1) / group,
       output_buffer.size(2), output_buffer.size(3)});

  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group, columns);

    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
                          weight.size(2), weight.size(3)});

    for (int g = 0; g < group; g++) {
      output_buffer[elt][g] = output_buffer[elt][g]
                                  .flatten(1)
                                  .addmm_(weight[g].flatten(1), columns[g])
                                  .view_as(output_buffer[elt][g]);
    }
    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
  }

  output_buffer = output_buffer.view(
      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
       output_buffer.size(3), output_buffer.size(4)});

  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
                                      im2col_step, outputHeight, outputWidth});
  output_buffer.transpose_(1, 2);
  output.copy_(output_buffer);
  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});

  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
  offset = offset.view(
      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  if (batch == 0) {
    output = output.view({nOutputPlane, outputHeight, outputWidth});
    input = input.view({nInputPlane, inputHeight, inputWidth});
    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
  }
}

void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
                                Tensor gradInput, Tensor gradOffset,
                                Tensor weight, Tensor columns, int kW, int kH,
                                int dW, int dH, int padW, int padH,
                                int dilationW, int dilationH, int group,
                                int deformable_group, int im2col_step) {
  if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(gradOutput);
    CHECK_CUDA_INPUT(gradInput);
    CHECK_CUDA_INPUT(gradOffset);
    CHECK_CUDA_INPUT(weight);
    CHECK_CUDA_INPUT(columns);
#else
    AT_ERROR("DeformConv is not compiled with GPU support");
#endif
  } else {
    CHECK_CPU_INPUT(input);
    CHECK_CPU_INPUT(offset);
    CHECK_CPU_INPUT(gradOutput);
    CHECK_CPU_INPUT(gradInput);
    CHECK_CPU_INPUT(gradOffset);
    CHECK_CPU_INPUT(weight);
    CHECK_CPU_INPUT(columns);
  }
  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
                          padH, padW, dilationH, dilationW, group,
                          deformable_group);

  at::DeviceGuard guard(input.device());

  int batch = 1;
  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
    input = input.view({1, input.size(0), input.size(1), input.size(2)});
    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
    gradOutput = gradOutput.view(
        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
  }

  long batchSize = input.size(0);
  long nInputPlane = input.size(1);
  long inputHeight = input.size(2);
  long inputWidth = input.size(3);

  long nOutputPlane = weight.size(0);

  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
  columns = at::zeros(
      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
      input.options());

  // change order of grad output
  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
                                nOutputPlane, outputHeight, outputWidth});
  gradOutput.transpose_(1, 2);

  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
                              inputHeight, inputWidth});
  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
                      inputHeight, inputWidth});
  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
                                deformable_group * 2 * kH * kW, outputHeight,
                                outputWidth});
  offset =
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    // divide into groups
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
                          weight.size(2), weight.size(3)});
    gradOutput = gradOutput.view(
        {gradOutput.size(0), group, gradOutput.size(1) / group,
         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});

    for (int g = 0; g < group; g++) {
      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
    }

    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    gradOutput = gradOutput.view(
        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});

    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
                                 inputHeight, inputWidth, kH, kW, padH, padW,
                                 dH, dW, dilationH, dilationW, im2col_step,
                                 deformable_group, gradOffset[elt]);

    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group,
                           gradInput[elt]);

    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
  }

  gradOutput.transpose_(1, 2);
  gradOutput =
      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});

  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
  gradOffset = gradOffset.view(
      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
  offset = offset.view(
      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  if (batch == 0) {
    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
    input = input.view({nInputPlane, inputHeight, inputWidth});
    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
    gradOffset =
        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
  }
}

void deform_conv_backward_parameters(Tensor input, Tensor offset,
                                     Tensor gradOutput, Tensor gradWeight,
                                     Tensor columns, Tensor ones, int kW,
                                     int kH, int dW, int dH, int padW, int padH,
                                     int dilationW, int dilationH, int group,
                                     int deformable_group, float scale,
                                     int im2col_step) {
  if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(gradOutput);
    CHECK_CUDA_INPUT(gradWeight);
    CHECK_CUDA_INPUT(columns);
    CHECK_CUDA_INPUT(ones);
#else
    AT_ERROR("DeformConv is not compiled with GPU support");
#endif
  } else {
    CHECK_CPU_INPUT(input);
    CHECK_CPU_INPUT(offset);
    CHECK_CPU_INPUT(gradOutput);
    CHECK_CPU_INPUT(gradWeight);
    CHECK_CPU_INPUT(columns);
    CHECK_CPU_INPUT(ones);
  }

  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
                          dW, padH, padW, dilationH, dilationW, group,
                          deformable_group);
  at::DeviceGuard guard(input.device());

  int batch = 1;

  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
    input = input.view(
        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
    gradOutput = gradOutput.view(
        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
  }

  long batchSize = input.size(0);
  long nInputPlane = input.size(1);
  long inputHeight = input.size(2);
  long inputWidth = input.size(3);

  long nOutputPlane = gradWeight.size(0);

  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");

  columns = at::zeros(
      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
      input.options());

  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
                                nOutputPlane, outputHeight, outputWidth});
  gradOutput.transpose_(1, 2);

  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
  gradOutputBuffer =
      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
                             outputHeight, outputWidth});
  gradOutputBuffer = gradOutputBuffer.contiguous();
  gradOutputBuffer.copy_(gradOutput);
  gradOutputBuffer =
      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
                             im2col_step * outputHeight, outputWidth});

  gradOutput.transpose_(1, 2);
  gradOutput =
      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});

  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
                      inputHeight, inputWidth});
  offset =
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group, columns);

    // divide into group
    gradOutputBuffer = gradOutputBuffer.view(
        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    gradWeight =
        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
                         gradWeight.size(2), gradWeight.size(3)});

    for (int g = 0; g < group; g++) {
      gradWeight[g] = gradWeight[g]
                          .flatten(1)
                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
                                  columns[g].transpose(1, 0), 1.0, scale)
                          .view_as(gradWeight[g]);
    }
    gradOutputBuffer = gradOutputBuffer.view(
        {gradOutputBuffer.size(0),
         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
                                  gradWeight.size(2), gradWeight.size(3),
                                  gradWeight.size(4)});
  }

  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
  offset = offset.view(
      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  if (batch == 0) {
    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
    input = input.view({nInputPlane, inputHeight, inputWidth});
  }
}


================================================
FILE: mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "deform_conv_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                      const OperatorBase::in_list_t& ins,
                                      OperatorBase::out_list_t& outs) {
  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
      im2col_step;
  SSAttrs(attr)
      .get<int>("kW", kW)
      .get<int>("kH", kH)
      .get<int>("dW", dW)
      .get<int>("dH", dH)
      .get<int>("padW", padW)
      .get<int>("padH", padH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilationH", dilationH)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<int>("im2col_step", im2col_step)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& weight = buildATensor(ctx, ins[1]);
  const auto& offset = buildATensor(ctx, ins[2]);

  auto output = buildATensor(ctx, outs[0]);
  auto columns = buildATensor(ctx, outs[1]);
  auto ones = buildATensor(ctx, outs[2]);

  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
                      dH, padW, padH, dilationW, dilationH, group,
                      deformable_group, im2col_step);
}

void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
                                             const SSElement& attr,
                                             const OperatorBase::in_list_t& ins,
                                             OperatorBase::out_list_t& outs) {
  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
      im2col_step;
  SSAttrs(attr)
      .get<int>("kW", kW)
      .get<int>("kH", kH)
      .get<int>("dW", dW)
      .get<int>("dH", dH)
      .get<int>("padW", padW)
      .get<int>("padH", padH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilationH", dilationH)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<int>("im2col_step", im2col_step)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& offset = buildATensor(ctx, ins[1]);
  const auto& gradOutput = buildATensor(ctx, ins[2]);

  auto gradInput = buildATensor(ctx, outs[0]);
  auto gradOffset = buildATensor(ctx, outs[1]);
  auto weight = buildATensor(ctx, outs[2]);
  auto columns = buildATensor(ctx, outs[3]);

  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
                             weight, columns, kW, kH, dW, dH, padW, padH,
                             dilationW, dilationH, group, deformable_group,
                             im2col_step);
}

void deform_conv_backward_parameters_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
      im2col_step;
  float scale;
  SSAttrs(attr)
      .get<int>("kW", kW)
      .get<int>("kH", kH)
      .get<int>("dW", dW)
      .get<int>("dH", dH)
      .get<int>("padW", padW)
      .get<int>("padH", padH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilationH", dilationH)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<float>("scale", scale)
      .get<int>("im2col_step", im2col_step)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& offset = buildATensor(ctx, ins[1]);
  const auto& gradOutput = buildATensor(ctx, ins[2]);

  auto gradWeight = buildATensor(ctx, outs[0]);
  auto columns = buildATensor(ctx, outs[1]);
  auto ones = buildATensor(ctx, outs[2]);
  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
                                  columns, ones, kW, kH, dW, dH, padW, padH,
                                  dilationW, dilationH, group, deformable_group,
                                  scale, im2col_step);
}
#endif

void deform_conv_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                     const OperatorBase::in_list_t& ins,
                                     OperatorBase::out_list_t& outs) {
  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
      im2col_step;
  SSAttrs(attr)
      .get<int>("kW", kW)
      .get<int>("kH", kH)
      .get<int>("dW", dW)
      .get<int>("dH", dH)
      .get<int>("padW", padW)
      .get<int>("padH", padH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilationH", dilationH)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<int>("im2col_step", im2col_step)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& weight = buildATensor(ctx, ins[1]);
  const auto& offset = buildATensor(ctx, ins[2]);

  auto output = buildATensor(ctx, outs[0]);
  auto columns = buildATensor(ctx, outs[1]);
  auto ones = buildATensor(ctx, outs[2]);

  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
                      dH, padW, padH, dilationW, dilationH, group,
                      deformable_group, im2col_step);
}

void deform_conv_backward_input_cpu_parrots(HostContext& ctx,
                                            const SSElement& attr,
                                            const OperatorBase::in_list_t& ins,
                                            OperatorBase::out_list_t& outs) {
  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
      im2col_step;
  SSAttrs(attr)
      .get<int>("kW", kW)
      .get<int>("kH", kH)
      .get<int>("dW", dW)
      .get<int>("dH", dH)
      .get<int>("padW", padW)
      .get<int>("padH", padH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilationH", dilationH)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<int>("im2col_step", im2col_step)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& offset = buildATensor(ctx, ins[1]);
  const auto& gradOutput = buildATensor(ctx, ins[2]);

  auto gradInput = buildATensor(ctx, outs[0]);
  auto gradOffset = buildATensor(ctx, outs[1]);
  auto weight = buildATensor(ctx, outs[2]);
  auto columns = buildATensor(ctx, outs[3]);

  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
                             weight, columns, kW, kH, dW, dH, padW, padH,
                             dilationW, dilationH, group, deformable_group,
                             im2col_step);
}

void deform_conv_backward_parameters_cpu_parrots(
    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
      im2col_step;
  float scale;
  SSAttrs(attr)
      .get<int>("kW", kW)
      .get<int>("kH", kH)
      .get<int>("dW", dW)
      .get<int>("dH", dH)
      .get<int>("padW", padW)
      .get<int>("padH", padH)
      .get<int>("dilationW", dilationW)
      .get<int>("dilationH", dilationH)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<float>("scale", scale)
      .get<int>("im2col_step", im2col_step)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& offset = buildATensor(ctx, ins[1]);
  const auto& gradOutput = buildATensor(ctx, ins[2]);

  auto gradWeight = buildATensor(ctx, outs[0]);
  auto columns = buildATensor(ctx, outs[1]);
  auto ones = buildATensor(ctx, outs[2]);
  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
                                  columns, ones, kW, kH, dW, dH, padW, padH,
                                  dilationW, dilationH, group, deformable_group,
                                  scale, im2col_step);
}

PARROTS_EXTENSION_REGISTER(deform_conv_forward)
    .attr("kW")
    .attr("kH")
    .attr("dW")
    .attr("dH")
    .attr("padW")
    .attr("padH")
    .attr("dilationW")
    .attr("dilationH")
    .attr("group")
    .attr("deformable_group")
    .attr("im2col_step")
    .input(3)
    .output(3)
    .apply(deform_conv_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(deform_conv_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
    .attr("kW")
    .attr("kH")
    .attr("dW")
    .attr("dH")
    .attr("padW")
    .attr("padH")
    .attr("dilationW")
    .attr("dilationH")
    .attr("group")
    .attr("deformable_group")
    .attr("im2col_step")
    .input(3)
    .output(4)
    .apply(deform_conv_backward_input_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(deform_conv_backward_input_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
    .attr("kW")
    .attr("kH")
    .attr("dW")
    .attr("dH")
    .attr("padW")
    .attr("padH")
    .attr("dilationW")
    .attr("dilationH")
    .attr("group")
    .attr("deformable_group")
    .attr("scale")
    .attr("im2col_step")
    .input(3)
    .output(3)
    .apply(deform_conv_backward_parameters_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(deform_conv_backward_parameters_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/deform_conv_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef DEFORM_CONV_PYTORCH_H
#define DEFORM_CONV_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
                         Tensor output, Tensor columns, Tensor ones, int kW,
                         int kH, int dW, int dH, int padW, int padH,
                         int dilationW, int dilationH, int group,
                         int deformable_group, int im2col_step);

void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
                                Tensor gradInput, Tensor gradOffset,
                                Tensor weight, Tensor columns, int kW, int kH,
                                int dW, int dH, int padW, int padH,
                                int dilationW, int dilationH, int group,
                                int deformable_group, int im2col_step);

void deform_conv_backward_parameters(Tensor input, Tensor offset,
                                     Tensor gradOutput, Tensor gradWeight,
                                     Tensor columns, Tensor ones, int kW,
                                     int kH, int dW, int dH, int padW, int padH,
                                     int dilationW, int dilationH, int group,
                                     int deformable_group, float scale,
                                     int im2col_step);

#endif  // DEFORM_CONV_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/deform_roi_pool.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
                       output, pooled_height, pooled_width, spatial_scale,
                       sampling_ratio, gamma);
}

void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma) {
  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
                       offset, grad_input, grad_offset, pooled_height,
                       pooled_width, spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                             Tensor output, int pooled_height, int pooled_width,
                             float spatial_scale, int sampling_ratio,
                             float gamma) {
  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
                               pooled_width, spatial_scale, sampling_ratio,
                               gamma);
}

void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                              Tensor offset, Tensor grad_input,
                              Tensor grad_offset, int pooled_height,
                              int pooled_width, float spatial_scale,
                              int sampling_ratio, float gamma) {
  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
                                grad_offset, pooled_height, pooled_width,
                                spatial_scale, sampling_ratio, gamma);
}


================================================
FILE: mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "deform_roi_pool_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
 *                                  Tensor output, int pooled_height,
 *                                  int pooled_width, float spatial_scale,
 *                                  int sampling_ratio, float gamma);
 */
void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
                                          const SSElement& attr,
                                          const OperatorBase::in_list_t& ins,
                                          OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  int sampling_ratio;
  float gamma;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<float>("gamma", gamma)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  const auto& offset = buildATensor(ctx, ins[2]);

  auto output = buildATensor(ctx, outs[0]);
  deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
                               pooled_width, spatial_scale, sampling_ratio,
                               gamma);
}

/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
 *                                   Tensor rois, Tensor offset,
 *                                   Tensor grad_input, Tensor grad_offset,
 *                                   int pooled_height, int pooled_width,
 *                                   float spatial_scale, int sampling_ratio,
 *                                   float gamma);
 */
void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
                                           const SSElement& attr,
                                           const OperatorBase::in_list_t& ins,
                                           OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  int sampling_ratio;
  float gamma;

  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<float>("gamma", gamma)
      .done();

  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& input = buildATensor(ctx, ins[1]);
  const auto& rois = buildATensor(ctx, ins[2]);
  const auto& offset = buildATensor(ctx, ins[3]);

  auto grad_input = buildATensor(ctx, outs[0]);
  auto grad_offset = buildATensor(ctx, outs[1]);

  deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
                                grad_offset, pooled_height, pooled_width,
                                spatial_scale, sampling_ratio, gamma);
}

PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .attr("sampling_ratio")
    .attr("gamma")
    .input(3)
    .output(1)
    .apply(deform_roi_pool_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .attr("sampling_ratio")
    .attr("gamma")
    .input(4)
    .output(2)
    .apply(deform_roi_pool_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef DEFORM_ROI_POOL_PYTORCH_H
#define DEFORM_ROI_POOL_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma);

void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma);
#endif  // DEFORM_ROI_POOL_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
                                                   Tensor num_valid) {
  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
                              vertices, mask, num_valid);
}

Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
                                              Tensor num_valid) {
  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
}


================================================
FILE: mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "diff_iou_rotated_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  at::Tensor boxes, scores, dets;
  auto vertices = buildATensor(ctx, ins[0]);
  auto mask = buildATensor(ctx, ins[1]);
  auto num_valid = buildATensor(ctx, ins[2]);
  auto out =
      diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
  updateDArray(ctx, out, outs[0]);
}

PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
    .input(3)
    .output(1)
    .apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef DIFF_IOU_ROTATED_PYTORCH_H
#define DIFF_IOU_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;

Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
                                                   Tensor num_valid);

#endif  // DIFF_IOU_ROTATED_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/focal_loss.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
                       output, gamma, alpha);
}

void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha) {
  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
                       grad_input, gamma, alpha);
}

void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
                       output, gamma, alpha);
}

void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
                       buff, grad_input, gamma, alpha);
}

void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
}

void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor grad_input, float gamma, float alpha) {
  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
                                   alpha);
}

void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
}

void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor buff, Tensor grad_input, float gamma,
                                 float alpha) {
  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
                                   gamma, alpha);
}


================================================
FILE: mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "focal_loss_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
                                             const SSElement& attr,
                                             const OperatorBase::in_list_t& ins,
                                             OperatorBase::out_list_t& outs) {
  float gamma;
  float alpha;
  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();

  // get inputs and outputs
  const auto& input = buildATensor(ctx, ins[0]);
  const auto& target = buildATensor(ctx, ins[1]);
  const auto& weight = buildATensor(ctx, ins[2]);

  auto output = buildATensor(ctx, outs[0]);

  sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
}

void sigmoid_focal_loss_backward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  float gamma;
  float alpha;
  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();

  // get inputs and outputs
  const auto& input = buildATensor(ctx, ins[0]);
  const auto& target = buildATensor(ctx, ins[1]);
  const auto& weight = buildATensor(ctx, ins[2]);

  auto grad_input = buildATensor(ctx, outs[0]);

  sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
                                   alpha);
}

void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
                                             const SSElement& attr,
                                             const OperatorBase::in_list_t& ins,
                                             OperatorBase::out_list_t& outs) {
  float gamma;
  float alpha;
  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();

  // get inputs and outputs
  const auto& input = buildATensor(ctx, ins[0]);
  const auto& target = buildATensor(ctx, ins[1]);
  const auto& weight = buildATensor(ctx, ins[2]);

  auto output = buildATensor(ctx, outs[0]);
  softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
}

void softmax_focal_loss_backward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  float gamma;
  float alpha;
  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();

  // get inputs and outputs
  const auto& input = buildATensor(ctx, ins[0]);
  const auto& target = buildATensor(ctx, ins[1]);
  const auto& weight = buildATensor(ctx, ins[2]);

  auto buff = buildATensor(ctx, outs[0]);
  auto grad_input = buildATensor(ctx, outs[1]);
  softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
                                   gamma, alpha);
}

PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
    .attr("gamma")
    .attr("alpha")
    .input(3)
    .output(1)
    .apply(sigmoid_focal_loss_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
    .attr("gamma")
    .attr("alpha")
    .input(3)
    .output(1)
    .apply(sigmoid_focal_loss_backward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
    .attr("gamma")
    .attr("alpha")
    .input(3)
    .output(1)
    .apply(softmax_focal_loss_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
    .attr("gamma")
    .attr("alpha")
    .input(3)
    .output(2)
    .apply(softmax_focal_loss_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/focal_loss_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef FOCAL_LOSS_PYTORCH_H
#define FOCAL_LOSS_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha);

void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha);
#endif  // FOCAL_LOSS_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/furthest_point_sample.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void furthest_point_sampling_forward_impl(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m) {
  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
                       temp_tensor, idx_tensor, b, n, m);
}

void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m) {
  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
                       points_tensor, temp_tensor, idx_tensor, b, n, m);
}

void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                     Tensor idx_tensor, int b, int n, int m) {
  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
                                       b, n, m);
}

void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                               Tensor temp_tensor,
                                               Tensor idx_tensor, int b, int n,
                                               int m) {
  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
                                                 idx_tensor, b, n, m);
}


================================================
FILE: mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "furthest_point_sample_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void furthest_point_sample_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int b, n, m;
  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();

  auto points_tensor = buildATensor(ctx, ins[0]);
  auto temp_tensor = buildATensor(ctx, ins[1]);

  auto idx_tensor = buildATensor(ctx, outs[0]);

  furthest_point_sampling_forward(points_tensor, temp_tensor, idx_tensor, b, n,
                                  m);
}

void furthest_point_sampling_with_dist_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int b, n, m;
  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();

  auto points_tensor = buildATensor(ctx, ins[0]);
  auto temp_tensor = buildATensor(ctx, ins[1]);

  auto idx_tensor = buildATensor(ctx, outs[0]);

  furthest_point_sampling_with_dist_forward(points_tensor, temp_tensor,
                                            idx_tensor, b, n, m);
}
PARROTS_EXTENSION_REGISTER(furthest_point_sampling_forward)
    .attr("b")
    .attr("n")
    .attr("m")
    .input(2)
    .output(1)
    .apply(furthest_point_sample_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(furthest_point_sampling_with_dist_forward)
    .attr("b")
    .attr("n")
    .attr("m")
    .input(2)
    .output(1)
    .apply(furthest_point_sampling_with_dist_forward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef FURTHEST_POINT_SAMPLE_PYTORCH_H
#define FURTHEST_POINT_SAMPLE_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                     Tensor idx_tensor, int b, int n, int m);

void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                               Tensor temp_tensor,
                                               Tensor idx_tensor, int b, int n,
                                               int m);
#endif  // FURTHEST_POINT_SAMPLE_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
================================================
// Modified from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp

/*
Copyright (c) 2021, NVIDIA Corporation. All rights reserved.

NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
Augmentation (ADA)
=======================================================================

1. Definitions

"Licensor" means any person or entity that distributes its Work.

"Software" means the original work of authorship made available under
this License.

"Work" means the Software and any additions to or derivative works of
the Software that are made available under this License.

The terms "reproduce," "reproduction," "derivative works," and
"distribution" have the meaning as provided under U.S. copyright law;
provided, however, that for the purposes of this License, derivative
works shall not include works that remain separable from, or merely
link (or bind by name) to the interfaces of, the Work.

Works, including the Software, are "made available" under this License
by including in or with the Work either (a) a copyright notice
referencing the applicability of this License to the Work, or (b) a
copy of this License.

2. License Grants

    2.1 Copyright Grant. Subject to the terms and conditions of this
    License, each Licensor grants to you a perpetual, worldwide,
    non-exclusive, royalty-free, copyright license to reproduce,
    prepare derivative works of, publicly display, publicly perform,
    sublicense and distribute its Work and any resulting derivative
    works in any form.

3. Limitations

    3.1 Redistribution. You may reproduce or distribute the Work only
    if (a) you do so under this License, (b) you include a complete
    copy of this License with your distribution, and (c) you retain
    without modification any copyright, patent, trademark, or
    attribution notices that are present in the Work.

    3.2 Derivative Works. You may specify that additional or different
    terms apply to the use, reproduction, and distribution of your
    derivative works of the Work ("Your Terms") only if (a) Your Terms
    provide that the use limitation in Section 3.3 applies to your
    derivative works, and (b) you identify the specific derivative
    works that are subject to Your Terms. Notwithstanding Your Terms,
    this License (including the redistribution requirements in Section
    3.1) will continue to apply to the Work itself.

    3.3 Use Limitation. The Work and any derivative works thereof only
    may be used or intended for use non-commercially. Notwithstanding
    the foregoing, NVIDIA and its affiliates may use the Work and any
    derivative works commercially. As used herein, "non-commercially"
    means for research or evaluation purposes only.

    3.4 Patent Claims. If you bring or threaten to bring a patent claim
    against any Licensor (including any claim, cross-claim or
    counterclaim in a lawsuit) to enforce any patents that you allege
    are infringed by any Work, then your rights under this License from
    such Licensor (including the grant in Section 2.1) will terminate
    immediately.

    3.5 Trademarks. This License does not grant any rights to use any
    Licensor’s or its affiliates’ names, logos, or trademarks, except
    as necessary to reproduce the notices described in this License.

    3.6 Termination. If you violate any term of this License, then your
    rights under this License (including the grant in Section 2.1) will
    terminate immediately.

4. Disclaimer of Warranty.

THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
THIS LICENSE.

5. Limitation of Liability.

EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
THE POSSIBILITY OF SUCH DAMAGES.

=======================================================================
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
                                           const torch::Tensor& bias,
                                           const torch::Tensor& refer, int act,
                                           int grad, float alpha, float scale) {
  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
                              act, grad, alpha, scale);
}

torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
                                   const torch::Tensor& bias,
                                   const torch::Tensor& refer, int act,
                                   int grad, float alpha, float scale) {
  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
                                      scale);
}


================================================
FILE: mmcv/ops/csrc/parrots/fused_bias_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <torch/extension.h>

#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
using namespace at;
using namespace parrots;

torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
                                   const torch::Tensor &bias,
                                   const torch::Tensor &refer, int act,
                                   int grad, float alpha, float scale);

void fused_bias_leakyrelu_parrots(CudaContext &ctx, const SSElement &attr,
                                  const OperatorBase::in_list_t &ins,
                                  OperatorBase::out_list_t &outs) {
  int act, grad;
  float alpha, scale;
  SSAttrs(attr)
      .get<int>("act", act)
      .get<int>("grad", grad)
      .get<float>("alpha", alpha)
      .get<float>("scale", scale)
      .done();
  const auto &input = buildATensor(ctx, ins[0]);
  const auto &bias = buildATensor(ctx, ins[1]);
  const auto &refer = buildATensor(ctx, ins[2]);
  auto out = fused_bias_leakyrelu(input, bias, refer, act, grad, alpha, scale);
  updateDArray(ctx, out, outs[0]);
}

PARROTS_EXTENSION_REGISTER(fused_bias_leakyrelu)
    .attr("act")
    .attr("grad")
    .attr("alpha")
    .attr("scale")
    .input(3)
    .output(1)
    .apply(fused_bias_leakyrelu_parrots)
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/gather_points.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void gather_points_forward_impl(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
                       idx, out);
}

void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
                       idx, grad_points);
}

void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n,
                           int npoints) {
  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
                             out_tensor);
}

void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints) {
  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
                              grad_points_tensor);
}


================================================
FILE: mmcv/ops/csrc/parrots/gather_points_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "gather_points_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void gather_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
  int b, c, n, npoints;
  SSAttrs(attr)
      .get<int>("b", b)
      .get<int>("c", c)
      .get<int>("n", n)
      .get<int>("npoints", npoints)
      .done();

  auto points_tensor = buildATensor(ctx, ins[0]);
  auto idx_tensor = buildATensor(ctx, ins[1]);

  auto out_tensor = buildATensor(ctx, outs[0]);

  gather_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n,
                        npoints);
}

void gather_points_backward_cuda_parrots(CudaContext& ctx,
                                         const SSElement& attr,
                                         const OperatorBase::in_list_t& ins,
                                         OperatorBase::out_list_t& outs) {
  int b, c, n, npoints;
  SSAttrs(attr)
      .get<int>("b", b)
      .get<int>("c", c)
      .get<int>("n", n)
      .get<int>("npoints", npoints)
      .done();

  auto grad_out_tensor = buildATensor(ctx, ins[0]);
  auto idx_tensor = buildATensor(ctx, ins[1]);

  auto grad_points_tensor = buildATensor(ctx, outs[0]);

  gather_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
                         n, npoints);
}

PARROTS_EXTENSION_REGISTER(gather_points_forward)
    .attr("b")
    .attr("c")
    .attr("n")
    .attr("npoints")
    .input(2)
    .output(1)
    .apply(gather_points_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(gather_points_backward)
    .attr("b")
    .attr("c")
    .attr("n")
    .attr("npoints")
    .input(2)
    .output(1)
    .apply(gather_points_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/gather_points_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef GATHER_POINTS_PYTORCH_H
#define GATHER_POINTS_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n, int npoints);

void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints);
#endif  // GATHER_POINTS_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/group_points.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
                       points, idx, out);
}

void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
                       grad_out, idx, grad_points);
}

void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                          Tensor out_tensor, int b, int c, int n, int npoints,
                          int nsample) {
  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
                       points_tensor, idx_tensor, out_tensor);
}

void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                           Tensor grad_points_tensor, int b, int c, int n,
                           int npoints, int nsample) {
  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
                             idx_tensor, grad_points_tensor);
}


================================================
FILE: mmcv/ops/csrc/parrots/group_points_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "group_points_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void group_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                       const OperatorBase::in_list_t& ins,
                                       OperatorBase::out_list_t& outs) {
  int b, c, n, npoints, nsample;
  SSAttrs(attr)
      .get<int>("b", b)
      .get<int>("c", c)
      .get<int>("n", n)
      .get<int>("npoints", npoints)
      .get<int>("nsample", nsample)
      .done();
  auto points_tensor = buildATensor(ctx, ins[0]);
  auto idx_tensor = buildATensor(ctx, ins[1]);

  auto out_tensor = buildATensor(ctx, outs[0]);

  group_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n, npoints,
                       nsample);
}

void group_points_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
  int b, c, n, npoints, nsample;
  SSAttrs(attr)
      .get<int>("b", b)
      .get<int>("c", c)
      .get<int>("n", n)
      .get<int>("npoints", npoints)
      .get<int>("nsample", nsample)
      .done();
  auto grad_out_tensor = buildATensor(ctx, ins[0]);
  auto idx_tensor = buildATensor(ctx, ins[1]);

  auto grad_points_tensor = buildATensor(ctx, outs[0]);

  group_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
                        n, npoints, nsample);
}

PARROTS_EXTENSION_REGISTER(group_points_forward)
    .attr("b")
    .attr("c")
    .attr("n")
    .attr("npoints")
    .attr("nsample")
    .input(2)
    .output(1)
    .apply(group_points_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(group_points_backward)
    .attr("b")
    .attr("c")
    .attr("n")
    .attr("npoints")
    .attr("nsample")
    .input(2)
    .output(1)
    .apply(group_points_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/group_points_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef GROUP_POINTS_PYTORCH_H
#define GROUP_POINTS_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                          Tensor out_tensor, int b, int c, int n, int npoints,
                          int nsample);

void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                           Tensor grad_points_tensor, int b, int c, int n,
                           int npoints, int nsample);

#endif  // GROUP_POINTS_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/info.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
#include "pytorch_cpp_helper.hpp"

#ifdef MMCV_WITH_CUDA
#ifdef MMCV_WITH_HIP
#include <hip/hip_runtime_api.h>
int get_hiprt_version() {
  int runtimeVersion;
  hipRuntimeGetVersion(&runtimeVersion);
  return runtimeVersion;
}
#else
#include <cuda_runtime_api.h>
int get_cudart_version() { return CUDART_VERSION; }
#endif
#endif

std::string get_compiling_cuda_version() {
#ifdef MMCV_WITH_CUDA
#ifndef MMCV_WITH_HIP
  std::ostringstream oss;
  // copied from
  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
  auto printCudaStyleVersion = [&](int v) {
    oss << (v / 1000) << "." << (v / 10 % 100);
    if (v % 10 != 0) {
      oss << "." << (v % 10);
    }
  };
  printCudaStyleVersion(get_cudart_version());
  return oss.str();
#else
  std::ostringstream oss;
  oss << get_hiprt_version();
  return oss.str();
#endif
#else
  return std::string("not available");
#endif
}

// similar to
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
std::string get_compiler_version() {
  std::ostringstream ss;
#if defined(__GNUC__)
#ifndef __clang__
  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
#endif
#endif

#if defined(__clang_major__)
  {
    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
       << __clang_patchlevel__;
  }
#endif

#if defined(_MSC_VER)
  { ss << "MSVC " << _MSC_FULL_VER; }
#endif
  return ss.str();
}


================================================
FILE: mmcv/ops/csrc/parrots/iou3d.cpp
================================================
// Modified from
// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp

/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;

void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap) {
  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
                       num_b, boxes_b, ans_overlap);
}

void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
                              Tensor &keep_num, float nms_overlap_thresh) {
  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
                       nms_overlap_thresh);
}

void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
                                     Tensor &keep_num,
                                     float nms_overlap_thresh) {
  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
                       nms_overlap_thresh);
}

void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap) {
  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
  int num_a = boxes_a.size(0);
  int num_b = boxes_b.size(0);

  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
                                       ans_overlap);
}

void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                         float nms_overlap_thresh) {
  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params keep: (N)
  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);

  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
}

void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                                float nms_overlap_thresh) {
  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params keep: (N)

  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);

  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
}


================================================
FILE: mmcv/ops/csrc/parrots/iou3d_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "iou3d_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void iou3d_boxes_overlap_bev_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  auto boxes_a = buildATensor(ctx, ins[0]);
  auto boxes_b = buildATensor(ctx, ins[1]);

  auto ans_iou = buildATensor(ctx, outs[0]);

  iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
}

void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                      const OperatorBase::in_list_t& ins,
                                      OperatorBase::out_list_t& outs) {
  float nms_overlap_thresh;
  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();

  auto boxes = buildATensor(ctx, ins[0]);

  auto keep = buildATensor(ctx, outs[0]);
  auto keep_num = buildATensor(ctx, outs[1]);

  iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
}

void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
                                             const SSElement& attr,
                                             const OperatorBase::in_list_t& ins,
                                             OperatorBase::out_list_t& outs) {
  float nms_overlap_thresh;
  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();

  auto boxes = buildATensor(ctx, ins[0]);

  auto keep = buildATensor(ctx, outs[0]);
  auto keep_num = buildATensor(ctx, outs[1]);

  iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
}

PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
    .input(2)
    .output(1)
    .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
    .attr("nms_overlap_thresh")
    .input(1)
    .output(2)
    .apply(iou3d_nms3d_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
    .attr("nms_overlap_thresh")
    .input(1)
    .output(2)
    .apply(iou3d_nms3d_normal_forward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/iou3d_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef IOU_3D_PYTORCH_H
#define IOU_3D_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap);

void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                         float nms_overlap_thresh);

void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                                float nms_overlap_thresh);

#endif  // IOU_3D_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/knn.cpp
================================================
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
                       dist2);
}

void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
                   dist2_tensor);
}


================================================
FILE: mmcv/ops/csrc/parrots/knn_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "knn_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void knn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                              const OperatorBase::in_list_t& ins,
                              OperatorBase::out_list_t& outs) {
  int b, n, m, nsample;
  SSAttrs(attr)
      .get<int>("b", b)
      .get<int>("n", n)
      .get<int>("m", m)
      .get<int>("nsample", nsample)
      .done();

  auto xyz_tensor = buildATensor(ctx, ins[0]);
  auto new_xyz_tensor = buildATensor(ctx, ins[1]);

  auto idx_tensor = buildATensor(ctx, outs[0]);
  auto dist2_tensor = buildATensor(ctx, outs[1]);

  knn_forward(xyz_tensor, new_xyz_tensor, idx_tensor, dist2_tensor, b, n, m,
              nsample);
}

PARROTS_EXTENSION_REGISTER(knn_forward)
    .attr("b")
    .attr("n")
    .attr("m")
    .attr("nsample")
    .input(2)
    .output(2)
    .apply(knn_forward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/knn_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef KNN_PYTORCH_H
#define KNN_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                 Tensor dist2_tensor, int b, int n, int m, int nsample);
#endif  // KNN_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/masked_conv2d.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w) {
  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
                       col, kernel_h, kernel_w, pad_h, pad_w);
}

void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels) {
  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
                       im, height, width, channels);
}

void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor col,
                           const int kernel_h, const int kernel_w,
                           const int pad_h, const int pad_w) {
  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
                             kernel_w, pad_h, pad_w);
}

void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor im, int height,
                           int width, int channels) {
  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
                             channels);
}


================================================
FILE: mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "masked_conv2d_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
  int kernel_h, kernel_w, pad_h, pad_w;
  SSAttrs(attr)
      .get<int>("kernel_h", kernel_h)
      .get<int>("kernel_w", kernel_w)
      .get<int>("pad_h", pad_h)
      .get<int>("pad_w", pad_w)
      .done();

  const auto& im = buildATensor(ctx, ins[0]);
  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
  const auto& mask_w_idx = buildATensor(ctx, ins[2]);

  auto col = buildATensor(ctx, outs[0]);
  masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
                             kernel_w, pad_h, pad_w);
}

void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
  int height, width, channels;
  SSAttrs(attr)
      .get<int>("height", height)
      .get<int>("width", width)
      .get<int>("channels", channels)
      .done();

  const auto& col = buildATensor(ctx, ins[0]);
  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
  const auto& mask_w_idx = buildATensor(ctx, ins[2]);

  auto im = buildATensor(ctx, outs[0]);
  masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
                             channels);
}

PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
    .attr("kernel_h")
    .attr("kernel_w")
    .attr("pad_h")
    .attr("pad_w")
    .input(3)
    .output(1)
    .apply(masked_im2col_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
    .attr("height")
    .attr("width")
    .attr("channels")
    .input(3)
    .output(1)
    .apply(masked_col2im_forward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MASKED_CONV2D_PYTORCH_H
#define MASKED_CONV2D_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w);

void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels);
#endif  // MASKED_CONV2D_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/min_area_polygons.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
}

void min_area_polygons(const Tensor pointsets, Tensor polygons) {
  min_area_polygons_impl(pointsets, polygons);
}


================================================
FILE: mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "min_area_polygons_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                    const OperatorBase::in_list_t& ins,
                                    OperatorBase::out_list_t& outs) {
  auto pointsets = buildATensor(ctx, ins[0]);

  auto polygons = buildATensor(ctx, outs[0]);
  min_area_polygons(pointsets, polygons);
}

PARROTS_EXTENSION_REGISTER(min_area_polygons)
    .input(1)
    .output(1)
    .apply(min_area_polygons_cuda_parrots)
    .done();

#endif


================================================
FILE: mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MIN_AREA_POLYGONS_PYTORCH_H
#define MIN_AREA_POLYGONS_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void min_area_polygons(const Tensor pointsets, Tensor polygons);

#endif  // MIN_AREA_POLYGONS_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void modulated_deformable_im2col_impl(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col) {
  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
                       data_mask, batch_size, channels, height_im, width_im,
                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
                       stride_h, stride_w, dilation_h, dilation_w,
                       deformable_group, data_col);
}

void modulated_deformable_col2im_impl(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im) {
  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
                       data_mask, batch_size, channels, height_im, width_im,
                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
                       stride_h, stride_w, dilation_h, dilation_w,
                       deformable_group, grad_im);
}

void modulated_deformable_col2im_coord_impl(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask) {
  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
                       data_im, data_offset, data_mask, batch_size, channels,
                       height_im, width_im, height_col, width_col, kernel_h,
                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
                       dilation_w, deformable_group, grad_offset, grad_mask);
}

void modulated_deform_conv_forward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias) {
  at::DeviceGuard guard(input.device());

  const int batch = input.size(0);
  const int channels = input.size(1);
  const int height = input.size(2);
  const int width = input.size(3);

  const int channels_out = weight.size(0);
  const int channels_kernel = weight.size(1);
  const int kernel_h_ = weight.size(2);
  const int kernel_w_ = weight.size(3);

  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
             kernel_h_, kernel_w, kernel_h_, kernel_w_);
  if (channels != channels_kernel * group)
    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
             channels, channels_kernel * group);

  const int height_out =
      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int width_out =
      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;

  if (ones.ndimension() != 2 ||
      ones.size(0) * ones.size(1) < height_out * width_out) {
    // Resize plane and fill with ones...
    ones = at::ones({height_out, width_out}, input.options());
  }

  // resize output
  output = output.view({batch, channels_out, height_out, width_out}).zero_();
  // resize temporary columns
  columns =
      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
                input.options());

  output = output.view({output.size(0), group, output.size(1) / group,
                        output.size(2), output.size(3)});

  for (int b = 0; b < batch; b++) {
    modulated_deformable_im2col_impl(
        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
        dilation_h, dilation_w, deformable_group, columns);

    // divide into group
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
                          weight.size(2), weight.size(3)});
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});

    for (int g = 0; g < group; g++) {
      output[b][g] = output[b][g]
                         .flatten(1)
                         .addmm_(weight[g].flatten(1), columns[g])
                         .view_as(output[b][g]);
    }

    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
  }

  output = output.view({output.size(0), output.size(1) * output.size(2),
                        output.size(3), output.size(4)});

  if (with_bias) {
    output += bias.view({1, bias.size(0), 1, 1});
  }
}

void modulated_deform_conv_backward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias) {
  at::DeviceGuard guard(input.device());

  const int batch = input.size(0);
  const int channels = input.size(1);
  const int height = input.size(2);
  const int width = input.size(3);

  const int channels_kernel = weight.size(1);
  const int kernel_h_ = weight.size(2);
  const int kernel_w_ = weight.size(3);
  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
             kernel_h_, kernel_w, kernel_h_, kernel_w_);
  if (channels != channels_kernel * group)
    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
             channels, channels_kernel * group);

  const int height_out =
      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int width_out =
      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;

  if (ones.ndimension() != 2 ||
      ones.size(0) * ones.size(1) < height_out * width_out) {
    // Resize plane and fill with ones...
    ones = at::ones({height_out, width_out}, input.options());
  }

  grad_input = grad_input.view({batch, channels, height, width});
  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
                      input.options());

  grad_output =
      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
                        grad_output.size(2), grad_output.size(3)});

  for (int b = 0; b < batch; b++) {
    // divide int group
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
                          weight.size(2), weight.size(3)});

    for (int g = 0; g < group; g++) {
      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
    }

    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});

    // gradient w.r.t. input coordinate data
    modulated_deformable_col2im_coord_impl(
        columns, input[b], offset[b], mask[b], 1, channels, height, width,
        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
        grad_mask[b]);
    // gradient w.r.t. input data
    modulated_deformable_col2im_impl(
        columns, offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
        dilation_h, dilation_w, deformable_group, grad_input[b]);

    // gradient w.r.t. weight, dWeight should accumulate across the batch and
    // group
    modulated_deformable_im2col_impl(
        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
        dilation_h, dilation_w, deformable_group, columns);

    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
                                    grad_weight.size(1), grad_weight.size(2),
                                    grad_weight.size(3)});
    if (with_bias)
      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});

    for (int g = 0; g < group; g++) {
      grad_weight[g] =
          grad_weight[g]
              .flatten(1)
              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
              .view_as(grad_weight[g]);
      if (with_bias) {
        grad_bias[g] =
            grad_bias[g]
                .view({-1, 1})
                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
                .view(-1);
      }
    }

    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
                                    grad_weight.size(2), grad_weight.size(3),
                                    grad_weight.size(4)});
    if (with_bias)
      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
  }
  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
                                  grad_output.size(2), grad_output.size(3),
                                  grad_output.size(4)});
}


================================================
FILE: mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "modulated_deform_conv_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void modulated_deform_conv_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
      dilation_w, group, deformable_group, with_bias;
  SSAttrs(attr)
      .get<int>("kernel_h", kernel_h)
      .get<int>("kernel_w", kernel_w)
      .get<int>("stride_h", stride_h)
      .get<int>("stride_w", stride_w)
      .get<int>("pad_h", pad_h)
      .get<int>("pad_w", pad_w)
      .get<int>("dilation_h", dilation_h)
      .get<int>("dilation_w", dilation_w)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<int>("with_bias", with_bias)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& weight = buildATensor(ctx, ins[1]);
  const auto& bias = buildATensor(ctx, ins[2]);
  const auto& ones = buildATensor(ctx, ins[3]);
  const auto& offset = buildATensor(ctx, ins[4]);
  const auto& mask = buildATensor(ctx, ins[5]);

  auto output = buildATensor(ctx, outs[0]);
  auto columns = buildATensor(ctx, outs[1]);

  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
                                columns, kernel_h, kernel_w, stride_h, stride_w,
                                pad_h, pad_w, dilation_h, dilation_w, group,
                                deformable_group, with_bias);
}

void modulated_deform_conv_backward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
      dilation_w, group, deformable_group, with_bias;
  SSAttrs(attr)
      .get<int>("kernel_h", kernel_h)
      .get<int>("kernel_w", kernel_w)
      .get<int>("stride_h", stride_h)
      .get<int>("stride_w", stride_w)
      .get<int>("pad_h", pad_h)
      .get<int>("pad_w", pad_w)
      .get<int>("dilation_h", dilation_h)
      .get<int>("dilation_w", dilation_w)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<int>("with_bias", with_bias)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& weight = buildATensor(ctx, ins[1]);
  const auto& bias = buildATensor(ctx, ins[2]);
  const auto& ones = buildATensor(ctx, ins[3]);
  const auto& offset = buildATensor(ctx, ins[4]);
  const auto& mask = buildATensor(ctx, ins[5]);

  auto columns = buildATensor(ctx, outs[0]);
  auto grad_input = buildATensor(ctx, outs[1]);
  auto grad_weight = buildATensor(ctx, outs[2]);
  auto grad_bias = buildATensor(ctx, outs[3]);
  auto grad_offset = buildATensor(ctx, outs[4]);
  auto grad_mask = buildATensor(ctx, outs[5]);
  auto grad_output = buildATensor(ctx, outs[6]);
  modulated_deform_conv_backward(
      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
      deformable_group, with_bias);
}
#endif

void modulated_deform_conv_forward_cpu_parrots(
    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
      dilation_w, group, deformable_group, with_bias;
  SSAttrs(attr)
      .get<int>("kernel_h", kernel_h)
      .get<int>("kernel_w", kernel_w)
      .get<int>("stride_h", stride_h)
      .get<int>("stride_w", stride_w)
      .get<int>("pad_h", pad_h)
      .get<int>("pad_w", pad_w)
      .get<int>("dilation_h", dilation_h)
      .get<int>("dilation_w", dilation_w)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<int>("with_bias", with_bias)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& weight = buildATensor(ctx, ins[1]);
  const auto& bias = buildATensor(ctx, ins[2]);
  const auto& ones = buildATensor(ctx, ins[3]);
  const auto& offset = buildATensor(ctx, ins[4]);
  const auto& mask = buildATensor(ctx, ins[5]);

  auto output = buildATensor(ctx, outs[0]);
  auto columns = buildATensor(ctx, outs[1]);

  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
                                columns, kernel_h, kernel_w, stride_h, stride_w,
                                pad_h, pad_w, dilation_h, dilation_w, group,
                                deformable_group, with_bias);
}

void modulated_deform_conv_backward_cpu_parrots(
    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
      dilation_w, group, deformable_group, with_bias;
  SSAttrs(attr)
      .get<int>("kernel_h", kernel_h)
      .get<int>("kernel_w", kernel_w)
      .get<int>("stride_h", stride_h)
      .get<int>("stride_w", stride_w)
      .get<int>("pad_h", pad_h)
      .get<int>("pad_w", pad_w)
      .get<int>("dilation_h", dilation_h)
      .get<int>("dilation_w", dilation_w)
      .get<int>("group", group)
      .get<int>("deformable_group", deformable_group)
      .get<int>("with_bias", with_bias)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& weight = buildATensor(ctx, ins[1]);
  const auto& bias = buildATensor(ctx, ins[2]);
  const auto& ones = buildATensor(ctx, ins[3]);
  const auto& offset = buildATensor(ctx, ins[4]);
  const auto& mask = buildATensor(ctx, ins[5]);

  auto columns = buildATensor(ctx, outs[0]);
  auto grad_input = buildATensor(ctx, outs[1]);
  auto grad_weight = buildATensor(ctx, outs[2]);
  auto grad_bias = buildATensor(ctx, outs[3]);
  auto grad_offset = buildATensor(ctx, outs[4]);
  auto grad_mask = buildATensor(ctx, outs[5]);
  auto grad_output = buildATensor(ctx, outs[6]);
  modulated_deform_conv_backward(
      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
      deformable_group, with_bias);
}
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
    .attr("kernel_h")
    .attr("kernel_w")
    .attr("stride_h")
    .attr("stride_w")
    .attr("pad_h")
    .attr("pad_w")
    .attr("dilation_h")
    .attr("dilation_w")
    .attr("group")
    .attr("deformable_group")
    .attr("with_bias")
    .input(6)
    .output(2)
    .apply(modulated_deform_conv_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(modulated_deform_conv_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
    .attr("kernel_h")
    .attr("kernel_w")
    .attr("stride_h")
    .attr("stride_w")
    .attr("pad_h")
    .attr("pad_w")
    .attr("dilation_h")
    .attr("dilation_w")
    .attr("group")
    .attr("deformable_group")
    .attr("with_bias")
    .input(6)
    .output(7)
    .apply(modulated_deform_conv_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(modulated_deform_conv_backward_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MODULATED_DEFORM_CONV_PYTORCH_H
#define MODULATED_DEFORM_CONV_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void modulated_deform_conv_forward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias);

void modulated_deform_conv_backward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias);
#endif  // MODULATED_DEFORM_CONV_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/ms_deform_attn.cpp
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

Tensor ms_deform_attn_impl_forward(const Tensor &value,
                                   const Tensor &spatial_shapes,
                                   const Tensor &level_start_index,
                                   const Tensor &sampling_loc,
                                   const Tensor &attn_weight,
                                   const int im2col_step) {
  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
                              spatial_shapes, level_start_index, sampling_loc,
                              attn_weight, im2col_step);
}

void ms_deform_attn_impl_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
    const int im2col_step) {
  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
                       level_start_index, sampling_loc, attn_weight,
                       grad_output, grad_value, grad_sampling_loc,
                       grad_attn_weight, im2col_step);
}

Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
                              const Tensor &level_start_index,
                              const Tensor &sampling_loc,
                              const Tensor &attn_weight,
                              const int im2col_step) {
  at::DeviceGuard guard(value.device());
  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
                                     sampling_loc, attn_weight, im2col_step);
}

void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
                             const Tensor &level_start_index,
                             const Tensor &sampling_loc,
                             const Tensor &attn_weight,
                             const Tensor &grad_output, Tensor &grad_value,
                             Tensor &grad_sampling_loc,
                             Tensor &grad_attn_weight, const int im2col_step) {
  at::DeviceGuard guard(value.device());
  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
                               sampling_loc, attn_weight, grad_output,
                               grad_value, grad_sampling_loc, grad_attn_weight,
                               im2col_step);
}


================================================
FILE: mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <torch/extension.h>

#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
using namespace at;
using namespace parrots;

Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
                              const Tensor &level_start_index,
                              const Tensor &sampling_loc,
                              const Tensor &attn_weight, const int im2col_step);

void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
                             const Tensor &level_start_index,
                             const Tensor &sampling_loc,
                             const Tensor &attn_weight,
                             const Tensor &grad_output, Tensor &grad_value,
                             Tensor &grad_sampling_loc,
                             Tensor &grad_attn_weight, const int im2col_step);

void ms_deform_attn_forward_parrots(CudaContext &ctx, const SSElement &attr,
                                    const OperatorBase::in_list_t &ins,
                                    OperatorBase::out_list_t &outs) {
  int im2col_step;
  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
  const auto &value = buildATensor(ctx, ins[0]);
  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
  const auto &level_start_index = buildATensor(ctx, ins[2]);
  const auto &sampling_loc = buildATensor(ctx, ins[3]);
  const auto &attn_weight = buildATensor(ctx, ins[4]);
  auto out = ms_deform_attn_forward(value, spatial_shapes, level_start_index,
                                    sampling_loc, attn_weight, im2col_step);
  updateDArray(ctx, out, outs[0]);
}

void ms_deform_attn_backward_parrots(CudaContext &ctx, const SSElement &attr,
                                     const OperatorBase::in_list_t &ins,
                                     OperatorBase::out_list_t &outs) {
  int im2col_step;
  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
  const auto &value = buildATensor(ctx, ins[0]);
  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
  const auto &level_start_index = buildATensor(ctx, ins[2]);
  const auto &sampling_loc = buildATensor(ctx, ins[3]);
  const auto &attn_weight = buildATensor(ctx, ins[4]);
  const auto &grad_output = buildATensor(ctx, ins[5]);
  auto grad_value = buildATensor(ctx, outs[0]);
  auto grad_sampling_loc = buildATensor(ctx, outs[1]);
  auto grad_attn_weight = buildATensor(ctx, outs[2]);
  ms_deform_attn_backward(value, spatial_shapes, level_start_index,
                          sampling_loc, attn_weight, grad_output, grad_value,
                          grad_sampling_loc, grad_attn_weight, im2col_step);
}

PARROTS_EXTENSION_REGISTER(ms_deform_attn_forward)
    .attr("im2col_step")
    .input(5)
    .output(1)
    .apply(ms_deform_attn_forward_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(ms_deform_attn_backward)
    .attr("im2col_step")
    .input(6)
    .output(3)
    .apply(ms_deform_attn_backward_parrots)
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/nms.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
}

Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
                    float iou_threshold, float sigma, float min_score,
                    int method, int offset) {
  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
                              sigma, min_score, method, offset);
}

std::vector<std::vector<int> > nms_match_impl(Tensor dets,
                                              float iou_threshold) {
  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
}

Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  return nms_impl(boxes, scores, iou_threshold, offset);
}

Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
               float sigma, float min_score, int method, int offset) {
  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
                      method, offset);
}

std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
  return nms_match_impl(dets, iou_threshold);
}


================================================
FILE: mmcv/ops/csrc/parrots/nms_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "nms_pytorch.h"

using namespace parrots;

// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
template <typename T>
void nms_parrots(T& ctx, const SSElement& attr,
                 const OperatorBase::in_list_t& ins,
                 OperatorBase::out_list_t& outs) {
  float iou_threshold;
  int offset;
  SSAttrs(attr)
      .get("iou_threshold", iou_threshold)
      .get("offset", offset)
      .done();
  at::Tensor boxes, scores;
  boxes = buildATensor(ctx, ins[0]);
  scores = buildATensor(ctx, ins[1]);
  auto out = nms(boxes, scores, iou_threshold, offset);
  updateDArray(ctx, out, outs[0]);
}

/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
 *                float sigma, float min_score, int method, int offset);*/
template <typename T>
void softnms_parrots(T& ctx, const SSElement& attr,
                     const OperatorBase::in_list_t& ins,
                     OperatorBase::out_list_t& outs) {
  float iou_threshold, sigma, min_score;
  int method, offset;
  SSAttrs(attr)
      .get("iou_threshold", iou_threshold)
      .get("sigma", sigma)
      .get("min_score", min_score)
      .get("method", method)
      .get("offset", offset)
      .done();
  at::Tensor boxes, scores, dets;
  boxes = buildATensor(ctx, ins[0]);
  scores = buildATensor(ctx, ins[1]);
  dets = buildATensor(ctx, ins[2]);
  auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,
                     method, offset);
  updateDArray(ctx, out, outs[0]);
}

// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
template <typename T>
void nms_match_parrots(T& ctx, const SSElement& attr,
                       const OperatorBase::in_list_t& ins,
                       OperatorBase::out_list_t& outs) {
  float iou_threshold;
  SSAttrs(attr).get("iou_threshold", iou_threshold).done();
  at::Tensor dets;
  dets = buildATensor(ctx, ins[0]);
  auto out = nms_match(dets, iou_threshold);
  int n = out.size(), m = 0;
  for (int i = 0; i < n; ++i)
    if (m < out[i].size()) m = out[i].size();
  auto options = torch::TensorOptions().dtype(at::kInt);
  auto tensor = torch::zeros({n, m}, options);
  for (int i = 0; i < n; i++)
    tensor.slice(0, i, i + 1) =
        torch::from_blob(out[i].data(), {out[i].size()}, options);
  updateDArray(ctx, tensor, outs[0]);
}

/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
 *                    const Tensor dets_sorted, const float iou_threshold,
 *                                       const int multi_label);*/
template <typename T>
void nms_rotated_parrots(T& ctx, const SSElement& attr,
                         const OperatorBase::in_list_t& ins,
                         OperatorBase::out_list_t& outs) {
  float iou_threshold;
  int multi_label;
  SSAttrs(attr)
      .get("iou_threshold", iou_threshold)
      .get("multi_label", multi_label)
      .done();
  at::Tensor dets, scores, order, dets_sorted;
  dets = buildATensor(ctx, ins[0]);
  scores = buildATensor(ctx, ins[1]);
  order = buildATensor(ctx, ins[2]);
  dets_sorted = buildATensor(ctx, ins[3]);
  auto out =
      nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);
  updateDArray(ctx, out, outs[0]);
}

PARROTS_EXTENSION_REGISTER(nms)
    .attr("iou_threshold")
    .attr("offset")
    .input(2)
    .output(1)
    .apply(nms_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
    .apply(nms_parrots<CudaContext>)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(softnms)
    .attr("iou_threshold")
    .attr("sigma")
    .attr("min_score")
    .attr("method")
    .attr("offset")
    .input(3)
    .output(1)
    .apply(softnms_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
    .apply(softnms_parrots<CudaContext>)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(nms_match)
    .attr("iou_threshold")
    .input(1)
    .output(1)
    .apply(nms_match_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
    .apply(nms_match_parrots<CudaContext>)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(nms_rotated)
    .attr("multi_label")
    .attr("iou_threshold")
    .input(4)
    .output(1)
    .apply(nms_rotated_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
    .apply(nms_rotated_parrots<CudaContext>)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/nms_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef NMS_PYTORCH_H
#define NMS_PYTORCH_H
#include <torch/extension.h>

at::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,
               int offset);

at::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,
                   float iou_threshold, float sigma, float min_score,
                   int method, int offset);

std::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);

at::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,
                       const at::Tensor order, const at::Tensor dets_sorted,
                       const float iou_threshold, const int multi_label);
#endif  // NMS_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/nms_rotated.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
#include "pytorch_cpp_helper.hpp"

Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
                       const float iou_threshold);

#ifdef MMCV_WITH_CUDA
Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
                        const Tensor order, const Tensor dets_sorted,
                        const float iou_threshold, const int multi_label);
#endif

// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
                   const Tensor dets_sorted, const float iou_threshold,
                   const int multi_label) {
  assert(dets.device().is_cuda() == scores.device().is_cuda());
  if (dets.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
                            multi_label);
#else
    AT_ERROR("Not compiled with GPU support");
#endif
  }

  return nms_rotated_cpu(dets, scores, iou_threshold);
}


================================================
FILE: mmcv/ops/csrc/parrots/pixel_group.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/WenmuZhou/PAN.pytorch

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

std::vector<std::vector<float>> pixel_group_impl(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
                              kernel_label, kernel_contour, kernel_region_num,
                              dis_threshold);
}

std::vector<std::vector<float>> pixel_group(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
  score = score.contiguous();
  mask = mask.contiguous();
  embedding = embedding.contiguous();
  kernel_label = kernel_label.contiguous();
  kernel_contour = kernel_contour.contiguous();

  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
                          kernel_region_num, distance_threshold);
}


================================================
FILE: mmcv/ops/csrc/parrots/pixel_group_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "pixel_group_pytorch.h"

using namespace parrots;
using namespace std;

template <typename T>
void pixel_group_parrots(T& ctx, const SSElement& attr,
                         const OperatorBase::in_list_t& ins,
                         OperatorBase::out_list_t& outs) {
  int kernel_region_num;
  float distance_threshold;
  SSAttrs(attr)
      .get<int>("kernel_region_num", kernel_region_num)
      .get<float>("distance_threshold", distance_threshold)
      .done();
  at::Tensor score;
  at::Tensor mask;
  at::Tensor embedding;
  at::Tensor kernel_label;
  at::Tensor kernel_contour;
  score = buildATensor(ctx, ins[0]);
  mask = buildATensor(ctx, ins[1]);
  embedding = buildATensor(ctx, ins[2]);
  kernel_label = buildATensor(ctx, ins[3]);
  kernel_contour = buildATensor(ctx, ins[4]);
  auto out = pixel_group(score, mask, embedding, kernel_label, kernel_contour,
                         kernel_region_num, distance_threshold);
  int n = out.size();
  std::vector<float> out_tensor;
  for (int i = 0; i < n; ++i) out_tensor.push_back(float(out[i].size()));
  for (int i = 0; i < n; ++i)
    out_tensor.insert(out_tensor.end(), out[i].begin(), out[i].end());
  auto options = torch::TensorOptions().dtype(at::kFloat);
  auto tensor = torch::zeros({1, out_tensor.size()}, options);
  tensor.slice(0, 0, 1) =
      torch::from_blob(out_tensor.data(), {out_tensor.size()}, options);
  updateDArray(ctx, tensor, outs[0]);
}

PARROTS_EXTENSION_REGISTER(pixel_group)
    .attr("kernel_region_num")
    .attr("distance_threshold")
    .input(5)
    .output(1)
    .apply(pixel_group_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
    .apply(pixel_group_parrots<CudaContext>)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/pixel_group_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef PIXEL_GROUP_PYTORCH_H
#define PIXEL_GROUP_PYTORCH_H
#include <torch/extension.h>
using namespace at;

std::vector<std::vector<float>> pixel_group(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float distance_threshold);

#endif  // PIXEL_GROUP_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/points_in_boxes.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points) {
  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
                       pts_num, boxes, pts, box_idx_of_points);
}

void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points) {
  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
                       pts_num, boxes, pts, box_idx_of_points);
}

void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                  Tensor box_idx_of_points_tensor) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
  // default -1
  int batch_size = boxes_tensor.size(0);
  int boxes_num = boxes_tensor.size(1);
  int pts_num = pts_tensor.size(1);
  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
                                    boxes_tensor, pts_tensor,
                                    box_idx_of_points_tensor);
}

void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                 Tensor box_idx_of_points_tensor) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
  int batch_size = boxes_tensor.size(0);
  int boxes_num = boxes_tensor.size(1);
  int pts_num = pts_tensor.size(1);
  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
                                   pts_tensor, box_idx_of_points_tensor);
}


================================================
FILE: mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "points_in_boxes_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void points_in_boxes_part_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  auto boxes_tensor = buildATensor(ctx, ins[0]);
  auto pts_tensor = buildATensor(ctx, ins[1]);

  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);

  points_in_boxes_part_forward(boxes_tensor, pts_tensor,
                               box_idx_of_points_tensor);
}

void points_in_boxes_all_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  auto boxes_tensor = buildATensor(ctx, ins[0]);
  auto pts_tensor = buildATensor(ctx, ins[1]);

  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);

  points_in_boxes_all_forward(boxes_tensor, pts_tensor,
                              box_idx_of_points_tensor);
}

PARROTS_EXTENSION_REGISTER(points_in_boxes_part_forward)
    .input(2)
    .output(1)
    .apply(points_in_boxes_part_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(points_in_boxes_all_forward)
    .input(2)
    .output(1)
    .apply(points_in_boxes_all_forward_cuda_parrots)
    .done();
#endif

void points_in_boxes_forward_cpu_parrots(HostContext& ctx,
                                         const SSElement& attr,
                                         const OperatorBase::in_list_t& ins,
                                         OperatorBase::out_list_t& outs) {
  auto boxes_tensor = buildATensor(ctx, ins[0]);
  auto pts_tensor = buildATensor(ctx, ins[1]);

  auto pts_indices_tensor = buildATensor(ctx, outs[0]);

  points_in_boxes_cpu_forward(boxes_tensor, pts_tensor, pts_indices_tensor);
}

PARROTS_EXTENSION_REGISTER(points_in_boxes_cpu_forward)
    .input(2)
    .output(1)
    .apply(points_in_boxes_forward_cpu_parrots)
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINTS_IN_BOXES_PYTORCH_H
#define POINTS_IN_BOXES_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                  Tensor box_idx_of_points_tensor);

void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                 Tensor box_idx_of_points_tensor);

void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                 Tensor pts_indices_tensor);

#endif  // POINTS_IN_BOXES_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/points_in_polygons.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols) {
  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
                       output, rows, cols);
}

void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
  int rows = points.size(0);
  int cols = polygons.size(0);
  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
}


================================================
FILE: mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "points_in_polygons_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                     const OperatorBase::in_list_t& ins,
                                     OperatorBase::out_list_t& outs) {
  auto points = buildATensor(ctx, ins[0]);
  auto polygons = buildATensor(ctx, ins[1]);

  auto output = buildATensor(ctx, outs[0]);

  points_in_polygons_forward(points, polygons, output);
}

PARROTS_EXTENSION_REGISTER(points_in_polygons_forward)
    .input(2)
    .output(1)
    .apply(points_in_polygons_cuda_parrots)
    .done();

#endif


================================================
FILE: mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINTS_IN_POLYGONS_PYTORCH_H
#define POINTS_IN_POLYGONS_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);

#endif  // POINTS_IN_POLYGONS_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/prroi_pool.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                             int pooled_height, int pooled_width,
                             float spatial_scale) {
  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
                       pooled_height, pooled_width, spatial_scale);
}

void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
                              Tensor grad_input, int pooled_height,
                              int pooled_width, float spatial_scale) {
  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
                       pooled_height, pooled_width, spatial_scale);
}

void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
                                   Tensor input, Tensor rois, Tensor grad_rois,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale) {
  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
                       input, rois, grad_rois, pooled_height, pooled_width,
                       spatial_scale);
}

void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
                        int pooled_height, int pooled_width,
                        float spatial_scale) {
  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
                          spatial_scale);
}

void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
                         int pooled_height, int pooled_width,
                         float spatial_scale) {
  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
                           pooled_width, spatial_scale);
}

void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
                              Tensor rois, Tensor grad_rois, int pooled_height,
                              int pooled_width, float spatial_scale) {
  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
                                pooled_height, pooled_width, spatial_scale);
}


================================================
FILE: mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "prroi_pool_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void prroi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                     const OperatorBase::in_list_t& ins,
                                     OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  prroi_pool_forward(input, rois, output, pooled_height, pooled_width,
                     spatial_scale);
}

void prroi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                      const OperatorBase::in_list_t& ins,
                                      OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .done();

  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  prroi_pool_backward(grad_output, rois, grad_input, pooled_height,
                      pooled_width, spatial_scale);
}

void prroi_pool_coor_backward_cuda_parrots(CudaContext& ctx,
                                           const SSElement& attr,
                                           const OperatorBase::in_list_t& ins,
                                           OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .done();

  const auto& output = buildATensor(ctx, ins[0]);
  const auto& grad_output = buildATensor(ctx, ins[1]);
  const auto& input = buildATensor(ctx, ins[2]);
  const auto& rois = buildATensor(ctx, ins[3]);
  auto grad_rois = buildATensor(ctx, outs[0]);
  prroi_pool_coor_backward(output, grad_output, input, rois, grad_rois,
                           pooled_height, pooled_width, spatial_scale);
}

PARROTS_EXTENSION_REGISTER(prroi_pool_forward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .input(2)
    .output(1)
    .apply(prroi_pool_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(prroi_pool_backward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .input(2)
    .output(1)
    .apply(prroi_pool_backward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(prroi_pool_coor_backward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .input(4)
    .output(1)
    .apply(prroi_pool_coor_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef PRROI_POOL_PYTORCH_H
#define PRROI_POOL_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
                        int pooled_height, int pooled_width,
                        float spatial_scale);

void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
                         int pooled_height, int pooled_width,
                         float spatial_scale);

void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
                              Tensor rois, Tensor grad_rois, int pooled_height,
                              int pooled_width, float spatial_scale);

#endif  // PRROI_POOL_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/psamask.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask) {
  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
                       half_w_mask);
}

void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask) {
  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
                       half_w_mask);
}

void psamask_forward(const Tensor input, Tensor output, const int psa_type,
                     const int num_, const int h_feature, const int w_feature,
                     const int h_mask, const int w_mask, const int half_h_mask,
                     const int half_w_mask) {
  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
                       h_mask, w_mask, half_h_mask, half_w_mask);
}

void psamask_backward(Tensor grad_output, const Tensor grad_input,
                      const int psa_type, const int num_, const int h_feature,
                      const int w_feature, const int h_mask, const int w_mask,
                      const int half_h_mask, const int half_w_mask) {
  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}


================================================
FILE: mmcv/ops/csrc/parrots/psamask_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "psamask_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
                                  const OperatorBase::in_list_t &ins,
                                  OperatorBase::out_list_t &outs) {
  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
      half_w_mask;
  SSAttrs(attr)
      .get<int>("psa_type", psa_type)
      .get<int>("num_", num_)
      .get<int>("h_feature", h_feature)
      .get<int>("w_feature", w_feature)
      .get<int>("h_mask", h_mask)
      .get<int>("w_mask", w_mask)
      .get<int>("half_h_mask", half_h_mask)
      .get<int>("half_w_mask", half_w_mask)
      .done();
  const auto &input = buildATensor(ctx, ins[0]);
  auto output = buildATensor(ctx, outs[0]);
  psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
                       h_mask, w_mask, half_h_mask, half_w_mask);
}

void psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
                                   const OperatorBase::in_list_t &ins,
                                   OperatorBase::out_list_t &outs) {
  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
      half_w_mask;
  SSAttrs(attr)
      .get<int>("psa_type", psa_type)
      .get<int>("num_", num_)
      .get<int>("h_feature", h_feature)
      .get<int>("w_feature", w_feature)
      .get<int>("h_mask", h_mask)
      .get<int>("w_mask", w_mask)
      .get<int>("half_h_mask", half_h_mask)
      .get<int>("half_w_mask", half_w_mask)
      .done();

  const auto &grad_output = buildATensor(ctx, ins[0]);
  auto grad_input = buildATensor(ctx, outs[0]);
  psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
#endif

void psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,
                                 const OperatorBase::in_list_t &ins,
                                 OperatorBase::out_list_t &outs) {
  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
      half_w_mask;
  SSAttrs(attr)
      .get<int>("psa_type", psa_type)
      .get<int>("num_", num_)
      .get<int>("h_feature", h_feature)
      .get<int>("w_feature", w_feature)
      .get<int>("h_mask", h_mask)
      .get<int>("w_mask", w_mask)
      .get<int>("half_h_mask", half_h_mask)
      .get<int>("half_w_mask", half_w_mask)
      .done();
  const auto &input = buildATensor(ctx, ins[0]);
  auto output = buildATensor(ctx, outs[0]);
  psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
                      h_mask, w_mask, half_h_mask, half_w_mask);
}

void psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,
                                  const OperatorBase::in_list_t &ins,
                                  OperatorBase::out_list_t &outs) {
  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
      half_w_mask;
  SSAttrs(attr)
      .get<int>("psa_type", psa_type)
      .get<int>("num_", num_)
      .get<int>("h_feature", h_feature)
      .get<int>("w_feature", w_feature)
      .get<int>("h_mask", h_mask)
      .get<int>("w_mask", w_mask)
      .get<int>("half_h_mask", half_h_mask)
      .get<int>("half_w_mask", half_w_mask)
      .done();

  const auto &grad_output = buildATensor(ctx, ins[0]);
  auto grad_input = buildATensor(ctx, outs[0]);
  psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
                       w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}

PARROTS_EXTENSION_REGISTER(psamask_forward)
    .attr("psa_type")
    .attr("num_")
    .attr("h_feature")
    .attr("w_feature")
    .attr("h_mask")
    .attr("w_mask")
    .attr("half_h_mask")
    .attr("half_w_mask")
    .input(1)
    .output(1)
    .apply(psamask_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(psamask_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(psamask_backward)
    .attr("psa_type")
    .attr("num_")
    .attr("h_feature")
    .attr("w_feature")
    .attr("h_mask")
    .attr("w_mask")
    .attr("half_h_mask")
    .attr("half_w_mask")
    .input(1)
    .output(1)
    .apply(psamask_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(psamask_backward_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/psamask_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef PSAMASK_PYTORCH_H
#define PSAMASK_PYTORCH_H
#include <torch/extension.h>
using namespace at;

#ifdef MMCV_WITH_CUDA
void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask);

void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask);
#endif
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
                         const int num_, const int h_feature,
                         const int w_feature, const int h_mask,
                         const int w_mask, const int half_h_mask,
                         const int half_w_mask);

void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
                          Tensor grad_input, const int num_,
                          const int h_feature, const int w_feature,
                          const int h_mask, const int w_mask,
                          const int half_h_mask, const int half_w_mask);
#endif  // PSAMASK_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
                                      Tensor output, int pooled_height,
                                      int pooled_width, float spatial_scale,
                                      int num_samples, int num_orientations,
                                      bool clockwise) {
  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
                       pooled_height, pooled_width, spatial_scale, num_samples,
                       num_orientations, clockwise);
}

void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                       Tensor bottom_grad, int pooled_height,
                                       int pooled_width, float spatial_scale,
                                       int num_samples, int num_orientations,
                                       bool clockwise) {
  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
                       bottom_grad, pooled_height, pooled_width, spatial_scale,
                       num_samples, num_orientations, clockwise);
}

void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
                                 int pooled_height, int pooled_width,
                                 float spatial_scale, int num_samples,
                                 int num_orientations, bool clockwise) {
  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
                                   pooled_width, spatial_scale, num_samples,
                                   num_orientations, clockwise);
}

void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
                                  Tensor bottom_grad, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int num_samples, int num_orientations,
                                  bool clockwise) {
  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
                                    pooled_width, spatial_scale, num_samples,
                                    num_orientations, clockwise);
}


================================================
FILE: mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "riroi_align_rotated_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void riroi_align_rotated_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  int sample_num;
  int num_orientations;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("num_samples", sample_num)
      .get<int>("num_orientations", num_orientations)
      .get<bool>("clockwise", clockwise)
      .done();

  auto input = buildATensor(ctx, ins[0]);
  auto rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,
                              spatial_scale, sample_num, num_orientations,
                              clockwise);
}

void riroi_align_rotated_backward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  int sample_num;
  int num_orientations;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("num_samples", sample_num)
      .get<int>("num_orientations", num_orientations)
      .get<bool>("clockwise", clockwise)
      .done();

  auto grad_output = buildATensor(ctx, ins[0]);
  auto rois = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,
                               pooled_width, spatial_scale, sample_num,
                               num_orientations, clockwise);
}

PARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .attr("num_samples")
    .attr("num_orientations")
    .attr("clockwise")
    .input(2)
    .output(1)
    .apply(riroi_align_rotated_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .attr("num_samples")
    .attr("num_orientations")
    .attr("clockwise")
    .input(2)
    .output(1)
    .apply(riroi_align_rotated_backward_cuda_parrots)
    .done();

#endif


================================================
FILE: mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H
#define RIROI_ALIGN_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
                                 int pooled_height, int pooled_width,
                                 float spatial_scale, int num_samples,
                                 int num_orientations, bool clockwise);

void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
                                  Tensor bottom_grad, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int num_samples, int num_orientations,
                                  bool clockwise);

#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/roi_align.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
                       argmax_x, aligned_height, aligned_width, spatial_scale,
                       sampling_ratio, pool_mode, aligned);
}

void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
                       argmax_x, grad_input, aligned_height, aligned_width,
                       spatial_scale, sampling_ratio, pool_mode, aligned);
}

void roi_align_forward(Tensor input, Tensor rois, Tensor output,
                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
                       int aligned_width, float spatial_scale,
                       int sampling_ratio, int pool_mode, bool aligned) {
  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
                         aligned_height, aligned_width, spatial_scale,
                         sampling_ratio, pool_mode, aligned);
}

void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
                        Tensor argmax_x, Tensor grad_input, int aligned_height,
                        int aligned_width, float spatial_scale,
                        int sampling_ratio, int pool_mode, bool aligned) {
  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
                          aligned_height, aligned_width, spatial_scale,
                          sampling_ratio, pool_mode, aligned);
}


================================================
FILE: mmcv/ops/csrc/parrots/roi_align_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "roi_align_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void roi_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                    const OperatorBase::in_list_t& ins,
                                    OperatorBase::out_list_t& outs) {
  int aligned_height;
  int aligned_width;
  float spatial_scale;
  int sampling_ratio;
  int pool_mode;
  bool aligned;
  SSAttrs(attr)
      .get<int>("aligned_height", aligned_height)
      .get<int>("aligned_width", aligned_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<int>("pool_mode", pool_mode)
      .get<bool>("aligned", aligned)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  auto argmax_y = buildATensor(ctx, outs[1]);
  auto argmax_x = buildATensor(ctx, outs[2]);
  roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
                         aligned_height, aligned_width, spatial_scale,
                         sampling_ratio, pool_mode, aligned);
}

void roi_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                     const OperatorBase::in_list_t& ins,
                                     OperatorBase::out_list_t& outs) {
  int aligned_height;
  int aligned_width;
  float spatial_scale;
  int sampling_ratio;
  int pool_mode;
  bool aligned;
  SSAttrs(attr)
      .get<int>("aligned_height", aligned_height)
      .get<int>("aligned_width", aligned_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<int>("pool_mode", pool_mode)
      .get<bool>("aligned", aligned)
      .done();

  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  const auto& argmax_y = buildATensor(ctx, ins[2]);
  const auto& argmax_x = buildATensor(ctx, ins[3]);
  auto grad_input = buildATensor(ctx, outs[0]);
  roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
                          aligned_height, aligned_width, spatial_scale,
                          sampling_ratio, pool_mode, aligned);
}
#endif

void roi_align_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                   const OperatorBase::in_list_t& ins,
                                   OperatorBase::out_list_t& outs) {
  int aligned_height;
  int aligned_width;
  float spatial_scale;
  int sampling_ratio;
  int pool_mode;
  bool aligned;
  SSAttrs(attr)
      .get<int>("aligned_height", aligned_height)
      .get<int>("aligned_width", aligned_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<int>("pool_mode", pool_mode)
      .get<bool>("aligned", aligned)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  auto argmax_y = buildATensor(ctx, outs[1]);
  auto argmax_x = buildATensor(ctx, outs[2]);
  roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height,
                        aligned_width, spatial_scale, sampling_ratio, pool_mode,
                        aligned);
}

void roi_align_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                    const OperatorBase::in_list_t& ins,
                                    OperatorBase::out_list_t& outs) {
  int aligned_height;
  int aligned_width;
  float spatial_scale;
  int sampling_ratio;
  int pool_mode;
  bool aligned;
  SSAttrs(attr)
      .get<int>("aligned_height", aligned_height)
      .get<int>("aligned_width", aligned_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<int>("pool_mode", pool_mode)
      .get<bool>("aligned", aligned)
      .done();

  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  const auto& argmax_y = buildATensor(ctx, ins[2]);
  const auto& argmax_x = buildATensor(ctx, ins[3]);
  auto grad_input = buildATensor(ctx, outs[0]);
  roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
                         aligned_height, aligned_width, spatial_scale,
                         sampling_ratio, pool_mode, aligned);
}

PARROTS_EXTENSION_REGISTER(roi_align_forward)
    .attr("aligned_height")
    .attr("aligned_width")
    .attr("spatial_scale")
    .attr("sampling_ratio")
    .attr("pool_mode")
    .attr("aligned")
    .input(2)
    .output(3)
    .apply(roi_align_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(roi_align_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(roi_align_backward)
    .attr("aligned_height")
    .attr("aligned_width")
    .attr("spatial_scale")
    .attr("sampling_ratio")
    .attr("pool_mode")
    .attr("aligned")
    .input(4)
    .output(1)
    .apply(roi_align_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(roi_align_backward_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/roi_align_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROI_ALIGN_PYTORCH_H
#define ROI_ALIGN_PYTORCH_H
#include <torch/extension.h>
using namespace at;

#ifdef MMCV_WITH_CUDA
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);

void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned);
#endif

void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
                           int aligned_width, float spatial_scale,
                           int sampling_ratio, int pool_mode, bool aligned);

void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
                            Tensor argmax_x, Tensor grad_input,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);

#endif  // ROI_ALIGN_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/roi_align_rotated.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sample_ratio,
                                    bool aligned, bool clockwise) {
  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
                       aligned_height, aligned_width, spatial_scale,
                       sample_ratio, aligned, clockwise);
}

void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sample_ratio, bool aligned,
                                     bool clockwise) {
  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
                       bottom_grad, aligned_height, aligned_width,
                       spatial_scale, sample_ratio, aligned, clockwise);
}

void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
                               int aligned_height, int aligned_width,
                               float spatial_scale, int sampling_ratio,
                               bool aligned, bool clockwise) {
  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
                                 aligned_width, spatial_scale, sampling_ratio,
                                 aligned, clockwise);
}

void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
                                Tensor bottom_grad, int aligned_height,
                                int aligned_width, float spatial_scale,
                                int sampling_ratio, bool aligned,
                                bool clockwise) {
  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
                                  aligned_width, spatial_scale, sampling_ratio,
                                  aligned, clockwise);
}


================================================
FILE: mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "roi_align_rotated_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
                                            const SSElement& attr,
                                            const OperatorBase::in_list_t& ins,
                                            OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  int sampling_ratio;
  bool aligned;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<bool>("aligned", aligned)
      .get<bool>("clockwise", clockwise)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
                                 pooled_width, spatial_scale, sampling_ratio,
                                 aligned, clockwise);
}

void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
                                             const SSElement& attr,
                                             const OperatorBase::in_list_t& ins,
                                             OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  int sampling_ratio;
  bool aligned;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<bool>("aligned", aligned)
      .get<bool>("clockwise", clockwise)
      .done();

  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
                                  pooled_width, spatial_scale, sampling_ratio,
                                  aligned, clockwise);
}
#endif

void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
                                           const SSElement& attr,
                                           const OperatorBase::in_list_t& ins,
                                           OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  int sampling_ratio;
  bool aligned;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<bool>("aligned", aligned)
      .get<bool>("clockwise", clockwise)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
                                pooled_width, spatial_scale, sampling_ratio,
                                aligned, clockwise);
}

void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
                                            const SSElement& attr,
                                            const OperatorBase::in_list_t& ins,
                                            OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  int sampling_ratio;
  bool aligned;
  bool clockwise;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("sampling_ratio", sampling_ratio)
      .get<bool>("aligned", aligned)
      .get<bool>("clockwise", clockwise)
      .done();

  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
                                 pooled_width, spatial_scale, sampling_ratio,
                                 aligned, clockwise);
}

PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .attr("sampling_ratio")
    .attr("aligned")
    .attr("clockwise")
    .input(2)
    .output(1)
    .apply(roi_align_rotated_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(roi_align_rotated_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .attr("sampling_ratio")
    .attr("aligned")
    .attr("clockwise")
    .input(2)
    .output(1)
    .apply(roi_align_rotated_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(roi_align_rotated_backward_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROI_ALIGN_ROTATED_PYTORCH_H
#define ROI_ALIGN_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;

#ifdef MMCV_WITH_CUDA
void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
                                    int pooled_height, int pooled_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise);

void roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,
                                     Tensor bottom_grad, int pooled_height,
                                     int pooled_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise);
#endif

void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   bool aligned, bool clockwise);

void roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,
                                    Tensor bottom_grad, int pooled_height,
                                    int pooled_width, float spatial_scale,
                                    int sampling_ratio, bool aligned,
                                    bool clockwise);

#endif  // ROI_ALIGN_ROTATED_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/roi_pool.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale) {
  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
                       pooled_height, pooled_width, spatial_scale);
}

void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale) {
  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
                       grad_input, pooled_height, pooled_width, spatial_scale);
}

void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
                      int pooled_height, int pooled_width,
                      float spatial_scale) {
  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
                        pooled_width, spatial_scale);
}

void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
                       Tensor grad_input, int pooled_height, int pooled_width,
                       float spatial_scale) {
  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
                         pooled_width, spatial_scale);
}


================================================
FILE: mmcv/ops/csrc/parrots/roi_pool_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "roi_pool_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void roi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                   const OperatorBase::in_list_t& ins,
                                   OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  auto argmax = buildATensor(ctx, outs[1]);
  roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
                        pooled_width, spatial_scale);
}

void roi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                    const OperatorBase::in_list_t& ins,
                                    OperatorBase::out_list_t& outs) {
  int pooled_height;
  int pooled_width;
  float spatial_scale;
  SSAttrs(attr)
      .get<int>("pooled_height", pooled_height)
      .get<int>("pooled_width", pooled_width)
      .get<float>("spatial_scale", spatial_scale)
      .done();

  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& rois = buildATensor(ctx, ins[1]);
  const auto& argmax = buildATensor(ctx, ins[2]);
  auto grad_input = buildATensor(ctx, outs[0]);
  roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
                         pooled_width, spatial_scale);
}

PARROTS_EXTENSION_REGISTER(roi_pool_forward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .input(2)
    .output(2)
    .apply(roi_pool_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(roi_pool_backward)
    .attr("pooled_height")
    .attr("pooled_width")
    .attr("spatial_scale")
    .input(3)
    .output(1)
    .apply(roi_pool_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/roi_pool_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROI_POOL_PYTORCH_H
#define ROI_POOL_PYTORCH_H
#include <torch/extension.h>
using namespace at;

#ifdef MMCV_WITH_CUDA
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale);

void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale);
#endif
#endif  // ROI_POOL_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method) {
  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
                       pts, pts_feature, argmax, pts_idx_of_voxels,
                       pooled_features, pool_method);
}

void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method) {
  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
                       argmax, grad_out, grad_in, pool_method);
}

void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
                             Tensor argmax, Tensor pts_idx_of_voxels,
                             Tensor pooled_features, int pool_method) {
  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
  // coordinate
  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
  // params pts_feature: (npoints, C)
  // params argmax: (N, out_x, out_y, out_z, C)
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params pooled_features: (N, out_x, out_y, out_z, C)
  // params pool_method: 0: max_pool 1: avg_pool
  int boxes_num = rois.size(0);
  int pts_num = pts.size(0);
  int channels = pts_feature.size(1);
  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
  int out_x = pts_idx_of_voxels.size(1);
  int out_y = pts_idx_of_voxels.size(2);
  int out_z = pts_idx_of_voxels.size(3);
  assert((out_x < 256) && (out_y < 256) &&
         (out_z < 256));  // we encode index with 8bit

  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
                               out_x, out_y, out_z, rois, pts, pts_feature,
                               argmax, pts_idx_of_voxels, pooled_features,
                               pool_method);
}

void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
                              Tensor grad_out, Tensor grad_in,
                              int pool_method) {
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params argmax: (N, out_x, out_y, out_z, C)
  // params grad_out: (N, out_x, out_y, out_z, C)
  // params grad_in: (npoints, C), return value
  // params pool_method: 0: max_pool 1: avg_pool
  int boxes_num = pts_idx_of_voxels.size(0);
  int out_x = pts_idx_of_voxels.size(1);
  int out_y = pts_idx_of_voxels.size(2);
  int out_z = pts_idx_of_voxels.size(3);
  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
  int channels = grad_out.size(4);

  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
                                grad_out, grad_in, pool_method);
}


================================================
FILE: mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "roiaware_pool3d_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void roiaware_pool3d_forward_cuda_parrots(CudaContext& ctx,
                                          const SSElement& attr,
                                          const OperatorBase::in_list_t& ins,
                                          OperatorBase::out_list_t& outs) {
  int pool_method;
  SSAttrs(attr).get<int>("pool_method", pool_method).done();
  auto rois = buildATensor(ctx, ins[0]);
  auto pts = buildATensor(ctx, ins[1]);
  auto pts_feature = buildATensor(ctx, ins[2]);

  auto argmax = buildATensor(ctx, outs[0]);
  auto pts_idx_of_voxels = buildATensor(ctx, outs[1]);
  auto pooled_features = buildATensor(ctx, outs[2]);

  roiaware_pool3d_forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels,
                          pooled_features, pool_method);
}

void roiaware_pool3d_backward_cuda_parrots(CudaContext& ctx,
                                           const SSElement& attr,
                                           const OperatorBase::in_list_t& ins,
                                           OperatorBase::out_list_t& outs) {
  int pool_method;
  SSAttrs(attr).get<int>("pool_method", pool_method).done();
  auto pts_idx_of_voxels = buildATensor(ctx, ins[0]);
  auto argmax = buildATensor(ctx, ins[1]);
  auto grad_out = buildATensor(ctx, ins[2]);

  auto grad_in = buildATensor(ctx, outs[0]);

  roiaware_pool3d_backward(pts_idx_of_voxels, argmax, grad_out, grad_in,
                           pool_method);
}

PARROTS_EXTENSION_REGISTER(roiaware_pool3d_forward)
    .attr("pool_method")
    .input(3)
    .output(3)
    .apply(roiaware_pool3d_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(roiaware_pool3d_backward)
    .attr("pool_method")
    .input(3)
    .output(1)
    .apply(roiaware_pool3d_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROIAWARE_POOL3D_PYTORCH_H
#define ROIAWARE_POOL3D_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
                             Tensor argmax, Tensor pts_idx_of_voxels,
                             Tensor pooled_features, int pool_method);

void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
                              Tensor grad_out, Tensor grad_in, int pool_method);

#endif  // ROIAWARE_POOL3D_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
================================================
/*
Modified from
https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag) {
  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
                       pts_feature, pooled_features, pooled_empty_flag);
}

void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
                             Tensor pooled_features, Tensor pooled_empty_flag) {
  // params xyz: (B, N, 3)
  // params boxes3d: (B, M, 7)
  // params pts_feature: (B, N, C)
  // params pooled_features: (B, M, 512, 3+C)
  // params pooled_empty_flag: (B, M)
  int batch_size = xyz.size(0);
  int pts_num = xyz.size(1);
  int boxes_num = boxes3d.size(1);
  int feature_in_len = pts_feature.size(2);
  int sampled_pts_num = pooled_features.size(2);

  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
                               sampled_pts_num, xyz, boxes3d, pts_feature,
                               pooled_features, pooled_empty_flag);
}


================================================
FILE: mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "roipoint_pool3d_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void roipoint_pool3d_forward_cuda_parrots(CudaContext& ctx,
                                          const SSElement& attr,
                                          const OperatorBase::in_list_t& ins,
                                          OperatorBase::out_list_t& outs) {
  auto xyz = buildATensor(ctx, ins[0]);
  auto boxes3d = buildATensor(ctx, ins[1]);
  auto pts_feature = buildATensor(ctx, ins[2]);

  auto pooled_features = buildATensor(ctx, outs[0]);
  auto pooled_empty_flag = buildATensor(ctx, outs[1]);

  roipoint_pool3d_forward(xyz, boxes3d, pts_feature, pooled_features,
                          pooled_empty_flag);
}

PARROTS_EXTENSION_REGISTER(roipoint_pool3d_forward)
    .input(3)
    .output(2)
    .apply(roipoint_pool3d_forward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROIPOINT_POOL3D_PYTORCH_H
#define ROIPOINT_POOL3D_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
                             Tensor pooled_features, Tensor pooled_empty_flag);

#endif  // ROIPOINT_POOL3D_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/rotated_feature_align.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void rotated_feature_align_forward_impl(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output) {
  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
                       best_bboxes, spatial_scale, points, output);
}

void rotated_feature_align_backward_impl(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad) {
  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
                       best_bboxes, spatial_scale, points, bottom_grad);
}

void rotated_feature_align_forward(const Tensor features,
                                   const Tensor best_bboxes, Tensor output,
                                   const float spatial_scale,
                                   const int points) {
  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
                                     points, output);
}

void rotated_feature_align_backward(const Tensor top_grad,
                                    const Tensor best_bboxes,
                                    Tensor bottom_grad,
                                    const float spatial_scale,
                                    const int points) {
  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
                                      points, bottom_grad);
}


================================================
FILE: mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "rotated_feature_align_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void rotated_feature_align_forward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  float spatial_scale;
  int points;
  SSAttrs(attr)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("points", points)
      .done();

  auto features = buildATensor(ctx, ins[0]);
  auto best_bboxes = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
                                points);
}

void rotated_feature_align_backward_cuda_parrots(
    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  float spatial_scale;
  int points;
  SSAttrs(attr)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("points", points)
      .done();

  auto grad_output = buildATensor(ctx, ins[0]);
  auto best_bboxes = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
                                 spatial_scale, points);
}
#endif

void rotated_feature_align_forward_cpu_parrots(
    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  float spatial_scale;
  int points;
  SSAttrs(attr)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("points", points)
      .done();

  auto features = buildATensor(ctx, ins[0]);
  auto best_bboxes = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
                                points);
}

void rotated_feature_align_backward_cpu_parrots(
    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
    OperatorBase::out_list_t& outs) {
  float spatial_scale;
  int points;
  SSAttrs(attr)
      .get<float>("spatial_scale", spatial_scale)
      .get<int>("points", points)
      .done();

  auto grad_output = buildATensor(ctx, ins[0]);
  auto best_bboxes = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
                                 spatial_scale, points);
}

PARROTS_EXTENSION_REGISTER(rotated_feature_align_forward)
    .attr("spatial_scale")
    .attr("points")
    .input(2)
    .output(1)
    .apply(rotated_feature_align_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(rotated_feature_align_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(rotated_feature_align_backward)
    .attr("spatial_scale")
    .attr("points")
    .input(2)
    .output(1)
    .apply(rotated_feature_align_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(rotated_feature_align_backward_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ROTATED_FEATURE_ALIGN_PYTORCH_H
#define ROTATED_FEATURE_ALIGN_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void rotated_feature_align_forward(const Tensor features,
                                   const Tensor best_bboxes, Tensor output,
                                   const float spatial_scale, const int points);

void rotated_feature_align_backward(const Tensor top_grad,
                                    const Tensor best_bboxes,
                                    Tensor bottom_grad,
                                    const float spatial_scale,
                                    const int points);

#endif  // ROTATED_FEATURE_ALIGN_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/sync_bn.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
}

void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
                              Tensor var) {
  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
}

void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size) {
  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
                       running_mean, running_var, weight, bias, norm, std,
                       output, eps, momentum, group_size);
}

void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias) {
  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
                       grad_weight, grad_bias);
}

void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input) {
  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
                       grad_weight, grad_bias, norm, std, grad_input);
}

void sync_bn_forward_mean(const Tensor input, Tensor mean) {
  sync_bn_forward_mean_impl(input, mean);
}

void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
  sync_bn_forward_var_impl(input, mean, var);
}

void sync_bn_forward_output(const Tensor input, const Tensor mean,
                            const Tensor var, const Tensor weight,
                            const Tensor bias, Tensor running_mean,
                            Tensor running_var, Tensor norm, Tensor std,
                            Tensor output, float eps, float momentum,
                            int group_size) {
  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
                              weight, bias, norm, std, output, eps, momentum,
                              group_size);
}

void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
                            Tensor grad_weight, Tensor grad_bias) {
  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
}

void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
                           const Tensor grad_weight, const Tensor grad_bias,
                           const Tensor norm, const Tensor std,
                           Tensor grad_input) {
  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
                             std, grad_input);
}


================================================
FILE: mmcv/ops/csrc/parrots/sync_bn_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "sync_bn_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void sync_bn_forward_mean_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                       const OperatorBase::in_list_t& ins,
                                       OperatorBase::out_list_t& outs) {
  const auto& input = buildATensor(ctx, ins[0]);
  auto mean = buildATensor(ctx, outs[0]);
  sync_bn_forward_mean_cuda(input, mean);
}

void sync_bn_forward_var_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                      const OperatorBase::in_list_t& ins,
                                      OperatorBase::out_list_t& outs) {
  const auto& input = buildATensor(ctx, ins[0]);
  const auto& mean = buildATensor(ctx, ins[1]);
  auto var = buildATensor(ctx, outs[0]);
  sync_bn_forward_var_cuda(input, mean, var);
}

void sync_bn_forward_output_cuda_parrots(CudaContext& ctx,
                                         const SSElement& attr,
                                         const OperatorBase::in_list_t& ins,
                                         OperatorBase::out_list_t& outs) {
  size_t group_size;
  float eps, momentum;
  SSAttrs(attr)
      .get<float>("eps", eps)
      .get<float>("momentum", momentum)
      .get<size_t>("group_size", group_size)
      .done();

  const auto& input = buildATensor(ctx, ins[0]);
  const auto& mean = buildATensor(ctx, ins[1]);
  const auto& var = buildATensor(ctx, ins[2]);
  const auto& weight = buildATensor(ctx, ins[3]);
  const auto& bias = buildATensor(ctx, ins[4]);
  auto running_mean = buildATensor(ctx, outs[0]);
  auto running_var = buildATensor(ctx, outs[1]);
  auto norm = buildATensor(ctx, outs[2]);
  auto std = buildATensor(ctx, outs[3]);
  auto output = buildATensor(ctx, outs[4]);
  sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
                              weight, bias, norm, std, output, eps, momentum,
                              group_size);
}

void sync_bn_backward_param_cuda_parrots(CudaContext& ctx,
                                         const SSElement& attr,
                                         const OperatorBase::in_list_t& ins,
                                         OperatorBase::out_list_t& outs) {
  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& norm = buildATensor(ctx, ins[1]);
  auto grad_weight = buildATensor(ctx, outs[0]);
  auto grad_bias = buildATensor(ctx, outs[1]);
  sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
}

void sync_bn_backward_data_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
  const auto& grad_output = buildATensor(ctx, ins[0]);
  const auto& weight = buildATensor(ctx, ins[1]);
  const auto& grad_weight = buildATensor(ctx, ins[2]);
  const auto& grad_bias = buildATensor(ctx, ins[3]);
  const auto& norm = buildATensor(ctx, ins[4]);
  const auto& std = buildATensor(ctx, ins[5]);
  auto grad_input = buildATensor(ctx, outs[0]);
  sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias, norm,
                             std, grad_input);
}

PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
    .input(1)
    .output(1)
    .apply(sync_bn_forward_mean_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
    .input(2)
    .output(1)
    .apply(sync_bn_forward_var_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
    .attr("eps")
    .attr("momentum")
    .attr("group_size")
    .input(5)
    .output(5)
    .apply(sync_bn_forward_output_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
    .input(2)
    .output(2)
    .apply(sync_bn_backward_param_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
    .input(6)
    .output(1)
    .apply(sync_bn_backward_data_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/sync_bn_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef SYNC_BN_PYTORCH_H
#define SYNC_BN_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean);

void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
                              Tensor var);

void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size);

void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias);

void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input);
#endif  // SYNC_BN_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/three_interpolate.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void three_interpolate_forward_impl(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out) {
  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
                       weight, out);
}

void three_interpolate_backward_impl(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points) {
  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
                       idx, weight, grad_points);
}

void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
                               Tensor weight_tensor, Tensor out_tensor, int b,
                               int c, int m, int n) {
  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
                                 weight_tensor, out_tensor);
}

void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                                Tensor weight_tensor, Tensor grad_points_tensor,
                                int b, int c, int n, int m) {
  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
                                  weight_tensor, grad_points_tensor);
}


================================================
FILE: mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "three_interpolate_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void three_interpolate_forward_cuda_parrots(CudaContext& ctx,
                                            const SSElement& attr,
                                            const OperatorBase::in_list_t& ins,
                                            OperatorBase::out_list_t& outs) {
  int b, c, m, n;
  SSAttrs(attr)
      .get<int>("b", b)
      .get<int>("c", c)
      .get<int>("m", m)
      .get<int>("n", n)
      .done();

  auto points_tensor = buildATensor(ctx, ins[0]);
  auto idx_tensor = buildATensor(ctx, ins[1]);
  auto weight_tensor = buildATensor(ctx, ins[2]);

  auto out_tensor = buildATensor(ctx, outs[0]);

  three_interpolate_forward(points_tensor, idx_tensor, weight_tensor,
                            out_tensor, b, c, m, n);
}

void three_interpolate_backward_cuda_parrots(CudaContext& ctx,
                                             const SSElement& attr,
                                             const OperatorBase::in_list_t& ins,
                                             OperatorBase::out_list_t& outs) {
  int b, c, n, m;
  SSAttrs(attr)
      .get<int>("b", b)
      .get<int>("c", c)
      .get<int>("n", n)
      .get<int>("m", m)
      .done();

  auto grad_out_tensor = buildATensor(ctx, ins[0]);
  auto idx_tensor = buildATensor(ctx, ins[1]);
  auto weight_tensor = buildATensor(ctx, ins[2]);

  auto grad_points_tensor = buildATensor(ctx, outs[0]);

  three_interpolate_backward(grad_out_tensor, idx_tensor, weight_tensor,
                             grad_points_tensor, b, c, n, m);
}

PARROTS_EXTENSION_REGISTER(three_interpolate_forward)
    .attr("b")
    .attr("c")
    .attr("m")
    .attr("n")
    .input(3)
    .output(1)
    .apply(three_interpolate_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(three_interpolate_backward)
    .attr("b")
    .attr("c")
    .attr("n")
    .attr("m")
    .input(3)
    .output(1)
    .apply(three_interpolate_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef THREE_INTERPOLATE_PYTORCH_H
#define THREE_INTERPOLATE_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
                               Tensor weight_tensor, Tensor out_tensor, int b,
                               int c, int m, int n);

void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                                Tensor weight_tensor, Tensor grad_points_tensor,
                                int b, int c, int n, int m);
#endif  // THREE_INTERPOLATE_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/three_nn.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx) {
  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
                       idx);
}

void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
                      int m) {
  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
                        idx_tensor);
}


================================================
FILE: mmcv/ops/csrc/parrots/three_nn_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "three_nn_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void three_nn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                   const OperatorBase::in_list_t& ins,
                                   OperatorBase::out_list_t& outs) {
  int b, n, m;
  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();

  auto unknown_tensor = buildATensor(ctx, ins[0]);
  auto known_tensor = buildATensor(ctx, ins[1]);

  auto dist2_tensor = buildATensor(ctx, outs[0]);
  auto idx_tensor = buildATensor(ctx, outs[1]);

  three_nn_forward(unknown_tensor, known_tensor, dist2_tensor, idx_tensor, b, n,
                   m);
}

PARROTS_EXTENSION_REGISTER(three_nn_forward)
    .attr("b")
    .attr("n")
    .attr("m")
    .input(2)
    .output(2)
    .apply(three_nn_forward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/three_nn_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef THREE_NN_PYTORCH_H
#define THREE_NN_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
                      int m);
#endif  // THREE_NN_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/tin_shift.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
}

void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
                             Tensor grad_input) {
  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
}

void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
  tin_shift_forward_impl(input, shift, output);
}

void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
  tin_shift_backward_impl(grad_output, shift, grad_input);
}


================================================
FILE: mmcv/ops/csrc/parrots/tin_shift_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "tin_shift_pytorch.h"
using namespace parrots;

#ifdef MMCV_WITH_CUDA
void tin_shift_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
                                    const OperatorBase::in_list_t &ins,
                                    OperatorBase::out_list_t &outs) {
  const auto &input = buildATensor(ctx, ins[0]);
  const auto &shift = buildATensor(ctx, ins[1]);
  auto output = buildATensor(ctx, outs[0]);
  tin_shift_forward_cuda(input, shift, output);
}

void tin_shift_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
                                     const OperatorBase::in_list_t &ins,
                                     OperatorBase::out_list_t &outs) {
  const auto &grad_output = buildATensor(ctx, ins[0]);
  const auto &shift = buildATensor(ctx, ins[1]);
  auto grad_input = buildATensor(ctx, outs[0]);
  tin_shift_backward_cuda(grad_output, shift, grad_input);
}

PARROTS_EXTENSION_REGISTER(tin_shift_forward)
    .input(2)
    .output(1)
    .apply(tin_shift_forward_cuda_parrots)
    .done();

PARROTS_EXTENSION_REGISTER(tin_shift_backward)
    .input(2)
    .output(1)
    .apply(tin_shift_backward_cuda_parrots)
    .done();
#endif


================================================
FILE: mmcv/ops/csrc/parrots/tin_shift_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TIN_SHIFT_PYTORCH_H
#define TIN_SHIFT_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output);

void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
                             Tensor grad_input);
#endif  // TIN_SHIFT_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/parrots/upfirdn2d.cpp
================================================
// Modified from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp

/*
Copyright (c) 2021, NVIDIA Corporation. All rights reserved.

NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
Augmentation (ADA)
=======================================================================

1. Definitions

"Licensor" means any person or entity that distributes its Work.

"Software" means the original work of authorship made available under
this License.

"Work" means the Software and any additions to or derivative works of
the Software that are made available under this License.

The terms "reproduce," "reproduction," "derivative works," and
"distribution" have the meaning as provided under U.S. copyright law;
provided, however, that for the purposes of this License, derivative
works shall not include works that remain separable from, or merely
link (or bind by name) to the interfaces of, the Work.

Works, including the Software, are "made available" under this License
by including in or with the Work either (a) a copyright notice
referencing the applicability of this License to the Work, or (b) a
copy of this License.

2. License Grants

    2.1 Copyright Grant. Subject to the terms and conditions of this
    License, each Licensor grants to you a perpetual, worldwide,
    non-exclusive, royalty-free, copyright license to reproduce,
    prepare derivative works of, publicly display, publicly perform,
    sublicense and distribute its Work and any resulting derivative
    works in any form.

3. Limitations

    3.1 Redistribution. You may reproduce or distribute the Work only
    if (a) you do so under this License, (b) you include a complete
    copy of this License with your distribution, and (c) you retain
    without modification any copyright, patent, trademark, or
    attribution notices that are present in the Work.

    3.2 Derivative Works. You may specify that additional or different
    terms apply to the use, reproduction, and distribution of your
    derivative works of the Work ("Your Terms") only if (a) Your Terms
    provide that the use limitation in Section 3.3 applies to your
    derivative works, and (b) you identify the specific derivative
    works that are subject to Your Terms. Notwithstanding Your Terms,
    this License (including the redistribution requirements in Section
    3.1) will continue to apply to the Work itself.

    3.3 Use Limitation. The Work and any derivative works thereof only
    may be used or intended for use non-commercially. Notwithstanding
    the foregoing, NVIDIA and its affiliates may use the Work and any
    derivative works commercially. As used herein, "non-commercially"
    means for research or evaluation purposes only.

    3.4 Patent Claims. If you bring or threaten to bring a patent claim
    against any Licensor (including any claim, cross-claim or
    counterclaim in a lawsuit) to enforce any patents that you allege
    are infringed by any Work, then your rights under this License from
    such Licensor (including the grant in Section 2.1) will terminate
    immediately.

    3.5 Trademarks. This License does not grant any rights to use any
    Licensor’s or its affiliates’ names, logos, or trademarks, except
    as necessary to reproduce the notices described in this License.

    3.6 Termination. If you violate any term of this License, then your
    rights under this License (including the grant in Section 2.1) will
    terminate immediately.

4. Disclaimer of Warranty.

THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
THIS LICENSE.

5. Limitation of Liability.

EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
THE POSSIBILITY OF SUCH DAMAGES.

=======================================================================
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
                                const torch::Tensor& kernel, int up_x, int up_y,
                                int down_x, int down_y, int pad_x0, int pad_x1,
                                int pad_y0, int pad_y1) {
  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
}

torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
                        int pad_x1, int pad_y0, int pad_y1) {
  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
                           pad_x1, pad_y0, pad_y1);
}


================================================
FILE: mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <torch/extension.h>

#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
using namespace at;
using namespace parrots;

torch::Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x,
                        int up_y, int down_x, int down_y, int pad_x0,
                        int pad_x1, int pad_y0, int pad_y1);

void upfirdn2d_parrots(CudaContext &ctx, const SSElement &attr,
                       const OperatorBase::in_list_t &ins,
                       OperatorBase::out_list_t &outs) {
  int up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1;
  const auto &input = buildATensor(ctx, ins[0]);
  const auto &kernel = buildATensor(ctx, ins[1]);
  SSAttrs(attr)
      .get("up_x", up_x)
      .get("up_y", up_y)
      .get("down_x", down_x)
      .get("down_y", down_y)
      .get("pad_x0", pad_x0)
      .get("pad_x1", pad_x1)
      .get("pad_y0", pad_y0)
      .get("pad_y1", pad_y1)
      .done();
  auto out = upfirdn2d(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
                       pad_x1, pad_y0, pad_y1);
  updateDArray(ctx, out, outs[0]);
}

PARROTS_EXTENSION_REGISTER(upfirdn2d)
    .attr("up_x")
    .attr("up_y")
    .attr("down_x")
    .attr("down_y")
    .attr("pad_x0")
    .attr("pad_x1")
    .attr("pad_y0")
    .attr("pad_y1")
    .input(2)
    .output(1)
    .apply(upfirdn2d_parrots)
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/voxelization.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim = 3) {
  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
                              num_points_per_voxel, voxel_size, coors_range,
                              max_points, max_voxels, NDim);
}

int nondeterministic_hard_voxelize_forward_impl(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3) {
  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
                              points, voxels, coors, num_points_per_voxel,
                              voxel_size, coors_range, max_points, max_voxels,
                              NDim);
}

void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim = 3) {
  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
                       coors_range, NDim);
}

void hard_voxelize_forward(const at::Tensor &points,
                           const at::Tensor &voxel_size,
                           const at::Tensor &coors_range, at::Tensor &voxels,
                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
                           at::Tensor &voxel_num, const int max_points,
                           const int max_voxels, const int NDim = 3,
                           const bool deterministic = true) {
  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
  std::vector<float> voxel_size_v(
      voxel_size.data_ptr<float>(),
      voxel_size.data_ptr<float>() + voxel_size.numel());
  std::vector<float> coors_range_v(
      coors_range.data_ptr<float>(),
      coors_range.data_ptr<float>() + coors_range.numel());

  if (deterministic) {
    *voxel_num_data = hard_voxelize_forward_impl(
        points, voxels, coors, num_points_per_voxel, voxel_size_v,
        coors_range_v, max_points, max_voxels, NDim);
  } else {
    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
        points, voxels, coors, num_points_per_voxel, voxel_size_v,
        coors_range_v, max_points, max_voxels, NDim);
  }
}

void dynamic_voxelize_forward(const at::Tensor &points,
                              const at::Tensor &voxel_size,
                              const at::Tensor &coors_range, at::Tensor &coors,
                              const int NDim = 3) {
  std::vector<float> voxel_size_v(
      voxel_size.data_ptr<float>(),
      voxel_size.data_ptr<float>() + voxel_size.numel());
  std::vector<float> coors_range_v(
      coors_range.data_ptr<float>(),
      coors_range.data_ptr<float>() + coors_range.numel());
  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
                                NDim);
}


================================================
FILE: mmcv/ops/csrc/parrots/voxelization_parrots.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>

#include "voxelization_pytorch.h"

using namespace parrots;

#ifdef MMCV_WITH_CUDA
void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
  int max_points, max_voxels, NDim;
  bool deterministic;
  SSAttrs(attr)
      .get<int>("max_points", max_points)
      .get<int>("max_voxels", max_voxels)
      .get<int>("NDim", NDim)
      .get<bool>("deterministic", deterministic)
      .done();
  const auto& points = buildATensor(ctx, ins[0]);
  const auto& voxel_size = buildATensor(ctx, ins[1]);
  const auto& coors_range = buildATensor(ctx, ins[2]);

  auto voxels = buildATensor(ctx, outs[0]);
  auto coors = buildATensor(ctx, outs[1]);
  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
  auto voxel_num = buildATensor(ctx, outs[3]);

  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
                        num_points_per_voxel, voxel_num, max_points, max_voxels,
                        NDim, deterministic);
}

void dynamic_voxelize_forward_cuda_parrots(CudaContext& ctx,
                                           const SSElement& attr,
                                           const OperatorBase::in_list_t& ins,
                                           OperatorBase::out_list_t& outs) {
  int NDim;
  SSAttrs(attr).get<int>("NDim", NDim).done();
  const auto& points = buildATensor(ctx, ins[0]);
  const auto& voxel_size = buildATensor(ctx, ins[1]);
  const auto& coors_range = buildATensor(ctx, ins[2]);

  auto coors = buildATensor(ctx, outs[0]);

  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
}
#endif

void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                       const OperatorBase::in_list_t& ins,
                                       OperatorBase::out_list_t& outs) {
  int max_points, max_voxels, NDim;
  bool deterministic;
  SSAttrs(attr)
      .get<int>("max_points", max_points)
      .get<int>("max_voxels", max_voxels)
      .get<int>("NDim", NDim)
      .get<bool>("deterministic", deterministic)
      .done();
  const auto& points = buildATensor(ctx, ins[0]);
  const auto& voxel_size = buildATensor(ctx, ins[1]);
  const auto& coors_range = buildATensor(ctx, ins[2]);

  auto voxels = buildATensor(ctx, outs[0]);
  auto coors = buildATensor(ctx, outs[1]);
  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
  auto voxel_num = buildATensor(ctx, outs[3]);

  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
                        num_points_per_voxel, voxel_num, max_points, max_voxels,
                        NDim, deterministic);
}

void dynamic_voxelize_forward_cpu_parrots(HostContext& ctx,
                                          const SSElement& attr,
                                          const OperatorBase::in_list_t& ins,
                                          OperatorBase::out_list_t& outs) {
  int NDim;
  SSAttrs(attr).get<int>("NDim", NDim).done();
  const auto& points = buildATensor(ctx, ins[0]);
  const auto& voxel_size = buildATensor(ctx, ins[1]);
  const auto& coors_range = buildATensor(ctx, ins[2]);

  auto coors = buildATensor(ctx, outs[0]);

  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
}

PARROTS_EXTENSION_REGISTER(hard_voxelize_forward)
    .attr("max_points")
    .attr("max_voxels")
    .attr("NDim")
    .attr("deterministic")
    .input(3)
    .output(4)
    .apply(hard_voxelize_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(hard_voxelize_forward_cuda_parrots)
#endif
    .done();

PARROTS_EXTENSION_REGISTER(dynamic_voxelize_forward)
    .attr("NDim")
    .input(3)
    .output(1)
    .apply(dynamic_voxelize_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
    .apply(dynamic_voxelize_forward_cuda_parrots)
#endif
    .done();


================================================
FILE: mmcv/ops/csrc/parrots/voxelization_pytorch.h
================================================
// Copyright (c) OpenMMLab. All rights reserved
#ifndef VOXELIZATION_PYTORCH_H
#define VOXELIZATION_PYTORCH_H
#include <torch/extension.h>
using namespace at;

void hard_voxelize_forward(const at::Tensor &points,
                           const at::Tensor &voxel_size,
                           const at::Tensor &coors_range, at::Tensor &voxels,
                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
                           at::Tensor &voxel_num, const int max_points,
                           const int max_voxels, const int NDim = 3,
                           const bool deterministic = true);

void dynamic_voxelize_forward(const at::Tensor &points,
                              const at::Tensor &voxel_size,
                              const at::Tensor &coors_range, at::Tensor &coors,
                              const int NDim = 3);

#endif  // VOXELIZATION_PYTORCH_H


================================================
FILE: mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void active_rotated_filter_forward_impl(const Tensor input,
                                        const Tensor indices, Tensor output) {
  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
                       output);
}

void active_rotated_filter_backward_impl(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in) {
  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
                       grad_in);
}

void active_rotated_filter_forward(const Tensor input, const Tensor indices,
                                   Tensor output) {
  active_rotated_filter_forward_impl(input, indices, output);
}

void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
                                    Tensor grad_in) {
  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
}


================================================
FILE: mmcv/ops/csrc/pytorch/assign_score_withk.cpp
================================================
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor& points,
                                     const Tensor& centers,
                                     const Tensor& scores,
                                     const Tensor& knn_idx, Tensor& output) {
  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
                       aggregate, points, centers, scores, knn_idx, output);
}

void assign_score_withk_backward_impl(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
                       aggregate, grad_out, points, centers, scores, knn_idx,
                       grad_points, grad_centers, grad_scores);
}

void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
                                const Tensor& scores, const Tensor& knn_idx,
                                Tensor& output, int B, int N0, int N1, int M,
                                int K, int O, int aggregate) {
  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
                                  centers, scores, knn_idx, output);
}

void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
                                 const Tensor& centers, const Tensor& scores,
                                 const Tensor& knn_idx, Tensor& grad_points,
                                 Tensor& grad_centers, Tensor& grad_scores,
                                 int B, int N0, int N1, int M, int K, int O,
                                 int aggregate) {
  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
                                   points, centers, scores, knn_idx,
                                   grad_points, grad_centers, grad_scores);
}


================================================
FILE: mmcv/ops/csrc/pytorch/ball_query.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void ball_query_forward_impl(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx) {
  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
                       nsample, new_xyz, xyz, idx);
}

void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                        Tensor idx_tensor, int b, int n, int m,
                        float min_radius, float max_radius, int nsample) {
  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
                          new_xyz_tensor, xyz_tensor, idx_tensor);
}

void stack_ball_query_forward_impl(float max_radius, int nsample,
                                   const Tensor new_xyz,
                                   const Tensor new_xyz_batch_cnt,
                                   const Tensor xyz, const Tensor xyz_batch_cnt,
                                   Tensor idx) {
  DISPATCH_DEVICE_IMPL(stack_ball_query_forward_impl, max_radius, nsample,
                       new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
}

void stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,
                              Tensor xyz_tensor, Tensor xyz_batch_cnt,
                              Tensor idx_tensor, float max_radius,
                              int nsample) {
  stack_ball_query_forward_impl(max_radius, nsample, new_xyz_tensor,
                                new_xyz_batch_cnt, xyz_tensor, xyz_batch_cnt,
                                idx_tensor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>

#include "csrc_dipu/base/basedef.h"
#include "csrc_dipu/diopirt/diopirt_impl.h"

using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif

void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset) {
  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
                       aligned, offset);
}

#ifdef MMCV_WITH_DIOPI
void bbox_overlaps_diopi(const Tensor bboxes1, const Tensor bboxes2,
                         Tensor ious, const int mode, const bool aligned,
                         const int offset) {
  auto bboxes1_p = toDiopiTensorHandle(bboxes1);
  diopiDevice_t device;
  diopiGetTensorDevice(bboxes1_p, &device);
  if (device == diopi_host) {
    bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
    return;
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  auto bboxes2_p = toDiopiTensorHandle(bboxes2);
  auto ious_p = toDiopiTensorHandle(ious);
  bool is_mock_cuda = bboxes1.device().type() == dipu::DIPU_DEVICE_TYPE;
  if (is_mock_cuda &&
      reinterpret_cast<void *>(diopiBboxOverlapsMmcv) != nullptr) {
    auto ret = diopiBboxOverlapsMmcv(ch, ious_p, bboxes1_p, bboxes2_p, mode,
                                     offset, aligned);
    if (ret == diopiSuccess) return;
  }
  LOG(WARNING) << "Fallback to cpu: mmcv ext op bbox_overlaps";
  auto bboxes1_cpu = bboxes1.cpu();
  auto bboxes2_cpu = bboxes2.cpu();
  auto ious_cpu = ious.cpu();
  bbox_overlaps_impl(bboxes1_cpu, bboxes2_cpu, ious_cpu, mode, aligned, offset);
  ious.copy_(ious_cpu);
}
#endif

void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                   const int mode, const bool aligned, const int offset) {
#ifdef MMCV_WITH_DIOPI
  bbox_overlaps_diopi(bboxes1, bboxes2, ious, mode, aligned, offset);
#else
  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
#endif
}


================================================
FILE: mmcv/ops/csrc/pytorch/bezier_align.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                               int aligned_height, int aligned_width,
                               float spatial_scale, int sampling_ratio,
                               bool aligned) {
  DISPATCH_DEVICE_IMPL(bezier_align_forward_impl, input, rois, output,
                       aligned_height, aligned_width, spatial_scale,
                       sampling_ratio, aligned);
}

void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
                                Tensor grad_input, int aligned_height,
                                int aligned_width, float spatial_scale,
                                int sampling_ratio, bool aligned) {
  DISPATCH_DEVICE_IMPL(bezier_align_backward_impl, grad_output, rois,
                       grad_input, aligned_height, aligned_width, spatial_scale,
                       sampling_ratio, aligned);
}

void bezier_align_forward(Tensor input, Tensor rois, Tensor output,
                          int aligned_height, int aligned_width,
                          float spatial_scale, int sampling_ratio,
                          bool aligned) {
  bezier_align_forward_impl(input, rois, output, aligned_height, aligned_width,
                            spatial_scale, sampling_ratio, aligned);
}

void bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
                           int aligned_height, int aligned_width,
                           float spatial_scale, int sampling_ratio,
                           bool aligned) {
  bezier_align_backward_impl(grad_output, rois, grad_input, aligned_height,
                             aligned_width, spatial_scale, sampling_ratio,
                             aligned);
}


================================================
FILE: mmcv/ops/csrc/pytorch/bias_act.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

torch::Tensor bias_act_op_impl(const torch::Tensor &input,
                               const torch::Tensor &bias,
                               const torch::Tensor &xref,
                               const torch::Tensor &yref,
                               const torch::Tensor &dy, int grad, int dim,
                               int act, float alpha, float gain, float clamp) {
  return DISPATCH_DEVICE_IMPL(bias_act_op_impl, input, bias, xref, yref, dy,
                              grad, dim, act, alpha, gain, clamp);
}

torch::Tensor bias_act(const torch::Tensor &input, const torch::Tensor &bias,
                       const torch::Tensor &xref, const torch::Tensor &yref,
                       const torch::Tensor &dy, int grad, int dim, int act,
                       float alpha, float gain, float clamp) {
  return bias_act_op_impl(input, bias, xref, yref, dy, grad, dim, act, alpha,
                          gain, clamp);
}


================================================
FILE: mmcv/ops/csrc/pytorch/border_align.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size) {
  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
                       argmax_idx, pool_size);
}

void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size) {
  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
                       argmax_idx, grad_input, pool_size);
}

void border_align_forward(const Tensor &input, const Tensor &boxes,
                          Tensor output, Tensor argmax_idx,
                          const int pool_size) {
  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
}

void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
                           const Tensor &argmax_idx, Tensor grad_input,
                           const int pool_size) {
  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
                             pool_size);
}


================================================
FILE: mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned) {
  DISPATCH_DEVICE_IMPL(box_iou_quadri_impl, boxes1, boxes2, ious, mode_flag,
                       aligned);
}

// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
void box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                    const int mode_flag, const bool aligned) {
  box_iou_quadri_impl(boxes1, boxes2, ious, mode_flag, aligned);
}


================================================
FILE: mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned) {
  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
                       aligned);
}

// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                     const int mode_flag, const bool aligned) {
  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
}


================================================
FILE: mmcv/ops/csrc/pytorch/carafe.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor) {
  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
                       rmasks, output, kernel_size, group_size, scale_factor);
}

void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor) {
  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
                       bottom_grad, mask_grad, kernel_size, group_size,
                       scale_factor);
}

void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
                    Tensor routput, Tensor rmasks, Tensor output,
                    int kernel_size, int group_size, int scale_factor) {
  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
                      kernel_size, group_size, scale_factor);
}

void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
                     Tensor rtop_grad, Tensor rbottom_grad_hs,
                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
                     Tensor mask_grad, int kernel_size, int group_size,
                     int scale_factor) {
  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
                       kernel_size, group_size, scale_factor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/carafe_naive.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor) {
  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
                       kernel_size, group_size, scale_factor);
}

void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor) {
  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
                       bottom_grad, mask_grad, kernel_size, group_size,
                       scale_factor);
}

void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                          int kernel_size, int group_size, int scale_factor) {
  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
                            scale_factor);
}

void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
                           Tensor bottom_grad, Tensor mask_grad,
                           int kernel_size, int group_size, int scale_factor) {
  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
                             kernel_size, group_size, scale_factor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/chamfer_distance.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
                                   const Tensor dist1, const Tensor dist2,
                                   const Tensor idx1, const Tensor idx2) {
  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
                       idx1, idx2);
}

void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
                                    Tensor idx1, Tensor idx2, Tensor graddist1,
                                    Tensor graddist2, Tensor gradxyz1,
                                    Tensor gradxyz2) {
  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
                       graddist1, graddist2, gradxyz1, gradxyz2);
}

void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
                              const Tensor dist1, const Tensor dist2,
                              const Tensor idx1, const Tensor idx2) {
  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
}

void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
                               Tensor idx1, Tensor idx2, Tensor graddist1,
                               Tensor graddist2, Tensor gradxyz1,
                               Tensor gradxyz2) {
  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
                                 gradxyz1, gradxyz2);
}


================================================
FILE: mmcv/ops/csrc/pytorch/contour_expand.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/whai362/PSENet
#include <iostream>
#include <queue>

#include "pytorch_cpp_helper.hpp"

using namespace std;

class Point2d {
 public:
  int x;
  int y;

  Point2d() : x(0), y(0) {}
  Point2d(int _x, int _y) : x(_x), y(_y) {}
};

void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
                   const int *label_map, int &label_num, int &min_area,
                   vector<vector<int>> &text_line) {
  std::vector<int> area(label_num + 1);
  int kernel_num = data_shape[0];
  int height = data_shape[1];
  int width = data_shape[2];

  for (int x = 0; x < height; ++x) {
    for (int y = 0; y < width; ++y) {
      int label = label_map[x * width + y];
      if (label == 0) continue;
      area[label] += 1;
    }
  }

  queue<Point2d> queue, next_queue;
  for (int x = 0; x < height; ++x) {
    vector<int> row(width);
    for (int y = 0; y < width; ++y) {
      int label = label_map[x * width + y];
      if (label == 0) continue;
      if (area[label] < min_area) continue;

      Point2d point(x, y);
      queue.push(point);
      row[y] = label;
    }
    text_line.emplace_back(row);
  }

  int dx[] = {-1, 1, 0, 0};
  int dy[] = {0, 0, -1, 1};
  vector<int> kernel_step(kernel_num);
  std::for_each(kernel_step.begin(), kernel_step.end(),
                [=](int &k) { return k * height * width; });

  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
    while (!queue.empty()) {
      Point2d point = queue.front();
      queue.pop();
      int x = point.x;
      int y = point.y;
      int label = text_line[x][y];

      bool is_edge = true;
      for (int d = 0; d < 4; ++d) {
        int tmp_x = x + dx[d];
        int tmp_y = y + dy[d];

        if (tmp_x < 0 || tmp_x >= height) continue;
        if (tmp_y < 0 || tmp_y >= width) continue;
        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
        if (kernel_value == 0) continue;
        if (text_line[tmp_x][tmp_y] > 0) continue;

        Point2d point(tmp_x, tmp_y);
        queue.push(point);
        text_line[tmp_x][tmp_y] = label;
        is_edge = false;
      }

      if (is_edge) {
        next_queue.push(point);
      }
    }
    swap(queue, next_queue);
  }
}

std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
                                             Tensor internal_kernel_label,
                                             int min_kernel_area,
                                             int kernel_num) {
  kernel_mask = kernel_mask.contiguous();
  internal_kernel_label = internal_kernel_label.contiguous();
  assert(kernel_mask.dim() == 3);
  assert(internal_kernel_label.dim() == 2);
  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
  CHECK_CPU_INPUT(kernel_mask);
  CHECK_CPU_INPUT(internal_kernel_label);
  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
  IntArrayRef data_shape = kernel_mask.sizes();

  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
  vector<vector<int>> text_line;

  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
                min_kernel_area, text_line);

  return text_line;
}


================================================
FILE: mmcv/ops/csrc/pytorch/convex_iou.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
                     Tensor ious) {
  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
}

void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
  convex_iou_impl(pointsets, polygons, ious);
}

void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
                      Tensor output) {
  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
}

void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
  convex_giou_impl(pointsets, polygons, output);
}


================================================
FILE: mmcv/ops/csrc/pytorch/correlation.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include <iostream>

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW) {
  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
                       patchH, patchW, padH, padW, dilationH, dilationW,
                       dilation_patchH, dilation_patchW, dH, dW);
}

void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
                       padW, dilationH, dilationW, dilation_patchH,
                       dilation_patchW, dH, dW);
}

void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
                         int kW, int patchH, int patchW, int padH, int padW,
                         int dilationH, int dilationW, int dilation_patchH,
                         int dilation_patchW, int dH, int dW) {
  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
                           padW, dilationH, dilationW, dilation_patchH,
                           dilation_patchW, dH, dW);
}

void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                          Tensor grad_input1, Tensor grad_input2, int kH,
                          int kW, int patchH, int patchW, int padH, int padW,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW) {
  correlation_backward_impl(grad_output, input1, input2, grad_input1,
                            grad_input2, kH, kW, patchH, patchW, padH, padW,
                            dilationH, dilationW, dilation_patchH,
                            dilation_patchW, dH, dW);
}


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

template <typename T>
void active_rotated_filter_forward_cpu_kernel(
    const T* weightData, const int* indicesData, const int num_output_planes,
    const int num_input_planes, const int num_orientations, const int kH,
    const int kW, const int num_rotations, T* outputData) {
  const int nEntry = num_orientations * kH * kW;
  int i, j, l;
  int k;

#pragma omp parallel for private(i, j, l, k)
  for (i = 0; i < num_output_planes; i++) {
    for (j = 0; j < num_input_planes; j++) {
      for (l = 0; l < nEntry; l++) {
        int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
        T val = *(weightData + weightIndex);
        for (k = 0; k < num_rotations; k++) {
          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
          T* target = outputData +
                      i * (num_rotations * num_input_planes * nEntry) +
                      k * (num_input_planes * nEntry) + j * (nEntry) + index;
          *target = val;
        }
      }
    }
  }
}

template <typename T>
void active_rotated_filter_backward_cpu_kernel(
    const T* gradOutputData, const int* indicesData,
    const int num_output_planes, const int num_input_planes,
    const int num_orientations, const int kH, const int kW,
    const int num_rotations, T* gradInputData) {
  const int nEntry = num_orientations * kH * kW;
  int i, j, l;
  int k;

#pragma omp parallel for private(i, j, l, k)
  for (i = 0; i < num_output_planes; i++) {
    for (j = 0; j < num_input_planes; j++) {
      for (l = 0; l < nEntry; l++) {
        int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
        T* val = gradInputData + gradInputIndex;
        *val = 0;
        for (k = 0; k < num_rotations; k++) {
          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
          const T* target =
              gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
              k * (num_input_planes * nEntry) + j * (nEntry) + index;
          *val = *val + *target;
        }
      }
    }
  }
}

void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
                                           const Tensor indices,
                                           Tensor output) {
  const int num_output_planes = input.size(0);
  const int num_input_planes = input.size(1);
  const int num_orientations = input.size(2);
  const int kH = input.size(3);
  const int kW = input.size(4);
  const int num_rotations = indices.size(3);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
        active_rotated_filter_forward_cpu_kernel<scalar_t>(
            input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
            num_output_planes, num_input_planes, num_orientations, kH, kW,
            num_rotations, output.data_ptr<scalar_t>());
      });
}

void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
                                            const Tensor indices,
                                            Tensor grad_in) {
  const int num_orientations = indices.size(0);
  const int kH = indices.size(1);
  const int kW = indices.size(2);
  const int num_rotations = indices.size(3);
  const int num_output_planes = grad_out.size(0) / num_rotations;
  const int num_input_planes = grad_out.size(1) / num_orientations;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
        active_rotated_filter_backward_cpu_kernel<scalar_t>(
            grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
            num_output_planes, num_input_planes, num_orientations, kH, kW,
            num_rotations, grad_in.data_ptr<scalar_t>());
      });
}

void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
                                       Tensor output) {
  ActiveRotatedFilterForwardCPULauncher(input, indices, output);
}

void active_rotated_filter_backward_cpu(const Tensor grad_out,
                                        const Tensor indices, Tensor grad_in) {
  ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
}

void active_rotated_filter_forward_impl(const Tensor input,
                                        const Tensor indices, Tensor output);

void active_rotated_filter_backward_impl(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in);

REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
                     active_rotated_filter_forward_cpu);
REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
                     active_rotated_filter_backward_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
================================================
// Copyright(c) OpenMMLab.All rights reserved.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

using torch::indexing::None;
using torch::indexing::Slice;

void bbox_overlaps_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
                              Tensor ious, const int mode_flag,
                              const bool aligned, const int offset) {
  Tensor temp_ious;
  if (aligned) {
    Tensor lt = torch::max(boxes1.index({Slice(None), Slice({None, 2})}),
                           boxes2.index({Slice(None), Slice({None, 2})}));
    Tensor rb = torch::min(boxes1.index({Slice(None), Slice(2)}),
                           boxes2.index({Slice(None), Slice(2)}));
    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
    Tensor overlap = wh.index({Slice(None), 0}) * wh.index({Slice(None), 1});
    Tensor area1 = (boxes1.index({Slice(None), 2}) -
                    boxes1.index({Slice(None), 0}) + offset) *
                   (boxes1.index({Slice(None), 3}) -
                    boxes1.index({Slice(None), 1}) + offset);
    if (mode_flag == 0) {
      Tensor area2 = (boxes2.index({Slice(None), 2}) -
                      boxes2.index({Slice(None), 0}) + offset) *
                     (boxes2.index({Slice(None), 3}) -
                      boxes2.index({Slice(None), 1}) + offset);
      temp_ious = overlap / (area1 + area2 - overlap);
    } else {
      temp_ious = overlap / area1;
    }
  } else {
    Tensor lt = torch::max(boxes1.index({Slice(None), None, Slice({None, 2})}),
                           boxes2.index({Slice(None), Slice({None, 2})}));
    Tensor rb = torch::min(boxes1.index({Slice(None), None, Slice(2)}),
                           boxes2.index({Slice(None), Slice(2)}));
    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
    Tensor overlap = wh.index({"...", 0}) * wh.index({"...", 1});
    Tensor area1 = (boxes1.index({Slice(None), 2}) -
                    boxes1.index({Slice(None), 0}) + offset) *
                   (boxes1.index({Slice(None), 3}) -
                    boxes1.index({Slice(None), 1}) + offset);
    if (mode_flag == 0) {
      Tensor area2 = (boxes2.index({Slice(None), 2}) -
                      boxes2.index({Slice(None), 0}) + offset) *
                     (boxes2.index({Slice(None), 3}) -
                      boxes2.index({Slice(None), 1}) + offset);
      temp_ious =
          overlap / (area1.index({Slice(None), None}) + area2 - overlap);
    } else {
      temp_ious = overlap / area1.index({Slice(None), None});
    }
  }
  ious.copy_(temp_ious);
}

void bbox_overlaps_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                       const int mode, const bool aligned, const int offset) {
  bbox_overlaps_cpu_kernel(boxes1, boxes2, ious, mode, aligned, offset);
}

void bbox_overlaps_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset);

REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CPU, bbox_overlaps_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
================================================
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/BezierAlign
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

// implementation taken from Caffe2
template <typename T>
struct PreCalc {
  int pos1;
  int pos2;
  int pos3;
  int pos4;
  T w1;
  T w2;
  T w3;
  T w4;
};

template <typename T>
T bezier_curve(const T p0, const T p1, const T p2, const T p3, const T u) {
  return ((1. - u) * (1. - u) * (1. - u) * p0 +
          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
          u * u * u * p3);
}

template <typename T>
void pre_calc_for_bilinear_interpolate(
    const int height, const int width, const int pooled_height,
    const int pooled_width, const int iy_upper, const int ix_upper, T p0_x,
    T p0_y, T p1_x, T p1_y, T p2_x, T p2_y, T p3_x, T p3_y, T p4_x, T p4_y,
    T p5_x, T p5_y, T p6_x, T p6_y, T p7_x, T p7_y, T bin_size_h, T bin_size_w,
    int roi_bin_grid_h, int roi_bin_grid_w, T offset,
    std::vector<PreCalc<T>> &pre_calc) {
  int pre_calc_index = 0;
  for (int ph = 0; ph < pooled_height; ph++) {
    for (int pw = 0; pw < pooled_width; pw++) {
      // compute the coords
      const T u = pw / static_cast<T>(pooled_width);
      const T v = ph / static_cast<T>(pooled_height);
      const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
      const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
      const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
      const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
      const T x_center = x1 * v + x0 * (1. - v) - offset;
      const T y_center = y1 * v + y0 * (1. - v) - offset;
      for (int iy = 0; iy < iy_upper; iy++) {
        const T yy = y_center - (T)0.5 * bin_size_h +
                     static_cast<T>(iy + .5f) * bin_size_h /
                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
        for (int ix = 0; ix < ix_upper; ix++) {
          const T xx = x_center - (T)0.5 * bin_size_w +
                       static_cast<T>(ix + .5f) * bin_size_w /
                           static_cast<T>(roi_bin_grid_w);

          T x = xx;
          T y = yy;
          // deal with: inverse elements are out of feature map boundary
          if (y < -1.0 || y > height || x < -1.0 || x > width) {
            // empty
            PreCalc<T> pc;
            pc.pos1 = 0;
            pc.pos2 = 0;
            pc.pos3 = 0;
            pc.pos4 = 0;
            pc.w1 = 0;
            pc.w2 = 0;
            pc.w3 = 0;
            pc.w4 = 0;
            pre_calc[pre_calc_index] = pc;
            pre_calc_index += 1;
            continue;
          }

          if (y <= 0) {
            y = 0;
          }
          if (x <= 0) {
            x = 0;
          }

          int y_low = (int)y;
          int x_low = (int)x;
          int y_high;
          int x_high;

          if (y_low >= height - 1) {
            y_high = y_low = height - 1;
            y = (T)y_low;
          } else {
            y_high = y_low + 1;
          }

          if (x_low >= width - 1) {
            x_high = x_low = width - 1;
            x = (T)x_low;
          } else {
            x_high = x_low + 1;
          }

          T ly = y - y_low;
          T lx = x - x_low;
          T hy = 1. - ly, hx = 1. - lx;
          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

          // save weights and indices
          PreCalc<T> pc;
          pc.pos1 = y_low * width + x_low;
          pc.pos2 = y_low * width + x_high;
          pc.pos3 = y_high * width + x_low;
          pc.pos4 = y_high * width + x_high;
          pc.w1 = w1;
          pc.w2 = w2;
          pc.w3 = w3;
          pc.w4 = w4;
          pre_calc[pre_calc_index] = pc;

          pre_calc_index += 1;
        }
      }
    }
  }
}

template <typename T>
void BezierAlignForward(const int nthreads, const T *input, const T *rois,
                        T *output, const int pooled_height,
                        const int pooled_width, const T &spatial_scale,
                        const int sampling_ratio, bool aligned,
                        const int channels, const int height, const int width) {
  int n_rois = nthreads / channels / pooled_width / pooled_height;
  // (n, c, ph, pw) is an element in the pooled output
  // can be parallelized using omp
  // #pragma omp parallel for num_threads(32)
  for (int n = 0; n < n_rois; n++) {
    int index_n = n * channels * pooled_width * pooled_height;

    // beziers have size Nx(1+8*2) = Nx17
    const T *offset_rois = rois + n * 17;
    int roi_batch_ind = offset_rois[0];

    T offset = aligned ? (T)0.5 : (T)0.0;
    // Do not use rounding; this implementation detail is critical
    T p0_x = offset_rois[1] * spatial_scale;
    T p0_y = offset_rois[2] * spatial_scale;
    T p1_x = offset_rois[3] * spatial_scale;
    T p1_y = offset_rois[4] * spatial_scale;
    T p2_x = offset_rois[5] * spatial_scale;
    T p2_y = offset_rois[6] * spatial_scale;
    T p3_x = offset_rois[7] * spatial_scale;
    T p3_y = offset_rois[8] * spatial_scale;
    T p4_x = offset_rois[15] * spatial_scale;
    T p4_y = offset_rois[16] * spatial_scale;
    T p5_x = offset_rois[13] * spatial_scale;
    T p5_y = offset_rois[14] * spatial_scale;
    T p6_x = offset_rois[11] * spatial_scale;
    T p6_y = offset_rois[12] * spatial_scale;
    T p7_x = offset_rois[9] * spatial_scale;
    T p7_y = offset_rois[10] * spatial_scale;

    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));
    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));
    if (aligned) {
      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
                 "Beziers in BezierAlign cannot have non-negative size!");
    } else {  // for backward-compatibility only
      roi_width = std::max(roi_width, (T)1.);
      roi_height = std::max(roi_height, (T)1.);
    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceil(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

    // We do average (integral) pooling inside a bin
    // When the grid is empty, output zeros == 0/1, instead of NaN.
    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    // we want to precalculate indices and weights shared by all channels,
    // this is the key point of optimization
    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
                                     pooled_width * pooled_height);
    pre_calc_for_bilinear_interpolate(
        height, width, pooled_height, pooled_width, roi_bin_grid_h,
        roi_bin_grid_w, p0_x, p0_y, p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x,
        p4_y, p5_x, p5_y, p6_x, p6_y, p7_x, p7_y, bin_size_h, bin_size_w,
        roi_bin_grid_h, roi_bin_grid_w, offset, pre_calc);

    for (int c = 0; c < channels; c++) {
      int index_n_c = index_n + c * pooled_width * pooled_height;
      const T *offset_input =
          input + (roi_batch_ind * channels + c) * height * width;
      int pre_calc_index = 0;

      for (int ph = 0; ph < pooled_height; ph++) {
        for (int pw = 0; pw < pooled_width; pw++) {
          int index = index_n_c + ph * pooled_width + pw;

          T output_val = 0.;
          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
              PreCalc<T> pc = pre_calc[pre_calc_index];
              output_val += pc.w1 * offset_input[pc.pos1] +
                            pc.w2 * offset_input[pc.pos2] +
                            pc.w3 * offset_input[pc.pos3] +
                            pc.w4 * offset_input[pc.pos4];

              pre_calc_index += 1;
            }
          }
          output_val /= count;

          output[index] = output_val;
        }  // for pw
      }  // for ph
    }  // for c
  }  // for n
}

template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
                                   T &w1, T &w2, T &w3, T &w4, int &x_low,
                                   int &x_high, int &y_low, int &y_high,
                                   const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    // empty
    w1 = w2 = w3 = w4 = 0.;
    x_low = x_high = y_low = y_high = -1;
    return;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  y_low = (int)y;
  x_low = (int)x;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;

  // reference in forward
  // T v1 = input[y_low * width + x_low];
  // T v2 = input[y_low * width + x_high];
  // T v3 = input[y_high * width + x_low];
  // T v4 = input[y_high * width + x_high];
  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
}

template <class T>
inline void add(T *address, const T &val) {
  *address += val;
}

template <typename T>
void BezierAlignBackward(const int nthreads, const T *grad_output,
                         const T *rois, T *grad_input, const int pooled_height,
                         const int pooled_width, const T &spatial_scale,
                         const int sampling_ratio, bool aligned,
                         const int channels, const int height, const int width,
                         const int n_stride, const int c_stride,
                         const int h_stride, const int w_stride) {
  for (int index = 0; index < nthreads; index++) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T *offset_rois = rois + n * 17;
    int roi_batch_ind = offset_rois[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T p0_x = offset_rois[1] * spatial_scale;
    T p0_y = offset_rois[2] * spatial_scale;
    T p1_x = offset_rois[3] * spatial_scale;
    T p1_y = offset_rois[4] * spatial_scale;
    T p2_x = offset_rois[5] * spatial_scale;
    T p2_y = offset_rois[6] * spatial_scale;
    T p3_x = offset_rois[7] * spatial_scale;
    T p3_y = offset_rois[8] * spatial_scale;
    T p4_x = offset_rois[15] * spatial_scale;
    T p4_y = offset_rois[16] * spatial_scale;
    T p5_x = offset_rois[13] * spatial_scale;
    T p5_y = offset_rois[14] * spatial_scale;
    T p6_x = offset_rois[11] * spatial_scale;
    T p6_y = offset_rois[12] * spatial_scale;
    T p7_x = offset_rois[9] * spatial_scale;
    T p7_y = offset_rois[10] * spatial_scale;

    // compute the coords
    const T u = pw / static_cast<T>(pooled_width);
    const T v = ph / static_cast<T>(pooled_height);
    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
    const T x_center = x1 * v + x0 * (1. - v) - offset;
    const T y_center = y1 * v + y0 * (1. - v) - offset;

    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));
    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));
    if (aligned) {
      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
                 "Beziers in BezierAlign do not have non-negative size!");
    } else {  // for backward-compatibility only
      roi_width = std::max(roi_width, (T)1.);
      roi_height = std::max(roi_height, (T)1.);
    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    T *offset_grad_input =
        grad_input + ((roi_batch_ind * channels + c) * height * width);

    int output_offset = n * n_stride + c * c_stride;
    const T *offset_grad_output = grad_output + output_offset;
    const T grad_output_this_bin =
        offset_grad_output[ph * h_stride + pw * w_stride];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceil(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);

    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
      const T y = y_center - (T)0.5 * bin_size_h +
                  static_cast<T>(iy + .5f) * bin_size_h /
                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T x = x_center - (T)0.5 * bin_size_w +
                    static_cast<T>(ix + .5f) * bin_size_w /
                        static_cast<T>(roi_bin_grid_w);

        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high, index);

        T g1 = grad_output_this_bin * w1 / count;
        T g2 = grad_output_this_bin * w2 / count;
        T g3 = grad_output_this_bin * w3 / count;
        T g4 = grad_output_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          // atomic add is not needed for now since it is single threaded
          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
        }  // if
      }  // ix
    }  // iy
  }  // for
}  // BezierAlignBackward

void BezierAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
                                   int aligned_height, int aligned_width,
                                   float spatial_scale, int sampling_ratio,
                                   bool aligned) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "BezierAlign_forward", [&] {
        BezierAlignForward<scalar_t>(
            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
            output.data_ptr<scalar_t>(), aligned_height, aligned_width,
            static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
            channels, height, width);
      });
}

void BezierAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
                                    Tensor grad_input, int aligned_height,
                                    int aligned_width, float spatial_scale,
                                    int sampling_ratio, bool aligned) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  // get stride values to ensure indexing into gradients is correct.
  int n_stride = grad_output.stride(0);
  int c_stride = grad_output.stride(1);
  int h_stride = grad_output.stride(2);
  int w_stride = grad_output.stride(3);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "BezierAlign_backward", [&] {
        BezierAlignBackward<scalar_t>(
            output_size, grad_output.data_ptr<scalar_t>(),
            rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
            sampling_ratio, aligned, channels, height, width, n_stride,
            c_stride, h_stride, w_stride);
      });
}

void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                               int aligned_height, int aligned_width,
                               float spatial_scale, int sampling_ratio,
                               bool aligned);

void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
                                Tensor grad_input, int aligned_height,
                                int aligned_width, float spatial_scale,
                                int sampling_ratio, bool aligned);

REGISTER_DEVICE_IMPL(bezier_align_forward_impl, CPU,
                     BezierAlignForwardCPULauncher);
REGISTER_DEVICE_IMPL(bezier_align_backward_impl, CPU,
                     BezierAlignBackwardCPULauncher);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

template <typename T>
void box_iou_quadri_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
                               Tensor ious, const int mode_flag,
                               const bool aligned) {
  int output_size = ious.numel();
  auto num_boxes1 = boxes1.size(0);
  auto num_boxes2 = boxes2.size(0);

  if (aligned) {
    for (int i = 0; i < output_size; i++) {
      ious[i] = single_box_iou_quadri<T>(boxes1[i].data_ptr<T>(),
                                         boxes2[i].data_ptr<T>(), mode_flag);
    }
  } else {
    for (int i = 0; i < num_boxes1; i++) {
      for (int j = 0; j < num_boxes2; j++) {
        ious[i * num_boxes2 + j] = single_box_iou_quadri<T>(
            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
      }
    }
  }
}

void box_iou_quadri_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                        const int mode_flag, const bool aligned) {
  box_iou_quadri_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
}

void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CPU, box_iou_quadri_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

template <typename T>
void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
                                Tensor ious, const int mode_flag,
                                const bool aligned) {
  int output_size = ious.numel();
  auto num_boxes1 = boxes1.size(0);
  auto num_boxes2 = boxes2.size(0);

  if (aligned) {
    for (int i = 0; i < output_size; i++) {
      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
                                          boxes2[i].data_ptr<T>(), mode_flag);
    }
  } else {
    for (int i = 0; i < num_boxes1; i++) {
      for (int j = 0; j < num_boxes2; j++) {
        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
      }
    }
  }
}

void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned) {
  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
}

void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

template <typename T>
T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
                                 const int height, const int width, T h, T w) {
  if (h <= -1 || height <= h || w <= -1 || width <= w) {
    return 0;
  }

  int h_low = floor(h);
  int w_low = floor(w);
  int h_high = h_low + 1;
  int w_high = w_low + 1;

  T lh = h - h_low;
  T lw = w - w_low;
  T hh = 1 - lh, hw = 1 - lw;

  T v1 = 0;
  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
  T v2 = 0;
  if (h_low >= 0 && w_high <= width - 1)
    v2 = input[h_low * data_width + w_high];
  T v3 = 0;
  if (h_high <= height - 1 && w_low >= 0)
    v3 = input[h_high * data_width + w_low];
  T v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
    v4 = input[h_high * data_width + w_high];

  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

template <typename T>
T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
                          const int height, const int width) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floor(argmax_h);
  int argmax_w_low = floor(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;
  if (h == argmax_h_low && w == argmax_w_low)
    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
  if (h == argmax_h_low && w == argmax_w_high)
    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
  if (h == argmax_h_high && w == argmax_w_low)
    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
  if (h == argmax_h_high && w == argmax_w_high)
    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
  return weight;
}

template <typename T>
T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
                            const int width, const T *im_data,
                            const int data_width, const int bp_dir) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floor(argmax_h);
  int argmax_w_low = floor(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;

  if (bp_dir == 0) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += -1 * (argmax_w - argmax_w_low) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_w - argmax_w_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  } else if (bp_dir == 1) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += -1 * (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  }

  return weight;
}

template <typename T>
void deformable_im2col_cpu_kernel(
    const int n, const T *data_im, const T *data_offset, const int height,
    const int width, const int kernel_h, const int kernel_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int num_channels, const int deformable_group, const int height_col,
    const int width_col, T *data_col) {
  for (int index = 0; index < n; index++) {
    // index index of output matrix
    const int w_col = index % width_col;
    const int h_col = (index / width_col) % height_col;
    const int b_col = (index / width_col / height_col) % batch_size;
    const int c_im = (index / width_col / height_col) / batch_size;
    const int c_col = c_im * kernel_h * kernel_w;

    // compute deformable group index
    const int deformable_group_index = c_im / channel_per_deformable_group;

    const int h_in = h_col * stride_h - pad_h;
    const int w_in = w_col * stride_w - pad_w;
    T *data_col_ptr =
        data_col +
        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
    const T *data_im_ptr =
        data_im + (b_col * num_channels + c_im) * height * width;
    const T *data_offset_ptr =
        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        const int data_offset_h_ptr =
            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
        const int data_offset_w_ptr =
            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
            w_col;
        const T offset_h = data_offset_ptr[data_offset_h_ptr];
        const T offset_w = data_offset_ptr[data_offset_w_ptr];
        T val = static_cast<T>(0);
        const T h_im = h_in + i * dilation_h + offset_h;
        const T w_im = w_in + j * dilation_w + offset_w;
        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
                                               width, h_im, w_im);
        *data_col_ptr = val;
        data_col_ptr += batch_size * height_col * width_col;
      }
    }
  }
}

template <typename T>
void deformable_col2im_cpu_kernel(
    const int n, const T *data_col, const T *data_offset, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int deformable_group, const int height_col, const int width_col,
    T *grad_im) {
  for (int index = 0; index < n; index++) {
    const int j = (index / width_col / height_col / batch_size) % kernel_w;
    const int i =
        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
    const int c =
        index / width_col / height_col / batch_size / kernel_w / kernel_h;
    // compute the start and end of the output

    const int deformable_group_index = c / channel_per_deformable_group;

    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int b = (index / width_col / height_col) % batch_size;
    int w_in = w_out * stride_w - pad_w;
    int h_in = h_out * stride_h - pad_h;

    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const int data_offset_h_ptr =
        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
    const int data_offset_w_ptr =
        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
    const T offset_h = data_offset_ptr[data_offset_h_ptr];
    const T offset_w = data_offset_ptr[data_offset_w_ptr];
    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;

    const T cur_top_grad = data_col[index];
    const int cur_h = (int)cur_inv_h_data;
    const int cur_w = (int)cur_inv_w_data;
    for (int dy = -2; dy <= 2; dy++) {
      for (int dx = -2; dx <= 2; dx++) {
        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
          int cur_bottom_grad_pos =
              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
          T weight =
              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
                                      cur_h + dy, cur_w + dx, height, width);
          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
        }
      }
    }
  }
}

template <typename T>
void deformable_col2im_coord_cpu_kernel(
    const int n, const T *data_col, const T *data_im, const T *data_offset,
    const int channels, const int height, const int width, const int kernel_h,
    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int offset_channels, const int deformable_group, const int height_col,
    const int width_col, T *grad_offset) {
  for (int index = 0; index < n; index++) {
    T val = 0;
    int w = index % width_col;
    int h = (index / width_col) % height_col;
    int c = (index / width_col / height_col) % offset_channels;
    int b = (index / width_col / height_col) / offset_channels;
    // compute the start and end of the output

    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
    const int col_step = kernel_h * kernel_w;
    int cnt = 0;
    const T *data_col_ptr = data_col + deformable_group_index *
                                           channel_per_deformable_group *
                                           batch_size * width_col * height_col;
    const T *data_im_ptr =
        data_im + (b * deformable_group + deformable_group_index) *
                      channel_per_deformable_group / kernel_h / kernel_w *
                      height * width;
    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;

    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
         col_c += col_step) {
      const int col_pos =
          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
      const int bp_dir = offset_c % 2;

      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
      int i =
          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
      int w_out = col_pos % width_col;
      int h_out = (col_pos / width_col) % height_col;
      int w_in = w_out * stride_w - pad_w;
      int h_in = h_out * stride_h - pad_h;
      const int data_offset_h_ptr =
          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
      const int data_offset_w_ptr =
          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
           w_out);
      const T offset_h = data_offset_ptr[data_offset_h_ptr];
      const T offset_w = data_offset_ptr[data_offset_w_ptr];
      T inv_h = h_in + i * dilation_h + offset_h;
      T inv_w = w_in + j * dilation_w + offset_w;
      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
        inv_h = inv_w = -2;
      const T weight = get_coordinate_weight_cpu(
          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
          width, bp_dir);
      val += weight * data_col_ptr[col_pos];
      cnt += 1;
    }

    grad_offset[index] = val;
  }
}

void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
                           const int channels, const int height,
                           const int width, const int ksize_h,
                           const int ksize_w, const int pad_h, const int pad_w,
                           const int stride_h, const int stride_w,
                           const int dilation_h, const int dilation_w,
                           const int parallel_imgs, const int deformable_group,
                           Tensor data_col) {
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels = channels * height_col * width_col * parallel_imgs;
  int channel_per_deformable_group = channels / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
        deformable_im2col_cpu_kernel<scalar_t>(
            num_kernels, data_im.data_ptr<scalar_t>(),
            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
            channel_per_deformable_group, parallel_imgs, channels,
            deformable_group, height_col, width_col,
            data_col.data_ptr<scalar_t>());
      });
}

void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
                           const int channels, const int height,
                           const int width, const int ksize_h,
                           const int ksize_w, const int pad_h, const int pad_w,
                           const int stride_h, const int stride_w,
                           const int dilation_h, const int dilation_w,
                           const int parallel_imgs, const int deformable_group,
                           Tensor grad_im) {
  // todo: make sure parallel_imgs is passed in correctly
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels =
      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
  int channel_per_deformable_group = channels / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();

        deformable_col2im_cpu_kernel<scalar_t>(
            num_kernels, data_col_, data_offset_, channels, height, width,
            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
            dilation_w, channel_per_deformable_group, parallel_imgs,
            deformable_group, height_col, width_col, grad_im_);
      }));
}

void deformable_col2im_coord_cpu(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset) {
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
                    deformable_group * parallel_imgs;
  int channel_per_deformable_group =
      channels * ksize_h * ksize_w / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();

        deformable_col2im_coord_cpu_kernel<scalar_t>(
            num_kernels, data_col_, data_im_, data_offset_, channels, height,
            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
            2 * ksize_h * ksize_w * deformable_group, deformable_group,
            height_col, width_col, grad_offset_);
      }));
}

void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col);

void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im);

void deformable_col2im_coord_impl(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset);

REGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);
REGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);
REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,
                     deformable_col2im_coord_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

template <typename T>
T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
                           const int height, const int width, T h, T w) {
  int h_low = floorf(h);
  int w_low = floorf(w);
  int h_high = h_low + 1;
  int w_high = w_low + 1;

  T lh = h - h_low;
  T lw = w - w_low;
  T hh = 1 - lh, hw = 1 - lw;

  T v1 = 0;
  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
  T v2 = 0;
  if (h_low >= 0 && w_high <= width - 1)
    v2 = input[h_low * data_width + w_high];
  T v3 = 0;
  if (h_high <= height - 1 && w_low >= 0)
    v3 = input[h_high * data_width + w_low];
  T v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
    v4 = input[h_high * data_width + w_high];

  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}

template <typename T>
T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
                               const int height, const int width) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;
  if (h == argmax_h_low && w == argmax_w_low)
    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
  if (h == argmax_h_low && w == argmax_w_high)
    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
  if (h == argmax_h_high && w == argmax_w_low)
    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
  if (h == argmax_h_high && w == argmax_w_high)
    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
  return weight;
}

template <typename T>
T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
                                 const int width, const T *im_data,
                                 const int data_width, const int bp_dir) {
  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
      argmax_w >= width) {
    // empty
    return 0;
  }

  int argmax_h_low = floorf(argmax_h);
  int argmax_w_low = floorf(argmax_w);
  int argmax_h_high = argmax_h_low + 1;
  int argmax_w_high = argmax_w_low + 1;

  T weight = 0;

  if (bp_dir == 0) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += -1 * (argmax_w - argmax_w_low) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += (argmax_w_low + 1 - argmax_w) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_w - argmax_w_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  } else if (bp_dir == 1) {
    if (argmax_h_low >= 0 && argmax_w_low >= 0)
      weight += -1 * (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_low];
    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
      weight += (argmax_h_low + 1 - argmax_h) *
                im_data[argmax_h_low * data_width + argmax_w_high];
    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
      weight += -1 * (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_low];
    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
      weight += (argmax_h - argmax_h_low) *
                im_data[argmax_h_high * data_width + argmax_w_high];
  }

  return weight;
}

template <typename T>
void modulated_deformable_im2col_cpu_kernel(
    const int n, const T *data_im, const T *data_offset, const T *data_mask,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int num_channels, const int deformable_group, const int height_col,
    const int width_col, T *data_col) {
  for (int index = 0; index < n; index++) {
    // index index of output matrix
    const int w_col = index % width_col;
    const int h_col = (index / width_col) % height_col;
    const int b_col = (index / width_col / height_col) % batch_size;
    const int c_im = (index / width_col / height_col) / batch_size;
    const int c_col = c_im * kernel_h * kernel_w;

    // compute deformable group index
    const int deformable_group_index = c_im / channel_per_deformable_group;

    const int h_in = h_col * stride_h - pad_h;
    const int w_in = w_col * stride_w - pad_w;

    T *data_col_ptr =
        data_col +
        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
    const T *data_im_ptr =
        data_im + (b_col * num_channels + c_im) * height * width;
    const T *data_offset_ptr =
        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;

    const T *data_mask_ptr =
        data_mask + (b_col * deformable_group + deformable_group_index) *
                        kernel_h * kernel_w * height_col * width_col;

    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        const int data_offset_h_ptr =
            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
        const int data_offset_w_ptr =
            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
            w_col;
        const int data_mask_hw_ptr =
            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
        const T offset_h = data_offset_ptr[data_offset_h_ptr];
        const T offset_w = data_offset_ptr[data_offset_w_ptr];
        const T mask = data_mask_ptr[data_mask_hw_ptr];
        T val = static_cast<T>(0);
        const T h_im = h_in + i * dilation_h + offset_h;
        const T w_im = w_in + j * dilation_w + offset_w;
        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
                                         h_im, w_im);
        *data_col_ptr = val * mask;
        data_col_ptr += batch_size * height_col * width_col;
      }
    }
  }
}

template <typename T>
void modulated_deformable_col2im_cpu_kernel(
    const int n, const T *data_col, const T *data_offset, const T *data_mask,
    const int channels, const int height, const int width, const int kernel_h,
    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, const int dilation_h, const int dilation_w,
    const int channel_per_deformable_group, const int batch_size,
    const int deformable_group, const int height_col, const int width_col,
    T *grad_im) {
  for (int index = 0; index < n; index++) {
    const int j = (index / width_col / height_col / batch_size) % kernel_w;
    const int i =
        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
    const int c =
        index / width_col / height_col / batch_size / kernel_w / kernel_h;
    // compute the start and end of the output

    const int deformable_group_index = c / channel_per_deformable_group;

    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int b = (index / width_col / height_col) % batch_size;
    int w_in = w_out * stride_w - pad_w;
    int h_in = h_out * stride_h - pad_h;

    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const T *data_mask_ptr =
        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
                        kernel_w * height_col * width_col;
    const int data_offset_h_ptr =
        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
    const int data_offset_w_ptr =
        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
    const int data_mask_hw_ptr =
        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
    const T offset_h = data_offset_ptr[data_offset_h_ptr];
    const T offset_w = data_offset_ptr[data_offset_w_ptr];
    const T mask = data_mask_ptr[data_mask_hw_ptr];
    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;

    const T cur_top_grad = data_col[index] * mask;
    const int cur_h = (int)cur_inv_h_data;
    const int cur_w = (int)cur_inv_w_data;
    for (int dy = -2; dy <= 2; dy++) {
      for (int dx = -2; dx <= 2; dx++) {
        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
          int cur_bottom_grad_pos =
              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
                                                  cur_inv_w_data, cur_h + dy,
                                                  cur_w + dx, height, width);
          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
        }
      }
    }
  }
}

template <typename T>
void modulated_deformable_col2im_coord_cpu_kernel(
    const int n, const T *data_col, const T *data_im, const T *data_offset,
    const T *data_mask, const int channels, const int height, const int width,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int channel_per_deformable_group,
    const int batch_size, const int offset_channels, const int deformable_group,
    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
  for (int index = 0; index < n; index++) {
    T val = 0, mval = 0;
    int w = index % width_col;
    int h = (index / width_col) % height_col;
    int c = (index / width_col / height_col) % offset_channels;
    int b = (index / width_col / height_col) / offset_channels;
    // compute the start and end of the output

    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
    const int col_step = kernel_h * kernel_w;
    int cnt = 0;
    const T *data_col_ptr = data_col + deformable_group_index *
                                           channel_per_deformable_group *
                                           batch_size * width_col * height_col;
    const T *data_im_ptr =
        data_im + (b * deformable_group + deformable_group_index) *
                      channel_per_deformable_group / kernel_h / kernel_w *
                      height * width;
    const T *data_offset_ptr =
        data_offset + (b * deformable_group + deformable_group_index) * 2 *
                          kernel_h * kernel_w * height_col * width_col;
    const T *data_mask_ptr =
        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
                        kernel_w * height_col * width_col;

    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;

    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
         col_c += col_step) {
      const int col_pos =
          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
      const int bp_dir = offset_c % 2;

      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
      int i =
          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
      int w_out = col_pos % width_col;
      int h_out = (col_pos / width_col) % height_col;
      int w_in = w_out * stride_w - pad_w;
      int h_in = h_out * stride_h - pad_h;
      const int data_offset_h_ptr =
          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
      const int data_offset_w_ptr =
          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
           w_out);
      const int data_mask_hw_ptr =
          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
      const T offset_h = data_offset_ptr[data_offset_h_ptr];
      const T offset_w = data_offset_ptr[data_offset_w_ptr];
      const T mask = data_mask_ptr[data_mask_hw_ptr];
      T inv_h = h_in + i * dilation_h + offset_h;
      T inv_w = w_in + j * dilation_w + offset_w;
      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
        inv_h = inv_w = -2;
      else
        mval += data_col_ptr[col_pos] *
                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
                                         width, height, width, inv_h, inv_w);
      const T weight = dmcn_get_coordinate_weight_cpu(
          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
          width, bp_dir);
      val += weight * data_col_ptr[col_pos] * mask;
      cnt += 1;
    }
    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
    grad_offset[index] = val;
    if (offset_c % 2 == 0)
      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
      // height_col + h) * width_col + w], mask_req, mval);
      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
                      kernel_w +
                  offset_c / 2) *
                     height_col +
                 h) *
                    width_col +
                w] = mval;
  }
}

void modulated_deformable_im2col_cpu(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col) {
  // num_axes should be smaller than block size
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels = channels * batch_size * height_col * width_col;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();

        modulated_deformable_im2col_cpu_kernel(
            num_kernels, data_im_, data_offset_, data_mask_, height_im,
            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
            channels, deformable_group, height_col, width_col, data_col_);
      }));
}

void modulated_deformable_col2im_cpu(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im) {
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels =
      channels * kernel_h * kernel_w * batch_size * height_col * width_col;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();

        modulated_deformable_col2im_cpu_kernel(
            num_kernels, data_col_, data_offset_, data_mask_, channels,
            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
            batch_size, deformable_group, height_col, width_col, grad_im_);
      }));
}

void modulated_deformable_col2im_coord_cpu(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask) {
  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
                          kernel_w * deformable_group;
  const int channel_per_deformable_group =
      channels * kernel_h * kernel_w / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();

        modulated_deformable_col2im_coord_cpu_kernel(
            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
            stride_h, stride_w, dilation_h, dilation_w,
            channel_per_deformable_group, batch_size,
            2 * kernel_h * kernel_w * deformable_group, deformable_group,
            height_col, width_col, grad_offset_, grad_mask_);
      }));
}

void modulated_deformable_im2col_impl(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col);

void modulated_deformable_col2im_impl(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im);

void modulated_deformable_col2im_coord_impl(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask);

REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,
                     modulated_deformable_im2col_cpu);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,
                     modulated_deformable_col2im_cpu);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,
                     modulated_deformable_col2im_coord_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/nms.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  if (boxes.numel() == 0) {
    return at::empty({0}, boxes.options().dtype(at::kLong));
  }
  auto x1_t = boxes.select(1, 0).contiguous();
  auto y1_t = boxes.select(1, 1).contiguous();
  auto x2_t = boxes.select(1, 2).contiguous();
  auto y2_t = boxes.select(1, 3).contiguous();

  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);

  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));

  auto nboxes = boxes.size(0);
  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));

  auto select = select_t.data_ptr<bool>();
  auto order = order_t.data_ptr<int64_t>();
  auto x1 = x1_t.data_ptr<float>();
  auto y1 = y1_t.data_ptr<float>();
  auto x2 = x2_t.data_ptr<float>();
  auto y2 = y2_t.data_ptr<float>();
  auto areas = areas_t.data_ptr<float>();

  for (int64_t _i = 0; _i < nboxes; _i++) {
    if (select[_i] == false) continue;
    auto i = order[_i];
    auto ix1 = x1[i];
    auto iy1 = y1[i];
    auto ix2 = x2[i];
    auto iy2 = y2[i];
    auto iarea = areas[i];

    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
      if (select[_j] == false) continue;
      auto j = order[_j];
      auto xx1 = std::max(ix1, x1[j]);
      auto yy1 = std::max(iy1, y1[j]);
      auto xx2 = std::min(ix2, x2[j]);
      auto yy2 = std::min(iy2, y2[j]);

      auto w = std::max(0.f, xx2 - xx1 + offset);
      auto h = std::max(0.f, yy2 - yy1 + offset);
      auto inter = w * h;
      auto ovr = inter / (iarea + areas[j] - inter);
      if (ovr > iou_threshold) select[_j] = false;
    }
  }
  return order_t.masked_select(select_t);
}

Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
REGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);

Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
                   float iou_threshold, float sigma, float min_score,
                   int method, int offset) {
  if (boxes.numel() == 0) {
    return at::empty({0}, boxes.options().dtype(at::kLong));
  }

  auto x1_t = boxes.select(1, 0).contiguous();
  auto y1_t = boxes.select(1, 1).contiguous();
  auto x2_t = boxes.select(1, 2).contiguous();
  auto y2_t = boxes.select(1, 3).contiguous();
  auto scores_t = scores.clone();

  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);

  auto nboxes = boxes.size(0);
  auto x1 = x1_t.data_ptr<float>();
  auto y1 = y1_t.data_ptr<float>();
  auto x2 = x2_t.data_ptr<float>();
  auto y2 = y2_t.data_ptr<float>();
  auto sc = scores_t.data_ptr<float>();
  auto areas = areas_t.data_ptr<float>();
  auto de = dets.data_ptr<float>();

  int64_t pos = 0;
  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
  auto inds = inds_t.data_ptr<int64_t>();

  for (int64_t i = 0; i < nboxes; i++) {
    auto max_score = sc[i];
    auto max_pos = i;

    pos = i + 1;
    // get max box
    while (pos < nboxes) {
      if (max_score < sc[pos]) {
        max_score = sc[pos];
        max_pos = pos;
      }
      pos = pos + 1;
    }
    // swap
    auto ix1 = de[i * 5 + 0] = x1[max_pos];
    auto iy1 = de[i * 5 + 1] = y1[max_pos];
    auto ix2 = de[i * 5 + 2] = x2[max_pos];
    auto iy2 = de[i * 5 + 3] = y2[max_pos];
    auto iscore = de[i * 5 + 4] = sc[max_pos];
    auto iarea = areas[max_pos];
    auto iind = inds[max_pos];
    x1[max_pos] = x1[i];
    y1[max_pos] = y1[i];
    x2[max_pos] = x2[i];
    y2[max_pos] = y2[i];
    sc[max_pos] = sc[i];
    areas[max_pos] = areas[i];
    inds[max_pos] = inds[i];
    x1[i] = ix1;
    y1[i] = iy1;
    x2[i] = ix2;
    y2[i] = iy2;
    sc[i] = iscore;
    areas[i] = iarea;
    inds[i] = iind;

    pos = i + 1;
    while (pos < nboxes) {
      auto xx1 = std::max(ix1, x1[pos]);
      auto yy1 = std::max(iy1, y1[pos]);
      auto xx2 = std::min(ix2, x2[pos]);
      auto yy2 = std::min(iy2, y2[pos]);

      auto w = std::max(0.f, xx2 - xx1 + offset);
      auto h = std::max(0.f, yy2 - yy1 + offset);
      auto inter = w * h;
      auto ovr = inter / (iarea + areas[pos] - inter);

      float weight = 1.;
      if (method == 0) {
        if (ovr >= iou_threshold) weight = 0;
      } else if (method == 1) {
        if (ovr >= iou_threshold) weight = 1 - ovr;
      } else if (method == 2) {
        weight = std::exp(-(ovr * ovr) / sigma);
      }
      sc[pos] *= weight;
      // if box score falls below threshold, discard the box by
      // swapping with last box update N
      if (sc[pos] < min_score) {
        x1[pos] = x1[nboxes - 1];
        y1[pos] = y1[nboxes - 1];
        x2[pos] = x2[nboxes - 1];
        y2[pos] = y2[nboxes - 1];
        sc[pos] = sc[nboxes - 1];
        areas[pos] = areas[nboxes - 1];
        inds[pos] = inds[nboxes - 1];
        nboxes = nboxes - 1;
        pos = pos - 1;
      }
      pos = pos + 1;
    }
  }
  return inds_t.slice(0, 0, nboxes);
}

Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
                    float iou_threshold, float sigma, float min_score,
                    int method, int offset);
REGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);

std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
  auto x1_t = dets.select(1, 0).contiguous();
  auto y1_t = dets.select(1, 1).contiguous();
  auto x2_t = dets.select(1, 2).contiguous();
  auto y2_t = dets.select(1, 3).contiguous();
  auto scores = dets.select(1, 4).contiguous();

  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);

  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));

  auto ndets = dets.size(0);
  at::Tensor suppressed_t =
      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));

  auto suppressed = suppressed_t.data_ptr<uint8_t>();
  auto order = order_t.data_ptr<int64_t>();
  auto x1 = x1_t.data_ptr<float>();
  auto y1 = y1_t.data_ptr<float>();
  auto x2 = x2_t.data_ptr<float>();
  auto y2 = y2_t.data_ptr<float>();
  auto areas = areas_t.data_ptr<float>();

  std::vector<int> keep;
  std::vector<std::vector<int> > matched;

  for (int64_t _i = 0; _i < ndets; _i++) {
    auto i = order[_i];
    if (suppressed[i] == 1) continue;
    keep.push_back(i);
    std::vector<int> v_i;
    auto ix1 = x1[i];
    auto iy1 = y1[i];
    auto ix2 = x2[i];
    auto iy2 = y2[i];
    auto iarea = areas[i];

    for (int64_t _j = _i + 1; _j < ndets; _j++) {
      auto j = order[_j];
      if (suppressed[j] == 1) continue;
      auto xx1 = std::max(ix1, x1[j]);
      auto yy1 = std::max(iy1, y1[j]);
      auto xx2 = std::min(ix2, x2[j]);
      auto yy2 = std::min(iy2, y2[j]);

      auto w = std::max(static_cast<float>(0), xx2 - xx1);
      auto h = std::max(static_cast<float>(0), yy2 - yy1);
      auto inter = w * h;
      auto ovr = inter / (iarea + areas[j] - inter);
      if (ovr >= iou_threshold) {
        suppressed[j] = 1;
        v_i.push_back(j);
      }
    }
    matched.push_back(v_i);
  }
  for (size_t i = 0; i < keep.size(); i++)
    matched[i].insert(matched[i].begin(), keep[i]);
  return matched;
}

std::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);
REGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"

template <typename scalar_t>
Tensor nms_quadri_cpu_kernel(const Tensor dets, const Tensor scores,
                             const float iou_threshold) {
  // nms_quadri_cpu_kernel is modified from torchvision's nms_cpu_kernel,
  // however, the code in this function is much shorter because
  // we delegate the IoU computation for quadri boxes to
  // the single_box_iou_quadri function in box_iou_rotated_utils.h
  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
             "dets should have the same type as scores");

  if (dets.numel() == 0) {
    return at::empty({0}, dets.options().dtype(at::kLong));
  }

  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));

  auto ndets = dets.size(0);
  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));

  auto suppressed = suppressed_t.data_ptr<uint8_t>();
  auto keep = keep_t.data_ptr<int64_t>();
  auto order = order_t.data_ptr<int64_t>();

  int64_t num_to_keep = 0;

  for (int64_t _i = 0; _i < ndets; _i++) {
    auto i = order[_i];
    if (suppressed[i] == 1) {
      continue;
    }

    keep[num_to_keep++] = i;

    for (int64_t _j = _i + 1; _j < ndets; _j++) {
      auto j = order[_j];
      if (suppressed[j] == 1) {
        continue;
      }

      auto ovr = single_box_iou_quadri<scalar_t>(
          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
      if (ovr >= iou_threshold) {
        suppressed[j] = 1;
      }
    }
  }
  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}

Tensor nms_quadri_cpu(const Tensor dets, const Tensor scores,
                      const float iou_threshold) {
  auto result = at::empty({0}, dets.options());
  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_quadri", [&] {
    result = nms_quadri_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
  });
  return result;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"

template <typename scalar_t>
Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
                              const float iou_threshold) {
  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
  // however, the code in this function is much shorter because
  // we delegate the IoU computation for rotated boxes to
  // the single_box_iou_rotated function in box_iou_rotated_utils.h
  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
             "dets should have the same type as scores");

  if (dets.numel() == 0) {
    return at::empty({0}, dets.options().dtype(at::kLong));
  }

  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));

  auto ndets = dets.size(0);
  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));

  auto suppressed = suppressed_t.data_ptr<uint8_t>();
  auto keep = keep_t.data_ptr<int64_t>();
  auto order = order_t.data_ptr<int64_t>();

  int64_t num_to_keep = 0;

  for (int64_t _i = 0; _i < ndets; _i++) {
    auto i = order[_i];
    if (suppressed[i] == 1) {
      continue;
    }

    keep[num_to_keep++] = i;

    for (int64_t _j = _i + 1; _j < ndets; _j++) {
      auto j = order[_j];
      if (suppressed[j] == 1) {
        continue;
      }

      auto ovr = single_box_iou_rotated<scalar_t>(
          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
      if (ovr >= iou_threshold) {
        suppressed[j] = 1;
      }
    }
  }
  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}

Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
                       const float iou_threshold) {
  auto result = at::empty({0}, dets.options());
  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
  });
  return result;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/WenmuZhou/PAN.pytorch

#include <queue>

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

std::vector<std::vector<float>> estimate_confidence(int32_t* label,
                                                    float* score, int label_num,
                                                    int height, int width) {
  std::vector<std::vector<float>> point_vector;
  for (int i = 0; i < label_num; i++) {
    std::vector<float> point;
    point.push_back(0);
    point.push_back(0);
    point_vector.push_back(point);
  }
  for (int y = 0; y < height; y++) {
    auto label_tmp = label + y * width;
    auto score_tmp = score + y * width;
    for (int x = 0; x < width; x++) {
      auto l = label_tmp[x];
      if (l > 0) {
        float confidence = score_tmp[x];
        point_vector[l].push_back(x);
        point_vector[l].push_back(y);
        point_vector[l][0] += confidence;
        point_vector[l][1] += 1;
      }
    }
  }
  for (size_t l = 0; l < point_vector.size(); l++)
    if (point_vector[l][1] > 0) {
      point_vector[l][0] /= point_vector[l][1];
    }
  return point_vector;
}
std::vector<std::vector<float>> pixel_group_cpu(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
  assert(score.dim() == 2);
  assert(mask.dim() == 2);
  assert(embedding.dim() == 3);
  int height = score.size(0);
  int width = score.size(1);
  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));

  auto threshold_square = dis_threshold * dis_threshold;
  auto ptr_score = score.data_ptr<float>();
  auto ptr_mask = mask.data_ptr<bool>();
  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
  auto ptr_embedding = embedding.data_ptr<float>();
  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
  auto embedding_dim = embedding.size(2);
  std::vector<std::vector<float>> kernel_vector(
      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));

  Tensor text_label;
  text_label = kernel_label.clone();
  auto ptr_text_label = text_label.data_ptr<int32_t>();

  for (int i = 0; i < height; i++) {
    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;

    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
         j++, k += embedding_dim) {
      int32_t label = ptr_kernel_label_tmp[j];
      if (label > 0) {
        for (int d = 0; d < embedding_dim; d++)
          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
        kernel_vector[label][embedding_dim] += 1;
        // kernel pixel number
        if (ptr_kernel_contour_tmp[j]) {
          contour_pixels.push(std::make_tuple(i, j, label));
        }
      }
    }
  }
  for (int i = 0; i < kernel_region_num; i++) {
    for (int j = 0; j < embedding_dim; j++) {
      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
    }
  }
  int dx[4] = {-1, 1, 0, 0};
  int dy[4] = {0, 0, -1, 1};
  while (!contour_pixels.empty()) {
    auto query_pixel = contour_pixels.front();
    contour_pixels.pop();
    int y = std::get<0>(query_pixel);
    int x = std::get<1>(query_pixel);
    int32_t l = std::get<2>(query_pixel);
    auto kernel_cv = kernel_vector[l];
    for (int idx = 0; idx < 4; idx++) {
      int tmpy = y + dy[idx];
      int tmpx = x + dx[idx];
      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
        continue;

      float dis = 0;
      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
      for (size_t i = 0; i < size_t(embedding_dim); i++) {
        dis +=
            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
        // ignore further computing if dis is big enough
        if (dis >= threshold_square) break;
      }
      if (dis >= threshold_square) continue;
      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
      ptr_text_label_tmp[tmpx] = l;
    }
  }

  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
                             height, width);
}
std::vector<std::vector<float>> pixel_group_impl(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float dis_threshold);
REGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
================================================
#include "pytorch_cpp_helper.hpp"

inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
                                      float &local_x, float &local_y) {
  float cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
}

inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
                                 float &local_x, float &local_y) {
  // param pt: (x, y, z)
  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
  // cz in the bottom center
  float x = pt[0], y = pt[1], z = pt[2];
  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
  cz += z_size /
        2.0;  // shift to the center since cz in box3d is the bottom center

  if (fabsf(z - cz) > z_size / 2.0) return 0;
  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
  return in_flag;
}

void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                 Tensor pts_indices_tensor) {
  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)

  CHECK_CONTIGUOUS(boxes_tensor);
  CHECK_CONTIGUOUS(pts_tensor);
  CHECK_CONTIGUOUS(pts_indices_tensor);

  int boxes_num = boxes_tensor.size(0);
  int pts_num = pts_tensor.size(0);

  const float *boxes = boxes_tensor.data_ptr<float>();
  const float *pts = pts_tensor.data_ptr<float>();
  int *pts_indices = pts_indices_tensor.data_ptr<int>();

  float local_x = 0, local_y = 0;
  for (int i = 0; i < boxes_num; i++) {
    for (int j = 0; j < pts_num; j++) {
      int cur_in_flag =
          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
      pts_indices[i * pts_num + j] = cur_in_flag;
    }
  }
}


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/psamask.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif

void psamask_collect_forward(const int num_, const int h_feature,
                             const int w_feature, const int h_mask,
                             const int w_mask, const int half_h_mask,
                             const int half_w_mask, const Tensor mask_data,
                             Tensor buffer_data) {
  for (int n = 0; n < num_; n++) {
    for (int h = 0; h < h_feature; h++) {
      for (int w = 0; w < w_feature; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with
        // mask-indexed
        const int hstart = max(0, half_h_mask - h);
        const int hend = min(h_mask, h_feature + half_h_mask - h);
        const int wstart = max(0, half_w_mask - w);
        const int wend = min(w_mask, w_feature + half_w_mask - w);
        // (hidx,                    widx                   ) with mask-indexed
        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
        // feature-indexed
        for (int hidx = hstart; hidx < hend; hidx++) {
          for (int widx = wstart; widx < wend; widx++) {
            buffer_data.view({-1})[(n * h_feature * w_feature +
                                    (hidx + h - half_h_mask) * w_feature +
                                    (widx + w - half_w_mask)) *
                                       h_feature * w_feature +
                                   h * w_feature + w] =
                mask_data.view(
                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
                               h_feature +
                           h) *
                              w_feature +
                          w];
          }
        }
      }
    }
  }
}

void psamask_distribute_forward(const int num_, const int h_feature,
                                const int w_feature, const int h_mask,
                                const int w_mask, const int half_h_mask,
                                const int half_w_mask, const Tensor mask_data,
                                Tensor buffer_data) {
  for (int n = 0; n < num_; n++) {
    for (int h = 0; h < h_feature; h++) {
      for (int w = 0; w < w_feature; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with
        // mask-indexed
        const int hstart = max(0, half_h_mask - h);
        const int hend = min(h_mask, h_feature + half_h_mask - h);
        const int wstart = max(0, half_w_mask - w);
        const int wend = min(w_mask, w_feature + half_w_mask - w);
        // (hidx,                    widx                   ) with mask-indexed
        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
        // feature-indexed
        for (int hidx = hstart; hidx < hend; hidx++) {
          for (int widx = wstart; widx < wend; widx++) {
            buffer_data.view(
                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
                          h_feature * w_feature +
                      (hidx + h - half_h_mask) * w_feature +
                      (widx + w - half_w_mask)] =
                mask_data.view(
                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
                               h_feature +
                           h) *
                              w_feature +
                          w];
          }
        }
      }
    }
  }
}

void psamask_collect_backward(const int num_, const int h_feature,
                              const int w_feature, const int h_mask,
                              const int w_mask, const int half_h_mask,
                              const int half_w_mask, const Tensor buffer_diff,
                              Tensor mask_diff) {
  for (int n = 0; n < num_; n++) {
    for (int h = 0; h < h_feature; h++) {
      for (int w = 0; w < w_feature; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with
        // mask-indexed
        const int hstart = max(0, half_h_mask - h);
        const int hend = min(h_mask, h_feature + half_h_mask - h);
        const int wstart = max(0, half_w_mask - w);
        const int wend = min(w_mask, w_feature + half_w_mask - w);
        // (hidx,                    widx                   ) with mask-indexed
        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
        // feature-indexed
        for (int hidx = hstart; hidx < hend; hidx++) {
          for (int widx = wstart; widx < wend; widx++) {
            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
                                      h_feature +
                                  h) *
                                     w_feature +
                                 w] =
                buffer_diff.view({-1})[(n * h_feature * w_feature +
                                        (hidx + h - half_h_mask) * w_feature +
                                        (widx + w - half_w_mask)) *
                                           h_feature * w_feature +
                                       h * w_feature + w];
          }
        }
      }
    }
  }
}

void psamask_distribute_backward(const int num_, const int h_feature,
                                 const int w_feature, const int h_mask,
                                 const int w_mask, const int half_h_mask,
                                 const int half_w_mask,
                                 const Tensor buffer_diff, Tensor mask_diff) {
  for (int n = 0; n < num_; n++) {
    for (int h = 0; h < h_feature; h++) {
      for (int w = 0; w < w_feature; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with
        // mask-indexed
        const int hstart = max(0, half_h_mask - h);
        const int hend = min(h_mask, h_feature + half_h_mask - h);
        const int wstart = max(0, half_w_mask - w);
        const int wend = min(w_mask, w_feature + half_w_mask - w);
        // (hidx,                    widx                   ) with mask-indexed
        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
        // feature-indexed
        for (int hidx = hstart; hidx < hend; hidx++) {
          for (int widx = wstart; widx < wend; widx++) {
            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
                                      h_feature +
                                  h) *
                                     w_feature +
                                 w] =
                buffer_diff.view(
                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
                              h_feature * w_feature +
                          (hidx + h - half_h_mask) * w_feature +
                          (widx + w - half_w_mask)];
          }
        }
      }
    }
  }
}

void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
                         const int num_, const int h_feature,
                         const int w_feature, const int h_mask,
                         const int w_mask, const int half_h_mask,
                         const int half_w_mask) {
  if (psa_type == 0)
    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
                            half_h_mask, half_w_mask, input, output);
  else
    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
                               half_h_mask, half_w_mask, input, output);
}

void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
                          Tensor grad_input, const int num_,
                          const int h_feature, const int w_feature,
                          const int h_mask, const int w_mask,
                          const int half_h_mask, const int half_w_mask) {
  if (psa_type == 0)
    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
                             half_h_mask, half_w_mask, grad_output, grad_input);
  else
    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
                                half_h_mask, half_w_mask, grad_output,
                                grad_input);
}

void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask);

void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask);
REGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);
REGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
================================================
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

// implementation taken from Caffe2
template <typename T>
struct PreCalc {
  int pos1;
  int pos2;
  int pos3;
  int pos4;
  T w1;
  T w2;
  T w3;
  T w4;
};

template <typename T>
void pre_calc_for_bilinear_interpolate(
    const int height, const int width, const int pooled_height,
    const int pooled_width, const int iy_upper, const int ix_upper,
    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
  int pre_calc_index = 0;
  for (int ph = 0; ph < pooled_height; ph++) {
    for (int pw = 0; pw < pooled_width; pw++) {
      for (int iy = 0; iy < iy_upper; iy++) {
        const T yy = roi_start_h + ph * bin_size_h +
                     static_cast<T>(iy + .5f) * bin_size_h /
                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
        for (int ix = 0; ix < ix_upper; ix++) {
          const T xx = roi_start_w + pw * bin_size_w +
                       static_cast<T>(ix + .5f) * bin_size_w /
                           static_cast<T>(roi_bin_grid_w);

          T x = xx;
          T y = yy;
          // deal with: inverse elements are out of feature map boundary
          if (y < -1.0 || y > height || x < -1.0 || x > width) {
            // empty
            PreCalc<T> pc;
            pc.pos1 = 0;
            pc.pos2 = 0;
            pc.pos3 = 0;
            pc.pos4 = 0;
            pc.w1 = 0;
            pc.w2 = 0;
            pc.w3 = 0;
            pc.w4 = 0;
            pre_calc[pre_calc_index] = pc;
            pre_calc_index += 1;
            continue;
          }

          if (y <= 0) {
            y = 0;
          }
          if (x <= 0) {
            x = 0;
          }

          int y_low = (int)y;
          int x_low = (int)x;
          int y_high;
          int x_high;

          if (y_low >= height - 1) {
            y_high = y_low = height - 1;
            y = (T)y_low;
          } else {
            y_high = y_low + 1;
          }

          if (x_low >= width - 1) {
            x_high = x_low = width - 1;
            x = (T)x_low;
          } else {
            x_high = x_low + 1;
          }

          T ly = y - y_low;
          T lx = x - x_low;
          T hy = 1. - ly, hx = 1. - lx;
          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

          // save weights and indices
          PreCalc<T> pc;
          pc.pos1 = y_low * width + x_low;
          pc.pos2 = y_low * width + x_high;
          pc.pos3 = y_high * width + x_low;
          pc.pos4 = y_high * width + x_high;
          pc.w1 = w1;
          pc.w2 = w2;
          pc.w3 = w3;
          pc.w4 = w4;
          pre_calc[pre_calc_index] = pc;

          pre_calc_index += 1;
        }
      }
    }
  }
}

template <typename T>
void ROIAlignForward(const int nthreads, const T* input, const T* rois,
                     T* output, T* argmax_y, T* argmax_x,
                     const int pooled_height, const int pooled_width,
                     const T spatial_scale, const int sampling_ratio,
                     const int pool_mode,  // 0 - max pool, 1 - avg pool
                     const bool aligned, const int channels, const int height,
                     const int width) {
  int n_rois = nthreads / channels / pooled_width / pooled_height;
  // (n, c, ph, pw) is an element in the pooled output
  // can be parallelized using omp
  // #pragma omp parallel for num_threads(32)
  for (int n = 0; n < n_rois; n++) {
    int index_n = n * channels * pooled_width * pooled_height;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T roi_start_w = offset_rois[1] * spatial_scale - offset;
    T roi_start_h = offset_rois[2] * spatial_scale - offset;
    T roi_end_w = offset_rois[3] * spatial_scale - offset;
    T roi_end_h = offset_rois[4] * spatial_scale - offset;

    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;
    if (aligned) {
      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
                 "ROIs in ROIAlign cannot have non-negative size!");
    } else {  // for backward-compatibility only
      roi_width = std::max(roi_width, (T)1.);
      roi_height = std::max(roi_height, (T)1.);
    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);

    // When the grid is empty, output zeros == 0/1, instead of NaN.
    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    // we want to precalculate indices and weights shared by all channels,
    // this is the key point of optimization
    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
                                     pooled_width * pooled_height);
    pre_calc_for_bilinear_interpolate(
        height, width, pooled_height, pooled_width, roi_bin_grid_h,
        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
        roi_bin_grid_h, roi_bin_grid_w, pre_calc);

    for (int c = 0; c < channels; c++) {
      int index_n_c = index_n + c * pooled_width * pooled_height;
      const T* offset_input =
          input + (roi_batch_ind * channels + c) * height * width;
      int pre_calc_index = 0;

      for (int ph = 0; ph < pooled_height; ph++) {
        for (int pw = 0; pw < pooled_width; pw++) {
          int index = index_n_c + ph * pooled_width + pw;

          T output_val = 0.;
          T maxval = -10000;
          T maxidx_y = -1.f, maxidx_x = -1.f;
          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
            const T y = roi_start_h + ph * bin_size_h +
                        static_cast<T>(iy + .5f) * bin_size_h /
                            static_cast<T>(roi_bin_grid_h);
            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
              const T x = roi_start_w + pw * bin_size_w +
                          static_cast<T>(ix + .5f) * bin_size_w /
                              static_cast<T>(roi_bin_grid_w);
              PreCalc<T> pc = pre_calc[pre_calc_index];
              T val = pc.w1 * offset_input[pc.pos1] +
                      pc.w2 * offset_input[pc.pos2] +
                      pc.w3 * offset_input[pc.pos3] +
                      pc.w4 * offset_input[pc.pos4];
              if (val > maxval) {
                maxval = val;
                maxidx_y = y;
                maxidx_x = x;
              }
              output_val += val;
              pre_calc_index += 1;
            }
          }
          if (pool_mode == 0) {
            // We do max pooling inside a bin
            output[index] = maxval;
            argmax_y[index] = maxidx_y;
            argmax_x[index] = maxidx_x;
          } else if (pool_mode == 1) {
            // We do average (integral) pooling inside a bin
            output[index] = output_val / count;
          }  // if
        }  // for pw
      }  // for ph
    }  // for c
  }  // for n
}

template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
                                   int& x_high, int& y_low, int& y_high,
                                   const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    // empty
    w1 = w2 = w3 = w4 = 0.;
    x_low = x_high = y_low = y_high = -1;
    return;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  y_low = (int)y;
  x_low = (int)x;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;

  // reference in forward
  // T v1 = input[y_low * width + x_low];
  // T v2 = input[y_low * width + x_high];
  // T v3 = input[y_high * width + x_low];
  // T v4 = input[y_high * width + x_high];
  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  return;
}

template <class T>
inline void add(T* address, const T& val) {
  *address += val;
}

template <typename T>
void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
                      const T* argmax_y, const T* argmax_x, T* grad_input,
                      const int pooled_height, const int pooled_width,
                      const T spatial_scale, const int sampling_ratio,
                      const int pool_mode,  // 0 - max pool, 1 - avg pool
                      const bool aligned, const int channels, const int height,
                      const int width, const int n_stride, const int c_stride,
                      const int h_stride, const int w_stride) {
  for (int index = 0; index < nthreads; index++) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* offset_rois = rois + n * 5;
    int roi_batch_ind = offset_rois[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T roi_start_w = offset_rois[1] * spatial_scale - offset;
    T roi_start_h = offset_rois[2] * spatial_scale - offset;
    T roi_end_w = offset_rois[3] * spatial_scale - offset;
    T roi_end_h = offset_rois[4] * spatial_scale - offset;

    T roi_width = roi_end_w - roi_start_w;
    T roi_height = roi_end_h - roi_start_h;
    if (aligned) {
      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
                 "ROIs in ROIAlign do not have non-negative size!");
    } else {  // for backward-compatibility only
      roi_width = std::max(roi_width, (T)1.);
      roi_height = std::max(roi_height, (T)1.);
    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    T* offset_grad_input =
        grad_input + ((roi_batch_ind * channels + c) * height * width);

    int output_offset = n * n_stride + c * c_stride;
    const T* offset_grad_output = grad_output + output_offset;
    const T grad_output_this_bin =
        offset_grad_output[ph * h_stride + pw * w_stride];

    if (pool_mode == 0) {
      // We do max pooling inside a bin
      T y = argmax_y[index], x = argmax_x[index];
      if (y != -1.f) {
        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;
        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high, index);

        T g1 = grad_output_this_bin * w1;
        T g2 = grad_output_this_bin * w2;
        T g3 = grad_output_this_bin * w3;
        T g4 = grad_output_this_bin * w4;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          // atomic add is not needed for now since it is single threaded
          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
        }  // if
      }  // mode
    } else if (pool_mode == 1) {
      // We do average (integral) pooling inside a bin
      // We use roi_bin_grid to sample the grid and mimic integral
      int roi_bin_grid_h =
          (sampling_ratio > 0)
              ? sampling_ratio
              : ceilf(roi_height / pooled_height);  // e.g., = 2
      int roi_bin_grid_w = (sampling_ratio > 0)
                               ? sampling_ratio
                               : ceilf(roi_width / pooled_width);

      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
        const T y = roi_start_h + ph * bin_size_h +
                    static_cast<T>(iy + .5f) * bin_size_h /
                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
          const T x = roi_start_w + pw * bin_size_w +
                      static_cast<T>(ix + .5f) * bin_size_w /
                          static_cast<T>(roi_bin_grid_w);

          T w1, w2, w3, w4;
          int x_low, x_high, y_low, y_high;

          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                        x_low, x_high, y_low, y_high, index);

          T g1 = grad_output_this_bin * w1 / count;
          T g2 = grad_output_this_bin * w2 / count;
          T g3 = grad_output_this_bin * w3 / count;
          T g4 = grad_output_this_bin * w4 / count;

          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
            // atomic add is not needed for now since it is single threaded
            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
            add(offset_grad_input + y_high * width + x_high,
                static_cast<T>(g4));
          }  // if
        }  // ix
      }  // iy
    }  // mode
  }  // for
}  // ROIAlignBackward

void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
                                Tensor argmax_y, Tensor argmax_x,
                                int aligned_height, int aligned_width,
                                float spatial_scale, int sampling_ratio,
                                int pool_mode, bool aligned) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "ROIAlign_forward", [&] {
        ROIAlignForward<scalar_t>(
            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
            aligned, channels, height, width);
      });
}

void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
                                 Tensor argmax_y, Tensor argmax_x,
                                 Tensor grad_input, int aligned_height,
                                 int aligned_width, float spatial_scale,
                                 int sampling_ratio, int pool_mode,
                                 bool aligned) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  // get stride values to ensure indexing into gradients is correct.
  int n_stride = grad_output.stride(0);
  int c_stride = grad_output.stride(1);
  int h_stride = grad_output.stride(2);
  int w_stride = grad_output.stride(3);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "ROIAlign_backward", [&] {
        ROIAlignBackward<scalar_t>(
            output_size, grad_output.data_ptr<scalar_t>(),
            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
            sampling_ratio, pool_mode, aligned, channels, height, width,
            n_stride, c_stride, h_stride, w_stride);
      });
}

void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
                           int aligned_width, float spatial_scale,
                           int sampling_ratio, int pool_mode, bool aligned) {
  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
                             aligned_height, aligned_width, spatial_scale,
                             sampling_ratio, pool_mode, aligned);
}

void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
                            Tensor argmax_x, Tensor grad_input,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
                              aligned_height, aligned_width, spatial_scale,
                              sampling_ratio, pool_mode, aligned);
}

void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);

void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned);

REGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
================================================
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

// implementation taken from Caffe2
template <typename T>
struct PreCalc {
  int pos1;
  int pos2;
  int pos3;
  int pos4;
  T w1;
  T w2;
  T w3;
  T w4;
};

template <typename T>
void pre_calc_for_bilinear_interpolate(
    const int height, const int width, const int pooled_height,
    const int pooled_width, const int iy_upper, const int ix_upper,
    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
  int pre_calc_index = 0;
  for (int ph = 0; ph < pooled_height; ph++) {
    for (int pw = 0; pw < pooled_width; pw++) {
      for (int iy = 0; iy < iy_upper; iy++) {
        const T yy = roi_start_h + ph * bin_size_h +
                     static_cast<T>(iy + .5f) * bin_size_h /
                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
        for (int ix = 0; ix < ix_upper; ix++) {
          const T xx = roi_start_w + pw * bin_size_w +
                       static_cast<T>(ix + .5f) * bin_size_w /
                           static_cast<T>(roi_bin_grid_w);

          // Rotate by theta around the center and translate
          // In image space, (y, x) is the order for Right Handed System,
          // and this is essentially multiplying the point by a rotation matrix
          // to rotate it counterclockwise through angle theta.
          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
          // deal with: inverse elements are out of feature map boundary
          if (y < -1.0 || y > height || x < -1.0 || x > width) {
            // empty
            PreCalc<T> pc;
            pc.pos1 = 0;
            pc.pos2 = 0;
            pc.pos3 = 0;
            pc.pos4 = 0;
            pc.w1 = 0;
            pc.w2 = 0;
            pc.w3 = 0;
            pc.w4 = 0;
            pre_calc[pre_calc_index] = pc;
            pre_calc_index += 1;
            continue;
          }

          if (y < 0) {
            y = 0;
          }
          if (x < 0) {
            x = 0;
          }

          int y_low = (int)y;
          int x_low = (int)x;
          int y_high;
          int x_high;

          if (y_low >= height - 1) {
            y_high = y_low = height - 1;
            y = (T)y_low;
          } else {
            y_high = y_low + 1;
          }

          if (x_low >= width - 1) {
            x_high = x_low = width - 1;
            x = (T)x_low;
          } else {
            x_high = x_low + 1;
          }

          T ly = y - y_low;
          T lx = x - x_low;
          T hy = 1. - ly, hx = 1. - lx;
          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

          // save weights and indices
          PreCalc<T> pc;
          pc.pos1 = y_low * width + x_low;
          pc.pos2 = y_low * width + x_high;
          pc.pos3 = y_high * width + x_low;
          pc.pos4 = y_high * width + x_high;
          pc.w1 = w1;
          pc.w2 = w2;
          pc.w3 = w3;
          pc.w4 = w4;
          pre_calc[pre_calc_index] = pc;

          pre_calc_index += 1;
        }
      }
    }
  }
}

template <typename T>
void ROIAlignRotatedForward(const int nthreads, const T* input,
                            const T& spatial_scale, const bool aligned,
                            const bool clockwise, const int channels,
                            const int height, const int width,
                            const int pooled_height, const int pooled_width,
                            const int sampling_ratio, const T* rois,
                            T* output) {
  int n_rois = nthreads / channels / pooled_width / pooled_height;
  // (n, c, ph, pw) is an element in the pooled output
  // can be parallelized using omp
  // #pragma omp parallel for num_threads(32)
  for (int n = 0; n < n_rois; n++) {
    int index_n = n * channels * pooled_width * pooled_height;

    const T* current_roi = rois + n * 6;
    int roi_batch_ind = current_roi[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T roi_center_w = current_roi[1] * spatial_scale - offset;
    T roi_center_h = current_roi[2] * spatial_scale - offset;
    T roi_width = current_roi[3] * spatial_scale;
    T roi_height = current_roi[4] * spatial_scale;
    T theta = current_roi[5];
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    T cos_theta = cos(theta);
    T sin_theta = sin(theta);

    if (aligned) {
      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
                 "ROIs in ROIAlignRotated do not have non-negative size!");
    } else {  // for backward-compatibility only
      roi_width = std::max(roi_width, (T)1.);
      roi_height = std::max(roi_height, (T)1.);
    }

    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);

    // We do average (integral) pooling inside a bin
    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4

    // we want to precalculate indices and weights shared by all channels,
    // this is the key point of optimization
    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
                                     pooled_width * pooled_height);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    T roi_start_h = -roi_height / 2.0;
    T roi_start_w = -roi_width / 2.0;

    pre_calc_for_bilinear_interpolate(
        height, width, pooled_height, pooled_width, roi_bin_grid_h,
        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
        sin_theta, pre_calc);

    for (int c = 0; c < channels; c++) {
      int index_n_c = index_n + c * pooled_width * pooled_height;
      const T* offset_input =
          input + (roi_batch_ind * channels + c) * height * width;
      int pre_calc_index = 0;

      for (int ph = 0; ph < pooled_height; ph++) {
        for (int pw = 0; pw < pooled_width; pw++) {
          int index = index_n_c + ph * pooled_width + pw;

          T output_val = 0.;
          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
              PreCalc<T> pc = pre_calc[pre_calc_index];
              output_val += pc.w1 * offset_input[pc.pos1] +
                            pc.w2 * offset_input[pc.pos2] +
                            pc.w3 * offset_input[pc.pos3] +
                            pc.w4 * offset_input[pc.pos4];

              pre_calc_index += 1;
            }
          }
          output_val /= count;

          output[index] = output_val;
        }  // for pw
      }  // for ph
    }  // for c
  }  // for n
}

template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
                                   int& x_high, int& y_low, int& y_high) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    // empty
    w1 = w2 = w3 = w4 = 0.;
    x_low = x_high = y_low = y_high = -1;
    return;
  }

  if (y < 0) {
    y = 0;
  }

  if (x < 0) {
    x = 0;
  }

  y_low = (int)y;
  x_low = (int)x;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;

  // reference in forward
  // T v1 = input[y_low * width + x_low];
  // T v2 = input[y_low * width + x_high];
  // T v3 = input[y_high * width + x_low];
  // T v4 = input[y_high * width + x_high];
  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);

  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  return;
}

template <class T>
inline void add(T* address, const T& val) {
  *address += val;
}

template <typename T>
void ROIAlignRotatedBackward(
    const int nthreads,
    // may not be contiguous. should index using n_stride, etc
    const T* grad_output, const T& spatial_scale, const bool aligned,
    const bool clockwise, const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, const int sampling_ratio,
    T* grad_input, const T* rois, const int n_stride, const int c_stride,
    const int h_stride, const int w_stride) {
  for (int index = 0; index < nthreads; index++) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;

    const T* current_roi = rois + n * 6;
    int roi_batch_ind = current_roi[0];

    // Do not use rounding; this implementation detail is critical
    T offset = aligned ? (T)0.5 : (T)0.0;
    T roi_center_w = current_roi[1] * spatial_scale - offset;
    T roi_center_h = current_roi[2] * spatial_scale - offset;
    T roi_width = current_roi[3] * spatial_scale;
    T roi_height = current_roi[4] * spatial_scale;
    T theta = current_roi[5];
    if (clockwise) {
      theta = -theta;  // If clockwise, the angle needs to be reversed.
    }
    T cos_theta = cos(theta);
    T sin_theta = sin(theta);

    if (aligned) {
      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
                 "ROIs in ROIAlignRotated do not have non-negative size!");
    } else {  // for backward-compatibility only
      roi_width = std::max(roi_width, (T)1.);
      roi_height = std::max(roi_height, (T)1.);
    }

    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

    T* offset_grad_input =
        grad_input + ((roi_batch_ind * channels + c) * height * width);

    int output_offset = n * n_stride + c * c_stride;
    const T* offset_grad_output = grad_output + output_offset;
    const T grad_output_this_bin =
        offset_grad_output[ph * h_stride + pw * w_stride];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sampling_ratio > 0)
                             ? sampling_ratio
                             : ceilf(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    T roi_start_h = -roi_height / 2.0;
    T roi_start_w = -roi_width / 2.0;

    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
      const T yy = roi_start_h + ph * bin_size_h +
                   static_cast<T>(iy + .5f) * bin_size_h /
                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const T xx = roi_start_w + pw * bin_size_w +
                     static_cast<T>(ix + .5f) * bin_size_w /
                         static_cast<T>(roi_bin_grid_w);

        // Rotate by theta around the center and translate
        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
        T x = yy * sin_theta + xx * cos_theta + roi_center_w;

        T w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
                                      x_low, x_high, y_low, y_high);

        T g1 = grad_output_this_bin * w1 / count;
        T g2 = grad_output_this_bin * w2 / count;
        T g3 = grad_output_this_bin * w3 / count;
        T g4 = grad_output_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          // atomic add is not needed for now since it is single threaded
          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
        }  // if
      }  // ix
    }  // iy
  }  // for
}  // ROIAlignRotatedBackward

void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
                                       int aligned_height, int aligned_width,
                                       float spatial_scale, int sampling_ratio,
                                       bool aligned, bool clockwise) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "ROIAlignRotated_forward", [&] {
        ROIAlignRotatedForward<scalar_t>(
            output_size, input.data_ptr<scalar_t>(),
            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
            height, width, aligned_height, aligned_width, sampling_ratio,
            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
      });
}

void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
                                        Tensor grad_input, int aligned_height,
                                        int aligned_width, float spatial_scale,
                                        int sampling_ratio, bool aligned,
                                        bool clockwise) {
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  // get stride values to ensure indexing into gradients is correct.
  int n_stride = grad_output.stride(0);
  int c_stride = grad_output.stride(1);
  int h_stride = grad_output.stride(2);
  int w_stride = grad_output.stride(3);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
        ROIAlignRotatedBackward<scalar_t>(
            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
            height, width, aligned_height, aligned_width, sampling_ratio,
            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
            n_stride, c_stride, h_stride, w_stride);
      });
}

void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
                                   int aligned_height, int aligned_width,
                                   float spatial_scale, int sampling_ratio,
                                   bool aligned, bool clockwise) {
  ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
                                    aligned_width, spatial_scale,
                                    sampling_ratio, aligned, clockwise);
}

void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
                                    Tensor bottom_grad, int aligned_height,
                                    int aligned_width, float spatial_scale,
                                    int sampling_ratio, bool aligned,
                                    bool clockwise) {
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }
  ROIAlignRotatedBackwardCPULauncher(
      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
      sampling_ratio, aligned, clockwise);
}

void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise);

void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
                     roi_align_rotated_forward_cpu);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,
                     roi_align_rotated_backward_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
================================================
// modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

template <typename T>
T bilinear_interpolate(const T* input, const int height, const int width, T y,
                       T x, const int index /* index for debug only*/) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  int y_low = (int)y;
  int x_low = (int)x;
  int y_high;
  int x_high;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  // do bilinear interpolation
  T v1 = input[y_low * width + x_low];
  T v2 = input[y_low * width + x_high];
  T v3 = input[y_high * width + x_low];
  T v4 = input[y_high * width + x_high];
  const T v_low = fma(v2 - v1, lx, v1);
  const T v_high = fma(v4 - v3, lx, v3);
  const T val = fma(v_high - v_low, ly, v_low);

  return val;
}

template <typename scalar_t>
void rotated_feature_align_forward_cpu_kernel(
    const int nthreads, const int points, const scalar_t* bottom_data,
    const scalar_t* best_bboxes, const scalar_t spatial_scale,
    const int channels, const int height, const int width, scalar_t* top_data) {
  for (int index = 0; index < nthreads; index++) {
    int w = index % width;
    int h = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    const scalar_t* bbox_offset =
        best_bboxes + ((n * height + h) * width + w) * 5;
    scalar_t roi_y = bbox_offset[0] * spatial_scale;
    scalar_t roi_x = bbox_offset[1] * spatial_scale;

    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
    scalar_t py[5] = {roi_y, 0, 0, 0, 0};

    if (points > 1) {
      scalar_t roi_w = bbox_offset[2] * spatial_scale;
      scalar_t roi_h = bbox_offset[3] * spatial_scale;
      scalar_t roi_a = bbox_offset[4];

      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
      scalar_t wx = cosa * w_2, wy = sina * w_2;
      scalar_t hx = -sina * h_2, hy = cosa * h_2;

      px[1] = roi_x + wx + hx;
      py[1] = roi_y + wy + hy;
      px[2] = roi_x - wx + hx;
      py[2] = roi_y - wy + hy;
      px[3] = roi_x - wx - hx;
      py[3] = roi_y - wy - hy;
      px[4] = roi_x + wx - hx;
      py[4] = roi_y + wy - hy;
    }

    const scalar_t* offset_bottom_data =
        bottom_data + (n * channels + c) * height * width;

    scalar_t output_val = bottom_data[index];
    for (int i = 0; i < points; i++) {
      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
                                                   width, py[i], px[i], i);
    }
    top_data[index] = output_val;
  }
}

template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
                                   int& x_high, int& y_low, int& y_high,
                                   const int index) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    // empty
    w1 = w2 = w3 = w4 = 0.;
    x_low = x_high = y_low = y_high = -1;
    return;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  y_low = (int)y;
  x_low = (int)x;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (T)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (T)x_low;
  } else {
    x_high = x_low + 1;
  }

  T ly = y - y_low;
  T lx = x - x_low;
  T hy = 1. - ly, hx = 1. - lx;

  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  return;
}

template <typename scalar_t>
inline void valueAdd(scalar_t* address, scalar_t val) {
  scalar_t old = *address;
  *address = (old + val);
}

template <typename scalar_t>
void rotated_feature_align_backward_cpu_kernel(
    const int nthreads, const int points, const scalar_t* top_diff,
    const scalar_t* best_bboxes, const scalar_t spatial_scale,
    const int channels, const int height, const int width,
    scalar_t* bottom_diff) {
  for (int index = 0; index < nthreads; index++) {
    int w = index % width;
    int h = (index / width) % height;
    int c = (index / width / height) % channels;
    int n = index / width / height / channels;

    const scalar_t* bbox_offset =
        best_bboxes + ((n * height + h) * width + w) * 5;
    scalar_t roi_y = bbox_offset[0] * spatial_scale;
    scalar_t roi_x = bbox_offset[1] * spatial_scale;

    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
    scalar_t py[5] = {roi_y, 0, 0, 0, 0};

    if (points > 1) {
      scalar_t roi_w = bbox_offset[2] * spatial_scale;
      scalar_t roi_h = bbox_offset[3] * spatial_scale;
      scalar_t roi_a = bbox_offset[4];

      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
      scalar_t wx = cosa * w_2, wy = sina * w_2;
      scalar_t hx = -sina * h_2, hy = cosa * h_2;

      px[1] = roi_x + wx + hx;
      py[1] = roi_y + wy + hy;
      px[2] = roi_x - wx + hx;
      py[2] = roi_y - wy + hy;
      px[3] = roi_x - wx - hx;
      py[3] = roi_y - wy - hy;
      px[4] = roi_x + wx - hx;
      py[4] = roi_y + wy - hy;
    }

    scalar_t* offset_bottom_diff =
        bottom_diff + (n * channels + c) * height * width;
    scalar_t value_top_diff = top_diff[index];

    valueAdd(bottom_diff + index, value_top_diff);
    for (int i = 0; i < points; i++) {
      scalar_t w1, w2, w3, w4;
      int x_low, x_high, y_low, y_high;

      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
                                              w2, w3, w4, x_low, x_high, y_low,
                                              y_high, i);
      scalar_t g1 = value_top_diff * w1;
      scalar_t g2 = value_top_diff * w2;
      scalar_t g3 = value_top_diff * w3;
      scalar_t g4 = value_top_diff * w4;
      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
        valueAdd(offset_bottom_diff + y_low * width + x_low, g1);
        valueAdd(offset_bottom_diff + y_low * width + x_high, g2);
        valueAdd(offset_bottom_diff + y_high * width + x_low, g3);
        valueAdd(offset_bottom_diff + y_high * width + x_high, g4);
      }
    }
  }
}

void rotated_feature_align_forward_cpu(const Tensor features,
                                       const Tensor best_bboxes,
                                       const float spatial_scale,
                                       const int points, Tensor output) {
  const int output_size = features.numel();
  AT_DISPATCH_FLOATING_TYPES(
      features.scalar_type(), "rotated_feature_align_forward_cpu_kernel", [&] {
        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
        scalar_t* top_data = output.data_ptr<scalar_t>();

        rotated_feature_align_forward_cpu_kernel<scalar_t>(
            output_size, points, bottom_data, bboxes_data,
            scalar_t(spatial_scale), features.size(1), features.size(2),
            features.size(3), top_data);
      });
}

void rotated_feature_align_backward_cpu(const Tensor top_grad,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor bottom_grad) {
  const int output_size = top_grad.numel();
  AT_DISPATCH_FLOATING_TYPES(
      top_grad.scalar_type(), "rotated_feature_align_backward_cpu_kernel", [&] {
        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();

        rotated_feature_align_backward_cpu_kernel<scalar_t>(
            output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),
            top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);
      });
}

void rotated_feature_align_forward_impl(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output);

void rotated_feature_align_backward_impl(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad);

REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,
                     rotated_feature_align_forward_cpu);

REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,
                     rotated_feature_align_backward_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <torch/script.h>
#include <utils/spconv/spconv/geometry.h>
#include <utils/spconv/spconv/indice.h>

#include "pytorch_cpp_helper.hpp"

namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    if (transpose)
      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
          outSpatialShape.data());
    else
      return getIndicePairsConv<Index, IndexGrid, NDim>(
          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
          outSpatialShape.data());
  }
};

template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    return getIndicePairsSubM<Index, IndexGrid, NDim>(
        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
  }
};

}  // namespace functor

#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
                                                       NDIM>;               \
  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
                                                       NDIM>;

#define DECLARE_CPU_INDEX(Index)          \
  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);

DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long);

#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <torch/script.h>
#include <utils/spconv/spconv/maxpool.h>

#include "pytorch_cpp_helper.hpp"

namespace functor {
template <typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
  void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> inFeatures,
                  tv::TensorView<const Index> indices, int size) {
    int stride = outFeatures.dim(1);
    auto outFeaturesData = outFeatures.data();
    auto inFeaturesData = inFeatures.data();
    auto indicesIn = indices.subview(0).data();
    auto indicesOut = indices.subview(1).data();
    Index idxi, idxo;
    for (int row = 0; row < size; row++) {
      idxi = indicesIn[row] * stride;
      idxo = indicesOut[row] * stride;
      for (int plane = 0; plane < stride; ++plane)
        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
    }
  }
};

template <typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
  void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> inFeatures,
                  tv::TensorView<const scalar_t> fout,
                  tv::TensorView<scalar_t> fin,
                  tv::TensorView<const Index> indices, int size) {
    int stride = outFeatures.dim(1);
    auto outFeaturesData = outFeatures.data();
    auto inFeaturesData = inFeatures.data();
    auto foutData = fout.data();
    auto finData = fin.data();
    auto indicesIn = indices.subview(0).data();
    auto indicesOut = indices.subview(1).data();
    Index idxi, idxo;
    for (int row = 0; row < size; row++) {
      idxi = indicesIn[row] * stride;
      idxo = indicesOut[row] * stride;
      for (int plane = 0; plane < stride; ++plane)
        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
          finData[idxi + plane] += foutData[idxo + plane];
    }
  }
};

}  // namespace functor

#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;

#define DECLARE_CPU_SPECS(T)         \
  DECLARE_CPU_SPECS_T_INDEX(T, int); \
  DECLARE_CPU_SPECS_T_INDEX(T, long);

DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);

#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <torch/script.h>
#include <utils/spconv/spconv/reordering.h>

#include "pytorch_cpp_helper.hpp"

namespace functor {
template <typename scalar_t, typename Index>
struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
                  tv::TensorView<const scalar_t> features,
                  tv::TensorView<const Index> indices, int size) {
    int numPlanes = features.dim(1);
    for (int i = 0; i < size; ++i) {
      std::memcpy(buffer.data() + i * numPlanes,
                  features.data() + indices[i] * numPlanes,
                  sizeof(scalar_t) * numPlanes);
    }
  }
};

template <typename scalar_t, typename Index>
struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> buffer,
                  tv::TensorView<const Index> indices, int size, bool stable) {
    int numPlanes = outFeatures.dim(1);
    const scalar_t* buf = buffer.data();
    scalar_t* out = outFeatures.data();
    for (int i = 0; i < size; ++i) {
      buf = buffer.data() + i * numPlanes;
      out = outFeatures.data() + indices[i] * numPlanes;
      for (int j = 0; j < numPlanes; ++j) {
        out[j] += buf[j];
      }
    }
  }
};

}  // namespace functor

#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index)                        \
  template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
  template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;

#define DECLARE_CPU_SPECS(scalar_t)         \
  DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
  DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);

DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);

#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX


================================================
FILE: mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

template <typename T, typename T_int>
void dynamic_voxelize_forward_cpu_kernel(
    const torch::TensorAccessor<T, 2> points,
    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const std::vector<int> grid_size,
    const int num_points, const int num_features, const int NDim) {
  const int ndim_minus_1 = NDim - 1;
  bool failed = false;
  // int coor[NDim];
  int* coor = new int[NDim]();
  int c;

  for (int i = 0; i < num_points; ++i) {
    failed = false;
    for (int j = 0; j < NDim; ++j) {
      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
      // necessary to rm points out of range
      if ((c < 0 || c >= grid_size[j])) {
        failed = true;
        break;
      }
      coor[ndim_minus_1 - j] = c;
    }

    // memcpy and memset will cause problem because of the memory distribution
    // discontinuity of TensorAccessor, so here using loops to replace memcpy
    // or memset
    if (failed) {
      for (int k = 0; k < NDim; ++k) {
        coors[i][k] = -1;
      }
    } else {
      for (int k = 0; k < NDim; ++k) {
        coors[i][k] = coor[k];
      }
    }
  }

  delete[] coor;
  return;
}

template <typename T, typename T_int>
void hard_voxelize_forward_cpu_kernel(
    const torch::TensorAccessor<T, 2> points,
    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
    const std::vector<float> voxel_size, const std::vector<float> coors_range,
    const std::vector<int> grid_size, const int max_points,
    const int max_voxels, const int num_points, const int num_features,
    const int NDim) {
  // declare a temp coors
  at::Tensor temp_coors = at::zeros(
      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));

  // First use dynamic voxelization to get coors,
  // then check max points/voxels constraints
  dynamic_voxelize_forward_cpu_kernel<T, int>(
      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
      num_points, num_features, NDim);

  int voxelidx, num;
  auto coor = temp_coors.accessor<int, 2>();

  for (int i = 0; i < num_points; ++i) {
    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;

    if (coor[i][0] == -1) continue;

    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];

    // record voxel
    if (voxelidx == -1) {
      voxelidx = voxel_num;
      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
      voxel_num += 1;

      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
      // memcpy will cause problem because of the memory distribution
      // discontinuity of TensorAccessor, so here using loops to replace memcpy
      for (int k = 0; k < NDim; ++k) {
        coors[voxelidx][k] = coor[i][k];
      }
    }

    // put points into voxel
    num = num_points_per_voxel[voxelidx];
    if (max_points == -1 || num < max_points) {
      // memcpy will cause problem because of the memory distribution
      // discontinuity of TensorAccessor, so here using loops to replace memcpy
      for (int k = 0; k < num_features; ++k) {
        voxels[voxelidx][num][k] = points[i][k];
      }
      num_points_per_voxel[voxelidx] += 1;
    }
  }

  return;
}

void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
                                  const std::vector<float> voxel_size,
                                  const std::vector<float> coors_range,
                                  const int NDim = 3) {
  // check device
  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");

  std::vector<int> grid_size(NDim);
  const int num_points = points.size(0);
  const int num_features = points.size(1);

  for (int i = 0; i < NDim; ++i) {
    grid_size[i] =
        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }

  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
      });
}

int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
                              at::Tensor& coors,
                              at::Tensor& num_points_per_voxel,
                              const std::vector<float> voxel_size,
                              const std::vector<float> coors_range,
                              const int max_points, const int max_voxels,
                              const int NDim = 3) {
  // current version tooks about 0.02s_0.03s for one frame on cpu
  // check device
  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");

  std::vector<int> grid_size(NDim);
  const int num_points = points.size(0);
  const int num_features = points.size(1);

  for (int i = 0; i < NDim; ++i) {
    grid_size[i] =
        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }

  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
  // grid_size[1], grid_size[0]);
  at::Tensor coor_to_voxelidx =
      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());

  int voxel_num = 0;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
            coors_range, grid_size, max_points, max_voxels, num_points,
            num_features, NDim);
      });

  return voxel_num;
}

int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
                               at::Tensor& coors,
                               at::Tensor& num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim);

void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim);
REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,
                     hard_voxelize_forward_cpu);
REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,
                     dynamic_voxelize_forward_cpu);


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#include "active_rotated_filter_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
                                                  const Tensor indices,
                                                  Tensor output) {
  int num_output_planes = input.size(0);
  int num_input_planes = input.size(1);
  int num_orientations = input.size(2);
  int kH = input.size(3);
  int kW = input.size(4);
  int num_rotations = indices.size(3);
  int nEntry = num_orientations * kH * kW;
  int output_size = input.numel();

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
        active_rotated_filter_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                indices.data_ptr<int>(), num_input_planes, num_output_planes,
                num_orientations, num_rotations, nEntry,
                output.data_ptr<scalar_t>());
      });
  AT_CUDA_CHECK(cudaGetLastError());
}

void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
                                                   const Tensor indices,
                                                   Tensor grad_in) {
  int num_orientations = indices.size(0);
  int kH = indices.size(1);
  int kW = indices.size(2);
  int num_rotations = indices.size(3);
  int num_output_planes = grad_out.size(0) / num_rotations;
  int num_input_planes = grad_out.size(1) / num_orientations;
  int nEntry = num_orientations * kH * kW;
  int output_size = grad_in.numel();

  at::cuda::CUDAGuard device_guard(indices.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
      [&] {
        active_rotated_filter_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_out.data_ptr<scalar_t>(),
                indices.data_ptr<int>(), num_input_planes, num_output_planes,
                num_orientations, num_rotations, nEntry,
                grad_in.data_ptr<scalar_t>());
      });
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
================================================
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include <stdio.h>
#include <stdlib.h>

#include "assign_score_withk_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void AssignScoreWithKForwardCUDAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& points, const Tensor& centers, const Tensor& scores,
    const Tensor& knn_idx, Tensor& output) {
  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "assign_score_withk_forward_cuda_kernel", [&] {
        assign_score_withk_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),
                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),
                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void AssignScoreWithKBackwardCUDAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
  at::cuda::CUDAGuard device_guard(grad_out.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
  dim3 threads1(THREADS_PER_BLOCK);
  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
  dim3 threads2(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "assign_score_withk_points_backward_cuda_kernel",
      [&] {
        assign_score_withk_points_backward_cuda_kernel<scalar_t>
            <<<blocks1, threads1, 0, stream>>>(
                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),
                grad_points.data_ptr<scalar_t>(),
                grad_centers.data_ptr<scalar_t>());
      });

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "assign_score_withk_scores_backward_cuda_kernel",
      [&] {
        assign_score_withk_scores_backward_cuda_kernel<scalar_t>
            <<<blocks2, threads2, 0, stream>>>(
                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),
                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "ball_query_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
                                        float max_radius, int nsample,
                                        const Tensor new_xyz, const Tensor xyz,
                                        Tensor idx) {
  // new_xyz: (B, M, 3)
  // xyz: (B, N, 3)
  // output:
  //      idx: (B, M, nsample)

  at::cuda::CUDAGuard device_guard(new_xyz.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      new_xyz.scalar_type(), "ball_query_forward_cuda_kernel", [&] {
        ball_query_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, n, m, min_radius, max_radius, nsample,
                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),
                idx.data_ptr<int>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "bbox_overlaps_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

// Disable fp16 on ROCm device
#ifndef MMCV_WITH_HIP
#if __CUDA_ARCH__ >= 530
template <>
__global__ void bbox_overlaps_cuda_kernel<at::Half>(
    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
    const int num_bbox1, const int num_bbox2, const int mode,
    const bool aligned, const int offset) {
  bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),
                                 reinterpret_cast<const __half*>(bbox2),
                                 reinterpret_cast<__half*>(ious), num_bbox1,
                                 num_bbox2, mode, aligned, offset);
}

#endif  // __CUDA_ARCH__ >= 530
#endif  // MMCV_WITH_HIP

void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                    Tensor ious, const int mode,
                                    const bool aligned, const int offset) {
  int output_size = ious.numel();
  int num_bbox1 = bboxes1.size(0);
  int num_bbox2 = bboxes2.size(0);

  at::cuda::CUDAGuard device_guard(bboxes1.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
        bbox_overlaps_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
                offset);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "bezier_align_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                          Tensor output, int aligned_height,
                                          int aligned_width,
                                          float spatial_scale,
                                          int sampling_ratio, bool aligned) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "bezier_align_forward_cuda_kernel", [&] {
        bezier_align_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                aligned_height, aligned_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
                channels, height, width);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void BezierAlignBackwardCUDAKernelLauncher(
    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "bezier_align_backward_cuda_kernel", [&] {
        bezier_align_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
                aligned_height, aligned_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
                channels, height, width);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
================================================
// Modified from
// https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.cpp

// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <c10/util/Half.h>
#include <cuda_runtime.h>
#include <torch/types.h>

#include "pytorch_cuda_helper.hpp"

struct bias_act_kernel_params {
  const void *x;     // [sizeX]
  const void *b;     // [sizeB] or NULL
  const void *xref;  // [sizeX] or NULL
  const void *yref;  // [sizeX] or NULL
  const void *dy;    // [sizeX] or NULL
  void *y;           // [sizeX]

  int grad;
  int act;
  float alpha;
  float gain;
  float clamp;

  int sizeX;
  int sizeB;
  int stepB;
  int loopX;
};

// CUDA kernel selection.

template <class T>
void *choose_bias_act_kernel(const bias_act_kernel_params &p);
//------------------------------------------------------------------------
// Helpers.

template <class T>
struct InternalType;
template <>
struct InternalType<double> {
  typedef double scalar_t;
};
template <>
struct InternalType<float> {
  typedef float scalar_t;
};
template <>
struct InternalType<c10::Half> {
  typedef float scalar_t;
};

//------------------------------------------------------------------------
// CUDA kernel.

template <class T, int A>
__global__ void bias_act_kernel(bias_act_kernel_params p) {
  typedef typename InternalType<T>::scalar_t scalar_t;
  int G = p.grad;
  scalar_t alpha = (scalar_t)p.alpha;
  scalar_t gain = (scalar_t)p.gain;
  scalar_t clamp = (scalar_t)p.clamp;
  scalar_t one = (scalar_t)1;
  scalar_t two = (scalar_t)2;
  scalar_t expRange = (scalar_t)80;
  scalar_t halfExpRange = (scalar_t)40;
  scalar_t seluScale = (scalar_t)1.0507009873554804934193349852946;
  scalar_t seluAlpha = (scalar_t)1.6732632423543772848170429916717;

  // Loop over elements.
  int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
  for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX;
       loopIdx++, xi += blockDim.x) {
    // Load.
    scalar_t x = (scalar_t)((const T *)p.x)[xi];
    scalar_t b =
        (p.b) ? (scalar_t)((const T *)p.b)[(xi / p.stepB) % p.sizeB] : 0;
    scalar_t xref = (p.xref) ? (scalar_t)((const T *)p.xref)[xi] : 0;
    scalar_t yref = (p.yref) ? (scalar_t)((const T *)p.yref)[xi] : 0;
    scalar_t dy = (p.dy) ? (scalar_t)((const T *)p.dy)[xi] : one;
    scalar_t yy = (gain != 0) ? yref / gain : 0;
    scalar_t y = 0;

    // Apply bias.
    ((G == 0) ? x : xref) += b;

    // linear
    if (A == 1) {
      if (G == 0) y = x;
      if (G == 1) y = x;
    }

    // relu
    if (A == 2) {
      if (G == 0) y = (x > 0) ? x : 0;
      if (G == 1) y = (yy > 0) ? x : 0;
    }

    // lrelu
    if (A == 3) {
      if (G == 0) y = (x > 0) ? x : x * alpha;
      if (G == 1) y = (yy > 0) ? x : x * alpha;
    }

    // tanh
    if (A == 4) {
      if (G == 0) {
        scalar_t c = exp(x);
        scalar_t d = one / c;
        y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d);
      }
      if (G == 1) y = x * (one - yy * yy);
      if (G == 2) y = x * (one - yy * yy) * (-two * yy);
    }

    // sigmoid
    if (A == 5) {
      if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
      if (G == 1) y = x * yy * (one - yy);
      if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
    }

    // elu
    if (A == 6) {
      if (G == 0) y = (x >= 0) ? x : exp(x) - one;
      if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
    }

    // selu
    if (A == 7) {
      if (G == 0)
        y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
      if (G == 1)
        y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
    }

    // softplus
    if (A == 8) {
      if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
      if (G == 1) y = x * (one - exp(-yy));
      if (G == 2) {
        scalar_t c = exp(-yy);
        y = x * c * (one - c);
      }
    }

    // swish
    if (A == 9) {
      if (G == 0)
        y = (x < -expRange) ? 0 : x / (exp(-x) + one);
      else {
        scalar_t c = exp(xref);
        scalar_t d = c + one;
        if (G == 1)
          y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
        else
          y = (xref > halfExpRange)
                  ? 0
                  : x * c * (xref * (two - d) + two * d) / (d * d * d);
        yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
      }
    }

    // Apply gain.
    y *= gain * dy;

    // Clamp.
    if (clamp >= 0) {
      if (G == 0)
        y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
      else
        y = (yref > -clamp & yref < clamp) ? y : 0;
    }

    // Store.
    ((T *)p.y)[xi] = (T)y;
  }
}

//------------------------------------------------------------------------
// CUDA kernel selection.

template <class T>
void *choose_bias_act_kernel(const bias_act_kernel_params &p) {
  if (p.act == 1) return (void *)bias_act_kernel<T, 1>;
  if (p.act == 2) return (void *)bias_act_kernel<T, 2>;
  if (p.act == 3) return (void *)bias_act_kernel<T, 3>;
  if (p.act == 4) return (void *)bias_act_kernel<T, 4>;
  if (p.act == 5) return (void *)bias_act_kernel<T, 5>;
  if (p.act == 6) return (void *)bias_act_kernel<T, 6>;
  if (p.act == 7) return (void *)bias_act_kernel<T, 7>;
  if (p.act == 8) return (void *)bias_act_kernel<T, 8>;
  if (p.act == 9) return (void *)bias_act_kernel<T, 9>;
  return NULL;
}

//------------------------------------------------------------------------

static bool has_same_layout(torch::Tensor x, torch::Tensor y) {
  if (x.dim() != y.dim()) return false;
  for (int64_t i = 0; i < x.dim(); i++) {
    if (x.size(i) != y.size(i)) return false;
    if (x.size(i) >= 2 && x.stride(i) != y.stride(i)) return false;
  }
  return true;
}

//------------------------------------------------------------------------
torch::Tensor bias_act_op(const torch::Tensor &x, const torch::Tensor &b,
                          const torch::Tensor &xref, const torch::Tensor &yref,
                          const torch::Tensor &dy, int grad, int dim, int act,
                          float alpha, float gain, float clamp) {
  // Validate arguments.
  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
  TORCH_CHECK(
      b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()),
      "b must have the same dtype and device as x");
  TORCH_CHECK(xref.numel() == 0 ||
                  (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() &&
                   xref.device() == x.device()),
              "xref must have the same shape, dtype, and device as x");
  TORCH_CHECK(yref.numel() == 0 ||
                  (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() &&
                   yref.device() == x.device()),
              "yref must have the same shape, dtype, and device as x");
  TORCH_CHECK(
      dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() &&
                          dy.device() == x.device()),
      "dy must have the same dtype and device as x");
  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
  TORCH_CHECK(b.dim() == 1, "b must have rank 1");
  TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()),
              "dim is out of bounds");
  TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim),
              "b has wrong number of elements");
  TORCH_CHECK(grad >= 0, "grad must be non-negative");

  // Validate layout.
  TORCH_CHECK(x.is_non_overlapping_and_dense(),
              "x must be non-overlapping and dense");
  TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
  TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x),
              "xref must have the same layout as x");
  TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x),
              "yref must have the same layout as x");
  TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x),
              "dy must have the same layout as x");

  // Create output tensor.
  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
  torch::Tensor y = torch::empty_like(x);
  TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");

  // Initialize CUDA kernel parameters.
  bias_act_kernel_params p;
  p.x = x.data_ptr();
  p.b = (b.numel()) ? b.data_ptr() : NULL;
  p.xref = (xref.numel()) ? xref.data_ptr() : NULL;
  p.yref = (yref.numel()) ? yref.data_ptr() : NULL;
  p.dy = (dy.numel()) ? dy.data_ptr() : NULL;
  p.y = y.data_ptr();
  p.grad = grad;
  p.act = act;
  p.alpha = alpha;
  p.gain = gain;
  p.clamp = clamp;
  p.sizeX = (int)x.numel();
  p.sizeB = (int)b.numel();
  p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;

  // Choose CUDA kernel.
  void *kernel;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
    kernel = choose_bias_act_kernel<scalar_t>(p);
  });
  TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");

  // Launch CUDA kernel.
  p.loopX = 4;
  int blockSize = 4 * 32;
  int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
  void *args[] = {&p};
#ifdef MMCV_WITH_HIP
  AT_CUDA_CHECK(hipLaunchKernel(kernel, gridSize, blockSize, args, 0,
                                at::cuda::getCurrentCUDAStream()));
#else
  AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0,
                                 at::cuda::getCurrentCUDAStream()));
#endif

  return y;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "border_align_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
                                          const Tensor &boxes, Tensor output,
                                          Tensor argmax_idx,
                                          const int pool_size) {
  // shape assertion
  AT_ASSERTM(input.ndimension() == 4,
             "non-empty 4D(batch mode) tensor expected for input feature");
  AT_ASSERTM(boxes.ndimension() == 3,
             "boxes must be 3D tensor with size of [B, H*W, 4]");

  int batch_size = input.size(0);
  int feat_channels = input.size(1);
  int channels = feat_channels / 4;
  int height = input.size(2);
  int width = input.size(3);
  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format
  int box_size = boxes.size(1);
  // shape [N, channels, box_size, 4] for output
  int nthreads = batch_size * channels * box_size;

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  dim3 block(128, 4);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "border_align_forward_cuda_kernel", [&] {
        border_align_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
                nthreads, input.data_ptr<scalar_t>(),
                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                argmax_idx.data_ptr<int>(), channels, box_size, height, width,
                pool_size);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
                                           const Tensor &boxes,
                                           const Tensor &argmax_idx,
                                           Tensor grad_input,
                                           const int pool_size) {
  int batch_size = grad_input.size(0);
  int feat_channels = grad_input.size(1);
  int channels = feat_channels / 4;
  int height = grad_input.size(2);
  int width = grad_input.size(3);
  int box_size = boxes.size(1);
  int nthreads = batch_size * channels * box_size;

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  dim3 block(128, 4);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "border_align_backward_cuda_kernel", [&] {
        border_align_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
                nthreads, grad_output.data_ptr<scalar_t>(),
                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),
                grad_input.data_ptr<scalar_t>(), channels, box_size, height,
                width, pool_size);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "box_iou_quadri_cuda.cuh"
#include "pytorch_cuda_helper.hpp"

void box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned) {
  using scalar_t = float;
  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");

  int output_size = ious.numel();
  int num_boxes1 = boxes1.size(0);
  int num_boxes2 = boxes2.size(0);

  at::cuda::CUDAGuard device_guard(boxes1.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  box_iou_quadri_cuda_kernel<scalar_t>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
          mode_flag, aligned);
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
#include "box_iou_rotated_cuda.cuh"
#include "pytorch_cuda_helper.hpp"

void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned) {
  using scalar_t = float;
  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");

  int output_size = ious.numel();
  int num_boxes1 = boxes1.size(0);
  int num_boxes2 = boxes2.size(0);

  at::cuda::CUDAGuard device_guard(boxes1.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  box_iou_rotated_cuda_kernel<scalar_t>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
          mode_flag, aligned);
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "carafe_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
                                     Tensor rfeatures, Tensor routput,
                                     Tensor rmasks, Tensor output,
                                     const int kernel_size,
                                     const int group_size,
                                     const int scale_factor) {
  const int batch_size = output.size(0);
  const int channels = output.size(1);
  const int output_height = output.size(2);
  const int output_width = output.size(3);

  const int input_height = features.size(2);
  const int input_width = features.size(3);

  const int mask_channels = masks.size(1);

  rfeatures.resize_({batch_size, input_height, input_width, channels});
  routput.resize_({batch_size, output_height, output_width, channels});
  rmasks.resize_({batch_size, output_height, output_width, mask_channels});

  // one warp per pixel
  at::cuda::CUDAGuard device_guard(features.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
        const int dh = divideUP(channels, kTileDim);
        const int dw = divideUP(input_height * input_width, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, channels, input_height * input_width, dh, dw,
                bottom_data, top_data);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
        const int dh = divideUP(mask_channels, kTileDim);
        const int dw = divideUP(output_height * output_width, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, mask_channels, output_height * output_width, dh, dw,
                bottom_data, top_data);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "CARAFELaucherForward", ([&] {
        const int num_kernels =
            batch_size * output_height * output_width * THREADS_PER_PIXEL;
        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
        scalar_t *top_data = routput.data_ptr<scalar_t>();

        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
                                  THREADS_PER_BLOCK, 0, stream>>>(
            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
            scale_factor, channels, input_height, input_width, output_height,
            output_width, mask_channels, top_data);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "NHWC2NCHW", ([&] {
        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
        scalar_t *top_data = output.data_ptr<scalar_t>();
        const int dh = divideUP(output_height * output_width, kTileDim);
        const int dw = divideUP(channels, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, output_height * output_width, channels, dh, dw,
                bottom_data, top_data);
      }));

  AT_CUDA_CHECK(cudaGetLastError());
}

void CARAFEBackwardCUDAKernelLauncher(
    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
    const int kernel_size, const int group_size, const int scale_factor) {
  const int batch_size = top_grad.size(0);
  const int channels = top_grad.size(1);
  const int output_height = top_grad.size(2);
  const int output_width = top_grad.size(3);

  const int input_height = bottom_grad.size(2);
  const int input_width = bottom_grad.size(3);

  const int mask_channels = masks.size(1);

  rtop_grad.resize_({batch_size, output_height, output_width, channels});
  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});

  at::cuda::CUDAGuard device_guard(top_grad.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
        const int dh = divideUP(channels, kTileDim);
        const int dw = divideUP(output_height * output_width, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, channels, output_height * output_width, dh, dw,
                bottom_data, top_data);
      }));

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
        const int num_kernels =
            batch_size * output_height * output_width * THREADS_PER_PIXEL;
        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();

        CARAFEBackward_Feature<scalar_t>
            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
                         group_size, scale_factor, channels, input_height,
                         input_width, output_height, output_width,
                         mask_channels, bottom_diff);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "FeatureSum", ([&] {
        const int num_kernels =
            batch_size * input_height * input_width * THREADS_PER_PIXEL;
        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();

        FeatureSum<scalar_t>
            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
                         input_height, input_width, bottom_diff);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
        const int dh = divideUP(input_height * input_width, kTileDim);
        const int dw = divideUP(channels, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, input_height * input_width, channels, dh, dw,
                bottom_data, top_data);
      }));

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
        const int num_kernels = batch_size * output_height * output_width *
                                mask_channels * WARP_SIZE;
        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();

        CARAFEBackward_Mask<scalar_t>
            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
                         group_size, scale_factor, channels, input_height,
                         input_width, output_height, output_width,
                         mask_channels, mask_diff);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
        const int dh = divideUP(output_height * output_width, kTileDim);
        const int dw = divideUP(mask_channels, kTileDim);
        BatchTranspose2DCUDAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, output_height * output_width, mask_channels, dh, dw,
                bottom_data, top_data);
      }));

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "carafe_naive_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
                                          const Tensor masks, Tensor output,
                                          const int kernel_size,
                                          const int group_size,
                                          const int scale_factor) {
  int output_size = output.numel();
  int channels = output.size(1);
  int height = output.size(2);
  int width = output.size(3);

  at::cuda::CUDAGuard device_guard(features.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "CARAFENAIVEForward", ([&] {
        carafe_naive_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, features.data_ptr<scalar_t>(),
                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                kernel_size, group_size, scale_factor, channels, height, width);
      }));

  AT_CUDA_CHECK(cudaGetLastError());
}

void CARAFENAIVEBackwardCUDAKernelLauncher(
    const Tensor top_grad, const Tensor features, const Tensor masks,
    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
    const int group_size, const int scale_factor) {
  int output_size = top_grad.numel();
  int channels = top_grad.size(1);
  int height = top_grad.size(2);
  int width = top_grad.size(3);

  at::cuda::CUDAGuard device_guard(top_grad.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
        carafe_naive_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, top_grad.data_ptr<scalar_t>(),
                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
                bottom_grad.data_ptr<scalar_t>(),
                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
                scale_factor, channels, height, width);
      }));

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
#include "chamfer_distance_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void ChamferDistanceForwardCUDAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
    const Tensor dist2, const Tensor idx1, const Tensor idx2) {
  int batch_size = xyz1.size(0);
  int n = xyz1.size(1);
  int m = xyz2.size(1);

  at::cuda::CUDAGuard device_guard(xyz1.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
        chamfer_distance_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
                batch_size, n, xyz1.data_ptr<scalar_t>(), m,
                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
                idx1.data_ptr<int>());
      });
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
        chamfer_distance_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
                batch_size, m, xyz2.data_ptr<scalar_t>(), n,
                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
                idx2.data_ptr<int>());
      });
  AT_CUDA_CHECK(cudaGetLastError());
}

void ChamferDistanceBackwardCUDAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2) {
  int batch_size = xyz1.size(0);
  int n = xyz1.size(1);
  int m = xyz2.size(1);

  at::cuda::CUDAGuard device_guard(xyz1.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
        chamfer_distance_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
                batch_size, m, xyz1.data_ptr<scalar_t>(), n,
                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
                grad_xyz2.data_ptr<scalar_t>());
      });
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
        chamfer_distance_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
                batch_size, n, xyz2.data_ptr<scalar_t>(), m,
                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
                grad_xyz1.data_ptr<scalar_t>());
      });
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
#include "convex_iou_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                 Tensor ious) {
  int output_size = ious.numel();
  int num_pointsets = pointsets.size(0);
  int num_polygons = polygons.size(0);

  at::cuda::CUDAGuard device_guard(pointsets.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
        convex_iou_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}

void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                  Tensor output) {
  int output_size = output.numel();
  int num_pointsets = pointsets.size(0);
  int num_polygons = polygons.size(0);

  at::cuda::CUDAGuard device_guard(pointsets.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
        convex_giou_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
// Original licence: Under MIT License

#include "correlation_cuda.cuh"
#include "pytorch_cuda_helper.hpp"

void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
                                          Tensor output, int kH, int kW,
                                          int patchH, int patchW, int padH,
                                          int padW, int dilationH,
                                          int dilationW, int dilation_patchH,
                                          int dilation_patchW, int dH, int dW) {
  const int batch_size = input1.size(0);
  const int iH = input1.size(2);
  const int iW = input1.size(3);
  const int dilatedKH = (kH - 1) * dilationH + 1;
  const int dilatedKW = (kW - 1) * dilationW + 1;

  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;

  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();

  const dim3 threads(WARP_SIZE, 4, 4);
  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);

  at::cuda::CUDAGuard device_guard(input1.device());

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input1.scalar_type(), "correlation_forward_cuda", ([&] {
        TensorAcc4R trInput1_acc =
            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc4R trInput2_acc =
            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc5R output_acc =
            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();

        correlation_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,
                padH, padW, dilationH, dilationW, dilation_patchH,
                dilation_patchW, dH, dW, oH, oW);
      }));
}

void CorrelationBackwardCUDAKernelLauncher(
    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,
    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,
    int padW, int dilationH, int dilationW, int dilation_patchH,
    int dilation_patchW, int dH, int dW) {
  const int batch_size = input1.size(0);
  const int iH = input1.size(2);
  const int iW = input1.size(3);
  const int C = input1.size(1);

  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
  const dim3 blocks(batch_size, iH, iW);
  const dim3 threads(THREADS_PER_BLOCK);

  at::cuda::CUDAGuard device_guard(input1.device());

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input1.scalar_type(), "correlation_backward_cuda", ([&] {
        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
        TensorAcc4R input1_acc =
            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc4R input2_acc =
            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc4R grad_input1_acc =
            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc4R grad_input2_acc =
            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc5R grad_output_acc =
            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();

        correlation_backward_cuda_kernel_input1<scalar_t>
            <<<blocks, threads, grad_cache_size,
               at::cuda::getCurrentCUDAStream()>>>(
                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
                dilation_patchW, dH, dW);

        correlation_backward_cuda_kernel_input2<scalar_t>
            <<<blocks, threads, grad_cache_size,
               at::cuda::getCurrentCUDAStream()>>>(
                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
                dilation_patchW, dH, dW);
      }));
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void AssignScoreWithKForwardCUDAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor &points, const Tensor &centers, const Tensor &scores,
    const Tensor &knn_idx, Tensor &output);

void AssignScoreWithKBackwardCUDAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
    Tensor &grad_centers, Tensor &grad_scores);

void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor &points,
                                     const Tensor &centers,
                                     const Tensor &scores,
                                     const Tensor &knn_idx, Tensor &output) {
  AssignScoreWithKForwardCUDAKernelLauncher(
      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
};

void assign_score_withk_backward_cuda(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
    Tensor &grad_centers, Tensor &grad_scores) {
  AssignScoreWithKBackwardCUDAKernelLauncher(
      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
      grad_points, grad_centers, grad_scores);
};

void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor &points,
                                     const Tensor &centers,
                                     const Tensor &scores,
                                     const Tensor &knn_idx, Tensor &output);

void assign_score_withk_backward_impl(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
    Tensor &grad_centers, Tensor &grad_scores);

REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
                     assign_score_withk_forward_cuda);
REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
                     assign_score_withk_backward_cuda);

void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
                                        float max_radius, int nsample,
                                        const Tensor new_xyz, const Tensor xyz,
                                        Tensor idx);

void ball_query_forward_cuda(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx) {
  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
                                     new_xyz, xyz, idx);
};

void ball_query_forward_impl(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx);
REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);

void StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,
                                             const Tensor new_xyz,
                                             const Tensor new_xyz_batch_cnt,
                                             const Tensor xyz,
                                             const Tensor xyz_batch_cnt,
                                             Tensor idx);

void stack_ball_query_forward_cuda(float max_radius, int nsample,
                                   const Tensor new_xyz,
                                   const Tensor new_xyz_batch_cnt,
                                   const Tensor xyz, const Tensor xyz_batch_cnt,
                                   Tensor idx) {
  StackBallQueryForwardCUDAKernelLauncher(
      max_radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
};

void stack_ball_query_forward_impl(float max_radius, int nsample,
                                   const Tensor new_xyz,
                                   const Tensor new_xyz_batch_cnt,
                                   const Tensor xyz, const Tensor xyz_batch_cnt,
                                   Tensor idx);
REGISTER_DEVICE_IMPL(stack_ball_query_forward_impl, CUDA,
                     stack_ball_query_forward_cuda);

void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                    Tensor ious, const int mode,
                                    const bool aligned, const int offset);

void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset) {
  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}

void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset);
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);

void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
                                          const Tensor &boxes, Tensor output,
                                          Tensor argmax_idx,
                                          const int pool_size);

void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
                                           const Tensor &boxes,
                                           const Tensor &argmax_idx,
                                           Tensor grad_input,
                                           const int pool_size);

void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size) {
  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
                                       pool_size);
}

void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size) {
  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
                                        grad_input, pool_size);
}

void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size);

void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size);

REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
                     border_align_forward_cuda);
REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
                     border_align_backward_cuda);

void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);

void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);

void box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned);

void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CUDA, box_iou_quadri_cuda);

void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
                                     Tensor rfeatures, Tensor routput,
                                     Tensor rmasks, Tensor output,
                                     const int kernel_size,
                                     const int group_size,
                                     const int scale_factor);

void CARAFEBackwardCUDAKernelLauncher(
    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
    const int kernel_size, const int group_size, const int scale_factor);

void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor) {
  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
                                  output, kernel_size, group_size,
                                  scale_factor);
}

void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor) {
  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
                                   bottom_grad, mask_grad, kernel_size,
                                   group_size, scale_factor);
}

void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor);

void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor);

REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);

void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
                                          const Tensor masks, Tensor output,
                                          const int kernel_size,
                                          const int group_size,
                                          const int scale_factor);

void CARAFENAIVEBackwardCUDAKernelLauncher(
    const Tensor top_grad, const Tensor features, const Tensor masks,
    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
    const int group_size, const int scale_factor);

void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor) {
  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
                                       group_size, scale_factor);
}

void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor) {
  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
                                        mask_grad, kernel_size, group_size,
                                        scale_factor);
}
void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor);

void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor);

REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
                     carafe_naive_forward_cuda);
REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
                     carafe_naive_backward_cuda);

void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
                                          Tensor output, int kH, int kW,
                                          int patchH, int patchW, int padH,
                                          int padW, int dilationH,
                                          int dilationW, int dilation_patchH,
                                          int dilation_patchW, int dH, int dW);

void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
                                           Tensor input2, Tensor grad_input1,
                                           Tensor grad_input2, int kH, int kW,
                                           int patchH, int patchW, int padH,
                                           int padW, int dilationH,
                                           int dilationW, int dilation_patchH,
                                           int dilation_patchW, int dH, int dW);

void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW) {
  CorrelationForwardCUDAKernelLauncher(
      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
      dilationW, dilation_patchH, dilation_patchW, dH, dW);
}

void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
  CorrelationBackwardCUDAKernelLauncher(
      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
      dilation_patchW, dH, dW);
}

void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW);

void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW);

REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
                     correlation_backward_cuda);

void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col);

void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im);

void deformable_col2im_coord_cuda(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset);

void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col);

void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im);

void deformable_col2im_coord_impl(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset);

REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
                     deformable_col2im_coord_cuda);

void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                            Tensor offset, Tensor output,
                                            int pooled_height, int pooled_width,
                                            float spatial_scale,
                                            int sampling_ratio, float gamma);

void DeformRoIPoolBackwardCUDAKernelLauncher(
    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
    float spatial_scale, int sampling_ratio, float gamma);

void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
                                         pooled_height, pooled_width,
                                         spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma) {
  DeformRoIPoolBackwardCUDAKernelLauncher(
      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
      pooled_width, spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma);

void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma);

REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
                     deform_roi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
                     deform_roi_pool_backward_cuda);

void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha);

void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
                                                Tensor weight,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha);

void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha);

void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
                                                Tensor weight, Tensor buff,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha);

void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
                                            gamma, alpha);
}

void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha) {
  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
                                             gamma, alpha);
}

void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
                                            gamma, alpha);
}

void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
                                             grad_input, gamma, alpha);
}

void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha);

void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha);

REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
                     sigmoid_focal_loss_forward_cuda);
REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
                     sigmoid_focal_loss_backward_cuda);
REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
                     softmax_focal_loss_forward_cuda);
REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
                     softmax_focal_loss_backward_cuda);

void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
                                                    const float *dataset,
                                                    float *temp, int *idxs);

void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
    int b, int n, int m, const float *dataset, float *temp, int *idxs);

void furthest_point_sampling_forward_cuda(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m) {
  const float *dataset = points_tensor.data_ptr<float>();
  float *temp = temp_tensor.data_ptr<float>();
  int *idxs = idx_tensor.data_ptr<int>();
  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
}

void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m) {
  const float *dataset = points_tensor.data_ptr<float>();
  float *temp = temp_tensor.data_ptr<float>();
  int *idxs = idx_tensor.data_ptr<int>();
  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
                                                         idxs);
}

void furthest_point_sampling_forward_impl(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m);

void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m);

REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
                     furthest_point_sampling_forward_cuda);
REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
                     furthest_point_sampling_with_dist_forward_cuda);

torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
                                      const torch::Tensor &bias,
                                      const torch::Tensor &refer, int act,
                                      int grad, float alpha, float scale);

torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor &input,
                                           const torch::Tensor &bias,
                                           const torch::Tensor &refer, int act,
                                           int grad, float alpha, float scale);
REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
                     fused_bias_leakyrelu_op);

torch::Tensor bias_act_op_impl(const torch::Tensor &input,
                               const torch::Tensor &bias,
                               const torch::Tensor &xref,
                               const torch::Tensor &yref,
                               const torch::Tensor &dy, int grad, int dim,
                               int act, float alpha, float gain, float clamp);

torch::Tensor bias_act_op(const torch::Tensor &input, const torch::Tensor &bias,
                          const torch::Tensor &xref, const torch::Tensor &yref,
                          const torch::Tensor &dy, int grad, int dim, int act,
                          float alpha, float gain, float clamp);

REGISTER_DEVICE_IMPL(bias_act_op_impl, CUDA, bias_act_op);

torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
                                         int sx, int sy, float gain,
                                         float slope, float clamp,
                                         bool writeSigns);

torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
                                    int sy, float gain, float slope,
                                    float clamp, bool writeSigns);

REGISTER_DEVICE_IMPL(filtered_lrelu_act_op_impl, CUDA, filtered_lrelu_act_op);

void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                           const Tensor points,
                                           const Tensor idx, Tensor out);

void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                            const Tensor grad_out,
                                            const Tensor idx,
                                            Tensor grad_points);

void gather_points_forward_cuda(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
};

void gather_points_backward_cuda(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
                                         grad_points);
};

void gather_points_forward_impl(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out);

void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points);

REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
                     gather_points_forward_cuda);
REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
                     gather_points_backward_cuda);

void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                          int nsample, const Tensor points,
                                          const Tensor idx, Tensor out);

void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                           int nsample, const Tensor grad_out,
                                           const Tensor idx,
                                           Tensor grad_points);

void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
                                       out);
};

void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
                                        idx, grad_points);
};

void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out);

void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points);

REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
                     group_points_forward_cuda);
REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
                     group_points_backward_cuda);

void StackGroupPointsForwardCUDAKernelLauncher(
    int b, int c, int m, int nsample, const Tensor features_tensor,
    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
    const Tensor idx_batch_cnt_tensor, Tensor out_tensor);
void StackGroupPointsBackwardCUDAKernelLauncher(
    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor);

void stack_group_points_forward_cuda(int b, int c, int m, int nsample,
                                     const Tensor features_tensor,
                                     const Tensor features_batch_cnt_tensor,
                                     const Tensor idx_tensor,
                                     const Tensor idx_batch_cnt_tensor,
                                     Tensor out_tensor) {
  StackGroupPointsForwardCUDAKernelLauncher(
      b, c, m, nsample, features_tensor, features_batch_cnt_tensor, idx_tensor,
      idx_batch_cnt_tensor, out_tensor);
};

void stack_group_points_backward_cuda(int b, int c, int m, int n, int nsample,
                                      const Tensor grad_out_tensor,
                                      const Tensor idx_tensor,
                                      const Tensor idx_batch_cnt_tensor,
                                      const Tensor features_batch_cnt_tensor,
                                      Tensor grad_features_tensor) {
  StackGroupPointsBackwardCUDAKernelLauncher(
      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
      features_batch_cnt_tensor, grad_features_tensor);
};

void stack_group_points_forward_impl(int b, int c, int m, int nsample,
                                     const Tensor features_tensor,
                                     const Tensor features_batch_cnt_tensor,
                                     const Tensor idx_tensor,
                                     const Tensor idx_batch_cnt_tensor,
                                     Tensor out_tensor);

void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
                                      const Tensor grad_out_tensor,
                                      const Tensor idx_tensor,
                                      const Tensor idx_batch_cnt_tensor,
                                      const Tensor features_batch_cnt_tensor,
                                      Tensor grad_features_tensor);

REGISTER_DEVICE_IMPL(stack_group_points_forward_impl, CUDA,
                     stack_group_points_forward_cuda);
REGISTER_DEVICE_IMPL(stack_group_points_backward_impl, CUDA,
                     stack_group_points_backward_cuda);

void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
                                                   const Tensor boxes_a,
                                                   const int num_b,
                                                   const Tensor boxes_b,
                                                   Tensor ans_overlap);

void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,
                                         Tensor &keep_num,
                                         float nms_overlap_thresh);

void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,
                                               Tensor &keep_num,
                                               float nms_overlap_thresh);

void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap) {
  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
                                                ans_overlap);
};

void iou3d_nms3d_forward_cuda(const Tensor boxes, Tensor &keep,
                              Tensor &keep_num, float nms_overlap_thresh) {
  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, keep, keep_num,
                                      nms_overlap_thresh);
};

void iou3d_nms3d_normal_forward_cuda(const Tensor boxes, Tensor &keep,
                                     Tensor &keep_num,
                                     float nms_overlap_thresh) {
  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, keep, keep_num,
                                            nms_overlap_thresh);
};

void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap);

void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
                              Tensor &keep_num, float nms_overlap_thresh);

void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
                                     Tensor &keep_num,
                                     float nms_overlap_thresh);

REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
                     iou3d_boxes_overlap_bev_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
                     iou3d_nms3d_normal_forward_cuda);

void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
                                  const Tensor xyz, const Tensor new_xyz,
                                  Tensor idx, Tensor dist2);

void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
}

void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2);
REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);

void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
                                           Tensor top_data, const int kernel_h,
                                           const int kernel_w, const int pad_h,
                                           const int pad_w);

void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
                                           Tensor top_data, const int height,
                                           const int width, const int channels);

void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
                                        kernel_h, kernel_w, pad_h, pad_w);
}

void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
                                        width, channels);
}

void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w);

void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels);

REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
                     masked_im2col_forward_cuda);
REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
                     masked_col2im_forward_cuda);

void modulated_deformable_im2col_cuda(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col);

void modulated_deformable_col2im_cuda(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im);

void modulated_deformable_col2im_coord_cuda(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask);

void modulated_deformable_im2col_impl(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col);

void modulated_deformable_col2im_impl(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im);

void modulated_deformable_col2im_coord_impl(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask);

REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
                     modulated_deformable_im2col_cuda);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
                     modulated_deformable_col2im_cuda);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
                     modulated_deformable_col2im_coord_cuda);

Tensor ms_deform_attn_cuda_forward(const Tensor &value,
                                   const Tensor &spatial_shapes,
                                   const Tensor &level_start_index,
                                   const Tensor &sampling_loc,
                                   const Tensor &attn_weight,
                                   const int im2col_step);

void ms_deform_attn_cuda_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);

Tensor ms_deform_attn_impl_forward(const Tensor &value,
                                   const Tensor &spatial_shapes,
                                   const Tensor &level_start_index,
                                   const Tensor &sampling_loc,
                                   const Tensor &attn_weight,
                                   const int im2col_step);

void ms_deform_attn_impl_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);

REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
                     ms_deform_attn_cuda_forward);
REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
                     ms_deform_attn_cuda_backward);

Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                             int offset);

Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
}

Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);

void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
                                                int pts_num, const Tensor boxes,
                                                const Tensor pts,
                                                Tensor box_idx_of_points);

void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
                                               int pts_num, const Tensor boxes,
                                               const Tensor pts,
                                               Tensor box_idx_of_points);

void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points) {
  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
                                             boxes, pts, box_idx_of_points);
};

void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points) {
  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
                                            boxes, pts, box_idx_of_points);
};

void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points);

void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points);
REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
                     points_in_boxes_part_forward_cuda);
REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
                     points_in_boxes_all_forward_cuda);

void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
                                      Tensor output, const int num_,
                                      const int h_feature, const int w_feature,
                                      const int h_mask, const int w_mask,
                                      const int half_h_mask,
                                      const int half_w_mask);

void PSAMaskBackwardCUDAKernelLauncher(
    const int psa_type, const Tensor grad_output, Tensor grad_input,
    const int num_, const int h_feature, const int w_feature, const int h_mask,
    const int w_mask, const int half_h_mask, const int half_w_mask);

void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask) {
  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
                                   w_feature, h_mask, w_mask, half_h_mask,
                                   half_w_mask);
}

void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask) {
  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
                                    h_feature, w_feature, h_mask, w_mask,
                                    half_h_mask, half_w_mask);
}

void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask);

void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask);
REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);

void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,
                                       int aligned_height, int aligned_width,
                                       float spatial_scale, int sampling_ratio,
                                       int pool_mode, bool aligned);

void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                        Tensor argmax_y, Tensor argmax_x,
                                        Tensor grad_input, int aligned_height,
                                        int aligned_width, float spatial_scale,
                                        int sampling_ratio, int pool_mode,
                                        bool aligned);

void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  ROIAlignForwardCUDAKernelLauncher(
      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
      spatial_scale, sampling_ratio, pool_mode, aligned);
}

void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
  ROIAlignBackwardCUDAKernelLauncher(
      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}

void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);

void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned);

REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);

void ROIAlignRotatedForwardCUDAKernelLauncher(
    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor output);

void ROIAlignRotatedBackwardCUDAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);

void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);

  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }

  int num_channels = input.size(1);
  int data_height = input.size(2);
  int data_width = input.size(3);
  ROIAlignRotatedForwardCUDAKernelLauncher(
      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
      num_channels, data_height, data_width, num_rois, aligned_height,
      aligned_width, output);
}

void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }

  int num_channels = bottom_grad.size(1);
  int data_height = bottom_grad.size(2);
  int data_width = bottom_grad.size(3);
  ROIAlignRotatedBackwardCUDAKernelLauncher(
      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
      num_channels, data_height, data_width, num_rois, aligned_height,
      aligned_width, bottom_grad);
}

void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise);

void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
                     roi_align_rotated_forward_cuda);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
                     roi_align_rotated_backward_cuda);

void RiROIAlignRotatedForwardCUDAKernelLauncher(
    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor output);

void RiROIAlignRotatedBackwardCUDAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor bottom_grad);

void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
                                      Tensor output, int pooled_height,
                                      int pooled_width, float spatial_scale,
                                      int num_samples, int num_orientations,
                                      bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }
  CHECK_CONTIGUOUS(features);
  CHECK_CONTIGUOUS(rois);
  int num_channels = features.size(1) / num_orientations;
  int data_height = features.size(2);
  int data_width = features.size(3);
  RiROIAlignRotatedForwardCUDAKernelLauncher(
      features, rois, spatial_scale, num_samples, clockwise, num_channels,
      data_height, data_width, num_rois, pooled_height, pooled_width,
      num_orientations, output);
}

void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
                                       Tensor bottom_grad, int pooled_height,
                                       int pooled_width, float spatial_scale,
                                       int num_samples, int num_orientations,
                                       bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }
  CHECK_CONTIGUOUS(top_grad);
  CHECK_CONTIGUOUS(rois);
  int num_channels = bottom_grad.size(1) / num_orientations;
  int data_height = bottom_grad.size(2);
  int data_width = bottom_grad.size(3);
  RiROIAlignRotatedBackwardCUDAKernelLauncher(
      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
      data_height, data_width, num_rois, pooled_height, pooled_width,
      num_orientations, bottom_grad);
}

void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
                                      Tensor output, int pooled_height,
                                      int pooled_width, float spatial_scale,
                                      int num_samples, int num_orientations,
                                      bool clockwise);

void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                       Tensor bottom_grad, int pooled_height,
                                       int pooled_width, float spatial_scale,
                                       int num_samples, int num_orientations,
                                       bool clockwise);

REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
                     riroi_align_rotated_forward_cuda);
REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
                     riroi_align_rotated_backward_cuda);

void RoiawarePool3dForwardCUDAKernelLauncher(
    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
    int out_y, int out_z, const Tensor rois, const Tensor pts,
    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
    Tensor pooled_features, int pool_method);

void RoiawarePool3dBackwardCUDAKernelLauncher(
    int boxes_num, int out_x, int out_y, int out_z, int channels,
    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
    const Tensor grad_out, Tensor grad_in, int pool_method);

void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method) {
  RoiawarePool3dForwardCUDAKernelLauncher(
      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
      pool_method);
};

void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method) {
  RoiawarePool3dBackwardCUDAKernelLauncher(
      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
};

void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method);

void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method);

REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
                     roiaware_pool3d_forward_cuda);
REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
                     roiaware_pool3d_backward_cuda);

void RoIPointPool3dForwardCUDAKernelLauncher(
    int batch_size, int pts_num, int boxes_num, int feature_in_len,
    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);

void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag) {
  RoIPointPool3dForwardCUDAKernelLauncher(
      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
};

void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag);
REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
                     roipoint_pool3d_forward_cuda);

void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                      Tensor argmax, int pooled_height,
                                      int pooled_width, float spatial_scale);

void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                       Tensor argmax, Tensor grad_input,
                                       int pooled_height, int pooled_width,
                                       float spatial_scale);

void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale) {
  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
                                   pooled_width, spatial_scale);
}

void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale) {
  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
                                    pooled_height, pooled_width, spatial_scale);
}

void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale);
void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale);
REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;

std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
    const at::Tensor &feats, const at::Tensor &coors,
    const reduce_t reduce_type);

void DynamicPointToVoxelBackwardCUDAKernelLauncher(
    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
    const at::Tensor &feats, const at::Tensor &reduced_feats,
    const at::Tensor &coors_map, const at::Tensor &reduce_count,
    const reduce_t reduce_type);

std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
    const torch::Tensor &feats, const torch::Tensor &coors,
    const reduce_t reduce_type) {
  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
                                                      reduce_type);
};

void dynamic_point_to_voxel_backward_cuda(
    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
    const reduce_t reduce_type) {
  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
                                                feats, reduced_feats, coors_idx,
                                                reduce_count, reduce_type);
};

std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
    const torch::Tensor &feats, const torch::Tensor &coors,
    const reduce_t reduce_type);

void dynamic_point_to_voxel_backward_impl(
    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
    const reduce_t reduce_type);

REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
                     dynamic_point_to_voxel_forward_cuda);
REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
                     dynamic_point_to_voxel_backward_cuda);

void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);

void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
                                        Tensor var);

void SyncBNForwardOutputCUDAKernelLauncher(
    const Tensor input, const Tensor mean, const Tensor var,
    Tensor running_mean, Tensor running_var, const Tensor weight,
    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
    float momentum, int group_size);

void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
                                           const Tensor norm,
                                           Tensor grad_weight,
                                           Tensor grad_bias);

void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
                                          const Tensor weight,
                                          const Tensor grad_weight,
                                          const Tensor grad_bias,
                                          const Tensor norm, const Tensor std,
                                          Tensor grad_input);

void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
}

void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
                              Tensor var) {
  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
}

void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size) {
  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
                                        running_var, weight, bias, norm, std,
                                        output, eps, momentum, group_size);
}

void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias) {
  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
                                        grad_bias);
}

void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input) {
  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
                                       grad_bias, norm, std, grad_input);
}

void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);

void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
                              Tensor var);

void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size);

void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias);

void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input);

REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
                     sync_bn_forward_mean_cuda);
REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
                     sync_bn_forward_output_cuda);
REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
                     sync_bn_backward_param_cuda);
REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
                     sync_bn_backward_data_cuda);

void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
                                               const Tensor points,
                                               const Tensor idx,
                                               const Tensor weight, Tensor out);

void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
                                                const Tensor grad_out,
                                                const Tensor idx,
                                                const Tensor weight,
                                                Tensor grad_points);

void three_interpolate_forward_cuda(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out) {
  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
                                            out);
};

void three_interpolate_backward_cuda(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points) {
  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
                                             grad_points);
};

void three_interpolate_forward_impl(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out);

void three_interpolate_backward_impl(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points);
REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
                     three_interpolate_forward_cuda);
REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
                     three_interpolate_backward_cuda);

void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
                                      const Tensor known, Tensor dist2,
                                      Tensor idx);

void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx) {
  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
};

void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx);
REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);

void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
                                       Tensor output);

void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
                                        Tensor grad_input);

void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
  TINShiftForwardCUDAKernelLauncher(input, shift, output);
}

void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
                             Tensor grad_input) {
  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
}

void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
                             Tensor grad_input);
REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);

torch::Tensor upfirdn2d_op(torch::Tensor input, torch::Tensor filter, int upx,
                           int upy, int downx, int downy, int padx0, int padx1,
                           int pady0, int pady1, bool flip, float gain);

torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
                                int upx, int upy, int downx, int downy,
                                int padx0, int padx1, int pady0, int pady1,
                                bool flip, float gain);
REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);

int HardVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3);

int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3);

void DynamicVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor &points, at::Tensor &coors,
    const std::vector<float> voxel_size, const std::vector<float> coors_range,
    const int NDim = 3);

int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim) {
  return HardVoxelizeForwardCUDAKernelLauncher(
      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
      max_points, max_voxels, NDim);
};

int nondeterministic_hard_voxelize_forward_cuda(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim) {
  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
      max_points, max_voxels, NDim);
};

void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim) {
  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
                                           coors_range, NDim);
};

int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim);

int nondeterministic_hard_voxelize_forward_impl(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim);

void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim);

REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
                     hard_voxelize_forward_cuda);
REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
                     nondeterministic_hard_voxelize_forward_cuda);
REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
                     dynamic_voxelize_forward_cuda);

void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
                                                  const Tensor best_bboxes,
                                                  const float spatial_scale,
                                                  const int points,
                                                  Tensor output);

void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
                                                   const Tensor best_bboxes,
                                                   const float spatial_scale,
                                                   const int points,
                                                   Tensor bottom_grad);

void rotated_feature_align_forward_cuda(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output) {
  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
                                               spatial_scale, points, output);
};

void rotated_feature_align_backward_cuda(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad) {
  RotatedFeatureAlignBackwardCUDAKernelLauncher(
      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
};

void rotated_feature_align_forward_impl(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output);

void rotated_feature_align_backward_impl(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad);

REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
                     rotated_feature_align_forward_cuda);
REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
                     rotated_feature_align_backward_cuda);

void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
                                               const at::Tensor polygons,
                                               const int rows, const int cols,
                                               at::Tensor output);

void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols) {
  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
                                            output);
};

void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols);

REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
                     points_in_polygons_forward_cuda);

torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
                                                     torch::Tensor indicePairs,
                                                     torch::Tensor indiceNum,
                                                     int64_t numAct);

torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
                                          torch::Tensor indicePairs,
                                          torch::Tensor indiceNum,
                                          int64_t numAct) {
  return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
                                                indiceNum, numAct);
};

torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
                                          torch::Tensor indicePairs,
                                          torch::Tensor indiceNum,
                                          int64_t numAct);
REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
                     indice_maxpool_forward_cuda);

torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
                                                      torch::Tensor outFeatures,
                                                      torch::Tensor outGrad,
                                                      torch::Tensor indicePairs,
                                                      torch::Tensor indiceNum);

torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
                                           torch::Tensor outFeatures,
                                           torch::Tensor outGrad,
                                           torch::Tensor indicePairs,
                                           torch::Tensor indiceNum) {
  return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
                                                 indicePairs, indiceNum);
};

torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
                                           torch::Tensor outFeatures,
                                           torch::Tensor outGrad,
                                           torch::Tensor indicePairs,
                                           torch::Tensor indiceNum);

REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
                     indice_maxpool_backward_cuda)

torch::Tensor IndiceConvForwardCUDAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
    int64_t _subM);

torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
                                       torch::Tensor filters,
                                       torch::Tensor indicePairs,
                                       torch::Tensor indiceNum,
                                       int64_t numActOut, int64_t _inverse,
                                       int64_t _subM) {
  return IndiceConvForwardCUDAKernelLauncher(
      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
};

torch::Tensor indice_conv_forward_impl(torch::Tensor features,
                                       torch::Tensor filters,
                                       torch::Tensor indicePairs,
                                       torch::Tensor indiceNum,
                                       int64_t numActOut, int64_t _inverse,
                                       int64_t _subM);

REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);

std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM);

std::vector<torch::Tensor> indice_conv_backward_cuda(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM) {
  return IndiceConvBackwardCUDAKernelLauncher(
      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
};

std::vector<torch::Tensor> indice_conv_backward_impl(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM);

REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
                     indice_conv_backward_cuda);

torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM);

torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM) {
  return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
                                                    indicePairs, indiceNum,
                                                    numActOut, _inverse, _subM);
};

torch::Tensor fused_indice_conv_batchnorm_forward_impl(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM);

REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
                     fused_indice_conv_batchnorm_forward_cuda)

void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);

void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
}

void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);

REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);

void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
                                                  const Tensor indices,
                                                  Tensor output);

void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
                                                   const Tensor indices,
                                                   Tensor grad_in);

void active_rotated_filter_forward_cuda(const Tensor input,
                                        const Tensor indices, Tensor output) {
  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
};

void active_rotated_filter_backward_cuda(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in) {
  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
};

void active_rotated_filter_forward_impl(const Tensor input,
                                        const Tensor indices, Tensor output);

void active_rotated_filter_backward_impl(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in);

REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
                     active_rotated_filter_forward_cuda);
REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
                     active_rotated_filter_backward_cuda);

void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                 Tensor ious);

void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                  Tensor output);

void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
                     Tensor ious) {
  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
}

void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
                      Tensor output) {
  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
}

void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
                     Tensor ious);

void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
                      Tensor output);

REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);

Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
                                                    Tensor mask,
                                                    Tensor num_valid);

Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
                                                   Tensor num_valid) {
  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
                                                      num_valid);
}

Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
                                                   Tensor num_valid);

REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
                     diff_iou_rotated_sort_vertices_forward_cuda);

void ChamferDistanceForwardCUDAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
    const Tensor dist2, const Tensor idx1, const Tensor idx2);

void ChamferDistanceBackwardCUDAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);

void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
                                   const Tensor dist1, const Tensor dist2,
                                   const Tensor idx1, const Tensor idx2) {
  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
                                           idx2);
};

void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
                                    Tensor idx1, Tensor idx2, Tensor graddist1,
                                    Tensor graddist2, Tensor gradxyz1,
                                    Tensor gradxyz2) {
  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
                                            graddist2, gradxyz1, gradxyz2);
};

void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
                                   const Tensor dist1, const Tensor dist2,
                                   const Tensor idx1, const Tensor idx2);

void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
                                    Tensor idx1, Tensor idx2, Tensor graddist1,
                                    Tensor graddist2, Tensor gradxyz1,
                                    Tensor gradxyz2);

REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
                     chamfer_distance_forward_cuda);
REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
                     chamfer_distance_backward_cuda);

void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                        Tensor output, int pooled_height,
                                        int pooled_width, float spatial_scale);

void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                         Tensor grad_input, int pooled_height,
                                         int pooled_width, float spatial_scale);

void PrROIPoolCoorBackwardCUDAKernelLauncher(
    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);

void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
                             int pooled_height, int pooled_width,
                             float spatial_scale) {
  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
                                     pooled_width, spatial_scale);
}

void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
                              Tensor grad_input, int pooled_height,
                              int pooled_width, float spatial_scale) {
  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
                                      pooled_height, pooled_width,
                                      spatial_scale);
}

void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
                                   Tensor input, Tensor rois, Tensor grad_rois,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale) {
  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
                                          grad_rois, pooled_height,
                                          pooled_width, spatial_scale);
}

void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                             int pooled_height, int pooled_width,
                             float spatial_scale);
void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
                              Tensor grad_input, int pooled_height,
                              int pooled_width, float spatial_scale);
void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
                                   Tensor input, Tensor rois, Tensor grad_rois,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale);
REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
                     prroi_pool_coor_backward_cuda);

void BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                          Tensor output, int aligned_height,
                                          int aligned_width,
                                          float spatial_scale,
                                          int sampling_ratio, bool aligned);

void BezierAlignBackwardCUDAKernelLauncher(
    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned);

void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                               int aligned_height, int aligned_width,
                               float spatial_scale, int sampling_ratio,
                               bool aligned);

void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
                                Tensor grad_input, int aligned_height,
                                int aligned_width, float spatial_scale,
                                int sampling_ratio, bool aligned);

REGISTER_DEVICE_IMPL(bezier_align_forward_impl, CUDA,
                     BezierAlignForwardCUDAKernelLauncher);
REGISTER_DEVICE_IMPL(bezier_align_backward_impl, CUDA,
                     BezierAlignBackwardCUDAKernelLauncher);


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "deform_conv_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col) {
  // num_axes should be smaller than block size
  // todo: check parallel_imgs is correctly passed in
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels = channels * height_col * width_col * parallel_imgs;
  int channel_per_deformable_group = channels / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();

        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
                                       THREADS_PER_BLOCK, 0,
                                       at::cuda::getCurrentCUDAStream()>>>(
            num_kernels, data_im_, data_offset_, height, width, ksize_h,
            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
            channel_per_deformable_group, parallel_imgs, channels,
            deformable_group, height_col, width_col, data_col_);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}

void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im) {
  // todo: make sure parallel_imgs is passed in correctly
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels =
      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
  int channel_per_deformable_group = channels / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();

        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
                                       THREADS_PER_BLOCK, 0,
                                       at::cuda::getCurrentCUDAStream()>>>(
            num_kernels, data_col_, data_offset_, channels, height, width,
            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
            dilation_w, channel_per_deformable_group, parallel_imgs,
            deformable_group, height_col, width_col, grad_im_);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}

void deformable_col2im_coord_cuda(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset) {
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
                    deformable_group * parallel_imgs;
  int channel_per_deformable_group =
      channels * ksize_h * ksize_w / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();

        deformable_col2im_coord_gpu_kernel<<<
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            at::cuda::getCurrentCUDAStream()>>>(
            num_kernels, data_col_, data_im_, data_offset_, channels, height,
            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
            2 * ksize_h * ksize_w * deformable_group, deformable_group,
            height_col, width_col, grad_offset_);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "deform_roi_pool_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                            Tensor offset, Tensor output,
                                            int pooled_height, int pooled_width,
                                            float spatial_scale,
                                            int sampling_ratio, float gamma) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
        deform_roi_pool_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio,
                static_cast<scalar_t>(gamma), channels, height, width);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void DeformRoIPoolBackwardCUDAKernelLauncher(
    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
    float spatial_scale, int sampling_ratio, float gamma) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
        deform_roi_pool_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio,
                static_cast<scalar_t>(gamma), channels, height, width);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
#include "diff_iou_rotated_cuda_kernel.cuh"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_cuda_helper.hpp"

at::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,
                                                        at::Tensor mask,
                                                        at::Tensor num_valid) {
  at::cuda::CUDAGuard device_guard(vertices.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  CHECK_CONTIGUOUS(vertices);
  CHECK_CONTIGUOUS(mask);
  CHECK_CONTIGUOUS(num_valid);
  CHECK_CUDA(vertices);
  CHECK_CUDA(mask);
  CHECK_CUDA(num_valid);

  int b = vertices.size(0);
  int n = vertices.size(1);
  int m = vertices.size(2);
  at::Tensor idx =
      torch::zeros({b, n, MAX_NUM_VERT_IDX},
                   at::device(vertices.device()).dtype(at::ScalarType::Int));

  diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,
                                                       stream>>>(
      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
      num_valid.data_ptr<int>(), idx.data_ptr<int>());
  AT_CUDA_CHECK(cudaGetLastError());

  return idx;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include <c10/util/Half.h>
#include <cuda_runtime.h>
#include <torch/types.h>

#include <cstdint>

#include "pytorch_cuda_helper.hpp"
#include "pytorch_device_registry.hpp"

//------------------------------------------------------------------------
// CUDA kernel parameters.

struct filtered_lrelu_kernel_params {
  // These parameters decide which kernel to use.
  int up;        // upsampling ratio (1, 2, 4)
  int down;      // downsampling ratio (1, 2, 4)
  int2 fuShape;  // [size, 1] | [size, size]
  int2 fdShape;  // [size, 1] | [size, size]

  int _dummy;  // Alignment.

  // Rest of the parameters.
  const void *x;     // Input tensor.
  void *y;           // Output tensor.
  const void *b;     // Bias tensor.
  unsigned char *s;  // Sign tensor in/out. NULL if unused.
  const float *fu;   // Upsampling filter.
  const float *fd;   // Downsampling filter.

  int2 pad0;    // Left/top padding.
  float gain;   // Additional gain factor.
  float slope;  // Leaky ReLU slope on negative side.
  float clamp;  // Clamp after nonlinearity.
  int flip;     // Filter kernel flip for gradient computation.

  int tilesXdim;  // Original number of horizontal output tiles.
  int tilesXrep;  // Number of horizontal tiles per CTA.
  int blockZofs;  // Block z offset to support large minibatch, channel
                  // dimensions.

  int4 xShape;  // [width, height, channel, batch]
  int4 yShape;  // [width, height, channel, batch]
  int2 sShape;  // [width, height] - width is in bytes. Contiguous. Zeros if
                // unused.
  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
  int swLimit;  // Active width of sign tensor in bytes.

  longlong4 xStride;   // Strides of all tensors except signs, same component
                       // order as shapes.
  longlong4 yStride;   //
  int64_t bStride;     //
  longlong3 fuStride;  //
  longlong3 fdStride;  //
};

struct filtered_lrelu_act_kernel_params {
  void *x;           // Input/output, modified in-place.
  unsigned char *s;  // Sign tensor in/out. NULL if unused.

  float gain;   // Additional gain factor.
  float slope;  // Leaky ReLU slope on negative side.
  float clamp;  // Clamp after nonlinearity.

  int4 xShape;        // [width, height, channel, batch]
  longlong4 xStride;  // Input/output tensor strides, same order as in shape.
  int2 sShape;  // [width, height] - width is in elements. Contiguous. Zeros if
                // unused.
  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
};

//------------------------------------------------------------------------
// CUDA kernel specialization.

struct filtered_lrelu_kernel_spec {
  void *setup;   // Function for filter kernel setup.
  void *exec;    // Function for main operation.
  int2 tileOut;  // Width/height of launch tile.
  int numWarps;  // Number of warps per thread block, determines launch block
                 // size.
  int xrep;      // For processing multiple horizontal tiles per thread block.
  int dynamicSharedKB;  // How much dynamic shared memory the exec kernel wants.
};

//------------------------------------------------------------------------
// CUDA kernel selection.

template <class T, class index_t, bool signWrite, bool signRead>
filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
    const filtered_lrelu_kernel_params &p, int sharedKB);
template <class T, bool signWrite, bool signRead>
void *choose_filtered_lrelu_act_kernel(void);

//------------------------------------------------------------------------
// Helpers.

enum  // Filter modes.
{
  MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
  MODE_FUSD = 1,  // Full upsampling, separable downsampling.
  MODE_SUFD = 2,  // Separable upsampling, full downsampling.
  MODE_FUFD = 3,  // Full upsampling, full downsampling.
};

template <class T>
struct InternalType;
template <>
struct InternalType<double> {
  typedef double scalar_t;
  typedef double2 vec2_t;
  typedef double4 vec4_t;
  __device__ __forceinline__ static vec2_t zero_vec2(void) {
    return make_double2(0, 0);
  }
  __device__ __forceinline__ static vec4_t zero_vec4(void) {
    return make_double4(0, 0, 0, 0);
  }
  __device__ __forceinline__ static double clamp(double x, double c) {
    return fmin(fmax(x, -c), c);
  }
};
template <>
struct InternalType<float> {
  typedef float scalar_t;
  typedef float2 vec2_t;
  typedef float4 vec4_t;
  __device__ __forceinline__ static vec2_t zero_vec2(void) {
    return make_float2(0, 0);
  }
  __device__ __forceinline__ static vec4_t zero_vec4(void) {
    return make_float4(0, 0, 0, 0);
  }
  __device__ __forceinline__ static float clamp(float x, float c) {
    return fminf(fmaxf(x, -c), c);
  }
};
template <>
struct InternalType<c10::Half> {
  typedef float scalar_t;
  typedef float2 vec2_t;
  typedef float4 vec4_t;
  __device__ __forceinline__ static vec2_t zero_vec2(void) {
    return make_float2(0, 0);
  }
  __device__ __forceinline__ static vec4_t zero_vec4(void) {
    return make_float4(0, 0, 0, 0);
  }
  __device__ __forceinline__ static float clamp(float x, float c) {
    return fminf(fmaxf(x, -c), c);
  }
};

#define MIN(A, B) ((A) < (B) ? (A) : (B))
#define MAX(A, B) ((A) > (B) ? (A) : (B))
#define CEIL_DIV(A, B)                  \
  (((B) == 1)   ? (A)                   \
   : ((B) == 2) ? ((int)((A) + 1) >> 1) \
   : ((B) == 4) ? ((int)((A) + 3) >> 2) \
                : (((A) + ((A) > 0 ? (B) - 1 : 0)) / (B)))

// This works only up to blocks of size 256 x 256 and for all N that are powers
// of two.
template <int N>
__device__ __forceinline__ void fast_div_mod(int &x, int &y, unsigned int i) {
  if ((N & (N - 1)) && N <= 256)
    y = (i * ((1 << 24) / N + 1)) >> 24;  // Assumes N <= 256, i < N*256.
  else
    y = i / N;

  x = i - y * N;
}

// Type cast stride before reading it.
template <class T>
__device__ __forceinline__ T get_stride(const int64_t &x) {
  return *reinterpret_cast<const T *>(&x);
}

//------------------------------------------------------------------------
// Filters, setup kernel, copying function.

#define MAX_FILTER_SIZE 32

// Combined up/down filter buffers so that transfer can be done with one copy.
__device__ float
    g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE];  // Filters in global memory,
                                                    // written by setup kernel.
__device__ __constant__ float
    c_fbuf[2 * MAX_FILTER_SIZE *
           MAX_FILTER_SIZE];  // Filters in constant memory, read by main
                              // kernel.

// Accessors to combined buffers to index up/down filters individually.
#define c_fu (c_fbuf)
#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
#define g_fu (g_fbuf)
#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)

// Set up filters into global memory buffer.
static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p) {
  for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE;
       idx += blockDim.x) {
    int x, y;
    fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);

    int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
    int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
    if (p.fuShape.y > 0)
      g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y)
                      ? 0.0f
                      : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
    else
      g_fu[idx] =
          (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];

    int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
    int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
    if (p.fdShape.y > 0)
      g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y)
                      ? 0.0f
                      : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
    else
      g_fd[idx] =
          (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
  }
}

// Host function to copy filters written by setup kernel into constant buffer
// for main kernel.
static cudaError_t copy_filters(cudaStream_t stream) {
  void *src = 0;
  cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
  if (err) return err;
  return cudaMemcpyToSymbolAsync(
      c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0,
      cudaMemcpyDeviceToDevice, stream);
}

//------------------------------------------------------------------------
// Coordinate spaces:
// - Relative to input tensor:      inX, inY, tileInX, tileInY
// - Relative to input tile:        relInX, relInY, tileInW, tileInH
// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
//
// Relationships between coordinate spaces:
// - inX = tileInX + relInX
// - inY = tileInY + relInY
// - relUpX = relInX * up + phaseInX
// - relUpY = relInY * up + phaseInY
// - relUpX = relOutX * down
// - relUpY = relOutY * down
// - outX = tileOutX + relOutX
// - outY = tileOutY + relOutY

extern __shared__ char
    s_buf_raw[];  // When sharedKB <= 48, allocate shared memory statically
                  // inside the kernel, otherwise use the externally allocated
                  // shared memory buffer.

template <class T, class index_t, int sharedKB, bool signWrite, bool signRead,
          int filterMode, int up, int fuSize, int down, int fdSize,
          int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep,
          bool enableWriteSkip>
static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
  // Check that we don't try to support non-existing filter modes.
  static_assert(up == 1 || up == 2 || up == 4,
                "only up=1, up=2, up=4 scales supported");
  static_assert(down == 1 || down == 2 || down == 4,
                "only down=1, down=2, down=4 scales supported");
  static_assert(fuSize >= up,
                "upsampling filter size must be at least upsampling factor");
  static_assert(
      fdSize >= down,
      "downsampling filter size must be at least downsampling factor");
  static_assert(
      fuSize % up == 0,
      "upsampling filter size must be divisible with upsampling factor");
  static_assert(
      fdSize % down == 0,
      "downsampling filter size must be divisible with downsampling factor");
  static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE,
                "filter size greater than MAX_FILTER_SIZE");
  static_assert(up != 1 || (fuSize == 1 && (filterMode == MODE_FUFD ||
                                            filterMode == MODE_FUSD)),
                "up=1 supported only for 1x1 full filters");
  static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD ||
                                              filterMode == MODE_SUFD)),
                "down=1 supported only for 1x1 full filters");
  static_assert(
      !(up == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)),
      "full filters not supported for up=4");
  static_assert(
      !(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)),
      "full filters not supported for down=4");

  // Static definitions.
  typedef typename InternalType<T>::scalar_t scalar_t;
  typedef typename InternalType<T>::vec2_t vec2_t;
  typedef typename InternalType<T>::vec4_t vec4_t;
  const int tileUpW = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) &
                      ~3;  // Upsampled tile width, rounded up to multiple of 4.
  const int tileUpH =
      tileOutH * down + (fdSize - 1) - (down - 1);  // Upsampled tile height.
  const int tileInW =
      CEIL_DIV(tileUpW + (fuSize - 1), up);  // Input tile width.
  const int tileInH =
      CEIL_DIV(tileUpH + (fuSize - 1), up);  // Input tile height.
  const int tileUpH_up =
      CEIL_DIV(tileUpH, up) *
      up;  // Upsampled tile height rounded up to a multiple of up.
  const int tileInH_up =
      CEIL_DIV(tileUpH_up + (fuSize - 1),
               up);  // For allocations only, to avoid shared memory read
                     // overruns with up=2 and up=4.

  // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
  const bool downInline =
      (down == 1) && ((up == 1 && filterMode == MODE_FUFD) ||
                      (up == 2 && filterMode == MODE_SUFD));

  // Sizes of logical buffers.
  const int szIn = tileInH_up * tileInW;
  const int szUpX = tileInH_up * tileUpW;
  const int szUpXY = downInline ? 0 : (tileUpH * tileUpW);
  const int szDownX = tileUpH * tileOutW;

  // Sizes for shared memory arrays.
  const int s_buf0_size_base = (filterMode == MODE_SUSD)   ? MAX(szIn, szUpXY)
                               : (filterMode == MODE_FUSD) ? MAX(szIn, szDownX)
                               : (filterMode == MODE_SUFD) ? MAX(szIn, szUpXY)
                               : (filterMode == MODE_FUFD) ? szIn
                                                           : -1;
  const int s_buf1_size_base = (filterMode == MODE_SUSD)   ? MAX(szUpX, szDownX)
                               : (filterMode == MODE_FUSD) ? szUpXY
                               : (filterMode == MODE_SUFD) ? szUpX
                               : (filterMode == MODE_FUFD) ? szUpXY
                                                           : -1;

  // Ensure U128 alignment.
  const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
  const int s_buf1_size = (s_buf1_size_base + 3) & ~3;

  // Check at compile time that we don't use too much shared memory.
  static_assert(
      (s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10),
      "shared memory overflow");

  // Declare shared memory arrays.
  scalar_t *s_buf0;
  scalar_t *s_buf1;
  if (sharedKB <= 48) {
    // Allocate shared memory arrays here.
    __shared__ scalar_t
        s_buf0_st[(sharedKB > 48)
                      ? (1 << 24)
                      : (s_buf0_size +
                         s_buf1_size)];  // Prevent launching if this isn't
                                         // optimized away when unused.
    s_buf0 = s_buf0_st;
    s_buf1 = s_buf0 + s_buf0_size;
  } else {
    // Use the dynamically allocated shared memory array.
    s_buf0 = (scalar_t *)s_buf_raw;
    s_buf1 = s_buf0 + s_buf0_size;
  }

  // Pointers to the buffers.
  scalar_t *
      s_tileIn;  // Input tile:                      [relInX * tileInH + relInY]
  scalar_t *s_tileUpX;   // After horizontal upsampling:     [relInY * tileUpW +
                         // relUpX]
  scalar_t *s_tileUpXY;  // After upsampling:                [relUpY * tileUpW +
                         // relUpX]
  scalar_t *s_tileDownX;  // After horizontal downsampling:   [relUpY * tileOutW
                          // + relOutX]
  if (filterMode == MODE_SUSD) {
    s_tileIn = s_buf0;
    s_tileUpX = s_buf1;
    s_tileUpXY = s_buf0;
    s_tileDownX = s_buf1;
  } else if (filterMode == MODE_FUSD) {
    s_tileIn = s_buf0;
    s_tileUpXY = s_buf1;
    s_tileDownX = s_buf0;
  } else if (filterMode == MODE_SUFD) {
    s_tileIn = s_buf0;
    s_tileUpX = s_buf1;
    s_tileUpXY = s_buf0;
  } else if (filterMode == MODE_FUFD) {
    s_tileIn = s_buf0;
    s_tileUpXY = s_buf1;
  }

  // Allow large grids in z direction via per-launch offset.
  int channelIdx = blockIdx.z + p.blockZofs;
  int batchIdx = channelIdx / p.yShape.z;
  channelIdx -= batchIdx * p.yShape.z;

  // Offset to output feature map. In bytes.
  index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) +
                      batchIdx * get_stride<index_t>(p.yStride.w);

  // Sign shift amount.
  uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;

// Inner tile loop.
#pragma unroll 1
  for (int tileIdx = 0;
       !enableXrep ||
       (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y));
       tileIdx++) {
    // Locate output tile.
    int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
    int tileOutX = tileX * tileOutW;
    int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;

    // Locate input tile.
    int tmpX = tileOutX * down - p.pad0.x;
    int tmpY = tileOutY * down - p.pad0.y;
    int tileInX = CEIL_DIV(tmpX, up);
    int tileInY = CEIL_DIV(tmpY, up);
    const int phaseInX = tileInX * up - tmpX;
    const int phaseInY = tileInY * up - tmpY;

    // Extra sync if input and output buffers are the same and we are not on
    // first tile.
    if (enableXrep && tileIdx > 0 &&
        (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) ||
         (filterMode == MODE_FUFD && downInline)))
      __syncthreads();

    // Load input tile & apply bias. Unrolled.
    scalar_t b =
        (scalar_t) * (const T *)((const char *)p.b +
                                 (channelIdx * get_stride<index_t>(p.bStride)));
    index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) +
                       batchIdx * get_stride<index_t>(p.xStride.w);
    int idx = threadIdx.x;
    const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
#pragma unroll
    for (int loop = 0; loop < loopCountIN; loop++) {
      int relInX, relInY;
      fast_div_mod<tileInW>(relInX, relInY, idx);
      int inX = tileInX + relInX;
      int inY = tileInY + relInY;
      scalar_t v = 0;

      if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
        v = (scalar_t) * ((const T *)((const char *)p.x +
                                      (inX * get_stride<index_t>(p.xStride.x) +
                                       inY * get_stride<index_t>(p.xStride.y) +
                                       mapOfsIn))) +
            b;

      bool skip = (loop == loopCountIN - 1) && (idx >= tileInW * tileInH);
      if (!skip) s_tileIn[idx] = v;

      idx += threadsPerBlock;
    }

    if (filterMode == MODE_SUSD ||
        filterMode == MODE_SUFD)  // Separable upsampling filter.
    {
      // Horizontal upsampling.
      __syncthreads();
      if (up == 4) {
        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
             idx += blockDim.x * up) {
          int relUpX0, relInY;
          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
          int relInX0 = relUpX0 / up;
          int src0 = relInX0 + tileInW * relInY;
          int dst = relInY * tileUpW + relUpX0;
          vec4_t v = InternalType<T>::zero_vec4();
          scalar_t a = s_tileIn[src0];
          if (phaseInX == 0) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
              v.y += a * (scalar_t)c_fu[step * up + 3];
              v.z += a * (scalar_t)c_fu[step * up + 2];
              v.w += a * (scalar_t)c_fu[step * up + 1];
            }
          } else if (phaseInX == 1) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 1];
              v.y += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
              v.z += a * (scalar_t)c_fu[step * up + 3];
              v.w += a * (scalar_t)c_fu[step * up + 2];
            }
          } else if (phaseInX == 2) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 2];
              v.y += a * (scalar_t)c_fu[step * up + 1];
              v.z += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
              v.w += a * (scalar_t)c_fu[step * up + 3];
            }
          } else  // (phaseInX == 3)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 3];
              v.y += a * (scalar_t)c_fu[step * up + 2];
              v.z += a * (scalar_t)c_fu[step * up + 1];
              v.w += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
            }
          }
          s_tileUpX[dst + 0] = v.x;
          s_tileUpX[dst + 1] = v.y;
          s_tileUpX[dst + 2] = v.z;
          s_tileUpX[dst + 3] = v.w;
        }
      } else if (up == 2) {
        bool p0 = (phaseInX == 0);
        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
             idx += blockDim.x * up) {
          int relUpX0, relInY;
          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
          int relInX0 = relUpX0 / up;
          int src0 = relInX0 + tileInW * relInY;
          int dst = relInY * tileUpW + relUpX0;
          vec2_t v = InternalType<T>::zero_vec2();
          scalar_t a = s_tileIn[src0];
          if (p0)  // (phaseInX == 0)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
              v.y += a * (scalar_t)c_fu[step * up + 1];
            }
          } else  // (phaseInX == 1)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 1];
              v.y += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
            }
          }
          s_tileUpX[dst + 0] = v.x;
          s_tileUpX[dst + 1] = v.y;
        }
      }

      // Vertical upsampling & nonlinearity.

      __syncthreads();
      int groupMask = 15 << ((threadIdx.x & 31) & ~3);
      int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
                          : 0;  // Skip already written signs.
      int sShapeMaxY =
          MIN(p.sShape.y,
              tileOutY * down + tileUpH);  // Avoid out-of-tile sign writes.
      if (up == 4) {
        minY -= 3;  // Adjust according to block height.
        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
             idx += blockDim.x) {
          int relUpX, relInY0;
          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
          int relUpY0 = relInY0 * up;
          int src0 = relInY0 * tileUpW + relUpX;
          int dst = relUpY0 * tileUpW + relUpX;
          vec4_t v = InternalType<T>::zero_vec4();

          scalar_t a = s_tileUpX[src0];
          if (phaseInY == 0) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
              v.y += a * (scalar_t)c_fu[step * up + 3];
              v.z += a * (scalar_t)c_fu[step * up + 2];
              v.w += a * (scalar_t)c_fu[step * up + 1];
            }
          } else if (phaseInY == 1) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 1];
              v.y += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
              v.z += a * (scalar_t)c_fu[step * up + 3];
              v.w += a * (scalar_t)c_fu[step * up + 2];
            }
          } else if (phaseInY == 2) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 2];
              v.y += a * (scalar_t)c_fu[step * up + 1];
              v.z += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
              v.w += a * (scalar_t)c_fu[step * up + 3];
            }
          } else  // (phaseInY == 3)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 3];
              v.y += a * (scalar_t)c_fu[step * up + 2];
              v.z += a * (scalar_t)c_fu[step * up + 1];
              v.w += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
            }
          }

          int x = tileOutX * down + relUpX;
          int y = tileOutY * down + relUpY0;
          int signX = x + p.sOfs.x;
          int signY = y + p.sOfs.y;
          int signZ = blockIdx.z + p.blockZofs;
          int signXb = signX >> 2;
          index_t si0 =
              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
          index_t si1 = si0 + p.sShape.x;
          index_t si2 = si0 + p.sShape.x * 2;
          index_t si3 = si0 + p.sShape.x * 3;

          v.x *= (scalar_t)((float)up * (float)up * p.gain);
          v.y *= (scalar_t)((float)up * (float)up * p.gain);
          v.z *= (scalar_t)((float)up * (float)up * p.gain);
          v.w *= (scalar_t)((float)up * (float)up * p.gain);

          if (signWrite) {
            if (!enableWriteSkip) {
              // Determine and write signs.
              int sx = __float_as_uint(v.x) >> 31 << 0;
              int sy = __float_as_uint(v.y) >> 31 << 8;
              int sz = __float_as_uint(v.z) >> 31 << 16;
              int sw = __float_as_uint(v.w) >> 31 << 24;
              if (sx) v.x *= p.slope;
              if (sy) v.y *= p.slope;
              if (sz) v.z *= p.slope;
              if (sw) v.w *= p.slope;
              if (fabsf(v.x) > p.clamp) {
                sx = 2 << 0;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
              }
              if (fabsf(v.y) > p.clamp) {
                sy = 2 << 8;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
              }
              if (fabsf(v.z) > p.clamp) {
                sz = 2 << 16;
                v.z = InternalType<T>::clamp(v.z, p.clamp);
              }
              if (fabsf(v.w) > p.clamp) {
                sw = 2 << 24;
                v.w = InternalType<T>::clamp(v.w, p.clamp);
              }

              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
                // Combine signs.
                uint32_t s = sx + sy + sw + sz;
                s <<= (signX & 3) << 1;
#ifdef MMCV_WITH_HIP
                s |= __shfl_xor(s, 1);
                s |= __shfl_xor(s, 2);
#else
                s |= __shfl_xor_sync(groupMask, s, 1);
                s |= __shfl_xor_sync(groupMask, s, 2);
#endif

                // Write signs.
                if ((uint32_t)(signY + 0) < sShapeMaxY) {
                  p.s[si0] = (unsigned char)(s >> 0);
                }
                if ((uint32_t)(signY + 1) < sShapeMaxY) {
                  p.s[si1] = (unsigned char)(s >> 8);
                }
                if ((uint32_t)(signY + 2) < sShapeMaxY) {
                  p.s[si2] = (unsigned char)(s >> 16);
                }
                if ((uint32_t)(signY + 3) < sShapeMaxY) {
                  p.s[si3] = (unsigned char)(s >> 24);
                }
              }
            } else {
              // Determine and write signs.
              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
                int sx = __float_as_uint(v.x) >> 31 << 0;
                int sy = __float_as_uint(v.y) >> 31 << 8;
                int sz = __float_as_uint(v.z) >> 31 << 16;
                int sw = __float_as_uint(v.w) >> 31 << 24;
                if (sx) v.x *= p.slope;
                if (sy) v.y *= p.slope;
                if (sz) v.z *= p.slope;
                if (sw) v.w *= p.slope;
                if (fabsf(v.x) > p.clamp) {
                  sx = 2 << 0;
                  v.x = InternalType<T>::clamp(v.x, p.clamp);
                }
                if (fabsf(v.y) > p.clamp) {
                  sy = 2 << 8;
                  v.y = InternalType<T>::clamp(v.y, p.clamp);
                }
                if (fabsf(v.z) > p.clamp) {
                  sz = 2 << 16;
                  v.z = InternalType<T>::clamp(v.z, p.clamp);
                }
                if (fabsf(v.w) > p.clamp) {
                  sw = 2 << 24;
                  v.w = InternalType<T>::clamp(v.w, p.clamp);
                }

                // Combine signs.
                uint32_t s = sx + sy + sw + sz;
                s <<= (signX & 3) << 1;
#ifdef MMCV_WITH_HIP
                s |= __shfl_xor(s, 1);
                s |= __shfl_xor(s, 2);
#else
                s |= __shfl_xor_sync(groupMask, s, 1);
                s |= __shfl_xor_sync(groupMask, s, 2);
#endif

                // Write signs.
                if ((uint32_t)(signY + 0) < sShapeMaxY) {
                  p.s[si0] = (unsigned char)(s >> 0);
                }
                if ((uint32_t)(signY + 1) < sShapeMaxY) {
                  p.s[si1] = (unsigned char)(s >> 8);
                }
                if ((uint32_t)(signY + 2) < sShapeMaxY) {
                  p.s[si2] = (unsigned char)(s >> 16);
                }
                if ((uint32_t)(signY + 3) < sShapeMaxY) {
                  p.s[si3] = (unsigned char)(s >> 24);
                }
              } else {
                // Just compute the values.
                if (v.x < 0.f) v.x *= p.slope;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
                if (v.y < 0.f) v.y *= p.slope;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
                if (v.z < 0.f) v.z *= p.slope;
                v.z = InternalType<T>::clamp(v.z, p.clamp);
                if (v.w < 0.f) v.w *= p.slope;
                v.w = InternalType<T>::clamp(v.w, p.clamp);
              }
            }
          } else if (signRead)  // Read signs and apply.
          {
            if ((uint32_t)signXb < p.swLimit) {
              int ss = (signX & 3) << 1;
              if ((uint32_t)(signY + 0) < p.sShape.y) {
                int s = p.s[si0] >> ss;
                if (s & 1) v.x *= p.slope;
                if (s & 2) v.x = 0.f;
              }
              if ((uint32_t)(signY + 1) < p.sShape.y) {
                int s = p.s[si1] >> ss;
                if (s & 1) v.y *= p.slope;
                if (s & 2) v.y = 0.f;
              }
              if ((uint32_t)(signY + 2) < p.sShape.y) {
                int s = p.s[si2] >> ss;
                if (s & 1) v.z *= p.slope;
                if (s & 2) v.z = 0.f;
              }
              if ((uint32_t)(signY + 3) < p.sShape.y) {
                int s = p.s[si3] >> ss;
                if (s & 1) v.w *= p.slope;
                if (s & 2) v.w = 0.f;
              }
            }
          } else  // Forward pass with no sign write.
          {
            if (v.x < 0.f) v.x *= p.slope;
            v.x = InternalType<T>::clamp(v.x, p.clamp);
            if (v.y < 0.f) v.y *= p.slope;
            v.y = InternalType<T>::clamp(v.y, p.clamp);
            if (v.z < 0.f) v.z *= p.slope;
            v.z = InternalType<T>::clamp(v.z, p.clamp);
            if (v.w < 0.f) v.w *= p.slope;
            v.w = InternalType<T>::clamp(v.w, p.clamp);
          }

          s_tileUpXY[dst + 0 * tileUpW] = v.x;
          if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
          if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
          if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
        }
      } else if (up == 2) {
        minY -= 1;  // Adjust according to block height.
        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
             idx += blockDim.x) {
          int relUpX, relInY0;
          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
          int relUpY0 = relInY0 * up;
          int src0 = relInY0 * tileUpW + relUpX;
          int dst = relUpY0 * tileUpW + relUpX;
          vec2_t v = InternalType<T>::zero_vec2();

          scalar_t a = s_tileUpX[src0];
          if (phaseInY == 0) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
              v.y += a * (scalar_t)c_fu[step * up + 1];
            }
          } else  // (phaseInY == 1)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 1];
              v.y += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
            }
          }

          int x = tileOutX * down + relUpX;
          int y = tileOutY * down + relUpY0;
          int signX = x + p.sOfs.x;
          int signY = y + p.sOfs.y;
          int signZ = blockIdx.z + p.blockZofs;
          int signXb = signX >> 2;
          index_t si0 =
              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
          index_t si1 = si0 + p.sShape.x;

          v.x *= (scalar_t)((float)up * (float)up * p.gain);
          v.y *= (scalar_t)((float)up * (float)up * p.gain);

          if (signWrite) {
            if (!enableWriteSkip) {
              // Determine and write signs.
              int sx = __float_as_uint(v.x) >> 31 << 0;
              int sy = __float_as_uint(v.y) >> 31 << 8;
              if (sx) v.x *= p.slope;
              if (sy) v.y *= p.slope;
              if (fabsf(v.x) > p.clamp) {
                sx = 2 << 0;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
              }
              if (fabsf(v.y) > p.clamp) {
                sy = 2 << 8;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
              }

              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
                // Combine signs.
                int s = sx + sy;
                s <<= signXo;
#ifdef MMCV_WITH_HIP
                s |= __shfl_xor(s, 1);
                s |= __shfl_xor(s, 2);
#else
                s |= __shfl_xor_sync(groupMask, s, 1);
                s |= __shfl_xor_sync(groupMask, s, 2);
#endif

                // Write signs.
                if ((uint32_t)(signY + 0) < sShapeMaxY) {
                  p.s[si0] = (unsigned char)(s >> 0);
                }
                if ((uint32_t)(signY + 1) < sShapeMaxY) {
                  p.s[si1] = (unsigned char)(s >> 8);
                }
              }
            } else {
              // Determine and write signs.
              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
                int sx = __float_as_uint(v.x) >> 31 << 0;
                int sy = __float_as_uint(v.y) >> 31 << 8;
                if (sx) v.x *= p.slope;
                if (sy) v.y *= p.slope;
                if (fabsf(v.x) > p.clamp) {
                  sx = 2 << 0;
                  v.x = InternalType<T>::clamp(v.x, p.clamp);
                }
                if (fabsf(v.y) > p.clamp) {
                  sy = 2 << 8;
                  v.y = InternalType<T>::clamp(v.y, p.clamp);
                }

                // Combine signs.
                int s = sx + sy;
                s <<= signXo;
#ifdef MMCV_WITH_HIP
                s |= __shfl_xor(s, 1);
                s |= __shfl_xor(s, 2);
#else
                s |= __shfl_xor_sync(groupMask, s, 1);
                s |= __shfl_xor_sync(groupMask, s, 2);
#endif

                // Write signs.
                if ((uint32_t)(signY + 0) < sShapeMaxY) {
                  p.s[si0] = (unsigned char)(s >> 0);
                }
                if ((uint32_t)(signY + 1) < sShapeMaxY) {
                  p.s[si1] = (unsigned char)(s >> 8);
                }
              } else {
                // Just compute the values.
                if (v.x < 0.f) v.x *= p.slope;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
                if (v.y < 0.f) v.y *= p.slope;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
              }
            }
          } else if (signRead)  // Read signs and apply.
          {
            if ((uint32_t)signXb < p.swLimit) {
              if ((uint32_t)(signY + 0) < p.sShape.y) {
                int s = p.s[si0] >> signXo;
                if (s & 1) v.x *= p.slope;
                if (s & 2) v.x = 0.f;
              }
              if ((uint32_t)(signY + 1) < p.sShape.y) {
                int s = p.s[si1] >> signXo;
                if (s & 1) v.y *= p.slope;
                if (s & 2) v.y = 0.f;
              }
            }
          } else  // Forward pass with no sign write.
          {
            if (v.x < 0.f) v.x *= p.slope;
            v.x = InternalType<T>::clamp(v.x, p.clamp);
            if (v.y < 0.f) v.y *= p.slope;
            v.y = InternalType<T>::clamp(v.y, p.clamp);
          }

          if (!downInline) {
            // Write into temporary buffer.
            s_tileUpXY[dst] = v.x;
            if (relUpY0 < tileUpH - 1) s_tileUpXY[dst + tileUpW] = v.y;
          } else {
            // Write directly into output buffer.
            if ((uint32_t)x < p.yShape.x) {
              int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
              index_t ofs = x * get_stride<index_t>(p.yStride.x) +
                            y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
              if ((uint32_t)y + 0 < p.yShape.y)
                *((T *)((char *)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
              if ((uint32_t)y + 1 < ymax)
                *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.y))) =
                    (T)(v.y * (scalar_t)c_fd[0]);
            }
          }
        }
      }
    } else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD) {
      // Full upsampling filter.

      if (up == 2) {
        // 2 x 2-wide.
        __syncthreads();
        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y
                            : 0;  // Skip already written signs.
        for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH;
             idx += blockDim.x * 4) {
          int relUpX0, relUpY0;
          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
          int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
          int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
          int src0 = relInX0 + tileInW * relInY0;
          int tap0y = (relInY0 * up + phaseInY - relUpY0);

#define X_LOOP(TAPY, PX)                                             \
  for (int sx = 0; sx < fuSize / up; sx++) {                         \
    v.x += a * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) +  \
                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
    v.z += b * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) +  \
                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
    if ((PX) == 0) {                                                 \
      a = b;                                                         \
      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
    }                                                                \
    v.y += a * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) +  \
                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
    v.w += b * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) +  \
                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
    if ((PX) == 1) {                                                 \
      a = b;                                                         \
      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
    }                                                                \
  }

          vec4_t v = InternalType<T>::zero_vec4();
          if (tap0y == 0 && phaseInX == 0)
#pragma unroll
            for (int sy = 0; sy < fuSize / up; sy++) {
              scalar_t a = s_tileIn[src0 + sy * tileInW];
              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
#pragma unroll
              X_LOOP(0, 0)
            }
          if (tap0y == 0 && phaseInX == 1)
#pragma unroll
            for (int sy = 0; sy < fuSize / up; sy++) {
              scalar_t a = s_tileIn[src0 + sy * tileInW];
              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
#pragma unroll
              X_LOOP(0, 1)
            }
          if (tap0y == 1 && phaseInX == 0)
#pragma unroll
            for (int sy = 0; sy < fuSize / up; sy++) {
              scalar_t a = s_tileIn[src0 + sy * tileInW];
              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
#pragma unroll
              X_LOOP(1, 0)
            }
          if (tap0y == 1 && phaseInX == 1)
#pragma unroll
            for (int sy = 0; sy < fuSize / up; sy++) {
              scalar_t a = s_tileIn[src0 + sy * tileInW];
              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
#pragma unroll
              X_LOOP(1, 1)
            }

#undef X_LOOP

          int x = tileOutX * down + relUpX0;
          int y = tileOutY * down + relUpY0;
          int signX = x + p.sOfs.x;
          int signY = y + p.sOfs.y;
          int signZ = blockIdx.z + p.blockZofs;
          int signXb = signX >> 2;
          index_t si =
              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);

          v.x *= (scalar_t)((float)up * (float)up * p.gain);
          v.y *= (scalar_t)((float)up * (float)up * p.gain);
          v.z *= (scalar_t)((float)up * (float)up * p.gain);
          v.w *= (scalar_t)((float)up * (float)up * p.gain);

          if (signWrite) {
            if (!enableWriteSkip) {
              // Determine and write signs.
              int sx = __float_as_uint(v.x) >> 31;
              int sy = __float_as_uint(v.y) >> 31;
              int sz = __float_as_uint(v.z) >> 31;
              int sw = __float_as_uint(v.w) >> 31;
              if (sx) v.x *= p.slope;
              if (fabsf(v.x) > p.clamp) {
                sx = 2;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
              }
              if (sy) v.y *= p.slope;
              if (fabsf(v.y) > p.clamp) {
                sy = 2;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
              }
              if (sz) v.z *= p.slope;
              if (fabsf(v.z) > p.clamp) {
                sz = 2;
                v.z = InternalType<T>::clamp(v.z, p.clamp);
              }
              if (sw) v.w *= p.slope;
              if (fabsf(v.w) > p.clamp) {
                sw = 2;
                v.w = InternalType<T>::clamp(v.w, p.clamp);
              }

              if ((uint32_t)signXb < p.swLimit &&
                  (uint32_t)signY < p.sShape.y && signY >= minY) {
                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
              }
            } else {
              // Determine and write signs.
              if ((uint32_t)signXb < p.swLimit &&
                  (uint32_t)signY < p.sShape.y && signY >= minY) {
                int sx = __float_as_uint(v.x) >> 31;
                int sy = __float_as_uint(v.y) >> 31;
                int sz = __float_as_uint(v.z) >> 31;
                int sw = __float_as_uint(v.w) >> 31;
                if (sx) v.x *= p.slope;
                if (fabsf(v.x) > p.clamp) {
                  sx = 2;
                  v.x = InternalType<T>::clamp(v.x, p.clamp);
                }
                if (sy) v.y *= p.slope;
                if (fabsf(v.y) > p.clamp) {
                  sy = 2;
                  v.y = InternalType<T>::clamp(v.y, p.clamp);
                }
                if (sz) v.z *= p.slope;
                if (fabsf(v.z) > p.clamp) {
                  sz = 2;
                  v.z = InternalType<T>::clamp(v.z, p.clamp);
                }
                if (sw) v.w *= p.slope;
                if (fabsf(v.w) > p.clamp) {
                  sw = 2;
                  v.w = InternalType<T>::clamp(v.w, p.clamp);
                }

                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
              } else {
                // Just compute the values.
                if (v.x < 0.f) v.x *= p.slope;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
                if (v.y < 0.f) v.y *= p.slope;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
                if (v.z < 0.f) v.z *= p.slope;
                v.z = InternalType<T>::clamp(v.z, p.clamp);
                if (v.w < 0.f) v.w *= p.slope;
                v.w = InternalType<T>::clamp(v.w, p.clamp);
              }
            }
          } else if (signRead)  // Read sign and apply.
          {
            if ((uint32_t)signY < p.sShape.y) {
              int s = 0;
              if ((uint32_t)signXb < p.swLimit) s = p.s[si];
              if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
              s >>= (signX & 3) << 1;
              if (s & 0x01) v.x *= p.slope;
              if (s & 0x02) v.x = 0.f;
              if (s & 0x04) v.y *= p.slope;
              if (s & 0x08) v.y = 0.f;
              if (s & 0x10) v.z *= p.slope;
              if (s & 0x20) v.z = 0.f;
              if (s & 0x40) v.w *= p.slope;
              if (s & 0x80) v.w = 0.f;
            }
          } else  // Forward pass with no sign write.
          {
            if (v.x < 0.f) v.x *= p.slope;
            v.x = InternalType<T>::clamp(v.x, p.clamp);
            if (v.y < 0.f) v.y *= p.slope;
            v.y = InternalType<T>::clamp(v.y, p.clamp);
            if (v.z < 0.f) v.z *= p.slope;
            v.z = InternalType<T>::clamp(v.z, p.clamp);
            if (v.w < 0.f) v.w *= p.slope;
            v.w = InternalType<T>::clamp(v.w, p.clamp);
          }

          s_tileUpXY[idx + 0] = v.x;
          s_tileUpXY[idx + 1] = v.y;
          s_tileUpXY[idx + 2] = v.z;
          s_tileUpXY[idx + 3] = v.w;
        }
      } else if (up == 1) {
        __syncthreads();
        uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
                            : 0;  // Skip already written signs.
        for (int idx = threadIdx.x; idx < tileUpW * tileUpH;
             idx += blockDim.x) {
          int relUpX0, relUpY0;
          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
          scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0];  // 1x1 filter.

          int x = tileOutX * down + relUpX0;
          int y = tileOutY * down + relUpY0;
          int signX = x + p.sOfs.x;
          int signY = y + p.sOfs.y;
          int signZ = blockIdx.z + p.blockZofs;
          int signXb = signX >> 2;
          index_t si =
              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
          v *= (scalar_t)((float)up * (float)up * p.gain);

          if (signWrite) {
            if (!enableWriteSkip) {
              // Determine and write sign.
              uint32_t s = 0;
              uint32_t signXbit = (1u << signXo);
              if (v < 0.f) {
                s = signXbit;
                v *= p.slope;
              }
              if (fabsf(v) > p.clamp) {
                s = signXbit * 2;
                v = InternalType<T>::clamp(v, p.clamp);
              }
              if ((uint32_t)signXb < p.swLimit &&
                  (uint32_t)signY < p.sShape.y && signY >= minY) {
#ifdef MMCV_WITH_HIP
                s += __shfl_xor(s, 1);  // Coalesce.
                s += __shfl_xor(s, 2);  // Coalesce.
#else
                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
#endif
                p.s[si] = s;  // Write.
              }
            } else {
              // Determine and write sign.
              if ((uint32_t)signXb < p.swLimit &&
                  (uint32_t)signY < p.sShape.y && signY >= minY) {
                uint32_t s = 0;
                uint32_t signXbit = (1u << signXo);
                if (v < 0.f) {
                  s = signXbit;
                  v *= p.slope;
                }
                if (fabsf(v) > p.clamp) {
                  s = signXbit * 2;
                  v = InternalType<T>::clamp(v, p.clamp);
                }
#ifdef MMCV_WITH_HIP
                s += __shfl_xor(s, 1);  // Coalesce.
                s += __shfl_xor(s, 2);  // Coalesce.
#else
                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
#endif
                p.s[si] = s;  // Write.
              } else {
                // Just compute the value.
                if (v < 0.f) v *= p.slope;
                v = InternalType<T>::clamp(v, p.clamp);
              }
            }
          } else if (signRead) {
            // Read sign and apply if within sign tensor bounds.
            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y) {
              int s = p.s[si];
              s >>= signXo;
              if (s & 1) v *= p.slope;
              if (s & 2) v = 0.f;
            }
          } else  // Forward pass with no sign write.
          {
            if (v < 0.f) v *= p.slope;
            v = InternalType<T>::clamp(v, p.clamp);
          }

          if (!downInline)  // Write into temporary buffer.
            s_tileUpXY[idx] = v;
          else if ((uint32_t)x < p.yShape.x &&
                   (uint32_t)y <
                       p.yShape.y)  // Write directly into output buffer
            *((T *)((char *)p.y + (x * get_stride<index_t>(p.yStride.x) +
                                   y * get_stride<index_t>(p.yStride.y) +
                                   mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
        }
      }
    }

    // Downsampling.
    if (filterMode == MODE_SUSD || filterMode == MODE_FUSD) {
      // Horizontal downsampling.
      __syncthreads();
      if (down == 4 && tileOutW % 4 == 0) {
        // Calculate 4 pixels at a time.
        for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH;
             idx += blockDim.x * 4) {
          int relOutX0, relUpY;
          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
          int relUpX0 = relOutX0 * down;
          int src0 = relUpY * tileUpW + relUpX0;
          vec4_t v = InternalType<T>::zero_vec4();
#pragma unroll
          for (int step = 0; step < fdSize; step++) {
            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
            v.y += s_tileUpXY[src0 + 4 + step] * (scalar_t)c_fd[step];
            v.z += s_tileUpXY[src0 + 8 + step] * (scalar_t)c_fd[step];
            v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
          }
          s_tileDownX[idx + 0] = v.x;
          s_tileDownX[idx + 1] = v.y;
          s_tileDownX[idx + 2] = v.z;
          s_tileDownX[idx + 3] = v.w;
        }
      } else if ((down == 2 || down == 4) && (tileOutW % 2 == 0)) {
        // Calculate 2 pixels at a time.
        for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH;
             idx += blockDim.x * 2) {
          int relOutX0, relUpY;
          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
          int relUpX0 = relOutX0 * down;
          int src0 = relUpY * tileUpW + relUpX0;
          vec2_t v = InternalType<T>::zero_vec2();
#pragma unroll
          for (int step = 0; step < fdSize; step++) {
            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
            v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
          }
          s_tileDownX[idx + 0] = v.x;
          s_tileDownX[idx + 1] = v.y;
        }
      } else {
        // Calculate 1 pixel at a time.
        for (int idx = threadIdx.x; idx < tileOutW * tileUpH;
             idx += blockDim.x) {
          int relOutX0, relUpY;
          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
          int relUpX0 = relOutX0 * down;
          int src = relUpY * tileUpW + relUpX0;
          scalar_t v = 0.f;
#pragma unroll
          for (int step = 0; step < fdSize; step++)
            v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
          s_tileDownX[idx] = v;
        }
      }

      // Vertical downsampling & store output tile.
      __syncthreads();
      for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
           idx += blockDim.x) {
        int relOutX, relOutY0;
        fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
        int relUpY0 = relOutY0 * down;
        int src0 = relUpY0 * tileOutW + relOutX;
        scalar_t v = 0;
#pragma unroll
        for (int step = 0; step < fdSize; step++)
          v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];

        int outX = tileOutX + relOutX;
        int outY = tileOutY + relOutY0;

        if (outX < p.yShape.x & outY < p.yShape.y)
          *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
                                 outY * get_stride<index_t>(p.yStride.y) +
                                 mapOfsOut))) = (T)v;
      }
    } else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD) {
      // Full downsampling filter.
      if (down == 2) {
        // 2-wide.
        __syncthreads();
        for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH;
             idx += blockDim.x * 2) {
          int relOutX0, relOutY0;
          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
          int relUpX0 = relOutX0 * down;
          int relUpY0 = relOutY0 * down;
          int src0 = relUpY0 * tileUpW + relUpX0;
          vec2_t v = InternalType<T>::zero_vec2();
#pragma unroll
          for (int sy = 0; sy < fdSize; sy++)
#pragma unroll
            for (int sx = 0; sx < fdSize; sx++) {
              v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] *
                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
              v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] *
                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
            }

          int outX = tileOutX + relOutX0;
          int outY = tileOutY + relOutY0;
          if ((uint32_t)outY < p.yShape.y) {
            index_t ofs = outX * get_stride<index_t>(p.yStride.x) +
                          outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
            if (outX + 0 < p.yShape.x) *((T *)((char *)p.y + ofs)) = (T)v.x;
            if (outX + 1 < p.yShape.x)
              *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.x))) =
                  (T)v.y;
          }
        }
      } else if (down == 1 && !downInline) {
        // Thread per pixel.
        __syncthreads();
        for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
             idx += blockDim.x) {
          int relOutX0, relOutY0;
          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
          scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0];  // 1x1 filter.

          int outX = tileOutX + relOutX0;
          int outY = tileOutY + relOutY0;
          if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
            *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
                                   outY * get_stride<index_t>(p.yStride.y) +
                                   mapOfsOut))) = (T)v;
        }
      }
    }

    if (!enableXrep) break;
  }
}

//------------------------------------------------------------------------
// Compute activation function and signs for upsampled data tensor, modifying
// data tensor in-place. Used for accelerating the generic variant. Sign tensor
// is known to be contiguous, and p.x and p.s have the same z, w dimensions.
// 64-bit indexing is always used.

template <class T, bool signWrite, bool signRead>
static __global__ void filtered_lrelu_act_kernel(
    filtered_lrelu_act_kernel_params p) {
  typedef typename InternalType<T>::scalar_t scalar_t;

  // Indexing.
  int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
  int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
  int32_t qmax =
      p.xShape.z * p.xShape.w;  // Combined minibatch*channel maximum index.

  // Loop to accommodate oversized tensors.
  for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y) {
      // Extract z and w (channel, minibatch index).
      int32_t w = q / p.xShape.z;
      int32_t z = q - w * p.xShape.z;

      // Choose behavior based on sign read/write mode.
      if (signWrite) {
        // Process value if in p.x.
        uint32_t s = 0;
        if (x < p.xShape.x && y < p.xShape.y) {
          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
                       w * p.xStride.w;
          T *pv = ((T *)p.x) + ix;
          scalar_t v = (scalar_t)(*pv);

          // Gain, LReLU, clamp.
          v *= p.gain;
          if (v < 0.f) {
            v *= p.slope;
            s = 1;  // Sign.
          }
          if (fabsf(v) > p.clamp) {
            v = InternalType<T>::clamp(v, p.clamp);
            s = 2;  // Clamp.
          }

          *pv = (T)v;  // Write value.
        }

        // Coalesce into threads 0 and 16 of warp.
        uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
        s <<= ((threadIdx.x & 15) << 1);  // Shift into place.
#ifdef MMCV_WITH_HIP
        s |= __shfl_xor(s, 1);  // Distribute.
        s |= __shfl_xor(s, 2);
        s |= __shfl_xor(s, 4);
        s |= __shfl_xor(s, 8);
#else
        s |= __shfl_xor_sync(m, s, 1);  // Distribute.
        s |= __shfl_xor_sync(m, s, 2);
        s |= __shfl_xor_sync(m, s, 4);
        s |= __shfl_xor_sync(m, s, 8);
#endif

        // Write signs if leader and in p.s.
        if (!(threadIdx.x & 15) && x < p.sShape.x)  // y is always in.
        {
          uint64_t is =
              x + p.sShape.x * (y + (int64_t)p.sShape.y * q);  // Contiguous.
          ((uint32_t *)p.s)[is >> 4] = s;
        }
      } else if (signRead) {
        // Process value if in p.x.
        if (x < p.xShape.x)  // y is always in.
        {
          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
                       w * p.xStride.w;
          T *pv = ((T *)p.x) + ix;
          scalar_t v = (scalar_t)(*pv);
          v *= p.gain;

          // Apply sign buffer offset.
          uint32_t sx = x + p.sOfs.x;
          uint32_t sy = y + p.sOfs.y;

          // Read and apply signs if we land inside valid region of sign buffer.
          if (sx < p.sShape.x && sy < p.sShape.y) {
            uint64_t is =
                (sx >> 2) + (p.sShape.x >> 2) *
                                (sy + (uint64_t)p.sShape.y * q);  // Contiguous.
            unsigned char s = p.s[is];
            s >>= (sx & 3) << 1;  // Shift into place.
            if (s & 1)            // Sign?
              v *= p.slope;
            if (s & 2)  // Clamp?
              v = 0.f;
          }

          *pv = (T)v;  // Write value.
        }
      } else {
        // Forward pass with no sign write. Process value if in p.x.
        if (x < p.xShape.x)  // y is always in.
        {
          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
                       w * p.xStride.w;
          T *pv = ((T *)p.x) + ix;
          scalar_t v = (scalar_t)(*pv);
          v *= p.gain;
          if (v < 0.f) v *= p.slope;
          if (fabsf(v) > p.clamp) v = InternalType<T>::clamp(v, p.clamp);
          *pv = (T)v;  // Write value.
        }
      }
    }
}

template <class T, bool signWrite, bool signRead>
void *choose_filtered_lrelu_act_kernel(void) {
  return (void *)filtered_lrelu_act_kernel<T, signWrite, signRead>;
}

//------------------------------------------------------------------------
// CUDA kernel selection.

template <class T, class index_t, bool signWrite, bool signRead>
filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
    const filtered_lrelu_kernel_params &p, int sharedKB) {
  filtered_lrelu_kernel_spec s = {0};

  // Return the first matching kernel.
#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS)                        \
  if (sharedKB >= SH)                                                          \
    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) ||      \
        (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD)))         \
      if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) ||    \
          (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD)))       \
        if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU &&             \
            p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) {           \
          static_assert((D * TW % 4) == 0,                                     \
                        "down * tileWidth must be divisible by 4");            \
          static_assert(                                                       \
              FU % U == 0,                                                     \
              "upscaling filter size must be multiple of upscaling factor");   \
          static_assert(FD % D == 0,                                           \
                        "downscaling filter size must be multiple of "         \
                        "downscaling factor");                                 \
          s.setup = (void *)setup_filters_kernel;                              \
          s.exec = (void *)                                                    \
              filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, \
                                    U, FU, D, FD, TW, TH, W * 32, !!XR, !!WS>; \
          s.tileOut = make_int2(TW, TH);                                       \
          s.numWarps = W;                                                      \
          s.xrep = XR;                                                         \
          s.dynamicSharedKB = (SH == 48) ? 0 : SH;                             \
          return s;                                                            \
        }

  // Launch parameters for various kernel specializations.
  // Small filters must be listed before large filters, otherwise the kernel for
  // larger filter will always match first. Kernels that use more shared memory
  // must be listed before those that use less, for the same reason.

  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 1, 1, /*mode*/ MODE_FUFD,
       /*tw,th,warps,xrep,wskip*/ 64, 178, 32, 0, 0)  // 1t-upf1-downf1
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 152, 95, 16, 0, 0)  // 4t-ups2-downf1
  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 56, 22, 16, 0, 0)  // 4t-upf1-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 56, 29, 16, 11, 0)  // 4t-ups2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 60, 28, 16, 0, 0)  // 4t-upf2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 56, 28, 16, 0, 0)  // 4t-ups2-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 56, 31, 16, 11, 0)  // 4t-ups4-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 56, 36, 16, 0, 0)  // 4t-ups4-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 16, 22, 16, 12, 0)  // 4t-ups2-downs4
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 29, 15, 16, 0, 0)  // 4t-upf2-downs4
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 96, 150, 28, 0, 0)  // 6t-ups2-downf1
  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 35, 24, 0, 0)  // 6t-upf1-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 46, 16, 10, 0)  // 6t-ups2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 58, 28, 24, 8, 0)  // 6t-upf2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 52, 28, 16, 0, 0)  // 6t-ups2-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 51, 16, 5, 0)  // 6t-ups4-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 32, 56, 16, 6, 0)  // 6t-ups4-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 16, 18, 16, 12, 0)  // 6t-ups2-downs4
  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 27, 31, 32, 6, 0)  // 6t-upf2-downs4 96kB
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 27, 13, 24, 0, 0)  // 6t-upf2-downs4
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 148, 89, 24, 0, 0)  // 8t-ups2-downf1
  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 31, 16, 5, 0)  // 8t-upf1-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 41, 16, 9, 0)  // 8t-ups2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 56, 26, 24, 0, 0)  // 8t-upf2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 32, 40, 16, 0, 0)  // 8t-ups2-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 46, 24, 5, 0)  // 8t-ups4-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 32, 50, 16, 0, 0)  // 8t-ups4-downf2
  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 24, 24, 32, 12, 1)  // 8t-ups2-downs4 96kB
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 16, 13, 16, 10, 1)  // 8t-ups2-downs4
  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 25, 28, 28, 4, 0)  // 8t-upf2-downs4 96kB
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 25, 10, 24, 0, 0)  // 8t-upf2-downs4

#undef CASE
  return s;  // No kernel found.
}

//------------------------------------------------------------------------

#define BUILD_FILTERED_LRELU_OP 1

#ifndef MMCV_WITH_HIP
#ifdef __GNUC__
#if __GNUC__ < 6
#undef BUILD_FILTERED_LRELU_OP
#define BUILD_FILTERED_LRELU_OP 0
#endif
#endif

#if CUDA_VERSION < 10020
#undef BUILD_FILTERED_LRELU_OP
#define BUILD_FILTERED_LRELU_OP 0
#endif
#endif

#if BUILD_FILTERED_LRELU_OP == 1
std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
    bool writeSigns) {
  // Set CUDA device.
  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));

  // Validate arguments.
  TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() &&
                  b.device() == x.device(),
              "all input tensors must reside on the same device");
  TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat,
              "fu and fd must be float32");
  TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat,
              "x and b must be float16 or float32");
  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
                  x.size(3) <= INT_MAX,
              "x is too large");
  TORCH_CHECK(x.numel() > 0, "x is empty");
  TORCH_CHECK(
      (fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2),
      "fu and fd must be rank 1 or 2");
  TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX,
              "fu is too large");
  TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX,
              "fd is too large");
  TORCH_CHECK(fu.numel() > 0, "fu is empty");
  TORCH_CHECK(fd.numel() > 0, "fd is empty");
  TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1),
              "b must be a vector with the same number of channels as x");
  TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");

  // Figure out how much shared memory is available on the device.
  int maxSharedBytes = 0;
#ifdef MMCV_WITH_HIP
  cudaDeviceGetAttribute(&maxSharedBytes,
                         hipDeviceAttributeMaxSharedMemoryPerBlock,
                         x.device().index());
#else
  AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes,
                                       cudaDevAttrMaxSharedMemoryPerBlockOptin,
                                       x.device().index()));
#endif
  int sharedKB = maxSharedBytes >> 10;

  // Populate enough launch parameters to check if a CUDA kernel exists.
  filtered_lrelu_kernel_params p;
  p.up = up;
  p.down = down;
  p.fuShape =
      make_int2((int)fu.size(-1),
                fu.dim() == 2 ? (int)fu.size(0)
                              : 0);  // shape [n, 0] indicates separable filter.
  p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
  filtered_lrelu_kernel_spec test_spec =
      choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
  if (!test_spec.exec) {
    // No kernel found - return empty tensors and indicate missing kernel with
    // return code of -1.
    return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
  }

  // Input/output element size.
  int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;

  // Input sizes.
  int64_t xw = (int)x.size(3);
  int64_t xh = (int)x.size(2);
  int64_t fut_w = (int)fu.size(-1) - 1;
  int64_t fut_h = (int)fu.size(0) - 1;
  int64_t fdt_w = (int)fd.size(-1) - 1;
  int64_t fdt_h = (int)fd.size(0) - 1;

  // Logical size of upsampled buffer.
  int64_t cw = xw * up + (px0 + px1) - fut_w;
  int64_t ch = xh * up + (py0 + py1) - fut_h;
  TORCH_CHECK(
      cw > fdt_w && ch > fdt_h,
      "upsampled buffer must be at least the size of downsampling filter");
  TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");

  // Compute output size and allocate.
  int64_t yw = (cw - fdt_w + (down - 1)) / down;
  int64_t yh = (ch - fdt_h + (down - 1)) / down;
  TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
  TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
  torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(),
                                 x.suggest_memory_format());

  // Allocate sign tensor.
  torch::Tensor so;
  torch::Tensor s = si;
  bool readSigns = !!s.numel();
  int64_t sw_active = 0;  // Active width of sign tensor.
  if (writeSigns) {
    sw_active = yw * down - (down - 1) + fdt_w;   // Active width in elements.
    int64_t sh = yh * down - (down - 1) + fdt_h;  // Height = active height.
    int64_t sw = (sw_active + 15) & ~15;  // Width  = active width in elements,
                                          // rounded up to multiple of 16.
    TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
    s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2},
                          x.options().dtype(torch::kUInt8),
                          at::MemoryFormat::Contiguous);
  } else if (readSigns)
    sw_active = s.size(3) << 2;

  // Validate sign tensor if in use.
  if (readSigns || writeSigns) {
    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
    TORCH_CHECK(s.device() == x.device(),
                "signs must reside on the same device as x");
    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
                "signs must have same batch & channels as x");
    TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX,
                "signs is too large");
  }

  // Populate rest of CUDA kernel parameters.
  p.x = x.data_ptr();
  p.y = y.data_ptr();
  p.b = b.data_ptr();
  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
  p.fu = fu.data_ptr<float>();
  p.fd = fd.data_ptr<float>();
  p.pad0 = make_int2(px0, py0);
  p.gain = gain;
  p.slope = slope;
  p.clamp = clamp;
  p.flip = (flip_filters) ? 1 : 0;
  p.xShape =
      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
  p.yShape =
      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
  p.sShape = (readSigns || writeSigns)
                 ? make_int2((int)s.size(3), (int)s.size(2))
                 : make_int2(0, 0);  // Width is in bytes. Contiguous.
  p.sOfs = make_int2(sx, sy);
  p.swLimit = (sw_active + 3) >> 2;  // Rounded up to bytes.

  // x, y, b strides are in bytes.
  p.xStride = make_longlong4(sz * x.stride(3), sz * x.stride(2),
                             sz * x.stride(1), sz * x.stride(0));
  p.yStride = make_longlong4(sz * y.stride(3), sz * y.stride(2),
                             sz * y.stride(1), sz * y.stride(0));
  p.bStride = sz * b.stride(0);

  // fu, fd strides are in elements.
  p.fuStride =
      make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
  p.fdStride =
      make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);

  // Determine if indices don't fit in int32. Support negative strides although
  // Torch currently never produces those.
  bool index64b = false;
  if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
  if (std::min(x.size(0) * p.xStride.w, 0ll) +
          std::min(x.size(1) * p.xStride.z, 0ll) +
          std::min(x.size(2) * p.xStride.y, 0ll) +
          std::min(x.size(3) * p.xStride.x, 0ll) <
      -INT_MAX)
    index64b = true;
  if (std::max(x.size(0) * p.xStride.w, 0ll) +
          std::max(x.size(1) * p.xStride.z, 0ll) +
          std::max(x.size(2) * p.xStride.y, 0ll) +
          std::max(x.size(3) * p.xStride.x, 0ll) >
      INT_MAX)
    index64b = true;
  if (std::min(y.size(0) * p.yStride.w, 0ll) +
          std::min(y.size(1) * p.yStride.z, 0ll) +
          std::min(y.size(2) * p.yStride.y, 0ll) +
          std::min(y.size(3) * p.yStride.x, 0ll) <
      -INT_MAX)
    index64b = true;
  if (std::max(y.size(0) * p.yStride.w, 0ll) +
          std::max(y.size(1) * p.yStride.z, 0ll) +
          std::max(y.size(2) * p.yStride.y, 0ll) +
          std::max(y.size(3) * p.yStride.x, 0ll) >
      INT_MAX)
    index64b = true;
  if (s.numel() > INT_MAX) index64b = true;

  // Choose CUDA kernel.
  filtered_lrelu_kernel_spec spec = {0};
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      x.scalar_type(), "filtered_lrelu_cuda", [&] {
        if constexpr (sizeof(scalar_t) <=
                      4)  // Exclude doubles. constexpr
                          // prevents template instantiation.
        {
          // Choose kernel based on index type, datatype and sign read/write
          // modes.
          if (!index64b && writeSigns && !readSigns)
            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true, false>(
                p, sharedKB);
          else if (!index64b && !writeSigns && readSigns)
            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true>(
                p, sharedKB);
          else if (!index64b && !writeSigns && !readSigns)
            spec =
                choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(
                    p, sharedKB);
          else if (index64b && writeSigns && !readSigns)
            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true, false>(
                p, sharedKB);
          else if (index64b && !writeSigns && readSigns)
            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true>(
                p, sharedKB);
          else if (index64b && !writeSigns && !readSigns)
            spec =
                choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(
                    p, sharedKB);
        }
      });
  TORCH_CHECK(
      spec.exec,
      "internal error - CUDA kernel not found")  // This should not happen
                                                 // because we tested earlier
                                                 // that kernel exists.

  // Launch CUDA kernel.
  void *args[] = {&p};
  int bx = spec.numWarps * 32;
  int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
  int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
  int gz = p.yShape.z * p.yShape.w;

  // Repeat multiple horizontal tiles in a CTA?
  if (spec.xrep) {
    p.tilesXrep = spec.xrep;
    p.tilesXdim = gx;

    gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
    std::swap(gx, gy);
  } else {
    p.tilesXrep = 0;
    p.tilesXdim = 0;
  }
#ifdef MMCV_WITH_HIP
  AT_CUDA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,
                                at::cuda::getCurrentCUDAStream()));
#else
  // Launch filter setup kernel.
  AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0,
                                 at::cuda::getCurrentCUDAStream()));
#endif

  // Copy kernels to constant memory.
  if (writeSigns && !readSigns)
    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
  else if (!writeSigns && readSigns)
    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
  else if (!writeSigns && !readSigns)
    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));

  // Set cache and shared memory configurations for main kernel.
  AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
  if (spec.dynamicSharedKB)  // Need dynamically allocated shared memory?
#ifdef MMCV_WITH_HIP
    AT_CUDA_CHECK(hipFuncSetAttribute(
        spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,
        spec.dynamicSharedKB << 10));
#else
    AT_CUDA_CHECK(cudaFuncSetAttribute(
        spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,
        spec.dynamicSharedKB << 10));
#endif
  AT_CUDA_CHECK(
      cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));

  // Launch main kernel.
  const int maxSubGz = 65535;  // CUDA maximum for block z dimension.
  for (int zofs = 0; zofs < gz;
       zofs += maxSubGz)  // Do multiple launches if gz is too big.
  {
    p.blockZofs = zofs;
    int subGz = std::min(maxSubGz, gz - zofs);
#ifdef MMCV_WITH_HIP
    AT_CUDA_CHECK(hipLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
                                  spec.dynamicSharedKB << 10,
                                  at::cuda::getCurrentCUDAStream()));
#else
    AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
                                   spec.dynamicSharedKB << 10,
                                   at::cuda::getCurrentCUDAStream()));
#endif
  }

  // Done.
  return std::make_tuple(y, so, 0);
}

std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
    bool writeSigns);

REGISTER_DEVICE_IMPL(filtered_lrelu_op_impl, CUDA, filtered_lrelu_op);

#else

#pragma message(                           \
    "filtered_lrelu_op is not available. " \
    "Please update your compiler and cuda version.")

#endif
#undef BUILD_FILTERED_LRELU_OP

//------------------------------------------------------------------------

torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
                                    int sy, float gain, float slope,
                                    float clamp, bool writeSigns) {
  // Set CUDA device.
  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));

  // Validate arguments.
  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
                  x.size(3) <= INT_MAX,
              "x is too large");
  TORCH_CHECK(x.numel() > 0, "x is empty");
  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat ||
                  x.dtype() == torch::kDouble,
              "x must be float16, float32 or float64");

  // Output signs if we don't have sign input.
  torch::Tensor so;
  torch::Tensor s = si;
  bool readSigns = !!s.numel();
  if (writeSigns) {
    int64_t sw = x.size(3);
    sw = (sw + 15) & ~15;  // Round to a multiple of 16 for coalescing.
    s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2},
                          x.options().dtype(torch::kUInt8),
                          at::MemoryFormat::Contiguous);
  }

  // Validate sign tensor if in use.
  if (readSigns || writeSigns) {
    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
    TORCH_CHECK(s.device() == x.device(),
                "signs must reside on the same device as x");
    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
                "signs must have same batch & channels as x");
    TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX,
                "signs tensor is too large");
  }

  // Initialize CUDA kernel parameters.
  filtered_lrelu_act_kernel_params p;
  p.x = x.data_ptr();
  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
  p.gain = gain;
  p.slope = slope;
  p.clamp = clamp;
  p.xShape =
      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
  p.xStride =
      make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
  p.sShape = (readSigns || writeSigns)
                 ? make_int2((int)s.size(3) << 2, (int)s.size(2))
                 : make_int2(0, 0);  // Width is in elements. Contiguous.
  p.sOfs = make_int2(sx, sy);

  // Choose CUDA kernel.
  void *func = 0;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      x.scalar_type(), "filtered_lrelu_act_cuda", [&] {
        if (writeSigns)
          func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
        else if (readSigns)
          func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
        else
          func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
      });
  TORCH_CHECK(func, "internal error - CUDA kernel not found");

  // Launch CUDA kernel.
  void *args[] = {&p};
  int bx = 128;  // 4 warps per block.

  // Logical size of launch = writeSigns ? p.s : p.x
  uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
  uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
  uint32_t gz =
      p.xShape.z * p.xShape.w;  // Same as in p.sShape if signs are in use.
  gx = (gx - 1) / bx + 1;

  // Make sure grid y and z dimensions are within CUDA launch limits. Kernel
  // loops internally to do the rest.
  const uint32_t gmax = 65535;
  gy = std::min(gy, gmax);
  gz = std::min(gz, gmax);

  // Launch.
#ifdef MMCV_WITH_HIP
  AT_CUDA_CHECK(hipLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
                                at::cuda::getCurrentCUDAStream()));
#else
  AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
                                 at::cuda::getCurrentCUDAStream()));
#endif

  return so;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp"
#include "sigmoid_focal_loss_cuda_kernel.cuh"
#include "softmax_focal_loss_cuda_kernel.cuh"

void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha) {
  int output_size = output.numel();
  int num_classes = input.size(1);
  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
             "target label should smaller or equal than num classes");
  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
                                                Tensor weight,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha) {
  int output_size = grad_input.numel();
  int num_classes = input.size(1);

  at::cuda::CUDAGuard device_guard(grad_input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha) {
  int output_size = output.numel();
  int num_classes = softmax.size(1);

  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
             "target label should smaller or equal than num classes");
  at::cuda::CUDAGuard device_guard(softmax.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
        softmax_focal_loss_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, softmax.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
                                                Tensor weight, Tensor buff,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha) {
  int num_classes = softmax.size(1);

  int output_size = buff.numel();
  at::cuda::CUDAGuard device_guard(grad_input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_input.scalar_type(),
      "softmax_focal_loss_backward_cuda1_"
      "kernel",
      [&] {
        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, softmax.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
      });

  AT_CUDA_CHECK(cudaGetLastError());

  output_size = grad_input.numel();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_input.scalar_type(),
      "softmax_focal_loss_backward_cuda2_"
      "kernel",
      [&] {
        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, softmax.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
                grad_input.data_ptr<scalar_t>(), num_classes);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu

#include <stdio.h>
#include <stdlib.h>

#include "furthest_point_sample_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

inline int opt_n_threads(int work_size) {
  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);

  return max(min(1 << pow_2, 1024), 1);
}

void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
                                                    const float* dataset,
                                                    float* temp, int* idxs) {
  // dataset: (B, N, 3)
  // tmp: (B, N)
  // output:
  //      idx: (B, M)

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  unsigned int n_threads = opt_n_threads(n);

  switch (n_threads) {
    case 1024:
      furthest_point_sampling_forward_cuda_kernel<1024>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 512:
      furthest_point_sampling_forward_cuda_kernel<512>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 256:
      furthest_point_sampling_forward_cuda_kernel<256>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 128:
      furthest_point_sampling_forward_cuda_kernel<128>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 64:
      furthest_point_sampling_forward_cuda_kernel<64>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 32:
      furthest_point_sampling_forward_cuda_kernel<32>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 16:
      furthest_point_sampling_forward_cuda_kernel<16>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 8:
      furthest_point_sampling_forward_cuda_kernel<8>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 4:
      furthest_point_sampling_forward_cuda_kernel<4>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 2:
      furthest_point_sampling_forward_cuda_kernel<2>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 1:
      furthest_point_sampling_forward_cuda_kernel<1>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    default:
      furthest_point_sampling_forward_cuda_kernel<512>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
  }

  AT_CUDA_CHECK(cudaGetLastError());
}

void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
    int b, int n, int m, const float* dataset, float* temp, int* idxs) {
  // dataset: (B, N, N)
  // temp: (B, N)
  // output:
  //      idx: (B, M)

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  unsigned int n_threads = opt_n_threads(n);

  switch (n_threads) {
    case 1024:
      furthest_point_sampling_with_dist_forward_cuda_kernel<1024>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 512:
      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 256:
      furthest_point_sampling_with_dist_forward_cuda_kernel<256>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 128:
      furthest_point_sampling_with_dist_forward_cuda_kernel<128>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 64:
      furthest_point_sampling_with_dist_forward_cuda_kernel<64>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 32:
      furthest_point_sampling_with_dist_forward_cuda_kernel<32>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 16:
      furthest_point_sampling_with_dist_forward_cuda_kernel<16>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 8:
      furthest_point_sampling_with_dist_forward_cuda_kernel<8>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 4:
      furthest_point_sampling_with_dist_forward_cuda_kernel<4>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 2:
      furthest_point_sampling_with_dist_forward_cuda_kernel<2>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 1:
      furthest_point_sampling_with_dist_forward_cuda_kernel<1>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    default:
      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
  }

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
================================================
// Modified from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu
// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
//
// This work is made available under the Nvidia Source Code License-NC.
// To view a copy of this license, visit
// https://nvlabs.github.io/stylegan2/license.html

#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/types.h>

#include <ATen/cuda/CUDAApplyUtils.cuh>

template <typename scalar_t>
static __global__ void fused_bias_act_kernel(
    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,
    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,
    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;

  scalar_t zero = 0.0;

  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;
       loop_idx++, xi += blockDim.x) {
    scalar_t x = p_x[xi];

    if (use_bias) {
      x += p_b[(xi / step_b) % size_b];
    }

    scalar_t ref = use_ref ? p_ref[xi] : zero;

    scalar_t y;

    // act = 1: linear layer
    // act = 3: leaky relu layer
    // grad = 0: direct forward path
    // grad = 1: first order deviation
    // grad = 2: second order deviation
    switch (act * 10 + grad) {
      default:
      case 10:
        y = x;
        break;
      case 11:
        y = x;
        break;
      case 12:
        y = 0.0;
        break;

      case 30:
        y = (x > 0.0) ? x : x * alpha;
        break;
      case 31:
        y = (ref > 0.0) ? x : x * alpha;
        break;
      case 32:
        y = 0.0;
        break;
    }

    out[xi] = y * scale;
  }
}

torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
                                      const torch::Tensor& bias,
                                      const torch::Tensor& refer, int act,
                                      int grad, float alpha, float scale) {
  int curDevice = -1;
  cudaGetDevice(&curDevice);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);

  auto x = input.contiguous();
  auto b = bias.contiguous();
  auto ref = refer.contiguous();

  int use_bias = b.numel() ? 1 : 0;
  int use_ref = ref.numel() ? 1 : 0;

  int size_x = x.numel();
  int size_b = b.numel();
  int step_b = 1;

  for (int i = 1 + 1; i < x.dim(); i++) {
    step_b *= x.size(i);
  }

  int loop_x = 4;
  int block_size = 4 * 32;
  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;

  auto y = torch::empty_like(x);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      x.scalar_type(), "fused_bias_act_kernel", [&] {
        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,
            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);
      });

  return y;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
================================================
#include <cuda_runtime_api.h>
#include <torch/script.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>

#include "pytorch_cuda_helper.hpp"

torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM) {
  at::cuda::CUDAGuard device_guard(features.device());
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto indicePairMaxSizeIter =
      std::max_element(indicePairNumCpu.data_ptr<int>(),
                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
  int indicePairMaxOffset =
      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;

  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());

  torch::Tensor output =
      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
  torch::Tensor inputBuffer =
      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
  if (subM) {  // the center index of subm conv don't need gather and scatter
               // add.
    torch::mm_out(output, features, filters[indicePairMaxOffset]);
  }
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
          auto outputBufferBlob = torch::from_blob(
              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
          auto inputBufferBlob = torch::from_blob(
              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);

          if (device == torch::kCPU) {
            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
          } else {
            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
                gatherFtor;
            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
            TV_CHECK_CUDA_ERR();
            /* slower than SparseGatherFunctor, may due to int->long conversion
            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
            auto indicePairBlob =
            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
            indicePairOptions); torch::index_select_out(inputBufferBlob,
            features, 0, indicePairBlob);*/
          }
          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);

          if (device == torch::kCPU) {
            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::CPU(), tv::torch2tv<scalar_t>(output),
                tv::torch2tv<const scalar_t>(outputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
          } else {
            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
                tv::torch2tv<const scalar_t>(outputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
            TV_CHECK_CUDA_ERR();
          }
        });
  }

  return output;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
================================================
#include <stdio.h>
#include <stdlib.h>

#include "gather_points_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                           const Tensor points,
                                           const Tensor idx, Tensor out) {
  // points: (B, C, N)
  // idx: (B, npoints)
  // output:
  //      out: (B, C, npoints)

  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "gather_points_forward_cuda_kernel", [&] {
        gather_points_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, npoints, points.data_ptr<scalar_t>(),
                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                            const Tensor grad_out,
                                            const Tensor idx,
                                            Tensor grad_points) {
  // grad_out: (B, C, npoints)
  // idx: (B, npoints)
  // output:
  //      grad_points: (B, C, N)

  at::cuda::CUDAGuard device_guard(grad_out.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "gather_points_backward_cuda_kernel", [&] {
        gather_points_backward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),
                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#include <stdio.h>
#include <stdlib.h>

#include "group_points_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                          int nsample, const Tensor points,
                                          const Tensor idx, Tensor out) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)

  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "group_points_forward_cuda_kernel", [&] {
        group_points_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),
                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                           int nsample, const Tensor grad_out,
                                           const Tensor idx,
                                           Tensor grad_points) {
  // grad_out: (B, C, npoints, nsample)
  // idx: (B, npoints, nsample)
  // output:
  //      grad_points: (B, C, N)

  at::cuda::CUDAGuard device_guard(grad_out.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "group_points_backward_cuda_kernel", [&] {
        group_points_backward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),
                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
================================================
// Modified from
// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu

/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/

#include <stdio.h>

#include "iou3d_cuda_kernel.cuh"
#include "nms_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
                                                   const Tensor boxes_a,
                                                   const int num_b,
                                                   const Tensor boxes_b,
                                                   Tensor ans_overlap) {
  at::cuda::CUDAGuard device_guard(boxes_a.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);

  iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
      ans_overlap.data_ptr<float>());

  AT_CUDA_CHECK(cudaGetLastError());
}

void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
                                         Tensor& keep_num,
                                         float nms_overlap_thresh) {
  using namespace at::indexing;
  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  int boxes_num = boxes.size(0);

  const int col_blocks =
      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));

  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);

  iou3d_nms3d_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
      (unsigned long long*)mask.data_ptr<int64_t>());

  at::Tensor keep_t = at::zeros(
      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
                          col_blocks * sizeof(unsigned long long), stream>>>(
      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
      boxes_num);

  auto keep_data = keep_t.nonzero().index({Slice(), 0});
  keep_num.fill_(at::Scalar(keep_data.size(0)));
  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
  AT_CUDA_CHECK(cudaGetLastError());
}

void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
                                               Tensor& keep_num,
                                               float nms_overlap_thresh) {
  using namespace at::indexing;
  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  int boxes_num = boxes.size(0);

  const int col_blocks =
      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));

  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);

  iou3d_nms3d_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
      (unsigned long long*)mask.data_ptr<int64_t>());

  at::Tensor keep_t = at::zeros(
      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
                          col_blocks * sizeof(unsigned long long), stream>>>(
      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
      boxes_num);

  auto keep_data = keep_t.nonzero().index({Slice(), 0});
  keep_num.fill_(at::Scalar(keep_data.size(0)));
  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap

#include <cmath>
#include <cstdio>

#include "knn_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
                                  const Tensor xyz, const Tensor new_xyz,
                                  Tensor idx, Tensor dist2) {
  // param new_xyz: (B, m, 3)
  // param xyz: (B, n, 3)
  // param idx: (B, m, nsample)

  at::cuda::CUDAGuard device_guard(new_xyz.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      new_xyz.scalar_type(), "knn_forward_cuda_kernel", [&] {
        knn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            b, n, m, nsample, xyz.data_ptr<scalar_t>(),
            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),
            dist2.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "masked_conv2d_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
                                           Tensor top_data, const int kernel_h,
                                           const int kernel_w, const int pad_h,
                                           const int pad_w) {
  int channels = bottom_data.size(1);
  int height = bottom_data.size(2);
  int width = bottom_data.size(3);
  int mask_cnt = mask_h_idx.size(0);
  int output_size = mask_cnt * channels;

  at::cuda::CUDAGuard device_guard(bottom_data.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
        MaskedIm2colForward<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, bottom_data_, height, width, kernel_h, kernel_w,
                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}

void MaskedCol2imForwardCUDAKernelLauncher(
    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
    Tensor top_data, const int height, const int width, const int channels) {
  int mask_cnt = mask_h_idx.size(0);
  int output_size = mask_cnt * channels;

  at::cuda::CUDAGuard device_guard(bottom_data.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();

        MaskedCol2imForward<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, bottom_data_, height, width, channels, mask_h_idx_,
                mask_w_idx_, mask_cnt, top_data_);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
#include "min_area_polygons_cuda.cuh"
#include "pytorch_cuda_helper.hpp"

void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets,
                                       Tensor polygons) {
  int num_pointsets = pointsets.size(0);
  const int output_size = polygons.numel();
  at::cuda::CUDAGuard device_guard(pointsets.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      pointsets.scalar_type(), "min_area_polygons_cuda_kernel", ([&] {
        min_area_polygons_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                num_pointsets, pointsets.data_ptr<scalar_t>(),
                polygons.data_ptr<scalar_t>());
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "modulated_deform_conv_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void modulated_deformable_im2col_cuda(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col) {
  // num_axes should be smaller than block size
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels = channels * batch_size * height_col * width_col;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();

        modulated_deformable_im2col_gpu_kernel<<<
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            at::cuda::getCurrentCUDAStream()>>>(
            num_kernels, data_im_, data_offset_, data_mask_, height_im,
            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
            channels, deformable_group, height_col, width_col, data_col_);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}

void modulated_deformable_col2im_cuda(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im) {
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels =
      channels * kernel_h * kernel_w * batch_size * height_col * width_col;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();

        modulated_deformable_col2im_gpu_kernel<<<
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            at::cuda::getCurrentCUDAStream()>>>(
            num_kernels, data_col_, data_offset_, data_mask_, channels,
            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
            batch_size, deformable_group, height_col, width_col, grad_im_);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}

void modulated_deformable_col2im_coord_cuda(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask) {
  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
                          kernel_w * deformable_group;
  const int channel_per_deformable_group =
      channels * kernel_h * kernel_w / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();

        modulated_deformable_col2im_coord_gpu_kernel<<<
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            at::cuda::getCurrentCUDAStream()>>>(
            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
            stride_h, stride_w, dilation_h, dilation_w,
            channel_per_deformable_group, batch_size,
            2 * kernel_h * kernel_w * deformable_group, deformable_group,
            height_col, width_col, grad_offset_, grad_mask_);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <THC/THCAtomics.cuh>
#include <vector>

#include "ms_deform_attn_cuda_kernel.cuh"

template <typename scalar_t>
void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
                               const int64_t *data_spatial_shapes,
                               const int64_t *data_level_start_index,
                               const scalar_t *data_sampling_loc,
                               const scalar_t *data_attn_weight,
                               const int batch_size, const int spatial_size,
                               const int num_heads, const int channels,
                               const int num_levels, const int num_query,
                               const int num_point, scalar_t *data_col) {
  const int num_kernels = batch_size * num_query * num_heads * channels;
  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
  const int num_threads = THREADS_PER_BLOCK;
  ms_deformable_im2col_gpu_kernel<scalar_t>
      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
          num_heads, channels, num_levels, num_query, num_point, data_col);

  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
  }
}

template <typename scalar_t>
void ms_deformable_col2im_cuda(
    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  const int num_threads =
      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;
  const int num_kernels = batch_size * num_query * num_heads * channels;
  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
  if (channels > THREADS_PER_BLOCK) {
    if ((channels & THREADS_PER_BLOCK - 1) == 0) {
      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
             num_threads * 3 * sizeof(scalar_t), stream>>>(
              num_kernels, grad_col, data_value, data_spatial_shapes,
              data_level_start_index, data_sampling_loc, data_attn_weight,
              batch_size, spatial_size, num_heads, channels, num_levels,
              num_query, num_point, grad_value, grad_sampling_loc,
              grad_attn_weight);
    } else {
      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, spatial_size, num_heads,
                       channels, num_levels, num_query, num_point, grad_value,
                       grad_sampling_loc, grad_attn_weight);
    }
  } else {
    switch (channels) {
      case 1:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      1>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 2:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      2>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 4:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      4>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 8:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      8>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 16:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      16>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 32:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      32>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 64:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
                                                                      64>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 128:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
                                                                      128>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 256:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
                                                                      256>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 512:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
                                                                      512>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      default:
        if (channels < 64) {
          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
                 num_threads * 3 * sizeof(scalar_t), stream>>>(
                  num_kernels, grad_col, data_value, data_spatial_shapes,
                  data_level_start_index, data_sampling_loc, data_attn_weight,
                  batch_size, spatial_size, num_heads, channels, num_levels,
                  num_query, num_point, grad_value, grad_sampling_loc,
                  grad_attn_weight);
        } else {
          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
                 num_threads * 3 * sizeof(scalar_t), stream>>>(
                  num_kernels, grad_col, data_value, data_spatial_shapes,
                  data_level_start_index, data_sampling_loc, data_attn_weight,
                  batch_size, spatial_size, num_heads, channels, num_levels,
                  num_query, num_point, grad_value, grad_sampling_loc,
                  grad_attn_weight);
        }
    }
  }
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
  }
}

at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
                                       const at::Tensor &spatial_shapes,
                                       const at::Tensor &level_start_index,
                                       const at::Tensor &sampling_loc,
                                       const at::Tensor &attn_weight,
                                       const int im2col_step) {
  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
  AT_ASSERTM(spatial_shapes.is_contiguous(),
             "spatial_shapes tensor has to be contiguous");
  AT_ASSERTM(level_start_index.is_contiguous(),
             "level_start_index tensor has to be contiguous");
  AT_ASSERTM(sampling_loc.is_contiguous(),
             "sampling_loc tensor has to be contiguous");
  AT_ASSERTM(attn_weight.is_contiguous(),
             "attn_weight tensor has to be contiguous");

  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
  AT_ASSERTM(level_start_index.is_cuda(),
             "level_start_index must be a CUDA tensor");
  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");

  const int batch = value.size(0);
  const int spatial_size = value.size(1);
  const int num_heads = value.size(2);
  const int channels = value.size(3);

  const int num_levels = spatial_shapes.size(0);

  const int num_query = sampling_loc.size(1);
  const int num_point = sampling_loc.size(4);

  const int im2col_step_ = std::min(batch, im2col_step);

  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
             batch, im2col_step_);

  auto output =
      at::zeros({batch, num_query, num_heads, channels}, value.options());

  const int batch_n = im2col_step_;
  auto output_n = output.view(
      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
  auto per_value_size = spatial_size * num_heads * channels;
  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
  for (int n = 0; n < batch / im2col_step_; ++n) {
    auto columns = output_n.select(0, n);
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
          ms_deformable_im2col_cuda(
              at::cuda::getCurrentCUDAStream(),
              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
              spatial_shapes.data_ptr<int64_t>(),
              level_start_index.data_ptr<int64_t>(),
              sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
              attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size,
              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
              num_point, columns.data_ptr<scalar_t>());
        }));
  }

  output = output.view({batch, num_query, num_heads * channels});

  return output;
}

void ms_deform_attn_cuda_backward(
    const at::Tensor &value, const at::Tensor &spatial_shapes,
    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
    const at::Tensor &attn_weight, const at::Tensor &grad_output,
    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
    at::Tensor &grad_attn_weight, const int im2col_step) {
  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
  AT_ASSERTM(spatial_shapes.is_contiguous(),
             "spatial_shapes tensor has to be contiguous");
  AT_ASSERTM(level_start_index.is_contiguous(),
             "level_start_index tensor has to be contiguous");
  AT_ASSERTM(sampling_loc.is_contiguous(),
             "sampling_loc tensor has to be contiguous");
  AT_ASSERTM(attn_weight.is_contiguous(),
             "attn_weight tensor has to be contiguous");
  AT_ASSERTM(grad_output.is_contiguous(),
             "grad_output tensor has to be contiguous");

  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
  AT_ASSERTM(level_start_index.is_cuda(),
             "level_start_index must be a CUDA tensor");
  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
  AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");

  const int batch = value.size(0);
  const int spatial_size = value.size(1);
  const int num_heads = value.size(2);
  const int channels = value.size(3);

  const int num_levels = spatial_shapes.size(0);

  const int num_query = sampling_loc.size(1);
  const int num_point = sampling_loc.size(4);

  const int im2col_step_ = std::min(batch, im2col_step);

  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
             batch, im2col_step_);

  const int batch_n = im2col_step_;
  auto per_value_size = spatial_size * num_heads * channels;
  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
  auto grad_output_n = grad_output.view(
      {batch / im2col_step_, batch_n, num_query, num_heads, channels});

  for (int n = 0; n < batch / im2col_step_; ++n) {
    auto grad_output_g = grad_output_n.select(0, n);
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
          ms_deformable_col2im_cuda(
              at::cuda::getCurrentCUDAStream(),
              grad_output_g.data_ptr<scalar_t>(),
              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
              spatial_shapes.data_ptr<int64_t>(),
              level_start_index.data_ptr<int64_t>(),
              sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
              attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size,
              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
              num_point,
              grad_value.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_value_size,
              grad_sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
              grad_attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size);
        }));
  }
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "nms_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                             int offset) {
  at::cuda::CUDAGuard device_guard(boxes.device());

  if (boxes.numel() == 0) {
    return at::empty({0}, boxes.options().dtype(at::kLong));
  }
  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
  auto boxes_sorted = boxes.index_select(0, order_t);

  int boxes_num = boxes.size(0);
  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
  dim3 threads(threadsPerBlock);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  nms_cuda<<<blocks, threads, 0, stream>>>(
      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
      (unsigned long long*)mask.data_ptr<int64_t>());

  // Filter the boxes which should be kept.
  at::Tensor keep_t = at::zeros(
      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
                          col_blocks * sizeof(unsigned long long), stream>>>(
      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
      boxes_num);
  AT_CUDA_CHECK(cudaGetLastError());
  return order_t.masked_select(keep_t);
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "nms_quadri_cuda.cuh"
#include "pytorch_cuda_helper.hpp"

Tensor nms_quadri_cuda(const Tensor dets, const Tensor scores,
                       const Tensor order_t, const Tensor dets_sorted,
                       float iou_threshold, const int multi_label) {
  // using scalar_t = float;
  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
  at::cuda::CUDAGuard device_guard(dets.device());

  int dets_num = dets.size(0);

  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);

  Tensor mask =
      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));

  dim3 blocks(col_blocks, col_blocks);
  dim3 threads(threadsPerBlock);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      dets_sorted.scalar_type(), "nms_quadri_kernel_cuda", [&] {
        nms_quadri_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
      });

  Tensor mask_cpu = mask.to(at::kCPU);
  unsigned long long* mask_host =
      (unsigned long long*)mask_cpu.data_ptr<int64_t>();

  std::vector<unsigned long long> remv(col_blocks);
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  Tensor keep =
      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
  int64_t* keep_out = keep.data_ptr<int64_t>();

  int num_to_keep = 0;
  for (int i = 0; i < dets_num; i++) {
    int nblock = i / threadsPerBlock;
    int inblock = i % threadsPerBlock;

    if (!(remv[nblock] & (1ULL << inblock))) {
      keep_out[num_to_keep++] = i;
      unsigned long long* p = mask_host + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv[j] |= p[j];
      }
    }
  }

  AT_CUDA_CHECK(cudaGetLastError());
  return order_t.index(
      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
           .to(order_t.device(), keep.scalar_type())});
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
#include "nms_rotated_cuda.cuh"
#include "pytorch_cuda_helper.hpp"

Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
                        const Tensor order_t, const Tensor dets_sorted,
                        float iou_threshold, const int multi_label) {
  // using scalar_t = float;
  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
  at::cuda::CUDAGuard device_guard(dets.device());

  int dets_num = dets.size(0);

  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);

  Tensor mask =
      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));

  dim3 blocks(col_blocks, col_blocks);
  dim3 threads(threadsPerBlock);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
      });

  Tensor mask_cpu = mask.to(at::kCPU);
  unsigned long long* mask_host =
      (unsigned long long*)mask_cpu.data_ptr<int64_t>();

  std::vector<unsigned long long> remv(col_blocks);
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  Tensor keep =
      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
  int64_t* keep_out = keep.data_ptr<int64_t>();

  int num_to_keep = 0;
  for (int i = 0; i < dets_num; i++) {
    int nblock = i / threadsPerBlock;
    int inblock = i % threadsPerBlock;

    if (!(remv[nblock] & (1ULL << inblock))) {
      keep_out[num_to_keep++] = i;
      unsigned long long* p = mask_host + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv[j] |= p[j];
      }
    }
  }

  AT_CUDA_CHECK(cudaGetLastError());
  return order_t.index(
      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
           .to(order_t.device(), keep.scalar_type())});
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.

#include <stdio.h>

#include "points_in_boxes_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
                                                int pts_num, const Tensor boxes,
                                                const Tensor pts,
                                                Tensor box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is
  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
  // -1

  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      boxes.scalar_type(), "points_in_boxes_part_forward_cuda_kernel", [&] {
        points_in_boxes_part_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
                                               int pts_num, const Tensor boxes,
                                               const Tensor pts,
                                               Tensor box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
  // default -1

  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      boxes.scalar_type(), "points_in_boxes_all_forward_cuda_kernel", [&] {
        points_in_boxes_all_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu

#include <stdio.h>

#include "points_in_polygons_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
                                               const at::Tensor polygons,
                                               const int rows, const int cols,
                                               at::Tensor output) {
  const int output_size = rows * cols;
  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "points_in_polygons_forward_cuda_kernel", ([&] {
        const scalar_t *vertex1 = points.data_ptr<scalar_t>();
        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();
        scalar_t *inside_flag = output.data_ptr<scalar_t>();

        points_in_polygons_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, vertex1, vertex2, rows, cols, inside_flag);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "prroi_pool_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                        Tensor output, int pooled_height,
                                        int pooled_width, float spatial_scale) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  prroi_pool_forward_cuda_kernel<float>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
          output.data_ptr<float>(), pooled_height, pooled_width,
          static_cast<float>(spatial_scale), channels, height, width);

  AT_CUDA_CHECK(cudaGetLastError());
}

void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                         Tensor grad_input, int pooled_height,
                                         int pooled_width,
                                         float spatial_scale) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  prroi_pool_backward_cuda_kernel<float>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
          grad_input.data_ptr<float>(), pooled_height, pooled_width,
          static_cast<float>(spatial_scale), channels, height, width);

  AT_CUDA_CHECK(cudaGetLastError());
}

void PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,
                                             Tensor input, Tensor rois,
                                             Tensor grad_rois,
                                             int pooled_height,
                                             int pooled_width,
                                             float spatial_scale) {
  int output_size = grad_output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  prroi_pool_coor_backward_cuda_kernel<float>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
          input.data_ptr<float>(), rois.data_ptr<float>(),
          grad_rois.data_ptr<float>(), pooled_height, pooled_width,
          static_cast<float>(spatial_scale), channels, height, width);

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src

#include <torch/serialize/tensor.h>

#include "psamask_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"

void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
                                      Tensor output, const int num_,
                                      const int h_feature, const int w_feature,
                                      const int h_mask, const int w_mask,
                                      const int half_h_mask,
                                      const int half_w_mask) {
  int nthreads = num_ * h_feature * w_feature;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  if (psa_type == 0)
    AT_DISPATCH_FLOATING_TYPES(
        input.scalar_type(), "psamask_collect_forward_cuda", [&] {
          psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
              half_w_mask, input.data_ptr<scalar_t>(),
              output.data_ptr<scalar_t>());
        });
  else
    AT_DISPATCH_FLOATING_TYPES(
        input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
          psamask_distribute_forward_cuda<scalar_t>
              <<<nthreads, 512, 0, stream>>>(
                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
                  half_w_mask, input.data_ptr<scalar_t>(),
                  output.data_ptr<scalar_t>());
        });
}

void PSAMaskBackwardCUDAKernelLauncher(
    const int psa_type, const Tensor grad_output, Tensor grad_input,
    const int num_, const int h_feature, const int w_feature, const int h_mask,
    const int w_mask, const int half_h_mask, const int half_w_mask) {
  int nthreads = num_ * h_feature * w_feature;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  if (psa_type == 0)
    AT_DISPATCH_FLOATING_TYPES(
        grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
          psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
              half_w_mask, grad_output.data_ptr<scalar_t>(),
              grad_input.data_ptr<scalar_t>());
        });
  else
    AT_DISPATCH_FLOATING_TYPES(
        grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
          psamask_distribute_backward_cuda<scalar_t>
              <<<nthreads, 512, 0, stream>>>(
                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
                  half_w_mask, grad_output.data_ptr<scalar_t>(),
                  grad_input.data_ptr<scalar_t>());
        });
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp"
#include "riroi_align_rotated_cuda_kernel.cuh"

void RiROIAlignRotatedForwardCUDAKernelLauncher(
    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor output) {
  const int output_size =
      num_rois * pooled_height * pooled_width * channels * num_orientations;
  at::cuda::CUDAGuard device_guard(features.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "riroi_align_rotated_forward_cuda_kernel", ([&] {
        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
        scalar_t *top_data = output.data_ptr<scalar_t>();

        riroi_align_rotated_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
                num_samples, clockwise, channels, height, width, pooled_height,
                pooled_width, num_orientations, top_data);
      }));

  AT_CUDA_CHECK(cudaGetLastError());
}

void RiROIAlignRotatedBackwardCUDAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor bottom_grad) {
  const int output_size =
      num_rois * pooled_height * pooled_width * channels * num_orientations;
  at::cuda::CUDAGuard device_guard(top_grad.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "riroi_align_rotated_backward_cuda_kernel", ([&] {
        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
        riroi_align_rotated_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, top_diff, rois_data, spatial_scale, num_samples,
                clockwise, channels, height, width, pooled_height, pooled_width,
                num_orientations, bottom_diff);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp"
#include "roi_align_cuda_kernel.cuh"

void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,
                                       int aligned_height, int aligned_width,
                                       float spatial_scale, int sampling_ratio,
                                       int pool_mode, bool aligned) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
        roi_align_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
                aligned_height, aligned_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
                aligned, channels, height, width);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                        Tensor argmax_y, Tensor argmax_x,
                                        Tensor grad_input, int aligned_height,
                                        int aligned_width, float spatial_scale,
                                        int sampling_ratio, int pool_mode,
                                        bool aligned) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
        roi_align_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
                aligned_height, aligned_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
                aligned, channels, height, width);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp"
#include "roi_align_rotated_cuda_kernel.cuh"

void ROIAlignRotatedForwardCUDAKernelLauncher(
    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor output) {
  const int output_size = num_rois * pooled_height * pooled_width * channels;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
        const scalar_t *bottom_data = input.data_ptr<scalar_t>();
        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
        scalar_t *top_data = output.data_ptr<scalar_t>();

        roi_align_rotated_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
                sampling_ratio, aligned, clockwise, channels, height, width,
                pooled_height, pooled_width, top_data);
      }));

  AT_CUDA_CHECK(cudaGetLastError());
}

void ROIAlignRotatedBackwardCUDAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
  const int output_size = num_rois * pooled_height * pooled_width * channels;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
        roi_align_rotated_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,
                aligned, clockwise, channels, height, width, pooled_height,
                pooled_width, bottom_diff);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp"
#include "roi_pool_cuda_kernel.cuh"

void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                      Tensor argmax, int pooled_height,
                                      int pooled_width, float spatial_scale) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
        roi_pool_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                argmax.data_ptr<int>(), pooled_height, pooled_width,
                static_cast<scalar_t>(spatial_scale), channels, height, width);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
                                       Tensor argmax, Tensor grad_input,
                                       int pooled_height, int pooled_width,
                                       float spatial_scale) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
        roi_pool_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
                channels, height, width);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.

#include <stdio.h>

#include "pytorch_cuda_helper.hpp"
#include "roiaware_pool3d_cuda_kernel.cuh"

void RoiawarePool3dForwardCUDAKernelLauncher(
    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
    int out_y, int out_z, const Tensor rois, const Tensor pts,
    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
    Tensor pooled_features, int pool_method) {
  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params
  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params
  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params
  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:
  // max_pool 1: avg_pool

  at::cuda::CUDAGuard device_guard(pts_feature.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  Tensor pts_mask =
      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));

  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      rois.scalar_type(), "generate_pts_mask_for_box3d", [&] {
        generate_pts_mask_for_box3d<scalar_t>
            <<<blocks_mask, threads, 0, stream>>>(
                boxes_num, pts_num, out_x, out_y, out_z,
                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),
                pts_mask.data_ptr<int>());
      });

  AT_CUDA_CHECK(cudaGetLastError());

  // TODO: Merge the collect and pool functions, SS

  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));

  AT_DISPATCH_INTEGRAL_TYPES(
      pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
        collect_inside_pts_for_box3d<scalar_t>
            <<<blocks_collect, threads, 0, stream>>>(
                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,
                pts_mask.data_ptr<int>(),
                pts_idx_of_voxels.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());

  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
                   channels, boxes_num);
  if (pool_method == 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
              out_z, pts_feature.data_ptr<scalar_t>(),
              pts_idx_of_voxels.data_ptr<int>(),
              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());
        });
  } else if (pool_method == 1) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        pts_feature.scalar_type(), "roiaware_avgpool3d", [&] {
          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
              out_z, pts_feature.data_ptr<scalar_t>(),
              pts_idx_of_voxels.data_ptr<int>(),
              pooled_features.data_ptr<scalar_t>());
        });
  }

  AT_CUDA_CHECK(cudaGetLastError());
}

void RoiawarePool3dBackwardCUDAKernelLauncher(
    int boxes_num, int out_x, int out_y, int out_z, int channels,
    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
    const Tensor grad_out, Tensor grad_in, int pool_method) {
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params argmax: (N, out_x, out_y, out_z, C)
  // params grad_out: (N, out_x, out_y, out_z, C)
  // params grad_in: (npoints, C), return value
  // params pool_method: 0: max_pool, 1: avg_pool

  at::cuda::CUDAGuard device_guard(grad_out.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
              boxes_num);
  dim3 threads(THREADS_PER_BLOCK);

  if (pool_method == 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        grad_in.scalar_type(), "roiaware_maxpool3d_backward", [&] {
          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),
              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());
        });
  } else if (pool_method == 1) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        grad_in.scalar_type(), "roiaware_avgpool3d_backward", [&] {
          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),
              grad_in.data_ptr<scalar_t>());
        });
  }

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
================================================
/*
Modified from
https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/

#include <math.h>
#include <stdio.h>

#include "pytorch_cuda_helper.hpp"
#include "roipoint_pool3d_cuda_kernel.cuh"

void RoIPointPool3dForwardCUDAKernelLauncher(
    int batch_size, int pts_num, int boxes_num, int feature_in_len,
    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
    const Tensor pts_feature, Tensor pooled_features,
    Tensor pooled_empty_flag) {
  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},
                                boxes3d.options().dtype(at::kInt));

  at::cuda::CUDAGuard device_guard(xyz.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz.scalar_type(), "assign_pts_to_box3d", [&] {
        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(
            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),
            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());
      });

  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},
                             boxes3d.options().dtype(at::kInt));

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);

  get_pooled_idx<<<blocks2, threads, 0, stream>>>(
      batch_size, pts_num, boxes_num, sampled_pts_num,
      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
      pooled_empty_flag.data_ptr<int>());

  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
                   batch_size);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz.scalar_type(), "roipoint_pool3d_forward", [&] {
        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(
            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),
            pts_feature.data_ptr<scalar_t>(),
            pooled_features.data_ptr<scalar_t>(),
            pooled_empty_flag.data_ptr<int>());
      });
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "pytorch_cuda_helper.hpp"
#include "rotated_feature_align_cuda_kernel.cuh"

void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
                                                  const Tensor best_bboxes,
                                                  const float spatial_scale,
                                                  const int points,
                                                  Tensor output) {
  at::cuda::CUDAGuard device_guard(features.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const int output_size = features.numel();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "rotated_feature_align_forward_cuda_kernel",
      ([&] {
        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
        scalar_t* top_data = output.data_ptr<scalar_t>();

        rotated_feature_align_forward_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, points, bottom_data, bboxes_data,
                scalar_t(spatial_scale), features.size(1), features.size(2),
                features.size(3), top_data);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}

void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
                                                   const Tensor best_bboxes,
                                                   const float spatial_scale,
                                                   const int points,
                                                   Tensor bottom_grad) {
  at::cuda::CUDAGuard device_guard(top_grad.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const int output_size = top_grad.numel();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "rotated_feature_align_backward_cuda_kernel",
      ([&] {
        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();

        rotated_feature_align_backward_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, points, top_diff, bboxes_data,
                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),
                top_grad.size(3), bottom_diff);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include <stdio.h>
#include <stdlib.h>
#include <torch/types.h>

#include "pytorch_cuda_helper.hpp"
#include "scatter_points_cuda_kernel.cuh"

std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
    const at::Tensor &feats, const at::Tensor &coors,
    const reduce_t reduce_type) {
  const int num_input = feats.size(0);
  const int num_feats = feats.size(1);

  if (num_input == 0)
    return {feats.clone().detach(), coors.clone().detach(),
            coors.new_empty({0}, torch::kInt32),
            coors.new_empty({0}, torch::kInt32)};

  at::Tensor out_coors;
  at::Tensor coors_map;
  at::Tensor reduce_count;

  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);

  std::tie(out_coors, coors_map, reduce_count) =
      at::unique_dim(coors_clean, 0, true, true, true);

  if (out_coors[0][0].lt(0).item<bool>()) {
    // the first element of out_coors (-1,-1,-1) and should be removed
    out_coors = out_coors.slice(0, 1);
    reduce_count = reduce_count.slice(0, 1);
    coors_map = coors_map - 1;
  }

  coors_map = coors_map.to(torch::kInt32);
  reduce_count = reduce_count.to(torch::kInt32);

  auto reduced_feats =
      at::empty({out_coors.size(0), num_feats}, feats.options());

  at::cuda::CUDAGuard device_guard(feats.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES(
      feats.scalar_type(), "feats_reduce_kernel", ([&] {
        if (reduce_type == reduce_t::MAX)
          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
        else
          reduced_feats.fill_(static_cast<scalar_t>(0));

        dim3 blocks(std::min(
            at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
        dim3 threads(THREADS_PER_BLOCK);
        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(
            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,
            reduce_type);
        if (reduce_type == reduce_t::MEAN)
          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
      }));

  AT_CUDA_CHECK(cudaGetLastError());

  return {reduced_feats, out_coors, coors_map, reduce_count};
}

void DynamicPointToVoxelBackwardCUDAKernelLauncher(
    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
    const at::Tensor &feats, const at::Tensor &reduced_feats,
    const at::Tensor &coors_map, const at::Tensor &reduce_count,
    const reduce_t reduce_type) {
  const int num_input = feats.size(0);
  const int num_reduced = reduced_feats.size(0);
  const int num_feats = feats.size(1);

  grad_feats.fill_(0);
  // copy voxel grad to points

  if (num_input == 0 || num_reduced == 0) return;
  at::cuda::CUDAGuard device_guard(feats.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
        ([&] {
          dim3 blocks(std::min(
              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
          dim3 threads(THREADS_PER_BLOCK);
          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(
              grad_feats.data_ptr<scalar_t>(),
              grad_reduced_feats.data_ptr<scalar_t>(),
              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
              num_input, num_feats, reduce_type);
        }));

    AT_CUDA_CHECK(cudaGetLastError());
  } else {
    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
                                coors_map.options().dtype(torch::kInt32));
    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(),
        "max_reduce_traceback_scatter_idx_kernel", ([&] {
          dim3 blocks(std::min(
              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
          dim3 threads(THREADS_PER_BLOCK);
          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,
                                                    stream>>>(
              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
              num_input, num_feats);
        }));

    AT_CUDA_CHECK(cudaGetLastError());

    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(),
        "max_reduce_traceback_scatter_idx_kernel", ([&] {
          dim3 blocks(
              std::min(at::cuda::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),
                       maxGridDim));
          dim3 threads(THREADS_PER_BLOCK);
          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(
              grad_feats.data_ptr<scalar_t>(),
              grad_reduced_feats.data_ptr<scalar_t>(),
              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
        }));

    AT_CUDA_CHECK(cudaGetLastError());
  }
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <ATen/ATen.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>

#include <chrono>
#include <limits>
#include <spconv/indice.cuh>
#include <type_traits>

#include "pytorch_cuda_helper.hpp"

namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose) {
    Index batchSize = gridsOut.dim(0);
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0) return 0;
    if (transpose)
      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
                              indiceNum, indicePairUnique, kernelSize, stride,
                              padding, dilation, outSpatialShape);
    else
      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
                              indiceNum, indicePairUnique, kernelSize, stride,
                              padding, dilation, outSpatialShape);
    TV_CHECK_CUDA_ERR();
    return 1;
  }
};

template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    Index batchSize = gridsOut.dim(0);
    auto kernelVolume = indicePairs.dim(0);
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0) return 0;
    Index numAct = indicePairUnique.dim(0) - 1;
    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
                            indicePairUnique, outSpatialShape, batchSize);
    TV_CHECK_CUDA_ERR();
    assignIndicePairsKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
                            indicePairUnique, outSpatialShape);
    TV_CHECK_CUDA_ERR();

    if (resetGrid) {
      resetGridKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
      TV_CHECK_CUDA_ERR();
    }
    return numAct;
  }
};

template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0) return 0;
    prepareSubMGridKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
    TV_CHECK_CUDA_ERR();
    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
                            kernelSize, stride, padding, dilation,
                            outSpatialShape);
    TV_CHECK_CUDA_ERR();

    if (resetGrid) {
      resetGridSubMKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
                              numActIn);
      TV_CHECK_CUDA_ERR();
    }
    return numActIn;
  }
};
}  // namespace functor

#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \
                                                       int, NDIM>;            \
  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
                                                         int, NDIM>;          \
  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
                                                         int, NDIM>;          \
  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \
                                                       int, NDIM>;

#define DECLARE_GPU_INDEX(Index)          \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);

DECLARE_GPU_INDEX(int);

#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <ATen/ATen.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/maxpool.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>

#include <chrono>
#include <limits>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.cuh>

#include "pytorch_cuda_helper.hpp"

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
                                      const scalar_t *inFeatures,
                                      const Index *indicesIn,
                                      const Index *indicesOut, int numHot,
                                      int numPlanes) {
  scalar_t in, out;
  int ILPStrideY[NumILP];
  Index idxo, idxi;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
  outFeatures += blockIdx.y * NumTLP;
  inFeatures += blockIdx.y * NumTLP;
  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
       ix += blockDim.x * gridDim.x) {
    {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
        in = inFeatures[idxi];
        out = outFeatures[idxo];
        if (in > out) {
          outFeatures[idxo] = in;
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
                                             const scalar_t *inFeatures,
                                             const Index *indicesIn,
                                             const Index *indicesOut,
                                             int numHot, int numPlanes) {
  int ILPStrideX[NumILP];
  Index RI[NumILP];
  Index RO[NumILP];
  scalar_t in, out;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        in = inFeatures[RI[ilp] + iy];
        out = outFeatures[RO[ilp] + iy];
        if (in > out) {
          outFeatures[RO[ilp] + iy] = in;
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType>
__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
                                         const scalar_t *inFeatures,
                                         const Index *indicesIn,
                                         const Index *indicesOut, int numHot,
                                         int numPlanes) {
  int ILPStrideY[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
  scalar_t bufi[vecloadFactor];
  scalar_t bufo[vecloadFactor];
  Index idxi, idxo;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
  outFeatures += blockIdx.y * NumTLP;
  inFeatures += blockIdx.y * NumTLP;
  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
       ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      reinterpret_cast<VecType *>(bufo)[0] =
          reinterpret_cast<VecType *>(outFeatures)[idxo];
      reinterpret_cast<VecType *>(bufi)[0] =
          reinterpret_cast<const VecType *>(inFeatures)[idxi];
#pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
        if (bufi[i] > bufo[i]) {
          bufo[i] = bufi[i];
        }
      }
      reinterpret_cast<VecType *>(outFeatures)[idxo] =
          reinterpret_cast<VecType *>(bufo)[0];
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
                                        const scalar_t *inFeatures,
                                        const Index *indicesIn,
                                        const Index *indicesOut, int numHot,
                                        int numPlanes) {
  int ILPStrideX[NumILP];
  Index RI[NumILP];
  Index RO[NumILP];
  scalar_t in, out;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < numHot) {
        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
      }
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < numHot) {
          in = inFeatures[RI[ilp] + iy];
          out = outFeatures[RO[ilp] + iy];
          if (in > out) {
            outFeatures[RO[ilp] + iy] = in;
          }
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
                                      const scalar_t *inFeatures,
                                      const scalar_t *fout, scalar_t *fin,
                                      const Index *indicesIn,
                                      const Index *indicesOut, int numHot,
                                      int numPlanes) {
  scalar_t in, out;
  Index idxo, idxi;
  int ILPStrideY[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
  outFeatures += blockIdx.y * NumTLP;
  inFeatures += blockIdx.y * NumTLP;
  fout += blockIdx.y * NumTLP;
  fin += blockIdx.y * NumTLP;
  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
       ix += blockDim.x * gridDim.x) {
    {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
        in = inFeatures[idxi];
        out = outFeatures[idxo];
        if (in == out) {
          fin[idxi] += fout[idxo];
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericBlockKernel(
    const scalar_t *outFeatures, const scalar_t *inFeatures,
    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
    const Index *indicesOut, int numHot, int numPlanes) {
  int ILPStrideX[NumILP];
  Index RI[NumILP];
  Index RO[NumILP];
  scalar_t in, out;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        in = inFeatures[RI[ilp] + iy];
        out = outFeatures[RO[ilp] + iy];
        if (in == out) {
          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType>
__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
                                         const scalar_t *inFeatures,
                                         const scalar_t *fout, scalar_t *fin,
                                         const Index *indicesIn,
                                         const Index *indicesOut, int numHot,
                                         int numPlanes) {
  int ILPStrideY[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
  scalar_t bufi[vecloadFactor];
  scalar_t bufo[vecloadFactor];
  scalar_t bufdi[vecloadFactor];
  scalar_t bufdo[vecloadFactor];
  Index idxi, idxo;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
  outFeatures += blockIdx.y * NumTLP;
  inFeatures += blockIdx.y * NumTLP;
  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
       ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      reinterpret_cast<VecType *>(bufo)[0] =
          reinterpret_cast<const VecType *>(outFeatures)[idxo];
      reinterpret_cast<VecType *>(bufi)[0] =
          reinterpret_cast<const VecType *>(inFeatures)[idxi];
      reinterpret_cast<VecType *>(bufdo)[0] =
          reinterpret_cast<const VecType *>(fout)[idxo];
      reinterpret_cast<VecType *>(bufdi)[0] =
          reinterpret_cast<VecType *>(fin)[idxi];

#pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
        if (bufi[i] == bufo[i]) {
          bufdi[i] += bufdo[i];
        }
      }
      reinterpret_cast<VecType *>(fin)[idxi] =
          reinterpret_cast<VecType *>(bufdi)[0];
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
                                        const scalar_t *inFeatures,
                                        const scalar_t *fout, scalar_t *fin,
                                        const Index *indicesIn,
                                        const Index *indicesOut, int numHot,
                                        int numPlanes) {
  int ILPStrideX[NumILP];
  Index RI[NumILP];
  Index RO[NumILP];
  scalar_t in, out;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < numHot) {
        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
      }
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < numHot) {
          in = inFeatures[RI[ilp] + iy];
          out = outFeatures[RO[ilp] + iy];
          if (in == out) {
            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
          }
        }
      }
    }
  }
}

namespace functor {
template <typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> inFeatures,
                  tv::TensorView<const Index> indices, int size) {
    if (size <= 0) return;
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
                                 &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;

      int numHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                    indices.subview(0).data(),
                                    indices.subview(1).data(), numHotBlock,
                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }

          if (size > numHotBlock) {
            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                       indices.subview(0).data() + numHotBlock,
                                       indices.subview(1).data() + numHotBlock,
                                       size - numHotBlock, numPlanes);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
        }
      }
    });

    if (notFound) {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      int numHotBlock = (size / NumTLP) * NumTLP;
      if (numHotBlock >= NumTLP) {
        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(),
                indices.subview(0).data(), indices.subview(1).data(),
                numHotBlock, numPlanes);
        TV_CHECK_CUDA_ERR();
      }

      if (size > numHotBlock) {
        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(),
                indices.subview(0).data() + numHotBlock,
                indices.subview(1).data() + numHotBlock, size - numHotBlock,
                numPlanes);
        TV_CHECK_CUDA_ERR();
      }
    }
  }
};

template <typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::TorchGPU &d,
                  tv::TensorView<const scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> inFeatures,
                  tv::TensorView<const scalar_t> fout,
                  tv::TensorView<scalar_t> fin,
                  tv::TensorView<const Index> indices, int size) {
    if (size <= 0) return;
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
                                 &indices, &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;

      int numHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                    fout.data(), fin.data(),
                                    indices.subview(0).data(),
                                    indices.subview(1).data(), numHotBlock,
                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }

          if (size > numHotBlock) {
            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                       fout.data(), fin.data(),
                                       indices.subview(0).data() + numHotBlock,
                                       indices.subview(1).data() + numHotBlock,
                                       size - numHotBlock, numPlanes);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
        }
      }
    });

    if (notFound) {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      int numHotBlock = (size / NumTLP) * NumTLP;
      if (numHotBlock >= NumTLP) {
        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
                indices.subview(0).data(), indices.subview(1).data(),
                numHotBlock, numPlanes);
        TV_CHECK_CUDA_ERR();
      }

      if (size > numHotBlock) {
        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
                indices.subview(0).data() + numHotBlock,
                indices.subview(1).data() + numHotBlock, size - numHotBlock,
                numPlanes);
        TV_CHECK_CUDA_ERR();
      }
    }
  }
};

}  // namespace functor

#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
                                                       Index>;                 \
  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \
                                                        scalar_t, Index>;

#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);

DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);

#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
================================================
#include <cuda_runtime_api.h>
#include <torch/script.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/maxpool.h>

#include "pytorch_cuda_helper.hpp"

torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
                                                     torch::Tensor indicePairs,
                                                     torch::Tensor indiceNum,
                                                     int64_t numAct) {
  at::cuda::CUDAGuard device_guard(features.device());
  auto device = features.device().type();
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
      continue;
    }
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
          if (device == torch::kCPU) {
            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
                forwardFtor;
            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
                        tv::torch2tv<const scalar_t>(features),
                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
          } else {
            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
                forwardFtor;
            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
                        tv::torch2tv<const scalar_t>(features),
                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
            TV_CHECK_CUDA_ERR();
          }
        });
  }
  return output;
}

torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
                                                      torch::Tensor outFeatures,
                                                      torch::Tensor outGrad,
                                                      torch::Tensor indicePairs,
                                                      torch::Tensor indiceNum) {
  at::cuda::CUDAGuard device_guard(features.device());
  auto device = features.device().type();
  auto numInPlanes = features.size(1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
  auto kernelVolume = indicePairs.size(0);
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
      continue;
    }
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
          if (device == torch::kCPU) {
            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
                backwardFtor;
            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
                         tv::torch2tv<const scalar_t>(features),
                         tv::torch2tv<const scalar_t>(outGrad),
                         tv::torch2tv<scalar_t>(inputGrad),
                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
          } else {
            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
                backwardFtor;
            backwardFtor(tv::TorchGPU(),
                         tv::torch2tv<const scalar_t>(outFeatures),
                         tv::torch2tv<const scalar_t>(features),
                         tv::torch2tv<const scalar_t>(outGrad),
                         tv::torch2tv<scalar_t>(inputGrad),
                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
            TV_CHECK_CUDA_ERR();
          }
        });
  }
  return inputGrad;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <ATen/ATen.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/spconv/reordering.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>

#include <chrono>
#include <limits>
#include <spconv/reordering.cuh>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.cuh>

#include "pytorch_cuda_helper.hpp"

namespace functor {
template <typename scalar_t, typename Index>
struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
                  tv::TensorView<const scalar_t> features,
                  tv::TensorView<const Index> indices, int size) {
    if (size <= 0) return;
    int numPlanes = features.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
                                 &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;
      int nHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (nHotBlock >= NumTLP) {
            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
                                 vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(buffer.data(), features.data(),
                                    indices.data(), nHotBlock,
                                    numPlanes / vecloadFactor);

            TV_CHECK_CUDA_ERR();
          }
          if (size - nHotBlock > 0) {
            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
                            vecload_type_t>
                <<<dim3(1, numPlanes / NumTLP),
                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
                                    features.data(), indices.data() + nHotBlock,
                                    size - nHotBlock,
                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
        }
      }
    });

    if (notFound) {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
          <<<dim3(tv::launch::DivUp(size, NumTLP),
                  tv::launch::DivUp(numPlanes, NumTLP)),
             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
              buffer.data(), features.data(), indices.data(), size, numPlanes);
      TV_CHECK_CUDA_ERR();
    }
  }
};
template <typename scalar_t, typename Index>
struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> buffer,
                  tv::TensorView<const Index> indices, int size, bool stable) {
    if (size <= 0) return;
    int numPlanes = outFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor =
        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
                                 &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;
      int nHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (nHotBlock >= NumTLP) {
            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
                                     vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), buffer.data(),
                                    indices.data(), nHotBlock,
                                    numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
          }
          if (size - nHotBlock > 0) {
            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(
                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
            TV_CHECK_CUDA_ERR();
          }
          notFound = false;
        }
      }
    });
    if (notFound) {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
          <<<dim3(tv::launch::DivUp(size, NumTLP),
                  tv::launch::DivUp(numPlanes, NumTLP)),
             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
              outFeatures.data(), buffer.data(), indices.data(), size,
              numPlanes);
      TV_CHECK_CUDA_ERR();
    }
  }
};

}  // namespace functor

#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
                                                   Index>;

#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);

DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);

#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
================================================
#include <cuda_runtime_api.h>
#include <torch/script.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>

#include "pytorch_cuda_helper.hpp"

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  at::cuda::CUDAGuard device_guard(indices.device());
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
  auto numAct = indices.size(0);
  auto coorDim = indices.size(1) - 1;
  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
  auto kernelVolume = kernelSize[0];
  for (int i = 1; i < kernelSize.size(); ++i) {
    kernelVolume *= kernelSize[i];
  }
  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
  auto outputVolume = outSpatialShape[0];
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];
  }
  torch::Tensor indicePairs =
      torch::full({kernelVolume, 2, numAct}, -1,
                  torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor indiceNum = torch::zeros(
      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor gridOut =
      torch::full({batchSize * outputVolume}, -1,
                  torch::dtype(torch::kInt32).device(indices.device()));
  int64_t numActOut = -1;
  tv::SimpleVector<int, NDim> outSpatialShape32;
  tv::SimpleVector<int, NDim> kernelSize32;
  tv::SimpleVector<int, NDim> stride32;
  tv::SimpleVector<int, NDim> padding32;
  tv::SimpleVector<int, NDim> dilation32;
  auto indicePairUnique = torch::full(
      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
      torch::dtype(torch::kInt32).device(indices.device()));
  for (int i = 0; i < NDim; ++i) {
    outSpatialShape32.push_back(outSpatialShape[i]);
    kernelSize32.push_back(kernelSize[i]);
    if (subM) {
      stride32.push_back(1);
      padding32.push_back(kernelSize[i] / 2);
      dilation32.push_back(dilation[i]);
    } else {
      stride32.push_back(stride[i]);
      padding32.push_back(padding[i]);
      dilation32.push_back(dilation[i]);
    }
  }
  if (subM) {
    if (indices.device().type() == torch::kCPU) {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose);
    } else {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose);
    }
    return {indices, indicePairs, indiceNum};
  } else {
    torch::Tensor outInds =
        torch::zeros({numAct * kernelVolume, coorDim + 1},
                     torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) {
      auto getIndicePairFtor =
          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
          transpose);
    } else {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
                                                 NDim>();
      auto getIndicePairFtorP2 =
          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
                                                 NDim>();
      numActOut = getIndicePairFtorP1(
          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
          padding32, dilation32, outSpatialShape32, transpose);
      if (numActOut > 0) {
        auto res = torch::_unique(indicePairUnique);
        indicePairUnique = std::get<0>(res);
        numActOut = getIndicePairFtorP2(
            tv::TorchGPU(), tv::torch2tv<const int>(indices),
            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
      }
    }
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
  }
}

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  at::cuda::CUDAGuard device_guard(indices.device());
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
  auto numAct = indices.size(0);
  auto coorDim = indices.size(1) - 1;
  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
  auto kernelVolume = kernelSize[0];
  for (int i = 1; i < kernelSize.size(); ++i) {
    kernelVolume *= kernelSize[i];
  }
  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
  auto outputVolume = outSpatialShape[0];
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];
  }
  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
  torch::Tensor indicePairs =
      torch::full({kernelVolume, 2, numAct}, -1,
                  torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor indiceNum = torch::zeros(
      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
  int64_t numActOut = -1;
  tv::SimpleVector<int, NDim> outSpatialShape32;
  tv::SimpleVector<int, NDim> kernelSize32;
  tv::SimpleVector<int, NDim> stride32;
  tv::SimpleVector<int, NDim> padding32;
  tv::SimpleVector<int, NDim> dilation32;
  auto indicePairUnique = torch::full(
      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
      torch::dtype(torch::kInt32).device(indices.device()));
  for (int i = 0; i < NDim; ++i) {
    outSpatialShape32.push_back(outSpatialShape[i]);
    kernelSize32.push_back(kernelSize[i]);
    if (subM) {
      stride32.push_back(1);
      padding32.push_back(kernelSize[i] / 2);
      dilation32.push_back(dilation[i]);
    } else {
      stride32.push_back(stride[i]);
      padding32.push_back(padding[i]);
      dilation32.push_back(dilation[i]);
    }
  }
  if (subM) {
    if (indices.device().type() == torch::kCPU) {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose);
      gridOut.fill_(-1);
    } else {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose, true);
    }
    return {indices, indicePairs, indiceNum};
  } else {
    torch::Tensor outInds =
        torch::zeros({numAct * kernelVolume, coorDim + 1},
                     torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) {
      auto getIndicePairFtor =
          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
          transpose, true);
      gridOut.fill_(-1);
    } else {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
                                                 NDim>();
      auto getIndicePairFtorP2 =
          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
                                                 NDim>();
      numActOut = getIndicePairFtorP1(
          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
          padding32, dilation32, outSpatialShape32, transpose);
      if (numActOut > 0) {
        auto res = torch::_unique(indicePairUnique);
        indicePairUnique = std::get<0>(res);
        numActOut = getIndicePairFtorP2(
            tv::TorchGPU(), tv::torch2tv<const int>(indices),
            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
            true);
      }
    }
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
  }
}

torch::Tensor IndiceConvForwardCUDAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
    int64_t _subM) {
  at::cuda::CUDAGuard device_guard(features.device());
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto indicePairMaxSizeIter =
      std::max_element(indicePairNumCpu.data_ptr<int>(),
                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
  int indicePairMaxOffset =
      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;

  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());

  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
  torch::Tensor inputBuffer =
      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
  if (subM) {
    torch::mm_out(output, features, filters[indicePairMaxOffset]);
  }
  double totalGatherTime = 0;
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        features.scalar_type(), "IndiceConvForwardKernel", [&] {
          auto outputBufferBlob = torch::from_blob(
              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
          auto inputBufferBlob = torch::from_blob(
              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);

          if (device == torch::kCPU) {
            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
          } else {
            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
                gatherFtor;
            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
            TV_CHECK_CUDA_ERR();
            /* slower than SparseGatherFunctor, may due to int->long conversion
            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
            auto indicePairBlob =
            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
            indicePairOptions); torch::index_select_out(inputBufferBlob,
            features, 0, indicePairBlob);*/
          }
          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);

          if (device == torch::kCPU) {
            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::CPU(), tv::torch2tv<scalar_t>(output),
                tv::torch2tv<const scalar_t>(outputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
          } else {
            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
                tv::torch2tv<const scalar_t>(outputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
            TV_CHECK_CUDA_ERR();
          }
        });
  }
  return output;
}

std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM) {
  at::cuda::CUDAGuard device_guard(features.device());
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;

  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto indicePairMaxSizeIter =
      std::max_element(indicePairNumCpu.data_ptr<int>(),
                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
  int indicePairMaxOffset =
      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  auto filterShape = filters.sizes();
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
  torch::Tensor inputBuffer =
      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);

  filters = filters.view({-1, numInPlanes, numOutPlanes});
  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
  if (subM) {
    auto filterGradSub = filtersGrad[indicePairMaxOffset];
    torch::mm_out(filterGradSub, features.t(), outGrad);
    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
  }
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
          if (device == torch::kCPU) {
            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
            gatherFtorOut(
                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
                tv::torch2tv<const scalar_t>(outGrad),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
                nHot);
          } else {
            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
                gatherFtor;
            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
                gatherFtorOut;
            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
            TV_CHECK_CUDA_ERR();
            gatherFtorOut(
                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
                tv::torch2tv<const scalar_t>(outGrad),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
                nHot);
            TV_CHECK_CUDA_ERR();
          }
          auto filterGradSub = filtersGrad[i];
          auto outputBufferBlob = torch::from_blob(
              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
          auto inputBufferBlob = torch::from_blob(
              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);

          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
          if (device == torch::kCPU) {
            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
                tv::torch2tv<const scalar_t>(inputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
          } else {
            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
                tv::torch2tv<const scalar_t>(inputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
            TV_CHECK_CUDA_ERR();
          }
        });
  }
  return {inputGrad, filtersGrad.view(filterShape)};
}

template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_cuda_helper.hpp"
#include "stack_ball_query_cuda_kernel.cuh"
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

void StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,
                                             const Tensor new_xyz,
                                             const Tensor new_xyz_batch_cnt,
                                             const Tensor xyz,
                                             const Tensor xyz_batch_cnt,
                                             Tensor idx) {
  at::cuda::CUDAGuard device_guard(new_xyz.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  //   const float *new_xyz_ptr = new_xyz.data_ptr<float>();
  //   const float *xyz_ptr = xyz.data_ptr<float>();
  //   const int *new_xyz_batch_cnt_ptr = new_xyz_batch_cnt.data_ptr<int>();
  //   const int *xyz_batch_cnt_ptr = xyz_batch_cnt.data_ptr<int>();
  //   int *idx_ptr = idx.data_ptr<int>();

  int B = xyz_batch_cnt.size(0);
  int M = new_xyz.size(0);

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      new_xyz.scalar_type(), "stack_ball_query_forward_cuda_kernel", [&] {
        stack_ball_query_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                B, M, max_radius, nsample, new_xyz.data_ptr<scalar_t>(),
                new_xyz_batch_cnt.data_ptr<int>(), xyz.data_ptr<scalar_t>(),
                xyz_batch_cnt.data_ptr<int>(), idx.data_ptr<int>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_cuda_helper.hpp"
#include "stack_group_points_cuda_kernel.cuh"

void StackGroupPointsForwardCUDAKernelLauncher(
    int b, int c, int m, int nsample, const Tensor features_tensor,
    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
    const Tensor idx_batch_cnt_tensor, Tensor out_tensor) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)
  at::cuda::CUDAGuard device_guard(features_tensor.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features_tensor.scalar_type(), "stack_group_points_forward_cuda_kernel",
      [&] {
        stack_group_points_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, m, nsample, features_tensor.data_ptr<scalar_t>(),
                features_batch_cnt_tensor.data_ptr<int>(),
                idx_tensor.data_ptr<int>(),
                idx_batch_cnt_tensor.data_ptr<int>(),
                out_tensor.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void StackGroupPointsBackwardCUDAKernelLauncher(
    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor) {
  at::cuda::CUDAGuard device_guard(grad_features_tensor.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_features_tensor.scalar_type(),
      "stack_group_points_backward_cuda_kernel", [&] {
        stack_group_points_backward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, m, n, nsample, grad_out_tensor.data_ptr<scalar_t>(),
                idx_tensor.data_ptr<int>(),
                idx_batch_cnt_tensor.data_ptr<int>(),
                features_batch_cnt_tensor.data_ptr<int>(),
                grad_features_tensor.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp"
#include "sync_bn_cuda_kernel.cuh"

void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
  int num = input.size(0);
  int channels = input.size(1);
  int spatial = input.size(2);

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
        sync_bn_forward_mean_cuda_kernel<scalar_t>
            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
                channels, spatial);
      });
  AT_CUDA_CHECK(cudaGetLastError());
}

void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
                                        Tensor var) {
  int num = input.size(0);
  int channels = input.size(1);
  int spatial = input.size(2);

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
        sync_bn_forward_var_cuda_kernel<scalar_t>
            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
                var.data_ptr<float>(), num, channels, spatial);
      });
  AT_CUDA_CHECK(cudaGetLastError());
}

void SyncBNForwardOutputCUDAKernelLauncher(
    const Tensor input, const Tensor mean, const Tensor var,
    Tensor running_mean, Tensor running_var, const Tensor weight,
    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
    float momentum, int group_size) {
  int num = input.size(0);
  int channels = input.size(1);
  int spatial = input.size(2);

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
        sync_bn_forward_output_cuda_kernel<scalar_t>
            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
                var.data_ptr<float>(), running_mean.data_ptr<float>(),
                running_var.data_ptr<float>(), weight.data_ptr<float>(),
                bias.data_ptr<float>(), norm.data_ptr<float>(),
                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
                channels, spatial, eps, momentum, group_size);
      });
  AT_CUDA_CHECK(cudaGetLastError());
}

void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
                                           const Tensor norm,
                                           Tensor grad_weight,
                                           Tensor grad_bias) {
  int num = grad_output.size(0);
  int channels = grad_output.size(1);
  int spatial = grad_output.size(2);

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
        sync_bn_backward_param_cuda_kernel<scalar_t>
            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
                channels, spatial);
      });
  AT_CUDA_CHECK(cudaGetLastError());
}

void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
                                          const Tensor weight,
                                          const Tensor grad_weight,
                                          const Tensor grad_bias,
                                          const Tensor norm, const Tensor std,
                                          Tensor grad_input) {
  int output_size = grad_input.numel();
  int num = grad_input.size(0);
  int channels = grad_input.size(1);
  int spatial = grad_input.size(2);

  at::cuda::CUDAGuard device_guard(grad_input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
        sync_bn_backward_data_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
                channels, spatial);
      });
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_cuda_helper.hpp"
#include "three_interpolate_cuda_kernel.cuh"

void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
                                               const Tensor points,
                                               const Tensor idx,
                                               const Tensor weight,
                                               Tensor out) {
  // points: (B, C, M)
  // idx: (B, N, 3)
  // weight: (B, N, 3)
  // output:
  //      out: (B, C, N)

  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "three_interpolate_forward_cuda_kernel", [&] {
        three_interpolate_forward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),
                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
                                                const Tensor grad_out,
                                                const Tensor idx,
                                                const Tensor weight,
                                                Tensor grad_points) {
  // grad_out: (B, C, N)
  // weight: (B, N, 3)
  // output:
  //      grad_points: (B, C, M)

  at::cuda::CUDAGuard device_guard(grad_out.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "three_interpolate_backward_cuda_kernel", [&] {
        three_interpolate_backward_cuda_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),
                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_cuda_helper.hpp"
#include "three_nn_cuda_kernel.cuh"

void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
                                      const Tensor known, Tensor dist2,
                                      Tensor idx) {
  // unknown: (B, N, 3)
  // known: (B, M, 3)
  // output:
  //      dist2: (B, N, 3)
  //      idx: (B, N, 3)

  at::cuda::CUDAGuard device_guard(unknown.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      unknown.scalar_type(), "three_nn_forward_cuda_kernel", [&] {
        three_nn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),
            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp"
#include "pytorch_device_registry.hpp"
#include "tin_shift_cuda_kernel.cuh"

void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
                                       Tensor output) {
  int output_size = output.numel();
  int batch_size = input.size(0);
  int t_size = input.size(1);
  int channels = input.size(2);
  int hw_size = input.size(3);
  int group_size = shift.size(1);
  int group_channel = channels / group_size;
  int num_kernels = batch_size * hw_size * channels;

  at::cuda::CUDAGuard device_guard(input.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "tin_shift_forward_cuda_kernel", [&] {
        tin_shift_forward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
                output.data_ptr<scalar_t>(), batch_size, channels, t_size,
                hw_size, group_size, group_channel);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}

void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
                                        Tensor grad_input) {
  int output_size = grad_output.numel();
  int batch_size = grad_output.size(0);
  int t_size = grad_output.size(1);
  int channels = grad_output.size(2);
  int hw_size = grad_output.size(3);
  int group_size = shift.size(1);
  int group_channel = channels / group_size;
  int num_kernels = batch_size * hw_size * channels;

  at::cuda::CUDAGuard device_guard(grad_output.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "tin_shift_backward_cuda_kernel", [&] {
        tin_shift_backward_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
                batch_size, channels, t_size, hw_size, group_size,
                group_channel);
      });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include <c10/util/Half.h>
#include <torch/types.h>

#include "pytorch_cuda_helper.hpp"

struct upfirdn2d_kernel_params {
  const void *x;
  const float *f;
  void *y;

  int2 up;
  int2 down;
  int2 pad0;
  int flip;
  float gain;

  int4 inSize;  // [width, height, channel, batch]
  int4 inStride;
  int2 filterSize;  // [width, height]
  int2 filterStride;
  int4 outSize;  // [width, height, channel, batch]
  int4 outStride;
  int sizeMinor;
  int sizeMajor;

  int loopMinor;
  int loopMajor;
  int loopX;
  int launchMinor;
  int launchMajor;
};

//------------------------------------------------------------------------
// CUDA kernel specialization.

struct upfirdn2d_kernel_spec {
  void *kernel;
  int tileOutW;
  int tileOutH;
  int loopMinor;
  int loopX;
};

//------------------------------------------------------------------------
// CUDA kernel selection.

template <class T>
upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params &p);
//------------------------------------------------------------------------

// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

//------------------------------------------------------------------------
// Helpers.

template <class T>
struct InternalType;
template <>
struct InternalType<double> {
  typedef double scalar_t;
};
template <>
struct InternalType<float> {
  typedef float scalar_t;
};
template <>
struct InternalType<c10::Half> {
  typedef float scalar_t;
};

static __device__ __forceinline__ int floor_div(int a, int b) {
  int t = 1 - a / b;
  return (a + t * b) / b - t;
}

//------------------------------------------------------------------------
// Generic CUDA implementation for large filters.

template <class T>
static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p) {
  typedef typename InternalType<T>::scalar_t scalar_t;

  // Calculate thread index.
  int minorBase = blockIdx.x * blockDim.x + threadIdx.x;
  int outY = minorBase / p.launchMinor;
  minorBase -= outY * p.launchMinor;
  int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
  int majorBase = blockIdx.z * p.loopMajor;
  if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)
    return;

  // Setup Y receptive field.
  int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;
  int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);
  int h =
      min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;
  int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;
  if (p.flip) filterY = p.filterSize.y - 1 - filterY;

  // Loop over major, minor, and X.
  for (int majorIdx = 0, major = majorBase;
       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
    for (int minorIdx = 0, minor = minorBase;
         minorIdx < p.loopMinor & minor < p.sizeMinor;
         minorIdx++, minor += p.launchMinor) {
      int nc = major * p.sizeMinor + minor;
      int n = nc / p.inSize.z;
      int c = nc - n * p.inSize.z;
      for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x;
           loopX++, outX += blockDim.y) {
        // Setup X receptive field.
        int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;
        int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);
        int w =
            min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) -
            inX;
        int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;
        if (p.flip) filterX = p.filterSize.x - 1 - filterX;

        // Initialize pointers.
        const T *xp =
            &((const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
                              c * p.inStride.z + n * p.inStride.w];
        const float *fp =
            &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];
        int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;
        int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;

        // Inner loop.
        scalar_t v = 0;
        for (int y = 0; y < h; y++) {
          for (int x = 0; x < w; x++) {
            v += (scalar_t)(*xp) * (scalar_t)(*fp);
            xp += p.inStride.x;
            fp += filterStepX;
          }
          xp += p.inStride.y - w * p.inStride.x;
          fp += filterStepY - w * filterStepX;
        }

        // Store result.
        v *= p.gain;
        ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
                   c * p.outStride.z + n * p.outStride.w] = (T)v;
      }
    }
}

//------------------------------------------------------------------------
// Specialized CUDA implementation for small filters.

template <class T, int upx, int upy, int downx, int downy, int filterW,
          int filterH, int tileOutW, int tileOutH, int loopMinor>
static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p) {
  typedef typename InternalType<T>::scalar_t scalar_t;
  const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;
  const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;
  __shared__ volatile scalar_t sf[filterH][filterW];
  __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];

  // Calculate tile index.
  int minorBase = blockIdx.x;
  int tileOutY = minorBase / p.launchMinor;
  minorBase -= tileOutY * p.launchMinor;
  minorBase *= loopMinor;
  tileOutY *= tileOutH;
  int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
  int majorBase = blockIdx.z * p.loopMajor;
  if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y |
      majorBase >= p.sizeMajor)
    return;

  // Load filter (flipped).
  for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW;
       tapIdx += blockDim.x) {
    int fy = tapIdx / filterW;
    int fx = tapIdx - fy * filterW;
    scalar_t v = 0;
    if (fx < p.filterSize.x & fy < p.filterSize.y) {
      int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;
      int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;
      v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];
    }
    sf[fy][fx] = v;
  }

  // Loop over major and X.
  for (int majorIdx = 0, major = majorBase;
       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) {
    int baseNC = major * p.sizeMinor + minorBase;
    int n = baseNC / p.inSize.z;
    int baseC = baseNC - n * p.inSize.z;
    for (int loopX = 0, tileOutX = tileOutXBase;
         loopX < p.loopX & tileOutX < p.outSize.x;
         loopX++, tileOutX += tileOutW) {
      // Load input pixels.
      int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;
      int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;
      int tileInX = floor_div(tileMidX, upx);
      int tileInY = floor_div(tileMidY, upy);
      __syncthreads();
      for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor;
           inIdx += blockDim.x) {
        int relC = inIdx;
        int relInX = relC / loopMinor;
        int relInY = relInX / tileInW;
        relC -= relInX * loopMinor;
        relInX -= relInY * tileInW;
        int c = baseC + relC;
        int inX = tileInX + relInX;
        int inY = tileInY + relInY;
        scalar_t v = 0;
        if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y &
            c < p.inSize.z)
          v = (scalar_t)((const T *)
                             p.x)[inX * p.inStride.x + inY * p.inStride.y +
                                  c * p.inStride.z + n * p.inStride.w];
        sx[relInY][relInX][relC] = v;
      }

      // Loop over output pixels.
      __syncthreads();
      for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor;
           outIdx += blockDim.x) {
        int relC = outIdx;
        int relOutX = relC / loopMinor;
        int relOutY = relOutX / tileOutW;
        relC -= relOutX * loopMinor;
        relOutX -= relOutY * tileOutW;
        int c = baseC + relC;
        int outX = tileOutX + relOutX;
        int outY = tileOutY + relOutY;

        // Setup receptive field.
        int midX = tileMidX + relOutX * downx;
        int midY = tileMidY + relOutY * downy;
        int inX = floor_div(midX, upx);
        int inY = floor_div(midY, upy);
        int relInX = inX - tileInX;
        int relInY = inY - tileInY;
        int filterX = (inX + 1) * upx - midX - 1;  // flipped
        int filterY = (inY + 1) * upy - midY - 1;  // flipped

        // Inner loop.
        if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z) {
          scalar_t v = 0;
#pragma unroll
          for (int y = 0; y < filterH / upy; y++)
#pragma unroll
            for (int x = 0; x < filterW / upx; x++)
              v += sx[relInY + y][relInX + x][relC] *
                   sf[filterY + y * upy][filterX + x * upx];
          v *= p.gain;
          ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
                     c * p.outStride.z + n * p.outStride.w] = (T)v;
        }
      }
    }
  }
}

//------------------------------------------------------------------------
// CUDA kernel selection.

template <class T>
upfirdn2d_kernel_spec choose_upfirdn2d_kernel(
    const upfirdn2d_kernel_params &p) {
  int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;
  upfirdn2d_kernel_spec spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 1,
                                4};  // contiguous
  if (s == 1)
    spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 4, 1};  // channels_last

  // No up/downsampling.
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 7 && fy <= 7)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 5 && fy <= 5)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 3 && fy <= 3)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 32, 32, 1>,
              32, 32, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 7 && fy <= 7)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 5 && fy <= 5)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 3 && fy <= 3)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 1, 128, 16>,
              1, 128, 16, 1};
  }

  // 2x upsampling.
  if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 2 && fy <= 2)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 64, 16, 1>,
              64, 16, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 8 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 2 && fy <= 2)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 16, 16, 8>,
              16, 16, 8, 1};
  }
  if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 8, 1>,
              128, 8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 1, 16>,
              128, 1, 16, 1};
  }
  if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 32, 32, 1>,
              32, 32, 1, 1};
    // channels_last
    if (s == 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 1, 128, 16>,
              1, 128, 16, 1};
  }

  // 2x downsampling.
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 32, 16, 1>,
              32, 16, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 32, 16, 1>,
              32, 16, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 32, 8, 1>, 32,
              8, 1, 1};
    if (s != 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 32, 8, 1>, 32,
              8, 1, 1};
    if (s != 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 32, 8, 1>, 32,
              8, 1, 1};
    if (s != 1 && fx <= 2 && fy <= 2)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 32, 8, 1>, 32,
              8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 16, 16, 1>,
              16, 16, 1, 1};
    if (s == 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 16, 16, 1>,
              16, 16, 1, 1};
    if (s == 1 && fx <= 8 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 8, 8, 8>, 8,
              8, 8, 1};
    if (s == 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 8, 8, 8>, 8,
              8, 8, 1};
    if (s == 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 8, 8, 8>, 8,
              8, 8, 1};
    if (s == 1 && fx <= 2 && fy <= 2)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 8, 8, 8>, 8,
              8, 8, 1};
  }
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 8, 1>,
              64, 8, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 8, 1>,
              64, 8, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 8, 1>, 64,
              8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 1, 8>,
              64, 1, 8, 1};
    if (s == 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 1, 8>,
              64, 1, 8, 1};
    if (s == 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 1, 8>, 64,
              1, 8, 1};
  }
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) {
    // contiguous
    if (s != 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 32, 16, 1>,
              32, 16, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 32, 16, 1>,
              32, 16, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 32, 16, 1>,
              32, 16, 1, 1};
    // channels_last
    if (s == 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 1, 64, 8>, 1,
              64, 8, 1};
    if (s == 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 1, 64, 8>, 1,
              64, 8, 1};
    if (s == 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 1, 64, 8>, 1,
              64, 8, 1};
  }

  // 4x upsampling.
  if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 48 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 32 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 64, 32, 1>,
              64, 32, 1, 1};
    // channels_last
    if (s == 1 && fx <= 48 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 32 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 32, 32, 1>,
              32, 32, 1, 1};
  }
  if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 48 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 32 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 8, 1>,
              128, 8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 48 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 32 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 1, 16>,
              128, 1, 16, 1};
  }
  if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 1 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 32, 32, 1>,
              32, 32, 1, 1};
    // channels_last
    if (s == 1 && fx <= 1 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 1, 128, 16>,
              1, 128, 16, 1};
  }

  // 4x downsampling (inefficient).
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 48 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 8, 1>,
              32, 8, 1, 1};
    if (s != 1 && fx <= 32 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 8, 1>,
              32, 8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 48 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 1, 8>,
              32, 1, 8, 1};
    if (s == 1 && fx <= 32 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 1, 8>,
              32, 1, 8, 1};
  }
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4) {
    // contiguous
    if (s != 1 && fx <= 1 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 32, 8, 1>,
              32, 8, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 32, 8, 1>,
              32, 8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 1 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 1, 32, 8>, 1,
              32, 8, 1};
    if (s == 1 && fx <= 1 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 1, 32, 8>, 1,
              32, 8, 1};
  }
  return spec;
}

//------------------------------------------------------------------------
// Template specializations.

template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>(
    const upfirdn2d_kernel_params &p);
template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>(
    const upfirdn2d_kernel_params &p);
template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(
    const upfirdn2d_kernel_params &p);

//------------------------------------------------------------------------

//------------------------------------------------------------------------

torch::Tensor upfirdn2d_op(torch::Tensor x, torch::Tensor f, int upx, int upy,
                           int downx, int downy, int padx0, int padx1,
                           int pady0, int pady1, bool flip, float gain) {
  // Validate arguments.
  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
  TORCH_CHECK(f.device() == x.device(),
              "f must reside on the same device as x");
  TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
  TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
  TORCH_CHECK(x.numel() > 0, "x has zero size");
  TORCH_CHECK(f.numel() > 0, "f has zero size");
  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
  TORCH_CHECK(f.dim() == 2, "f must be rank 2");
  TORCH_CHECK((x.size(0) - 1) * x.stride(0) + (x.size(1) - 1) * x.stride(1) +
                      (x.size(2) - 1) * x.stride(2) +
                      (x.size(3) - 1) * x.stride(3) <=
                  INT_MAX,
              "x memory footprint is too large");
  TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
  TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
  TORCH_CHECK(downx >= 1 && downy >= 1,
              "downsampling factor must be at least 1");

  // Create output tensor.
  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
  int outW =
      ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;
  int outH =
      ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;
  TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1");
  torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW},
                                 x.options(), x.suggest_memory_format());
  TORCH_CHECK(y.numel() <= INT_MAX, "output is too large");
  TORCH_CHECK((y.size(0) - 1) * y.stride(0) + (y.size(1) - 1) * y.stride(1) +
                      (y.size(2) - 1) * y.stride(2) +
                      (y.size(3) - 1) * y.stride(3) <=
                  INT_MAX,
              "output memory footprint is too large");

  // Initialize CUDA kernel parameters.
  upfirdn2d_kernel_params p;
  p.x = x.data_ptr();
  p.f = f.data_ptr<float>();
  p.y = y.data_ptr();
  p.up = make_int2(upx, upy);
  p.down = make_int2(downx, downy);
  p.pad0 = make_int2(padx0, pady0);
  p.flip = (flip) ? 1 : 0;
  p.gain = gain;
  p.inSize =
      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
  p.inStride = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1),
                         (int)x.stride(0));
  p.filterSize = make_int2((int)f.size(1), (int)f.size(0));
  p.filterStride = make_int2((int)f.stride(1), (int)f.stride(0));
  p.outSize =
      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
  p.outStride = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1),
                          (int)y.stride(0));
  p.sizeMajor = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;
  p.sizeMinor = (p.inStride.z == 1) ? p.inSize.z : 1;

  // Choose CUDA kernel.
  upfirdn2d_kernel_spec spec;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
    spec = choose_upfirdn2d_kernel<scalar_t>(p);
  });

  // Set looping options.
  p.loopMajor = (p.sizeMajor - 1) / 16384 + 1;
  p.loopMinor = spec.loopMinor;
  p.loopX = spec.loopX;
  p.launchMinor = (p.sizeMinor - 1) / p.loopMinor + 1;
  p.launchMajor = (p.sizeMajor - 1) / p.loopMajor + 1;

  // Compute grid size.
  dim3 blockSize, gridSize;
  if (spec.tileOutW < 0)  // large
  {
    blockSize = dim3(4, 32, 1);
    gridSize =
        dim3(((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,
             (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1, p.launchMajor);
  } else  // small
  {
    blockSize = dim3(256, 1, 1);
    gridSize =
        dim3(((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,
             (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1, p.launchMajor);
  }

  // Launch CUDA kernel.
  void *args[] = {&p};
#ifdef MMCV_WITH_HIP
  AT_CUDA_CHECK(hipLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
                                at::cuda::getCurrentCUDAStream()));
#else
  AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
                                 at::cuda::getCurrentCUDAStream()));
#endif

  return y;
}


================================================
FILE: mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_cuda_helper.hpp"
#include "voxelization_cuda_kernel.cuh"

int HardVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3) {
  // current version tooks about 0.04s for one frame on cpu
  // check device

  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  const int num_points = points.size(0);
  const int num_features = points.size(1);

  const float voxel_x = voxel_size[0];
  const float voxel_y = voxel_size[1];
  const float voxel_z = voxel_size[2];
  const float coors_x_min = coors_range[0];
  const float coors_y_min = coors_range[1];
  const float coors_z_min = coors_range[2];
  const float coors_x_max = coors_range[3];
  const float coors_y_max = coors_range[4];
  const float coors_z_max = coors_range[5];

  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);

  // map points to voxel coors
  at::Tensor temp_coors =
      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));

  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
  dim3 block(512);

  // 1. link point to corresponding voxel coors
  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "hard_voxelize_kernel", ([&] {
        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
            points.contiguous().data_ptr<scalar_t>(),
            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
            NDim);
      }));

  AT_CUDA_CHECK(cudaGetLastError());

  // 2. map point to the idx of the corresponding voxel, find duplicate coor
  // create some temporary variables
  auto point_to_pointidx = -at::ones(
      {
          num_points,
      },
      points.options().dtype(at::kInt));
  auto point_to_voxelidx = -at::ones(
      {
          num_points,
      },
      points.options().dtype(at::kInt));

  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
  dim3 map_block(512);

  AT_DISPATCH_ALL_TYPES(
      temp_coors.scalar_type(), "determin_duplicate", ([&] {
        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
            temp_coors.contiguous().data_ptr<int>(),
            point_to_voxelidx.contiguous().data_ptr<int>(),
            point_to_pointidx.contiguous().data_ptr<int>(), max_points,
            max_voxels, num_points, NDim);
      }));

  AT_CUDA_CHECK(cudaGetLastError());

  // 3. determine voxel num and voxel's coor index
  // make the logic in the CUDA device could accelerate about 10 times
  auto coor_to_voxelidx = -at::ones(
      {
          num_points,
      },
      points.options().dtype(at::kInt));
  auto voxel_num = at::zeros(
      {
          1,
      },
      points.options().dtype(at::kInt));  // must be zero from the beginning

  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
                          determin_voxel_num<int><<<1, 1, 0, stream>>>(
                              num_points_per_voxel.contiguous().data_ptr<int>(),
                              point_to_voxelidx.contiguous().data_ptr<int>(),
                              point_to_pointidx.contiguous().data_ptr<int>(),
                              coor_to_voxelidx.contiguous().data_ptr<int>(),
                              voxel_num.contiguous().data_ptr<int>(),
                              max_points, max_voxels, num_points);
                        }));

  AT_CUDA_CHECK(cudaGetLastError());

  // 4. copy point features to voxels
  // Step 4 & 5 could be parallel
  auto pts_output_size = num_points * num_features;
  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
  dim3 cp_block(512);
  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "assign_point_to_voxel", ([&] {
        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(
            pts_output_size, points.contiguous().data_ptr<float>(),
            point_to_voxelidx.contiguous().data_ptr<int>(),
            coor_to_voxelidx.contiguous().data_ptr<int>(),
            voxels.contiguous().data_ptr<float>(), max_points, num_features,
            num_points, NDim);
      }));
  //   cudaDeviceSynchronize();
  //   AT_CUDA_CHECK(cudaGetLastError());

  // 5. copy coors of each voxels
  auto coors_output_size = num_points * NDim;
  dim3 coors_cp_grid(
      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
  dim3 coors_cp_block(512);
  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "assign_point_to_voxel", ([&] {
        assign_voxel_coors<float, int>
            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(
                coors_output_size, temp_coors.contiguous().data_ptr<int>(),
                point_to_voxelidx.contiguous().data_ptr<int>(),
                coor_to_voxelidx.contiguous().data_ptr<int>(),
                coors.contiguous().data_ptr<int>(), num_points, NDim);
      }));

  AT_CUDA_CHECK(cudaGetLastError());

  auto voxel_num_cpu = voxel_num.to(at::kCPU);
  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];

  return voxel_num_int;
}

int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3) {
  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  const int num_points = points.size(0);
  const int num_features = points.size(1);

  if (num_points == 0) return 0;

  dim3 blocks(
      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
  dim3 threads(THREADS_PER_BLOCK);

  const float voxel_x = voxel_size[0];
  const float voxel_y = voxel_size[1];
  const float voxel_z = voxel_size[2];
  const float coors_x_min = coors_range[0];
  const float coors_y_min = coors_range[1];
  const float coors_z_min = coors_range[2];
  const float coors_x_max = coors_range[3];
  const float coors_y_max = coors_range[4];
  const float coors_z_max = coors_range[5];

  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);

  // map points to voxel coors
  at::Tensor temp_coors =
      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));

  // 1. link point to corresponding voxel coors
  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "hard_voxelize_kernel", ([&] {
        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
            points.contiguous().data_ptr<scalar_t>(),
            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
            NDim);
      }));

  at::Tensor coors_map;
  at::Tensor reduce_count;

  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);

  std::tie(temp_coors, coors_map, reduce_count) =
      at::unique_dim(coors_clean, 0, true, true, false);

  if (temp_coors[0][0].lt(0).item<bool>()) {
    // the first element of temp_coors is (-1,-1,-1) and should be removed
    temp_coors = temp_coors.slice(0, 1);
    coors_map = coors_map - 1;
  }

  int num_coors = temp_coors.size(0);
  temp_coors = temp_coors.to(at::kInt);
  coors_map = coors_map.to(at::kInt);

  at::Tensor coors_count = at::zeros({1}, coors_map.options());
  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
  reduce_count = at::zeros({num_coors}, coors_map.options());

  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "get_assign_pos", ([&] {
        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
            num_points, coors_map.contiguous().data_ptr<int32_t>(),
            pts_id.contiguous().data_ptr<int32_t>(),
            coors_count.contiguous().data_ptr<int32_t>(),
            reduce_count.contiguous().data_ptr<int32_t>(),
            coors_order.contiguous().data_ptr<int32_t>());
      }));

  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "assign_point_to_voxel", ([&] {
        nondeterministic_assign_point_voxel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                num_points, points.contiguous().data_ptr<scalar_t>(),
                coors_map.contiguous().data_ptr<int32_t>(),
                pts_id.contiguous().data_ptr<int32_t>(),
                temp_coors.contiguous().data_ptr<int32_t>(),
                reduce_count.contiguous().data_ptr<int32_t>(),
                coors_order.contiguous().data_ptr<int32_t>(),
                voxels.contiguous().data_ptr<scalar_t>(),
                coors.contiguous().data_ptr<int32_t>(),
                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
                max_voxels, max_points, num_features, NDim);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
  return max_voxels < num_coors ? max_voxels : num_coors;
}

void DynamicVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor &points, at::Tensor &coors,
    const std::vector<float> voxel_size, const std::vector<float> coors_range,
    const int NDim = 3) {
  // current version tooks about 0.04s for one frame on cpu
  // check device

  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  const int num_points = points.size(0);
  const int num_features = points.size(1);

  const float voxel_x = voxel_size[0];
  const float voxel_y = voxel_size[1];
  const float voxel_z = voxel_size[2];
  const float coors_x_min = coors_range[0];
  const float coors_y_min = coors_range[1];
  const float coors_z_min = coors_range[2];
  const float coors_x_max = coors_range[3];
  const float coors_y_max = coors_range[4];
  const float coors_z_max = coors_range[5];

  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);

  const int col_blocks = at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK);
  dim3 blocks(col_blocks);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
        points.contiguous().data_ptr<scalar_t>(),
        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
  });

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/deform_conv.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col) {
  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
                       stride_w, dilation_h, dilation_w, parallel_imgs,
                       deformable_group, data_col);
}

void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im) {
  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
                       stride_w, dilation_h, dilation_w, parallel_imgs,
                       deformable_group, grad_im);
}

void deformable_col2im_coord_impl(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset) {
  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
                       data_offset, channels, height, width, ksize_h, ksize_w,
                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                       parallel_imgs, deformable_group, grad_offset);
}

void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
                             at::Tensor *gradOutput, at::Tensor weight, int kH,
                             int kW, int dH, int dW, int padH, int padW,
                             int dilationH, int dilationW, int group,
                             int deformable_group) {
  TORCH_CHECK(
      weight.ndimension() == 4,
      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
      weight.ndimension());

  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");

  TORCH_CHECK(kW > 0 && kH > 0,
              "kernel size should be greater than zero, but got kH: %d kW: %d",
              kH, kW);

  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
              "kernel size should be consistent with weight, ",
              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
              kH, kW, weight.size(2), weight.size(3));

  TORCH_CHECK(dW > 0 && dH > 0,
              "stride should be greater than zero, but got dH: %d dW: %d", dH,
              dW);

  TORCH_CHECK(
      dilationW > 0 && dilationH > 0,
      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
      dilationH, dilationW);

  int ndim = input.ndimension();
  int dimf = 0;
  int dimh = 1;
  int dimw = 2;

  if (ndim == 4) {
    dimf++;
    dimh++;
    dimw++;
  }

  TORCH_CHECK(ndim == 3 || ndim == 4,
              "3D or 4D input tensor expected but got: %s", ndim);

  long nInputPlane = weight.size(1) * group;
  long inputHeight = input.size(dimh);
  long inputWidth = input.size(dimw);
  long nOutputPlane = weight.size(0);
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;

  TORCH_CHECK(nInputPlane % deformable_group == 0,
              "input channels must divide deformable group size");

  if (outputWidth < 1 || outputHeight < 1)
    AT_ERROR(
        "Given input size: (%ld x %ld x %ld). "
        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
        outputWidth);

  TORCH_CHECK(input.size(1) == nInputPlane,
              "invalid number of input planes, expected: %d, but got: %d",
              nInputPlane, input.size(1));

  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
              "input image is smaller than kernel");

  TORCH_CHECK(
      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
      "invalid spatial size of offset, expected height: %d width: %d, but "
      "got height: %d width: %d",
      outputHeight, outputWidth, offset.size(2), offset.size(3));

  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
              "invalid number of channels of offset");

  if (gradOutput != NULL) {
    TORCH_CHECK(
        gradOutput->size(dimf) == nOutputPlane,
        "invalid number of gradOutput planes, expected: %d, but got: %d",
        nOutputPlane, gradOutput->size(dimf));

    TORCH_CHECK(
        (gradOutput->size(dimh) == outputHeight &&
         gradOutput->size(dimw) == outputWidth),
        "invalid size of gradOutput, expected height: %d width: %d , but "
        "got height: %d width: %d",
        outputHeight, outputWidth, gradOutput->size(dimh),
        gradOutput->size(dimw));
  }
}

void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
                         Tensor output, Tensor columns, Tensor ones, int kW,
                         int kH, int dW, int dH, int padW, int padH,
                         int dilationW, int dilationH, int group,
                         int deformable_group, int im2col_step) {
  if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(weight);
    CHECK_CUDA_INPUT(output);
    CHECK_CUDA_INPUT(columns);
    CHECK_CUDA_INPUT(ones);
#else
    AT_ERROR("DeformConv is not compiled with GPU support");
#endif
  }
#ifndef MMCV_WITH_MUSA
  else {
    CHECK_CPU_INPUT(input);
    CHECK_CPU_INPUT(offset);
    CHECK_CPU_INPUT(weight);
    CHECK_CPU_INPUT(output);
    CHECK_CPU_INPUT(columns);
    CHECK_CPU_INPUT(ones);
  }
#endif
  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
                          padW, dilationH, dilationW, group, deformable_group);
  at::DeviceGuard guard(input.device());

  int batch = 1;
  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
    input.unsqueeze_(0);
    offset.unsqueeze_(0);
  }

  // todo: assert batchsize dividable by im2col_step

  long batchSize = input.size(0);
  long nInputPlane = input.size(1);
  long inputHeight = input.size(2);
  long inputWidth = input.size(3);

  long nOutputPlane = weight.size(0);

  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");

  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
                        outputHeight, outputWidth});
  columns = at::zeros(
      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
      input.options());

  if (ones.ndimension() != 2 ||
      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
    ones = at::ones({outputHeight, outputWidth}, input.options());
  }

  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
                      inputHeight, inputWidth});
  offset =
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
                                    im2col_step * outputHeight, outputWidth},
                                   output.options());

  output_buffer = output_buffer.view(
      {output_buffer.size(0), group, output_buffer.size(1) / group,
       output_buffer.size(2), output_buffer.size(3)});

  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group, columns);

    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
                          weight.size(2), weight.size(3)});

    for (int g = 0; g < group; g++) {
      output_buffer[elt][g] = output_buffer[elt][g]
                                  .flatten(1)
                                  .addmm_(weight[g].flatten(1), columns[g])
                                  .view_as(output_buffer[elt][g]);
    }
    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
  }

  output_buffer = output_buffer.view(
      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
       output_buffer.size(3), output_buffer.size(4)});

  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
                                      im2col_step, outputHeight, outputWidth});
  output_buffer.transpose_(1, 2);
  output.copy_(output_buffer);
  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});

  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
  offset = offset.view(
      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  if (batch == 0) {
    output = output.view({nOutputPlane, outputHeight, outputWidth});
    input = input.view({nInputPlane, inputHeight, inputWidth});
    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
  }
}

void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
                                Tensor gradInput, Tensor gradOffset,
                                Tensor weight, Tensor columns, int kW, int kH,
                                int dW, int dH, int padW, int padH,
                                int dilationW, int dilationH, int group,
                                int deformable_group, int im2col_step) {
  if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(gradOutput);
    CHECK_CUDA_INPUT(gradInput);
    CHECK_CUDA_INPUT(gradOffset);
    CHECK_CUDA_INPUT(weight);
    CHECK_CUDA_INPUT(columns);
#else
    AT_ERROR("DeformConv is not compiled with GPU support");
#endif
  }
#ifndef MMCV_WITH_MUSA
  else {
    CHECK_CPU_INPUT(input);
    CHECK_CPU_INPUT(offset);
    CHECK_CPU_INPUT(gradOutput);
    CHECK_CPU_INPUT(gradInput);
    CHECK_CPU_INPUT(gradOffset);
    CHECK_CPU_INPUT(weight);
    CHECK_CPU_INPUT(columns);
  }
#endif
  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
                          padH, padW, dilationH, dilationW, group,
                          deformable_group);

  at::DeviceGuard guard(input.device());

  int batch = 1;
  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
    input = input.view({1, input.size(0), input.size(1), input.size(2)});
    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
    gradOutput = gradOutput.view(
        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
  }

  long batchSize = input.size(0);
  long nInputPlane = input.size(1);
  long inputHeight = input.size(2);
  long inputWidth = input.size(3);

  long nOutputPlane = weight.size(0);

  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
  columns = at::zeros(
      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
      input.options());

  // change order of grad output
  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
                                nOutputPlane, outputHeight, outputWidth});
  gradOutput.transpose_(1, 2);

  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
                              inputHeight, inputWidth});
  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
                      inputHeight, inputWidth});
  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
                                deformable_group * 2 * kH * kW, outputHeight,
                                outputWidth});
  offset =
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    // divide into groups
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
                          weight.size(2), weight.size(3)});
    gradOutput = gradOutput.view(
        {gradOutput.size(0), group, gradOutput.size(1) / group,
         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});

    for (int g = 0; g < group; g++) {
      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
    }

    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    gradOutput = gradOutput.view(
        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});

    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
                                 inputHeight, inputWidth, kH, kW, padH, padW,
                                 dH, dW, dilationH, dilationW, im2col_step,
                                 deformable_group, gradOffset[elt]);

    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group,
                           gradInput[elt]);

    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
  }

  gradOutput.transpose_(1, 2);
  gradOutput =
      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});

  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
  gradOffset = gradOffset.view(
      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
  offset = offset.view(
      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  if (batch == 0) {
    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
    input = input.view({nInputPlane, inputHeight, inputWidth});
    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
    gradOffset =
        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
  }
}

void deform_conv_backward_parameters(Tensor input, Tensor offset,
                                     Tensor gradOutput, Tensor gradWeight,
                                     Tensor columns, Tensor ones, int kW,
                                     int kH, int dW, int dH, int padW, int padH,
                                     int dilationW, int dilationH, int group,
                                     int deformable_group, float scale,
                                     int im2col_step) {
  if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(input);
    CHECK_CUDA_INPUT(offset);
    CHECK_CUDA_INPUT(gradOutput);
    CHECK_CUDA_INPUT(gradWeight);
    CHECK_CUDA_INPUT(columns);
    CHECK_CUDA_INPUT(ones);
#else
    AT_ERROR("DeformConv is not compiled with GPU support");
#endif
  }
#ifndef MMCV_WITH_MUSA
  else {
    CHECK_CPU_INPUT(input);
    CHECK_CPU_INPUT(offset);
    CHECK_CPU_INPUT(gradOutput);
    CHECK_CPU_INPUT(gradWeight);
    CHECK_CPU_INPUT(columns);
    CHECK_CPU_INPUT(ones);
  }
#endif

  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
                          dW, padH, padW, dilationH, dilationW, group,
                          deformable_group);
  at::DeviceGuard guard(input.device());

  int batch = 1;

  if (input.ndimension() == 3) {
    // Force batch
    batch = 0;
    input = input.view(
        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
    gradOutput = gradOutput.view(
        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
  }

  long batchSize = input.size(0);
  long nInputPlane = input.size(1);
  long inputHeight = input.size(2);
  long inputWidth = input.size(3);

  long nOutputPlane = gradWeight.size(0);

  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");

  columns = at::zeros(
      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
      input.options());

  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
                                nOutputPlane, outputHeight, outputWidth});
  gradOutput.transpose_(1, 2);

  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
  gradOutputBuffer =
      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
                             outputHeight, outputWidth});
  gradOutputBuffer = gradOutputBuffer.contiguous();
  gradOutputBuffer.copy_(gradOutput);
  gradOutputBuffer =
      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
                             im2col_step * outputHeight, outputWidth});

  gradOutput.transpose_(1, 2);
  gradOutput =
      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});

  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
                      inputHeight, inputWidth});
  offset =
      offset.view({batchSize / im2col_step, im2col_step,
                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
                           dilationW, im2col_step, deformable_group, columns);

    // divide into group
    gradOutputBuffer = gradOutputBuffer.view(
        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    gradWeight =
        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
                         gradWeight.size(2), gradWeight.size(3)});

    for (int g = 0; g < group; g++) {
      gradWeight[g] = gradWeight[g]
                          .flatten(1)
                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
                                  columns[g].transpose(1, 0), 1.0, scale)
                          .view_as(gradWeight[g]);
    }
    gradOutputBuffer = gradOutputBuffer.view(
        {gradOutputBuffer.size(0),
         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
                                  gradWeight.size(2), gradWeight.size(3),
                                  gradWeight.size(4)});
  }

  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
  offset = offset.view(
      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});

  if (batch == 0) {
    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
    input = input.view({nInputPlane, inputHeight, inputWidth});
  }
}


================================================
FILE: mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
                       output, pooled_height, pooled_width, spatial_scale,
                       sampling_ratio, gamma);
}

void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma) {
  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
                       offset, grad_input, grad_offset, pooled_height,
                       pooled_width, spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                             Tensor output, int pooled_height, int pooled_width,
                             float spatial_scale, int sampling_ratio,
                             float gamma) {
  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
                               pooled_width, spatial_scale, sampling_ratio,
                               gamma);
}

void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                              Tensor offset, Tensor grad_input,
                              Tensor grad_offset, int pooled_height,
                              int pooled_width, float spatial_scale,
                              int sampling_ratio, float gamma) {
  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
                                grad_offset, pooled_height, pooled_width,
                                spatial_scale, sampling_ratio, gamma);
}


================================================
FILE: mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
                                                   Tensor num_valid) {
  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
                              vertices, mask, num_valid);
}

Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
                                              Tensor num_valid) {
  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
}


================================================
FILE: mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
    bool writeSigns) {
  return DISPATCH_DEVICE_IMPL(filtered_lrelu_op_impl, x, fu, fd, b, si, up,
                              down, px0, px1, py0, py1, sx, sy, gain, slope,
                              clamp, flip_filters, writeSigns);
}

std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
    bool writeSigns) {
  return filtered_lrelu_op_impl(x, fu, fd, b, si, up, down, px0, px1, py0, py1,
                                sx, sy, gain, slope, clamp, flip_filters,
                                writeSigns);
}

torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
                                         int sx, int sy, float gain,
                                         float slope, float clamp,
                                         bool writeSigns) {
  return DISPATCH_DEVICE_IMPL(filtered_lrelu_act_op_impl, x, si, sx, sy, gain,
                              slope, clamp, writeSigns);
}

torch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,
                                  int sy, float gain, float slope, float clamp,
                                  bool writeSigns) {
  return filtered_lrelu_act_op_impl(x, si, sx, sy, gain, slope, clamp,
                                    writeSigns);
}


================================================
FILE: mmcv/ops/csrc/pytorch/focal_loss.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>

#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"

using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif

void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
                       output, gamma, alpha);
}

void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha) {
  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
                       grad_input, gamma, alpha);
}

void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
                       output, gamma, alpha);
}

void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
                       buff, grad_input, gamma, alpha);
}

#ifdef MMCV_WITH_DIOPI
void sigmoid_focal_loss_forward_diopi(Tensor input, Tensor target,
                                      Tensor weight, Tensor output, float gamma,
                                      float alpha) {
  auto input_p = toDiopiTensorHandle(input);
  diopiDevice_t device;
  diopiGetTensorDevice(input_p, &device);
  if (device == diopi_host) {
    sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma,
                                    alpha);
    return;
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  auto target_p = toDiopiTensorHandle(target);
  auto weight_p = toDiopiTensorHandle(weight);
  auto output_p = toDiopiTensorHandle(output);
  if (reinterpret_cast<void *>(diopiSigmoidFocalLossMmcv) != nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,
                                           weight_p, gamma, alpha);
      if (ret == diopiSuccess) return;
    } else {
      auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,
                                           weight_p, gamma, alpha);
      if (ret == diopiSuccess) return;
    }
  }
  LOG(WARNING)
      << "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
  auto input_cpu = input.cpu();
  auto target_cpu = target.cpu();
  auto weight_cpu = weight.cpu();
  auto output_cpu = output.cpu();
  sigmoid_focal_loss_forward_impl(input_cpu, target_cpu, weight_cpu, output_cpu,
                                  gamma, alpha);
  output.copy_(output_cpu);
  return;
}

void sigmoid_focal_loss_backward_diopi(Tensor input, Tensor target,
                                       Tensor weight, Tensor grad_input,
                                       float gamma, float alpha) {
  auto input_p = toDiopiTensorHandle(input);
  diopiDevice_t device;
  diopiGetTensorDevice(input_p, &device);
  if (device == diopi_host) {
    sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
                                     alpha);
    return;
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  auto target_p = toDiopiTensorHandle(target);
  auto weight_p = toDiopiTensorHandle(weight);
  auto grad_input_p = toDiopiTensorHandle(grad_input);
  if (reinterpret_cast<void *>(diopiSigmoidFocalLossBackwardMmcv) != nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret = diopiSigmoidFocalLossBackwardMmcv(
          ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);
      if (ret == diopiSuccess) return;
    } else {
      auto ret = diopiSigmoidFocalLossBackwardMmcv(
          ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);
      if (ret == diopiSuccess) return;
    }
  }
  LOG(WARNING)
      << "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
  auto input_cpu = input.cpu();
  auto target_cpu = target.cpu();
  auto weight_cpu = weight.cpu();
  auto grad_input_cpu = grad_input.cpu();
  sigmoid_focal_loss_backward_impl(input_cpu, target_cpu, weight_cpu,
                                   grad_input_cpu, gamma, alpha);
  grad_input.copy_(grad_input_cpu);
  return;
}
#endif

void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
#ifdef MMCV_WITH_DIOPI
  sigmoid_focal_loss_forward_diopi(input, target, weight, output, gamma, alpha);
#else
  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#endif
}

void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor grad_input, float gamma, float alpha) {
#ifdef MMCV_WITH_DIOPI
  sigmoid_focal_loss_backward_diopi(input, target, weight, grad_input, gamma,
                                    alpha);
#else
  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
                                   alpha);
#endif
}

void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha) {
  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
}

void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor buff, Tensor grad_input, float gamma,
                                 float alpha) {
  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
                                   gamma, alpha);
}


================================================
FILE: mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void furthest_point_sampling_forward_impl(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m) {
  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
                       temp_tensor, idx_tensor, b, n, m);
}

void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m) {
  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
                       points_tensor, temp_tensor, idx_tensor, b, n, m);
}

void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                     Tensor idx_tensor, int b, int n, int m) {
  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
                                       b, n, m);
}

void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                               Tensor temp_tensor,
                                               Tensor idx_tensor, int b, int n,
                                               int m) {
  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
                                                 idx_tensor, b, n, m);
}


================================================
FILE: mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
================================================
// Modified from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp

/*
Copyright (c) 2021, NVIDIA Corporation. All rights reserved.

NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
Augmentation (ADA)
=======================================================================

1. Definitions

"Licensor" means any person or entity that distributes its Work.

"Software" means the original work of authorship made available under
this License.

"Work" means the Software and any additions to or derivative works of
the Software that are made available under this License.

The terms "reproduce," "reproduction," "derivative works," and
"distribution" have the meaning as provided under U.S. copyright law;
provided, however, that for the purposes of this License, derivative
works shall not include works that remain separable from, or merely
link (or bind by name) to the interfaces of, the Work.

Works, including the Software, are "made available" under this License
by including in or with the Work either (a) a copyright notice
referencing the applicability of this License to the Work, or (b) a
copy of this License.

2. License Grants

    2.1 Copyright Grant. Subject to the terms and conditions of this
    License, each Licensor grants to you a perpetual, worldwide,
    non-exclusive, royalty-free, copyright license to reproduce,
    prepare derivative works of, publicly display, publicly perform,
    sublicense and distribute its Work and any resulting derivative
    works in any form.

3. Limitations

    3.1 Redistribution. You may reproduce or distribute the Work only
    if (a) you do so under this License, (b) you include a complete
    copy of this License with your distribution, and (c) you retain
    without modification any copyright, patent, trademark, or
    attribution notices that are present in the Work.

    3.2 Derivative Works. You may specify that additional or different
    terms apply to the use, reproduction, and distribution of your
    derivative works of the Work ("Your Terms") only if (a) Your Terms
    provide that the use limitation in Section 3.3 applies to your
    derivative works, and (b) you identify the specific derivative
    works that are subject to Your Terms. Notwithstanding Your Terms,
    this License (including the redistribution requirements in Section
    3.1) will continue to apply to the Work itself.

    3.3 Use Limitation. The Work and any derivative works thereof only
    may be used or intended for use non-commercially. Notwithstanding
    the foregoing, NVIDIA and its affiliates may use the Work and any
    derivative works commercially. As used herein, "non-commercially"
    means for research or evaluation purposes only.

    3.4 Patent Claims. If you bring or threaten to bring a patent claim
    against any Licensor (including any claim, cross-claim or
    counterclaim in a lawsuit) to enforce any patents that you allege
    are infringed by any Work, then your rights under this License from
    such Licensor (including the grant in Section 2.1) will terminate
    immediately.

    3.5 Trademarks. This License does not grant any rights to use any
    Licensor’s or its affiliates’ names, logos, or trademarks, except
    as necessary to reproduce the notices described in this License.

    3.6 Termination. If you violate any term of this License, then your
    rights under this License (including the grant in Section 2.1) will
    terminate immediately.

4. Disclaimer of Warranty.

THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
THIS LICENSE.

5. Limitation of Liability.

EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
THE POSSIBILITY OF SUCH DAMAGES.

=======================================================================
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
                                           const torch::Tensor& bias,
                                           const torch::Tensor& refer, int act,
                                           int grad, float alpha, float scale) {
  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
                              act, grad, alpha, scale);
}

torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
                                   const torch::Tensor& bias,
                                   const torch::Tensor& refer, int act,
                                   int grad, float alpha, float scale) {
  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
                                      scale);
}


================================================
FILE: mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

torch::Tensor fused_indice_conv_batchnorm_forward_impl(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM) {
  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
                              features, filters, bias, indicePairs, indiceNum,
                              numActOut, _inverse, _subM);
}

torch::Tensor fused_indice_conv_batchnorm_forward(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM) {
  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
                                                  indicePairs, indiceNum,
                                                  numActOut, _inverse, _subM);
}


================================================
FILE: mmcv/ops/csrc/pytorch/gather_points.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void gather_points_forward_impl(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
                       idx, out);
}

void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
                       idx, grad_points);
}

void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n,
                           int npoints) {
  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
                             out_tensor);
}

void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints) {
  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
                              grad_points_tensor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/group_points.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
                       points, idx, out);
}

void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
                       grad_out, idx, grad_points);
}

void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                          Tensor out_tensor, int b, int c, int n, int npoints,
                          int nsample) {
  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
                       points_tensor, idx_tensor, out_tensor);
}

void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                           Tensor grad_points_tensor, int b, int c, int n,
                           int npoints, int nsample) {
  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
                             idx_tensor, grad_points_tensor);
}

void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
                                      const Tensor grad_out_tensor,
                                      const Tensor idx_tensor,
                                      const Tensor idx_batch_cnt_tensor,
                                      const Tensor features_batch_cnt_tensor,
                                      Tensor grad_features_tensor) {
  DISPATCH_DEVICE_IMPL(stack_group_points_backward_impl, b, c, m, n, nsample,
                       grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
                       features_batch_cnt_tensor, grad_features_tensor);
}

void stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                                 Tensor idx_batch_cnt_tensor,
                                 Tensor features_batch_cnt_tensor,
                                 Tensor grad_features_tensor, int b, int c,
                                 int m, int n, int nsample) {
  stack_group_points_backward_impl(
      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
      features_batch_cnt_tensor, grad_features_tensor);
}

void stack_group_points_forward_impl(int b, int c, int m, int nsample,
                                     const Tensor features_tensor,
                                     const Tensor features_batch_cnt_tensor,
                                     const Tensor idx_tensor,
                                     const Tensor idx_batch_cnt_tensor,
                                     Tensor out_tensor) {
  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,
                       features_tensor, features_batch_cnt_tensor, idx_tensor,
                       idx_batch_cnt_tensor, out_tensor);
}

void stack_group_points_forward(Tensor features_tensor,
                                Tensor features_batch_cnt_tensor,
                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,
                                Tensor out_tensor, int b, int c, int m,
                                int nsample) {
  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,
                       features_tensor, features_batch_cnt_tensor, idx_tensor,
                       idx_batch_cnt_tensor, out_tensor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/info.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
#include "pytorch_cpp_helper.hpp"

#ifdef MMCV_WITH_CUDA
#ifdef MMCV_WITH_HIP
#include <hip/hip_runtime_api.h>
int get_hiprt_version() {
  int runtimeVersion;
  hipRuntimeGetVersion(&runtimeVersion);
  return runtimeVersion;
}
#else
#include <cuda_runtime_api.h>
int get_cudart_version() { return CUDART_VERSION; }
#endif
#endif

std::string get_compiling_cuda_version() {
#ifdef MMCV_WITH_CUDA
#ifndef MMCV_WITH_HIP
  std::ostringstream oss;
  // copied from
  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
  auto printCudaStyleVersion = [&](int v) {
    oss << (v / 1000) << "." << (v / 10 % 100);
    if (v % 10 != 0) {
      oss << "." << (v % 10);
    }
  };
  printCudaStyleVersion(get_cudart_version());
  return oss.str();
#else
  std::ostringstream oss;
  oss << get_hiprt_version();
  return oss.str();
#endif
#else
  return std::string("not available");
#endif
}

// similar to
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
std::string get_compiler_version() {
  std::ostringstream ss;
#if defined(__GNUC__)
#ifndef __clang__
  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
#endif
#endif

#if defined(__clang_major__)
  {
    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
       << __clang_patchlevel__;
  }
#endif

#if defined(_MSC_VER)
  { ss << "MSVC " << _MSC_FULL_VER; }
#endif
  return ss.str();
}


================================================
FILE: mmcv/ops/csrc/pytorch/iou3d.cpp
================================================
// Modified from
// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp

/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;

void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap) {
  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
                       num_b, boxes_b, ans_overlap);
}

void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
                              Tensor &keep_num, float nms_overlap_thresh) {
  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
                       nms_overlap_thresh);
}

void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
                                     Tensor &keep_num,
                                     float nms_overlap_thresh) {
  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
                       nms_overlap_thresh);
}

void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap) {
  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
  int num_a = boxes_a.size(0);
  int num_b = boxes_b.size(0);

  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
                                       ans_overlap);
}

void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                         float nms_overlap_thresh) {
  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params keep: (N)
  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);

  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
}

void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                                float nms_overlap_thresh) {
  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params keep: (N)

  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);

  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
}


================================================
FILE: mmcv/ops/csrc/pytorch/knn.cpp
================================================
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
                       dist2);
}

void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
                   dist2_tensor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/masked_conv2d.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w) {
  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
                       col, kernel_h, kernel_w, pad_h, pad_w);
}

void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels) {
  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
                       im, height, width, channels);
}

void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor col,
                           const int kernel_h, const int kernel_w,
                           const int pad_h, const int pad_w) {
  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
                             kernel_w, pad_h, pad_w);
}

void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor im, int height,
                           int width, int channels) {
  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
                             channels);
}


================================================
FILE: mmcv/ops/csrc/pytorch/min_area_polygons.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
}

void min_area_polygons(const Tensor pointsets, Tensor polygons) {
  min_area_polygons_impl(pointsets, polygons);
}


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void ball_query_forward_mlu(int b, int n, int m, float min_radius,
                            float max_radius, int nsample, const Tensor new_xyz,
                            const Tensor xyz, Tensor idx) {
  auto new_xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      new_xyz, new_xyz.suggest_memory_format());
  auto xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      xyz, new_xyz.suggest_memory_format());
  auto idx_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      idx, new_xyz.suggest_memory_format());

  MluOpTensorDescriptor new_xyz_desc, xyz_desc, idx_desc;
  new_xyz_desc.set(new_xyz_contiguous);
  xyz_desc.set(xyz_contiguous);
  idx_desc.set(idx_contiguous);

  auto new_xyz_impl = torch_mlu::getMluTensorImpl(new_xyz_contiguous);
  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);
  auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);
  auto new_xyz_ptr = new_xyz_impl->cnnlMalloc();
  auto xyz_ptr = xyz_impl->cnnlMalloc();
  auto idx_ptr = idx_impl->cnnlMalloc();

  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpBallQuery(
      handle, new_xyz_desc.desc(), new_xyz_ptr, xyz_desc.desc(), xyz_ptr,
      min_radius, max_radius, nsample, idx_desc.desc(), idx_ptr));
}

void ball_query_forward_impl(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx);

REGISTER_DEVICE_IMPL(ball_query_forward_impl, MLU, ball_query_forward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/

#include "mlu_common_helper.h"

void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                       const int32_t mode, const bool aligned,
                       const int32_t offset) {
  // check dtype
  TORCH_CHECK(
      bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
      "Data type of input should be Float or Half. But now input type is ",
      bboxes1.scalar_type(), ".");
  TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),
              "bboxes1's dtype should be the same with bboxes2's dtype.");

  // params check
  TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ",
              bboxes1.dim(), "D");
  TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ",
              bboxes2.dim(), "D");

  auto rows = bboxes1.size(0);
  auto cols = bboxes2.size(0);
  auto batch_num_all = rows;

  if (rows * cols == 0) {
    // return if zero element
    return;
  }

  INITIAL_MLU_PARAM_WITH_TENSOR(bboxes1);
  INITIAL_MLU_PARAM_WITH_TENSOR(bboxes2);
  INITIAL_MLU_PARAM_WITH_TENSOR(ious);

  // get compute handle
  auto handle = mluOpGetCurrentHandle();

  TORCH_MLUOP_CHECK(mluOpBboxOverlaps(
      handle, mode, aligned, offset, bboxes1_desc.desc(), bboxes1_ptr,
      bboxes2_desc.desc(), bboxes2_ptr, ious_desc.desc(), ious_ptr));
}

void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset);

REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 by Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void BoxIouRotatedMLUKernelLauncher(const Tensor boxes1, const Tensor boxes2,
                                    Tensor ious, const int mode_flag,
                                    const bool aligned) {
  // get compute handle
  auto handle = mluOpGetCurrentHandle();

  auto boxes1_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      boxes1, boxes1.suggest_memory_format());
  auto boxes2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      boxes2, boxes2.suggest_memory_format());
  auto ious_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(ious, ious.suggest_memory_format());

  MluOpTensorDescriptor boxes1_desc, boxes2_desc, ious_desc;
  boxes1_desc.set(boxes1_contiguous);
  boxes2_desc.set(boxes2_contiguous);
  ious_desc.set(ious_contiguous);

  auto boxes1_impl = torch_mlu::getMluTensorImpl(boxes1_contiguous);
  auto boxes2_impl = torch_mlu::getMluTensorImpl(boxes2_contiguous);
  auto ious_impl = torch_mlu::getMluTensorImpl(ious_contiguous);

  auto boxes1_ptr = boxes1_impl->cnnlMalloc();
  auto boxes2_ptr = boxes2_impl->cnnlMalloc();
  auto ious_ptr = ious_impl->cnnlMalloc();

  CNLOG(INFO) << "Call mluOpBoxIouRotated().";
  TORCH_MLUOP_CHECK(mluOpBoxIouRotated(
      handle, mode_flag, aligned, boxes1_desc.desc(), boxes1_ptr,
      boxes2_desc.desc(), boxes2_ptr, ious_desc.desc(), ious_ptr));
}

void box_iou_rotated_mlu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned) {
  BoxIouRotatedMLUKernelLauncher(boxes1, boxes2, ious, mode_flag, aligned);
}

void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);

REGISTER_DEVICE_IMPL(box_iou_rotated_impl, MLU, box_iou_rotated_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,
                                    Tensor rinput, Tensor routput, Tensor rmask,
                                    Tensor output, const int kernel_size,
                                    const int group_size,
                                    const int scale_factor) {
  // check tensor data type
  TORCH_CHECK(
      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
      "Data type of input should be Float or Half. But now input type is ",
      input.scalar_type(), ".");

  TORCH_CHECK(mask.scalar_type() == input.scalar_type(),
              "Data types of input and mask should be the same, but got ",
              input.scalar_type(), " and ", mask.scalar_type());

  // check number of dimensions
  TORCH_CHECK(input.dim() == 4, "input should be a 4-D tensor, but has ",
              input.dim(), "D.");
  TORCH_CHECK(mask.dim() == 4, "mask should be a 4-D tensor, but has ",
              input.dim(), "D.");

  // return fast on zero-element tensor
  if (output.numel() == 0) {
    output = at::zeros(output.sizes().vec(), output.options());
    return;
  }

  // convert NCHW to NHWC
  auto memory_format_input_nhwc =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
  auto rinput_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format_input_nhwc);

  auto memory_format_mask_nhwc =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());
  auto rmask_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);

  auto memory_format_output_nhwc =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(output.dim());
  auto routput_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format_output_nhwc);

  // set tensor descriptor
  MluOpTensorDescriptor input_desc, mask_desc, output_desc;
  input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);
  mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);
  output_desc.set_with_layout(routput_, MLUOP_LAYOUT_NHWC);

  // get ptr of tensors
  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
  auto input_ptr = input_impl->cnnlMalloc();
  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);
  auto mask_ptr = mask_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(routput_);
  auto output_ptr = output_impl->cnnlMalloc();

  // set op descriptor
  auto handle = mluOpGetCurrentHandle();
  mluOpCarafeDescriptor_t carafe_desc;
  TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));
  TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(
      carafe_desc, input.dim(), kernel_size, group_size, scale_factor));
  // launch kernel
  TORCH_MLUOP_CHECK(mluOpCarafeForward(handle, carafe_desc, input_desc.desc(),
                                       input_ptr, mask_desc.desc(), mask_ptr,
                                       output_desc.desc(), output_ptr));
  // destroy op descriptor
  TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));

  // copy output from NHWC back into NCHW
  rinput.copy_(rinput_);
  output.copy_(routput_);
}

void CARAFEBackwardMLUKernelLauncher(
    const Tensor grad_output, const Tensor rinput, const Tensor mask,
    Tensor rgrad_output, Tensor rgrad_input_hs, Tensor rgrad_input,
    Tensor rgrad_mask, Tensor grad_input, Tensor grad_mask,
    const int kernel_size, const int group_size, const int scale_factor) {
  // data type check
  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
                  grad_output.scalar_type() == at::kHalf,
              "grad_output type should be Float or Half, got ",
              grad_output.scalar_type());
  TORCH_CHECK(grad_output.scalar_type() == mask.scalar_type(),
              "mask should have the same type as grad_output");

  // dim check
  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
              grad_output.dim(), "D");

  // param check
  TORCH_CHECK(kernel_size < 137, "kernel_size should be less than 137, got ",
              kernel_size);

  // convert NCHW to NHWC
  auto memory_format_input_nhwc =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(rinput.dim());
  auto rinput_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(rinput, memory_format_input_nhwc);

  auto memory_format_mask_nhwc =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());
  auto rmask_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);

  auto memory_format_grad_output_nhwc =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
  auto rgrad_output_ = torch_mlu::cnnl::ops::cnnl_contiguous(
      grad_output, memory_format_grad_output_nhwc);

  auto memory_format_grad_input_nhwc =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_input.dim());
  auto rgrad_input_ = torch_mlu::cnnl::ops::cnnl_contiguous(
                          grad_input, memory_format_grad_input_nhwc)
                          .zero_();

  auto memory_format_grad_mask_nhwc =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_mask.dim());
  auto rgrad_mask_ = torch_mlu::cnnl::ops::cnnl_contiguous(
      grad_mask, memory_format_grad_mask_nhwc);

  // set tensor descriptor
  MluOpTensorDescriptor input_desc, mask_desc;
  input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);
  mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);

  MluOpTensorDescriptor grad_output_desc, grad_input_desc, grad_mask_desc;
  grad_output_desc.set_with_layout(rgrad_output_, MLUOP_LAYOUT_NHWC);
  grad_input_desc.set_with_layout(rgrad_input_, MLUOP_LAYOUT_NHWC);
  grad_mask_desc.set_with_layout(rgrad_mask_, MLUOP_LAYOUT_NHWC);

  // get ptr of tensors
  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
  auto input_ptr = input_impl->cnnlMalloc();
  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);
  auto mask_ptr = mask_impl->cnnlMalloc();
  auto grad_output_impl = torch_mlu::getMluTensorImpl(rgrad_output_);
  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
  auto grad_input_impl = torch_mlu::getMluTensorImpl(rgrad_input_);
  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
  auto grad_mask_impl = torch_mlu::getMluTensorImpl(rgrad_mask_);
  auto grad_mask_ptr = grad_mask_impl->cnnlMalloc();

  // set op descriptor
  auto handle = mluOpGetCurrentHandle();
  mluOpCarafeDescriptor_t carafe_desc;
  TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));
  TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(
      carafe_desc, grad_output.dim(), kernel_size, group_size, scale_factor));
  // launch kernel
  TORCH_MLUOP_CHECK(mluOpCarafeBackward(
      handle, carafe_desc, input_desc.desc(), input_ptr, mask_desc.desc(),
      mask_ptr, grad_output_desc.desc(), grad_output_ptr,
      grad_input_desc.desc(), grad_input_ptr, grad_mask_desc.desc(),
      grad_mask_ptr));
  // destroy op descriptor
  TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));

  // copy output from NHWC back into NCHW
  grad_input.copy_(rgrad_input_);
  grad_mask.copy_(rgrad_mask_);
}

void carafe_forward_mlu(Tensor features, Tensor masks, Tensor rfeatures,
                        Tensor routput, Tensor rmasks, Tensor output,
                        int kernel_size, int group_size, int scale_factor) {
  CARAFEForwardMLUKernelLauncher(features, masks, rfeatures, routput, rmasks,
                                 output, kernel_size, group_size, scale_factor);
}

void carafe_backward_mlu(Tensor top_grad, Tensor rfeatures, Tensor masks,
                         Tensor rtop_grad, Tensor rbottom_grad_hs,
                         Tensor rbottom_grad, Tensor rmask_grad,
                         Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                         int group_size, int scale_factor) {
  CARAFEBackwardMLUKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
                                  rbottom_grad_hs, rbottom_grad, rmask_grad,
                                  bottom_grad, mask_grad, kernel_size,
                                  group_size, scale_factor);
}

void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor);

void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor);

REGISTER_DEVICE_IMPL(carafe_forward_impl, MLU, carafe_forward_mlu);
REGISTER_DEVICE_IMPL(carafe_backward_impl, MLU, carafe_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void DeformRoIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois,
                                           Tensor offset, Tensor output,
                                           int pooled_height, int pooled_width,
                                           float spatial_scale,
                                           int sampling_ratio, float gamma) {
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
  auto rois_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
  auto output_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);

  MluOpTensorDescriptor input_desc, rois_desc, offset_desc, output_desc;
  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
  rois_desc.set(rois_contiguous);
  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);

  mluOpTensorDescriptor_t offset_real_desc = NULL;
  void *offset_ptr = NULL;
  if (offset.defined() && offset.numel() > 0) {
    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
        offset, offset.suggest_memory_format());
    offset_desc.set(offset_contiguous);
    offset_real_desc = offset_desc.desc();
    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
    offset_ptr = offset_impl->cnnlMalloc();
  }

  // get ptr of tensors
  auto input_impl = torch_mlu::getMluTensorImpl(input_);
  auto input_ptr = input_impl->cnnlMalloc();
  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
  auto rois_ptr = rois_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
  auto output_ptr = output_impl->cnnlMalloc();

  // get compute handle
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpDeformRoiPoolForward(
      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
      offset_real_desc, offset_ptr, pooled_height, pooled_width, spatial_scale,
      sampling_ratio, gamma, output_desc.desc(), output_ptr));

  output.copy_(output_contiguous);
}

void DeformRoIPoolBackwardMLUKernelLauncher(
    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
    float spatial_scale, int sampling_ratio, float gamma) {
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
  auto grad_output_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
  memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
  auto rois_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
  auto grad_input_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(grad_input, memory_format);

  // get ptr of tensors
  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
  auto input_impl = torch_mlu::getMluTensorImpl(input_);
  auto input_ptr = input_impl->cnnlMalloc();
  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
  auto rois_ptr = rois_impl->cnnlMalloc();
  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
  auto grad_input_ptr = grad_input_impl->cnnlMalloc();

  MluOpTensorDescriptor grad_output_desc, input_desc, rois_desc, offset_desc,
      grad_input_desc, grad_offset_desc;
  grad_output_desc.set_with_layout(grad_output_, MLUOP_LAYOUT_NHWC);
  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
  rois_desc.set(rois_contiguous);
  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
  mluOpTensorDescriptor_t offset_real_desc = NULL;
  void *offset_ptr = NULL;
  if (offset.defined() && offset.numel() > 0) {
    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
        offset, offset.suggest_memory_format());
    offset_desc.set(offset_contiguous);
    offset_real_desc = offset_desc.desc();
    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
    offset_ptr = offset_impl->cnnlMalloc();
  }
  mluOpTensorDescriptor_t grad_offset_real_desc = NULL;
  void *grad_offset_ptr = NULL;
  if (grad_offset.defined() && grad_offset.numel() > 0) {
    auto grad_offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
        grad_offset, grad_offset.suggest_memory_format());
    grad_offset_desc.set(grad_offset_contiguous);
    grad_offset_real_desc = grad_offset_desc.desc();
    auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset_contiguous);
    grad_offset_ptr = grad_offset_impl->cnnlMalloc();
  }

  // get compute handle
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpDeformRoiPoolBackward(
      handle, grad_output_desc.desc(), grad_output_ptr, input_desc.desc(),
      input_ptr, rois_desc.desc(), rois_ptr, offset_real_desc, offset_ptr,
      pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma,
      grad_input_desc.desc(), grad_input_ptr, grad_offset_real_desc,
      grad_offset_ptr));
  grad_input.copy_(grad_input_);
}

void deform_roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor offset,
                                 Tensor output, int pooled_height,
                                 int pooled_width, float spatial_scale,
                                 int sampling_ratio, float gamma) {
  DeformRoIPoolForwardMLUKernelLauncher(input, rois, offset, output,
                                        pooled_height, pooled_width,
                                        spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_backward_mlu(Tensor grad_output, Tensor input, Tensor rois,
                                  Tensor offset, Tensor grad_input,
                                  Tensor grad_offset, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
  DeformRoIPoolBackwardMLUKernelLauncher(
      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
      pooled_width, spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma);

void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma);

REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, MLU,
                     deform_roi_pool_forward_mlu);
REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, MLU,
                     deform_roi_pool_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2023 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

Tensor diff_iou_rotated_sort_vertices_forward_mlu(Tensor vertices, Tensor mask,
                                                  Tensor num_valid) {
  // params check
  TORCH_CHECK(vertices.scalar_type() == at::kFloat,
              "vertices type should be Float, got ", vertices.scalar_type());
  TORCH_CHECK(mask.scalar_type() == at::kBool, "mask should be Bool, got ",
              mask.scalar_type());
  TORCH_CHECK(num_valid.scalar_type() == at::kInt,
              "num_valid type should be Int32, got ", num_valid.scalar_type());
  TORCH_CHECK(vertices.size(2) == 24, "vertices.dim(2) should be 24, got ",
              vertices.size(2));
  TORCH_CHECK(mask.size(2) == 24, "mask.dim(2) should be 24, got ",
              mask.size(2));

  // zero-element check
  if (vertices.numel() == 0) {
    return at::empty({0}, num_valid.options().dtype(at::kInt));
  }

  auto idx = at::empty({vertices.size(0), vertices.size(1), 9},
                       num_valid.options().dtype(at::kInt));

  INITIAL_MLU_PARAM_WITH_TENSOR(vertices);
  INITIAL_MLU_PARAM_WITH_TENSOR(mask);
  INITIAL_MLU_PARAM_WITH_TENSOR(num_valid);
  INITIAL_MLU_PARAM_WITH_TENSOR(idx);

  // get compute handle
  auto handle = mluOpGetCurrentHandle();

  // launch kernel
  TORCH_MLUOP_CHECK(mluOpDiffIouRotatedSortVerticesForward(
      handle, vertices_desc.desc(), vertices_ptr, mask_desc.desc(), mask_ptr,
      num_valid_desc.desc(), num_valid_ptr, idx_desc.desc(), idx_ptr));
  return idx;
}

Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
                                                   Tensor num_valid);

REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, MLU,
                     diff_iou_rotated_sort_vertices_forward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include <string>
#include <vector>

#include "mlu_common_helper.h"

void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
                                    Tensor output, const float gamma,
                                    const float alpha) {
  // params check
  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
              "But now gamma is ", gamma, ".");

  // check dtype
  TORCH_CHECK(
      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
      "Data type of input should be Float or Half. But now input type is ",
      input.scalar_type(), ".");

  TORCH_CHECK(
      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
      "target type should be Int or Long. ", "But now target type is ",
      target.scalar_type(), ".");

  if (weight.data_ptr() != nullptr) {
    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
                "Data types of input and weight should be the same. But now "
                "input type is ",
                input.scalar_type(), ", weight type is ", weight.scalar_type(),
                ".");
  } else {
    CNLOG(INFO) << "weight is a empty tensor.";
  }

  // return if zero-element
  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
    return;
  }

  // contiguous
  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      input, input.suggest_memory_format());
  // target only support in32
  auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      target.toType(at::kInt), target.suggest_memory_format());
  auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      weight, weight.suggest_memory_format());
  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      output, output.suggest_memory_format());

  // set tensor descriptor
  MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;
  input_desc.set(input_contiguous);
  target_desc.set(target_contiguous);
  weight_desc.set(weight_contiguous);
  output_desc.set(output_contiguous);

  // get ptr of tensors
  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
  auto input_ptr = input_impl->cnnlMalloc();
  auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);
  auto target_ptr = target_impl->cnnlMalloc();
  auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);
  auto weight_ptr = weight_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
  auto output_ptr = output_impl->cnnlMalloc();

  // set prefer computation performance and redcuntion approach
  mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_FAST;
  mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;

  auto handle = mluOpGetCurrentHandle();

  // launch kernel
  TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidForward(
      handle, prefer, reduction, input_desc.desc(), input_ptr,
      target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,
      gamma, output_desc.desc(), output_ptr));
}

void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, const float gamma,
                                     const float alpha) {
  // params check
  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
              "But now gamma is ", gamma, ".");
  // check dtype
  TORCH_CHECK(
      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
      "Data type of input should be Float or Half. But now input type is ",
      input.scalar_type(), ".");

  TORCH_CHECK(
      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
      "target type should be Int or Long. ", "But now target type is ",
      target.scalar_type(), ".");

  bool has_weight = false;
  if (weight.data_ptr() != nullptr) {
    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
                "Data types of input and weight should be the same. But now "
                "input type is ",
                input.scalar_type(), ", weight type is ", weight.scalar_type(),
                ".");
    has_weight = true;
  } else {
    CNLOG(INFO) << "weight is a empty tensor.";
  }

  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
    // return if zero-element
    return;
  }

  // contiguous
  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      input, input.suggest_memory_format());
  // only support in32
  auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      target.toType(at::kInt), target.suggest_memory_format());
  auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      weight, weight.suggest_memory_format());
  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      output, output.suggest_memory_format());

  // set tensor descriptor
  MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;
  input_desc.set(input_contiguous);
  target_desc.set(target_contiguous);
  weight_desc.set(weight_contiguous);
  output_desc.set(output_contiguous);

  // get ptr of tensors
  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
  auto input_ptr = input_impl->cnnlMalloc();
  auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);
  auto target_ptr = target_impl->cnnlMalloc();
  auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);
  auto weight_ptr = weight_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
  auto output_ptr = output_impl->cnnlMalloc();

  // set prefer computation performance and redcuntion approach
  // backward only support MLUOP_COMPUTATION_HIGH_PRECISION
  mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_HIGH_PRECISION;
  mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;

  auto handle = mluOpGetCurrentHandle();

  // launch kernel
  TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidBackward(
      handle, prefer, reduction, input_desc.desc(), input_ptr,
      target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,
      gamma, output_desc.desc(), output_ptr));
}

void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha);

REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,
                     sigmoid_focal_loss_forward_mlu);
REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,
                     sigmoid_focal_loss_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/

#include "mlu_common_helper.h"

void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
                                 float iou_threshold) {
  if (boxes.numel() == 0) {
    return;
  }

  int input_box_num = boxes.size(0);
  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
  auto output = keep.to(boxes.options().dtype(at::kInt));
  auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));

  MluOpTensorDescriptor boxes_desc, output_desc;
  boxes_desc.set(boxes_);
  output_desc.set(output);

  // workspace
  size_t workspace_size = 0;
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL,
                                             &workspace_size));
  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));

  // get compute queue
  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
  auto boxes_ptr = boxes_impl->cnnlMalloc();
  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
  auto workspace_ptr = workspace_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(keep);
  auto output_ptr = output_impl->cnnlMalloc();
  auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
  auto output_size_ptr = output_size_impl->cnnlMalloc();

  // nms desc
  mluOpNmsDescriptor_t nms_desc;
  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
  const float soft_nms_sigma = 0.0;
  const float confidence_threshold = 0.0;
  const int input_layout = 0;
  const bool pad_to_max_output_size = false;
  const int max_output_size = input_box_num;
  const float offset = 0.0;

  TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));
  TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(
      nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,
      soft_nms_sigma, max_output_size, confidence_threshold, offset,
      input_layout, pad_to_max_output_size));

  TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,
                             NULL, NULL, workspace_ptr, workspace_size,
                             output_desc.desc(), output_ptr, output_size_ptr));
  TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));
}

void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
                             float nms_overlap_thresh) {
  IoU3DNMS3DMLUKernelLauncher(boxes, keep, keep_num, nms_overlap_thresh);
}

void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
                              Tensor &keep_num, float nms_overlap_thresh);
REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, MLU, iou3d_nms3d_forward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"

void KernelMaskedIm2colForward(
    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
    cnrtDataType_t k_dtype, const void *im_ptr, const int height,
    const int width, const int channels, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr);

void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                               cnrtQueue_t queue, cnrtDataType_t k_dtype,
                               const void *col_ptr, const int height,
                               const int width, const int channels,
                               const void *mask_h_idx_ptr,
                               const void *mask_w_idx_ptr, const int mask_cnt,
                               void *im_ptr);

// policy function
static void policyFunc(const int mask_cnt, cnrtDim3_t *k_dim,
                       cnrtFunctionType_t *k_type) {
  const size_t cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
  const size_t core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
  const size_t task_dim = CEIL_ALIGN(mask_cnt, core_num);
  k_dim->x = core_num;
  k_dim->y =
      (task_dim / core_num) > cluster_num ? cluster_num : (task_dim / core_num);
  k_dim->z = 1;
  *k_type = CNRT_FUNC_TYPE_UNION1;
}

void MaskedIm2colForwardMLUKernelLauncher(const Tensor im,
                                          const Tensor mask_h_idx,
                                          const Tensor mask_w_idx, Tensor col,
                                          const int kernel_h,
                                          const int kernel_w, const int pad_h,
                                          const int pad_w) {
  // Check dtype.
  TORCH_CHECK(im.scalar_type() == at::kFloat || im.scalar_type() == at::kHalf,
              "im type should be Float or Half, got ", im.scalar_type(), ".");
  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||
                  mask_h_idx.scalar_type() == at::kLong,
              "mask_h_idx type should be Int or Long, got ",
              mask_h_idx.scalar_type(), ".");
  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||
                  mask_w_idx.scalar_type() == at::kLong,
              "mask_w_idx type should be Int or Long, got ",
              mask_w_idx.scalar_type(), ".");
  TORCH_CHECK(kernel_h > 0, "kernel_h should greater than 0, got ", kernel_h,
              ".");
  TORCH_CHECK(kernel_w > 0, "kernel_w should greater than 0, got ", kernel_w,
              ".");

  // zero element check
  TORCH_CHECK(im.numel() > 0, "im.numel should greater than zero, got ",
              im.numel(), ".");
  TORCH_CHECK(col.size(0) > 0, "col.size(0) should greater than zero, got ",
              col.size(0), ".");

  // large tensor check
  const size_t max_input_num = 2147483648;  // 2^31, 2G num
  TORCH_CHECK(im.numel() < max_input_num,
              "im.numel() should be less than 2147483648, got ", im.numel(),
              ".");
  TORCH_CHECK(col.numel() < max_input_num,
              "col.numel() should be less than 2147483648, got ", col.numel(),
              ".");

  const int channels = im.size(1);
  const int height = im.size(2);
  const int width = im.size(3);
  const int mask_cnt = mask_h_idx.size(0);

  // auto im_t = im.permute({0, 2, 3, 1}).contiguous();
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());
  auto im_ = torch_mlu::cnnl::ops::cnnl_contiguous(im, memory_format);
  auto col_ =
      at::zeros({mask_cnt, kernel_h * kernel_w, channels}, col.options());
  // calculate task dimension
  cnrtDim3_t k_dim;
  cnrtFunctionType_t k_type;
  policyFunc(mask_cnt, &k_dim, &k_type);

  // get compute queue
  auto queue = torch_mlu::getCurQueue();
  // get ptr of tensors
  auto im_impl = torch_mlu::getMluTensorImpl(im_);
  auto im_ptr = im_impl->cnnlMalloc();
  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);
  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();
  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);
  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();
  auto col_impl = torch_mlu::getMluTensorImpl(col_);
  auto col_ptr = col_impl->cnnlMalloc();

  // get comput dtype of input
  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(im.dtype());

  // launch kernel
  CNLOG(INFO) << "Launch Kernel MLUKernelMaskedIm2colForward<<<" << k_dim.x
              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
  KernelMaskedIm2colForward(k_dim, k_type, queue, data_type, im_ptr, height,
                            width, channels, kernel_h, kernel_w, pad_h, pad_w,
                            mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr);

  col.copy_(col_.permute({2, 1, 0})
                .reshape({channels * kernel_h * kernel_w, mask_cnt})
                .contiguous());
}

void MaskedCol2imForwardMLUKernelLauncher(const Tensor col,
                                          const Tensor mask_h_idx,
                                          const Tensor mask_w_idx, Tensor im,
                                          const int height, const int width,
                                          const int channels) {
  // Check dtype.
  TORCH_CHECK(col.scalar_type() == at::kFloat || col.scalar_type() == at::kHalf,
              "col type should be Float or Half, got ", col.scalar_type(), ".");
  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||
                  mask_h_idx.scalar_type() == at::kLong,
              "mask_h_idx type should be Int or Long, got ",
              mask_h_idx.scalar_type(), ".");
  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||
                  mask_w_idx.scalar_type() == at::kLong,
              "mask_w_idx type should be Int or Long, got ",
              mask_w_idx.scalar_type(), ".");

  // zero element check
  TORCH_CHECK(im.numel() > 0, "im.numel should greater than zero, got ",
              im.numel(), ".");
  TORCH_CHECK(col.size(0) > 0, "col.size(0) should greater than zero, got ",
              col.size(0), ".");

  // large tensor check
  const size_t max_input_num = 2147483648;  // 2^31, 2G num
  TORCH_CHECK(im.numel() < max_input_num,
              "im.numel() should be less than 2147483648, got ", im.numel(),
              ".");
  TORCH_CHECK(col.numel() < max_input_num,
              "col.numel() should be less than 2147483648, got ", col.numel(),
              ".");

  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());
  at::Tensor im_ =
      at::empty({1, channels, height, width}, im.options(), memory_format)
          .zero_();

  auto col_t = torch_mlu::cnnl::ops::cnnl_contiguous(col.transpose(0, 1));

  const int mask_cnt = mask_h_idx.size(0);
  // calculate task dimension
  cnrtDim3_t k_dim;
  cnrtFunctionType_t k_type;
  policyFunc(mask_cnt, &k_dim, &k_type);

  // get compute queue
  auto queue = torch_mlu::getCurQueue();
  // get ptr of tensors
  auto im_impl = torch_mlu::getMluTensorImpl(im_);
  auto im_ptr = im_impl->cnnlMalloc();
  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);
  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();
  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);
  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();
  auto col_t_impl = torch_mlu::getMluTensorImpl(col_t);
  auto col_t_ptr = col_t_impl->cnnlMalloc();

  // get comput dtype of input
  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(col.dtype());

  // launch kernel
  CNLOG(INFO) << "Launch Kernel MLUKernelMaskedCol2imForward<<<" << k_dim.x
              << ", " << k_dim.y << ", " << k_dim.z << ">>>";

  KernelMaskedCol2imForward(k_dim, k_type, queue, data_type, col_t_ptr, height,
                            width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
                            mask_cnt, im_ptr);

  im.copy_(im_);
}

void masked_im2col_forward_mlu(const Tensor im, const Tensor mask_h_idx,
                               const Tensor mask_w_idx, Tensor col,
                               const int kernel_h, const int kernel_w,
                               const int pad_h, const int pad_w) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
  MaskedIm2colForwardMLUKernelLauncher(im, mask_h_idx, mask_w_idx, col,
                                       kernel_h, kernel_w, pad_h, pad_w);
}

void masked_col2im_forward_mlu(const Tensor col, const Tensor mask_h_idx,
                               const Tensor mask_w_idx, Tensor im, int height,
                               int width, int channels) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
  MaskedCol2imForwardMLUKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
                                       width, channels);
}

void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w);

void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels);

REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, MLU,
                     masked_im2col_forward_mlu);
REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, MLU,
                     masked_col2im_forward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

// Descriptors
mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type) {
  const std::map<std::string, mluOpDataType_t> mapping_type = {
      {std::string("c10::Half"), MLUOP_DTYPE_HALF},
      {std::string("float"), MLUOP_DTYPE_FLOAT},
      {std::string("double"), MLUOP_DTYPE_DOUBLE},
      {std::string("int8"), MLUOP_DTYPE_INT8},
      {std::string("signed char"), MLUOP_DTYPE_INT8},
      {std::string("short int"), MLUOP_DTYPE_INT16},
      {std::string("short"), MLUOP_DTYPE_INT16},
      {std::string("int"), MLUOP_DTYPE_INT32},
      {std::string("long int"), MLUOP_DTYPE_INT64},
      {std::string("long"), MLUOP_DTYPE_INT64},
      {std::string("unsigned char"), MLUOP_DTYPE_UINT8},
      {std::string("bool"), MLUOP_DTYPE_BOOL},
      {std::string("c10::complex<c10::Half>"), MLUOP_DTYPE_COMPLEX_HALF},
      {std::string("c10::complex<float>"), MLUOP_DTYPE_COMPLEX_FLOAT}};

  if (mapping_type.find(std::string(data_type.name())) != mapping_type.end()) {
    return mapping_type.find(std::string(data_type.name()))->second;
  }
  return MLUOP_DTYPE_INVALID;
}

// laytout
mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input) {
  auto suggest_memory_format = input.suggest_memory_format();
  mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;
  switch (input.dim()) {
    case 4:
      layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast)
                   ? MLUOP_LAYOUT_NHWC
                   : MLUOP_LAYOUT_NCHW;
      break;
    case 5:
      layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast3d)
                   ? MLUOP_LAYOUT_NDHWC
                   : MLUOP_LAYOUT_NCDHW;
      break;
    default:
      layout = MLUOP_LAYOUT_ARRAY;
  }
  return layout;
}

mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type) {
  const std::map<reduce_t, mluOpReduceMode_t> mapping_type = {
      {reduce_t::MAX, MLUOP_REDUCE_DMAX},
      {reduce_t::SUM, MLUOP_REDUCE_DSUM},
      {reduce_t::MEAN, MLUOP_REDUCE_DMEAN}};
  if (mapping_type.find(reduce_type) != mapping_type.end()) {
    return mapping_type.find(reduce_type)->second;
  } else {
    TORCH_CHECK(false, "Unsupported reduce type: ", to_string(reduce_type));
    return MLUOP_REDUCE_DSUM;
  }
}

void MluOpTensorDescriptor::set(Tensor t) {
  mluOpDataType_t data_type = getMluOpDataType(t.dtype());
  mluOpTensorLayout_t layout = getMluOpSuggestLayout(t);
  int t_dim = t.dim();
  std::vector<int> dim_array;
  if (t_dim == 0) {
    dim_array.push_back(
        1);  // ScalarTensor(0-dim 1-item Tensor) view like size = 1 as default;
  } else {
    for (int i = 0; i < t_dim; i++) {
      dim_array.push_back(static_cast<int>(t.sizes().vec()[i]));
    }
  }
  set_desc(t, layout, data_type, dim_array);
}

void MluOpTensorDescriptor::set_with_layout(Tensor t,
                                            mluOpTensorLayout_t layout) {
  mluOpDataType_t data_type = getMluOpDataType(t.dtype());
  int t_dim = t.dim();
  std::vector<int> shape_info = checkUpperBoundAndCastTo<int>(t.sizes().vec());
  std::vector<int> stride_info =
      checkUpperBoundAndCastTo<int>(t.strides().vec());
  if (layout == MLUOP_LAYOUT_NHWC || layout == MLUOP_LAYOUT_NDHWC ||
      layout == MLUOP_LAYOUT_NLC) {
    convertShapeAndStride(shape_info, stride_info);
  } else if (layout == MLUOP_LAYOUT_HWCN) {
    auto convertDepthWiseConvShapeStride = [](const std::vector<int64_t>& vec,
                                              std::vector<int>& target_vec,
                                              std::vector<int>& stride_vec) {
      // NCHW --> HWCN
      target_vec[0] = static_cast<int>(vec[2]);
      target_vec[1] = static_cast<int>(vec[3]);
      target_vec[2] = static_cast<int>(vec[1]);
      target_vec[3] = static_cast<int>(vec[0]);
      // Calculate Stride just like contiguous of HWCN.
      stride_vec[3] = 1;
      stride_vec[2] = target_vec[3] * stride_vec[3];
      stride_vec[1] = target_vec[2] * stride_vec[2];
      stride_vec[0] = target_vec[1] * stride_vec[1];
    };
    convertDepthWiseConvShapeStride(t.sizes().vec(), shape_info, stride_info);
  }
  TORCH_CHECK(mluOpSetTensorDescriptorEx(
                  desc_, layout, data_type, t_dim, shape_info.data(),
                  stride_info.data()) == MLUOP_STATUS_SUCCESS,
              "mluOpSetTensorDescriptorEx execution failed.");
}

void MluOpTensorDescriptor::set_desc(const at::Tensor& t,
                                     mluOpTensorLayout_t layout,
                                     mluOpDataType_t dtype,
                                     std::vector<int>& dims) {
  int dimNb = dims.size();
  TORCH_MLUOP_CHECK(
      mluOpSetTensorDescriptor(desc_, layout, dtype, dimNb, dims.data()));
}

// Handles
std::once_flag mmcv_mluop_init_flag;
std::mutex mmcv_mluop_mutex;
static std::vector<MluOpHandle> mmcv_mluop_handles;

mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index) {
  std::call_once(mmcv_mluop_init_flag,
                 []()  // Init mmcv_mluop_handles 1-device <-> 1-handle
                 {
                   c10::DeviceIndex num_devices = torch_mlu::device_count();
                   mmcv_mluop_handles.resize(num_devices);
                 });

  if (device_index == -1) {
    device_index = torch_mlu::current_device();
  }
  std::lock_guard<std::mutex> mmcv_mluop_guard(mmcv_mluop_mutex);
  auto queue = torch_mlu::getCurrentQueue(device_index).queue();
  mmcv_mluop_handles[device_index].setQueue(queue);
  return mmcv_mluop_handles[device_index].handle;
}


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#pragma once
#include <ATen/ATen.h>
#include <c10/core/ScalarType.h>

#include "aten.h"
#include "mlu_op.h"
#include "pytorch_device_registry.hpp"

#define MLUOP_MAJOR 0
#define MLUOP_MINOR 8
#define MLUOP_PATCHLEVEL 1

/*************************************************************************
 * This MACRO contains operations of simple tensor to mlu-tensor.
 * _contiguous, _desc, _impl, _ptr will be automatically generated in
 * this MACRO.
 *************************************************************************/
#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \
  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \
      NAME, NAME.suggest_memory_format());                          \
  MluOpTensorDescriptor NAME##_desc;                                \
  NAME##_desc.set(NAME##_contigous);                                \
  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
  auto NAME##_ptr = NAME##_impl->cnnlMalloc();

#ifndef TORCH_MLUOP_CHECK
#define TORCH_MLUOP_CHECK(EXPR)                                          \
  do {                                                                   \
    mluOpStatus_t status = EXPR;                                         \
    if (status != MLUOP_STATUS_SUCCESS) {                                \
      CNLOG(ERROR) << "";                                                \
      TORCH_CHECK(false, "MLUOPS error: ", mluOpGetErrorString(status)); \
    }                                                                    \
  } while (0);
#endif

enum class reduce_t { SUM = 0, MEAN = 1, MAX = 2 };

inline std::string to_string(reduce_t reduce_type) {
  if (reduce_type == reduce_t::MAX) {
    return "max";
  } else if (reduce_type == reduce_t::MEAN) {
    return "mean";
  } else if (reduce_type == reduce_t::SUM) {
    return "sum";
  } else {
    return "unknown reduce type";
  }
}

mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type);

class MluOpTensorDescriptor {
 public:
  MluOpTensorDescriptor() {
    TORCH_MLUOP_CHECK(mluOpCreateTensorDescriptor(&desc_));
  };
  ~MluOpTensorDescriptor() {
    TORCH_MLUOP_CHECK(mluOpDestroyTensorDescriptor(desc_));
  }

  void set(at::Tensor);
  void set_with_layout(at::Tensor, mluOpTensorLayout_t layout);
  mluOpTensorDescriptor_t desc() { return desc_; }

 private:
  mluOpTensorDescriptor_t desc_;
  void set_desc(const at::Tensor&, mluOpTensorLayout_t, mluOpDataType_t,
                std::vector<int>& dims);
};

mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index = -1);

class MluOpHandle {
 public:
  MluOpHandle() : handle(nullptr) { TORCH_MLUOP_CHECK(mluOpCreate(&handle)); }
  ~MluOpHandle() {
    if (handle) {
      TORCH_MLUOP_CHECK(mluOpDestroy(handle));
      handle = nullptr;
    }
  }
  void setQueue(cnrtQueue_t queue) {
    TORCH_MLUOP_CHECK(mluOpSetQueue(handle, queue));
  }
  mluOpHandle_t handle;
};

// modify tensor size and stride order based on
// channels_first to channels_last or channels_last_3d.
// which this is not same with pytorch original layout,
// this real layout is based on data storage real order.
// example: modify channels_last tensor dim to nhwc tensor desc.
//            N    C H W  -->   N    H W C
//          C*H*W  1 W C  --> C*H*W  W C 1
template <typename T>
void convertShapeAndStride(std::vector<T>& shape_info,
                           std::vector<T>& stride_info) {
  TORCH_MLU_CHECK(shape_info.size() == stride_info.size(),
                  "shape size need equal to stride size.");
  const int dim = shape_info.size();
  std::vector<T> temp_shape_info(dim);
  std::vector<T> temp_stride_info(dim);
  temp_shape_info[0] = shape_info[0];
  temp_stride_info[0] = stride_info[0];
  for (size_t i = 0; i < dim - 1; ++i) {
    const int index = (i + 1) % (dim - 1) + 1;
    temp_shape_info[i + 1] = shape_info[index];
    temp_stride_info[i + 1] = stride_info[index];
  }
  shape_info.assign(temp_shape_info.begin(), temp_shape_info.end());
  stride_info.assign(temp_stride_info.begin(), temp_stride_info.end());
}

// torch tensor provides int64_t type of shape and stride,
// but mluops descriptor requires type int32.
// use this function to ensure safe CAST, or report an error.
template <typename DST_T, typename SRC_T>
std::vector<DST_T> checkUpperBoundAndCastTo(const std::vector<SRC_T>& input) {
  std::vector<DST_T> output;
  output.reserve(input.size());
  for (const auto& val : input) {
    if (val > std::numeric_limits<DST_T>::max()) {
      TORCH_MLU_CHECK(false, "Requires dim size not greater than ",
                      std::numeric_limits<DST_T>::max(), ". But got ", val,
                      ".");
    }
    output.push_back(static_cast<DST_T>(val));
  }
  return output;
}


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 by Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"

Tensor MsDeformAttnForwardLauncher(const Tensor& value,
                                   const Tensor& spatial_shapes,
                                   const Tensor& level_start_index,
                                   const Tensor& sampling_loc,
                                   const Tensor& attn_weight,
                                   const int im2col_step) {
  auto handle = mluOpGetCurrentHandle();
  const int batch_size = value.size(0);
  const int num_heads = value.size(2);
  const int channels = value.size(3);
  const int num_queries = sampling_loc.size(1);
  auto output = at::zeros({batch_size, num_queries, num_heads, channels},
                          value.options());
  auto spatial_shapes_int = spatial_shapes.to(at::kInt);
  auto level_start_index_int = level_start_index.to(at::kInt);
  INITIAL_MLU_PARAM_WITH_TENSOR(output);
  INITIAL_MLU_PARAM_WITH_TENSOR(value);
  INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
  INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
  INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
  INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);

  TORCH_MLUOP_CHECK(mluOpMsDeformAttnForward(
      handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
      spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
      level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
      attn_weight_desc.desc(), attn_weight_ptr, im2col_step, output_desc.desc(),
      output_ptr));

  output = output.view({batch_size, num_queries, num_heads * channels});
  return output;
}

void MsDeformAttnBackwardLauncher(
    const Tensor& value, const Tensor& spatial_shapes,
    const Tensor& level_start_index, const Tensor& sampling_loc,
    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
    const int im2col_step) {
  auto handle = mluOpGetCurrentHandle();
  auto spatial_shapes_int = spatial_shapes.to(at::kInt);
  auto level_start_index_int = level_start_index.to(at::kInt);
  const int batch_size = value.size(0);
  const int num_heads = value.size(2);
  const int channels = value.size(3);
  const int num_queries = sampling_loc.size(1);

  auto grad_output_dim4 =
      grad_output.view({batch_size, num_queries, num_heads, channels});
  // auto grad_output_dim4 = grad_output.view({batch_size, num_queries,
  // num_heads, channels}).detach();
  INITIAL_MLU_PARAM_WITH_TENSOR(value);
  INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
  INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
  INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
  INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
  INITIAL_MLU_PARAM_WITH_TENSOR(grad_output_dim4);
  // INITIAL_MLU_PARAM_WITH_TENSOR(grad_output);
  INITIAL_MLU_PARAM_WITH_TENSOR(grad_value);
  INITIAL_MLU_PARAM_WITH_TENSOR(grad_sampling_loc);
  INITIAL_MLU_PARAM_WITH_TENSOR(grad_attn_weight);

  mluOpMsDeformAttnBackward(
      handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
      spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
      level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
      attn_weight_desc.desc(), attn_weight_ptr, grad_output_dim4_desc.desc(),
      grad_output_dim4_ptr, im2col_step, grad_value_desc.desc(), grad_value_ptr,
      grad_sampling_loc_desc.desc(), grad_sampling_loc_ptr,
      grad_attn_weight_desc.desc(), grad_attn_weight_ptr);

  return;
}

Tensor ms_deform_attn_mlu_forward(const Tensor& value,
                                  const Tensor& spatial_shapes,
                                  const Tensor& level_start_index,
                                  const Tensor& sampling_loc,
                                  const Tensor& attn_weight,
                                  const int im2col_step) {
  return MsDeformAttnForwardLauncher(value, spatial_shapes, level_start_index,
                                     sampling_loc, attn_weight, im2col_step);
}

void ms_deform_attn_mlu_backward(
    const Tensor& value, const Tensor& spatial_shapes,
    const Tensor& level_start_index, const Tensor& sampling_loc,
    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
    const int im2col_step) {
  return MsDeformAttnBackwardLauncher(value, spatial_shapes, level_start_index,
                                      sampling_loc, attn_weight, grad_output,
                                      grad_value, grad_sampling_loc,
                                      grad_attn_weight, im2col_step);
}

Tensor ms_deform_attn_impl_forward(const Tensor& value,
                                   const Tensor& spatial_shapes,
                                   const Tensor& level_start_index,
                                   const Tensor& sampling_loc,
                                   const Tensor& attn_weight,
                                   const int im2col_step);

void ms_deform_attn_impl_backward(
    const Tensor& value, const Tensor& spatial_shapes,
    const Tensor& level_start_index, const Tensor& sampling_loc,
    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);

REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MLU,
                     ms_deform_attn_mlu_forward);

REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MLU,
                     ms_deform_attn_mlu_backward);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/

#include "mlu_common_helper.h"

Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                            int offset) {
  if (boxes.numel() == 0) {
    return at::empty({0}, boxes.options().dtype(at::kLong));
  }

  int max_output_boxes = boxes.size(0);

  // transpose boxes (n, 4) to (4, n) for better performance
  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kInt));
  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));

  MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
  boxes_desc.set(boxes_);
  scores_desc.set(scores_);
  output_desc.set(output);

  // workspace
  size_t workspace_size = 0;
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(
      handle, boxes_desc.desc(), scores_desc.desc(), &workspace_size));
  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));

  // get compute queue
  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
  auto boxes_ptr = boxes_impl->cnnlMalloc();
  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
  auto scores_ptr = scores_impl->cnnlMalloc();
  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
  auto workspace_ptr = workspace_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(output);
  auto output_ptr = output_impl->cnnlMalloc();
  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
  auto output_size_ptr = output_size_impl->cnnlMalloc();

  // nms desc
  mluOpNmsDescriptor_t nms_desc;
  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
  const float soft_nms_sigma = 0.0;
  const float confidence_threshold = 0.0;
  const int input_layout = 0;
  const bool pad_to_max_output_size = false;
  const int max_output_size = max_output_boxes;

  TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));
  TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(
      nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,
      soft_nms_sigma, max_output_size, confidence_threshold, (float)offset,
      input_layout, pad_to_max_output_size));

  TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,
                             scores_desc.desc(), scores_ptr, workspace_ptr,
                             workspace_size, output_desc.desc(), output_ptr,
                             output_size_ptr));
  TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));
  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
  auto ret = output.to(boxes.options().dtype(at::kLong));
  return ret.slice(0, 0, output_num);
}

Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
}

Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

Tensor nms_rotated_mlu(Tensor boxes, Tensor scores, float iou_threshold) {
  if (boxes.numel() == 0) {
    return at::empty({0}, boxes.options().dtype(at::kLong));
  }

  int boxes_num = boxes.size(0);
  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
  auto output = at::empty({boxes_num}, boxes.options().dtype(at::kInt));
  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));

  MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
  boxes_desc.set(boxes_);
  scores_desc.set(scores_);
  output_desc.set(output);

  // workspace
  size_t workspace_size = 0;
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpGetNmsRotatedWorkspaceSize(handle, boxes_desc.desc(),
                                                    &workspace_size));
  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));

  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
  auto boxes_ptr = boxes_impl->cnnlMalloc();
  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
  auto scores_ptr = scores_impl->cnnlMalloc();
  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
  auto workspace_ptr = workspace_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(output);
  auto output_ptr = output_impl->cnnlMalloc();
  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
  auto output_size_ptr = output_size_impl->cnnlMalloc();

  TORCH_MLUOP_CHECK(mluOpNmsRotated(
      handle, iou_threshold, boxes_desc.desc(), boxes_ptr, scores_desc.desc(),
      scores_ptr, workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
      (int *)output_size_ptr));
  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
  auto ret = output.to(boxes.options().dtype(at::kLong));
  return ret.slice(0, 0, output_num);
}


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
                                     Tensor y, const int num_,
                                     const int h_feature, const int w_feature,
                                     const int h_mask, const int w_mask,
                                     const int half_h_mask,
                                     const int half_w_mask) {
  int y_c = y.size(1);

  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
  auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);
  at::Tensor y_tmp =
      at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);

  MluOpTensorDescriptor x_desc, y_desc;
  x_desc.set_with_layout(x_tensor, MLUOP_LAYOUT_NHWC);
  y_desc.set_with_layout(y_tmp, MLUOP_LAYOUT_NHWC);

  auto handle = mluOpGetCurrentHandle();
  auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
  auto x_ptr = x_impl->cnnlMalloc();
  auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
  auto y_ptr = y_impl->cnnlMalloc();

  TORCH_MLUOP_CHECK(mluOpPsamaskForward(handle, psa_type, x_desc.desc(), x_ptr,
                                        h_mask, w_mask, y_desc.desc(), y_ptr));

  y.copy_(y_tmp);
}

void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
                                      Tensor dx, const int num_,
                                      const int h_feature, const int w_feature,
                                      const int h_mask, const int w_mask,
                                      const int half_h_mask,
                                      const int half_w_mask) {
  int dx_c = dx.size(1);

  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
  auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);
  at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
                                dy.options(), memory_format);

  MluOpTensorDescriptor dy_desc, dx_tmp_desc;
  dy_desc.set_with_layout(dy_tensor, MLUOP_LAYOUT_NHWC);
  dx_tmp_desc.set_with_layout(dx_tmp, MLUOP_LAYOUT_NHWC);

  auto handle = mluOpGetCurrentHandle();

  // get ptr of tensors
  auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
  auto dx_ptr = dx_impl->cnnlMalloc();
  auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
  auto dy_ptr = dy_impl->cnnlMalloc();

  TORCH_MLUOP_CHECK(mluOpPsamaskBackward(handle, psa_type, dy_desc.desc(),
                                         dy_ptr, h_mask, w_mask,
                                         dx_tmp_desc.desc(), dx_ptr));

  dx.copy_(dx_tmp);
}

void psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,
                         const int num_, const int h_feature,
                         const int w_feature, const int h_mask,
                         const int w_mask, const int half_h_mask,
                         const int half_w_mask) {
  PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,
                                  w_feature, h_mask, w_mask, half_h_mask,
                                  half_w_mask);
}

void psamask_backward_mlu(const int psa_type, const Tensor grad_output,
                          Tensor grad_input, const int num_,
                          const int h_feature, const int w_feature,
                          const int h_mask, const int w_mask,
                          const int half_h_mask, const int half_w_mask) {
  PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,
                                   h_feature, w_feature, h_mask, w_mask,
                                   half_h_mask, half_w_mask);
}

void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask);

void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask);

REGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);
REGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                      Tensor argmax_y, Tensor argmax_x,
                                      int aligned_height, int aligned_width,
                                      float spatial_scale, int sampling_ratio,
                                      int pool_mode, bool aligned) {
  // params check
  TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
  auto input_tensor =
      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);

  auto num_rois = rois.size(0);
  auto channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  auto output_contiguous =
      at::empty({num_rois, channels, aligned_height, aligned_width},
                input.options(), memory_format);
  // get tensor impl
  auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);

  MluOpTensorDescriptor input_desc, rois_desc, argmax_y_desc, argmax_x_desc,
      output_desc;
  input_desc.set_with_layout(input_tensor, MLUOP_LAYOUT_NHWC);
  rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);

  // get the mlu ptr
  auto self_ptr = self_impl->cnnlMalloc();
  auto rois_ptr = rois_impl->cnnlMalloc();
  auto output_ptr = output_impl->cnnlMalloc();

  mluOpRoiAlignForwardDescriptor_t roialign_desc;
  TORCH_MLUOP_CHECK(mluOpCreateRoiAlignForwardDescriptor(&roialign_desc));
  TORCH_MLUOP_CHECK(mluOpSetRoiAlignForwardDescriptor_v2(
      roialign_desc, aligned_height, aligned_width, sampling_ratio,
      spatial_scale, pool_mode, aligned));

  auto handle = mluOpGetCurrentHandle();
  if (pool_mode == 0) {
    auto argmax_y_contiguous =
        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
    auto argmax_x_contiguous =
        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
    auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
    auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
    auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
    auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
    argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
    argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
    TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(
        handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),
        rois_ptr, output_desc.desc(), output_ptr, argmax_x_desc.desc(),
        argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr));
    argmax_x.copy_(argmax_x_contiguous);
    argmax_y.copy_(argmax_y_contiguous);
  } else {
    TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(
        handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),
        rois_ptr, output_desc.desc(), output_ptr, NULL, NULL, NULL, NULL));
  }
  TORCH_MLUOP_CHECK(mluOpDestroyRoiAlignForwardDescriptor(roialign_desc));
  output.copy_(output_contiguous);
}

void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
                                       Tensor argmax_y, Tensor argmax_x,
                                       Tensor grad_input, int aligned_height,
                                       int aligned_width, float spatial_scale,
                                       int sampling_ratio, int pool_mode,
                                       bool aligned) {
  // params check
  TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
  int batch_size = grad_input.size(0);
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());
  auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);
  auto grad_input_ = at::empty({batch_size, channels, height, width},
                               grad.options(), memory_format)
                         .zero_();

  int boxes_num = rois.size(0);
  int hi = grad.size(2);
  int wi = grad.size(3);
  int c = grad.size(1);

  int no = grad_input.size(0);
  int ho = grad_input.size(2);
  int wo = grad_input.size(3);

  // get tensor impl
  auto grad_impl = torch_mlu::getMluTensorImpl(grad_);
  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
  auto rois_impl = torch_mlu::getMluTensorImpl(rois);

  // get the mlu ptr
  auto grad_ptr = grad_impl->cnnlMalloc();
  auto rois_ptr = rois_impl->cnnlMalloc();
  auto grad_input_ptr = grad_input_impl->cnnlMalloc();

  MluOpTensorDescriptor grads_desc, rois_desc, argmax_y_desc, argmax_x_desc,
      grad_input_desc;
  grads_desc.set_with_layout(grad_, MLUOP_LAYOUT_NHWC);
  rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);

  auto handle = mluOpGetCurrentHandle();
  if (pool_mode == 0) {
    auto argmax_y_contiguous =
        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
    auto argmax_x_contiguous =
        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
    auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
    auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
    auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
    auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
    argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
    argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
    TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(
        handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr,
        argmax_y_desc.desc(), argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr,
        spatial_scale, sampling_ratio, aligned, pool_mode,
        grad_input_desc.desc(), grad_input_ptr));
  } else {
    TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(
        handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr, NULL,
        NULL, NULL, NULL, spatial_scale, sampling_ratio, aligned, pool_mode,
        grad_input_desc.desc(), grad_input_ptr));
  }
  grad_input.copy_(grad_input_);
}

void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
                           int aligned_width, float spatial_scale,
                           int sampling_ratio, int pool_mode, bool aligned) {
  ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,
                                   aligned_height, aligned_width, spatial_scale,
                                   sampling_ratio, pool_mode, aligned);
}

void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,
                            Tensor argmax_x, Tensor grad_input,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  ROIAlignBackwardMLUKernelLauncher(
      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}

void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);

void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned);

REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 by Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
                                             Tensor output, int pooled_height,
                                             int pooled_width,
                                             float spatial_scale,
                                             int sampling_ratio, bool aligned,
                                             bool clockwise) {
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
  auto rois_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
  auto output_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);

  MluOpTensorDescriptor input_desc, rois_desc, output_desc;
  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
  rois_desc.set(rois_contiguous);
  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);

  // get ptr of tensors
  auto input_impl = torch_mlu::getMluTensorImpl(input_);
  auto input_ptr = input_impl->cnnlMalloc();
  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
  auto rois_ptr = rois_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
  auto output_ptr = output_impl->cnnlMalloc();

  // get compute handle
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedForward(
      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
      pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,
      clockwise, output_desc.desc(), output_ptr));

  output.copy_(output_contiguous);
}

void ROIAlignRotatedBackwardMLUKernelLauncher(
    Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
    int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
    bool clockwise) {
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
  auto top_grad_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
  auto rois_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
  auto bottom_grad_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);

  // get ptr of tensors
  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);
  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
  auto rois_ptr = rois_impl->cnnlMalloc();
  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);
  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();

  MluOpTensorDescriptor top_grad_desc, rois_desc, bottom_grad_desc;
  top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);
  rois_desc.set(rois_contiguous);
  bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);

  // get compute handle
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedBackward(
      handle, top_grad_desc.desc(), top_grad_ptr, rois_desc.desc(), rois_ptr,
      pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,
      clockwise, bottom_grad_desc.desc(), bottom_grad_ptr));
  bottom_grad.copy_(bottom_grad_);
}

void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
                                   int aligned_height, int aligned_width,
                                   float spatial_scale, int sampling_ratio,
                                   bool aligned, bool clockwise) {
  ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,
                                          aligned_width, spatial_scale,
                                          sampling_ratio, aligned, clockwise);
}

void roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,
                                    Tensor bottom_grad, int aligned_height,
                                    int aligned_width, float spatial_scale,
                                    int sampling_ratio, bool aligned,
                                    bool clockwise) {
  ROIAlignRotatedBackwardMLUKernelLauncher(
      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
      sampling_ratio, aligned, clockwise);
}

void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise);

void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise);

REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,
                     roi_align_rotated_forward_mlu);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,
                     roi_align_rotated_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"

void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                          cnrtQueue_t queue, cnrtDataType_t data_type,
                          const void *input_data, const void *input_rois,
                          const int batch, const int channels, const int height,
                          const int width, const int pooled_height,
                          const int pooled_width, const int rois_num,
                          const float spatial_scale, void *output_data,
                          int *argmax);

void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
                           const void *grad_output_ptr, const void *rois_ptr,
                           const int *argmax_ptr, void *grad_input_ptr,
                           const int box_num, const int pooled_height,
                           const int pooled_width, const int channels,
                           const int batch, const int height, const int width,
                           const float spatial_scale);

// policy function for forward
static void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,
                              cnrtFunctionType_t *k_type) {
  auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
  *k_type = CNRT_FUNC_TYPE_UNION1;
  k_dim->x = core_num;
  unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);
  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
  k_dim->z = 1;
}

void ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                     Tensor argmax, int pooled_height,
                                     int pooled_width, float spatial_scale) {
  // Check dtype.
  TORCH_CHECK(
      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
      "input type should be Float or Half, got ", input.scalar_type());
  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
              "rois should have the same type as input");

  // Check dtype relationship.
  TORCH_CHECK(
      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
      "argmax type should be Int or Long, got ", argmax.scalar_type());

  // Check shape.
  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
              "D");
  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
              "D");
  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
              argmax.dim(), "D");

  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
              "spatial_scale should be within (0, 1], got ", spatial_scale);

  // compute kernel params
  auto batch = input.size(0);
  auto height = input.size(2);
  auto width = input.size(3);
  auto channels = input.size(1);
  auto rois_num = output.size(0);

  if (output.numel() == 0) {
    output = at::zeros({rois_num, channels, pooled_height, pooled_width},
                       input.options());
    return;
  }
  if (argmax.numel() == 0) {
    argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},
                       argmax.options());
    return;
  }

  // zero element check
  if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||
      argmax.numel() == 0) {
    return;
  }

  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);

  at::Tensor output_ =
      at::empty({rois_num, channels, pooled_height, pooled_width},
                input.options(), memory_format);
  at::Tensor argmax_ =
      at::empty({rois_num, channels, pooled_height, pooled_width},
                argmax.options(), memory_format);

  // calculate task dimension
  cnrtDim3_t k_dim;
  cnrtFunctionType_t k_type;
  policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);

  // get compute queue
  auto queue = torch_mlu::getCurQueue();

  // get ptr of tensors
  auto input_impl = torch_mlu::getMluTensorImpl(input_);
  auto input_ptr = input_impl->cnnlMalloc();
  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
  auto rois_ptr = rois_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(output_);
  auto output_ptr = output_impl->cnnlMalloc();
  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
  auto argmax_ptr = argmax_impl->cnnlMalloc();

  // get comput dtype of input
  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());

  // launch kernel
  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolForward<<<" << k_dim.x << ", "
              << k_dim.y << ", " << k_dim.z << ">>>";

  KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,
                       batch, channels, height, width, pooled_height,
                       pooled_width, rois_num, spatial_scale, output_ptr,
                       (int *)argmax_ptr);
  output.copy_(output_);
  argmax.copy_(argmax_);
}

// policy function for backward
static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
  *k_type = CNRT_FUNC_TYPE_UNION1;
  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
  k_dim->z = 1;
}

void ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,
                                      Tensor argmax, Tensor grad_input,
                                      int pooled_height, int pooled_width,
                                      float spatial_scale) {
  // Check dtype.
  TORCH_CHECK(
      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
      "argmax type should be Int or Long, got ", argmax.scalar_type());
  TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||
               grad_output.scalar_type() == at::kHalf),
              "grad_output type should be FLoat or Half, got ",
              grad_output.scalar_type());

  // Check dtype relationship.
  TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),
              "rois should have the same type as grad_output");

  // Check shape.
  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
              grad_output.dim(), "D");
  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
              "D");
  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
              argmax.dim(), "D");

  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
              "spatial_scale should be within (0, 1], got ", spatial_scale);

  // Check relationship between tensor.
  // Check the relationship of n.
  TORCH_CHECK(grad_output.size(0) == rois.size(0),
              "grad_output.size(0) = ", grad_output.size(0),
              ", while rois.size(0) = ", rois.size(0),
              ". They should be the same.");

  // Check the relationship of channels.
  TORCH_CHECK(grad_output.size(1) == argmax.size(1),
              "grad_output.size(1) = ", grad_output.size(1),
              ", while argmax.size(1) = ", argmax.size(1),
              ". They should be the same.");

  // Check the relationship of height and width.
  TORCH_CHECK(grad_output.size(2) == argmax.size(2),
              "argmax.size(2) = ", argmax.size(2),
              ", while grad_output.size(2) = ", grad_output.size(2),
              ". They should be the same.");
  TORCH_CHECK(grad_output.size(3) == argmax.size(3),
              "argmax.size(3) = ", argmax.size(3),
              ", while grad_output.size(3) = ", grad_output.size(3),
              ". They should be the same.");

  // Check zero element.
  if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||
      grad_input.numel() == 0) {
    // return if zero-element
    return;
  }

  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
  auto grad_output_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
  auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);

  int boxes_num = grad_output.size(0);
  int no = grad_input.size(0);
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);
  auto grad_input_ = at::empty({no, channels, height, width},
                               grad_input.options(), memory_format)
                         .zero_();

  // get tensor impl
  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);

  // get compute queue
  auto queue = torch_mlu::getCurQueue();

  // get mlu ptr
  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
  auto rois_ptr = rois_impl->cnnlMalloc();
  auto argmax_ptr = argmax_impl->cnnlMalloc();
  auto grad_input_ptr = grad_input_impl->cnnlMalloc();

  // calculate task dimension
  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());
  cnrtDim3_t k_dim;
  cnrtFunctionType_t k_type;
  policyFuncBackward(&k_dim, &k_type);

  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolBackward<<<" << k_dim.x << ", "
              << k_dim.y << ", " << k_dim.z << ">>>";

  KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,
                        rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,
                        pooled_height, pooled_width, channels, no, height,
                        width, spatial_scale);

  grad_input.copy_(grad_input_);
}

void roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,
                          Tensor argmax, int pooled_height, int pooled_width,
                          float spatial_scale) {
  ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,
                                  pooled_width, spatial_scale);
}

void roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,
                           Tensor grad_input, int pooled_height,
                           int pooled_width, float spatial_scale) {
  ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,
                                   pooled_height, pooled_width, spatial_scale);
}

void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale);

void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale);

REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);
REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 by Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void RoiawarePool3dForwardMLUKernelLauncher(
    const int pool_method, const int boxes_num, const int pts_num,
    const int channels, const int max_pts_each_voxel, const int out_x,
    const int out_y, const int out_z, const Tensor rois, const Tensor pts,
    const Tensor pts_feature, Tensor pts_idx_of_voxels, Tensor pooled_features,
    Tensor argmax) {
  // get compute handle
  auto handle = mluOpGetCurrentHandle();

  auto rois_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
  auto pts_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(pts, pts.suggest_memory_format());
  auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      pts_feature, pts_feature.suggest_memory_format());
  auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      argmax, argmax.suggest_memory_format());
  auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());
  auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      pooled_features, pooled_features.suggest_memory_format());

  MluOpTensorDescriptor rois_desc, pts_desc, pts_feature_desc, argmax_desc,
      pts_idx_of_voxels_desc, pooled_features_desc;
  rois_desc.set(rois_contiguous);
  pts_desc.set(pts_contiguous);
  pts_feature_desc.set(pts_feature_contiguous);
  argmax_desc.set(argmax_contiguous);
  pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);
  pooled_features_desc.set(pooled_features_contiguous);

  // allocate extra space for workspace
  size_t workspace_size = 0;
  TORCH_MLUOP_CHECK(mluOpGetRoiawarePool3dForwardWorkspaceSize(
      handle, rois_desc.desc(), pts_desc.desc(), pts_feature_desc.desc(),
      &workspace_size));

  auto workspace = at::empty(workspace_size, rois.options().dtype(at::kByte));
  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
  auto workspace_ptr = workspace_impl->cnnlMalloc();

  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
  auto pts_impl = torch_mlu::getMluTensorImpl(pts_contiguous);
  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);
  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);
  auto pts_idx_of_voxels_impl =
      torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);
  auto pooled_features_impl =
      torch_mlu::getMluTensorImpl(pooled_features_contiguous);

  auto rois_ptr = rois_impl->cnnlMalloc();
  auto pts_ptr = pts_impl->cnnlMalloc();
  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
  auto argmax_ptr = argmax_impl->cnnlMalloc();
  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();

  CNLOG(INFO) << "Call mluOpRoiawarePool3dForward().";
  TORCH_MLUOP_CHECK(mluOpRoiawarePool3dForward(
      handle, pool_method, boxes_num, pts_num, channels, rois_desc.desc(),
      rois_ptr, pts_desc.desc(), pts_ptr, pts_feature_desc.desc(),
      pts_feature_ptr, workspace_ptr, workspace_size, max_pts_each_voxel, out_x,
      out_y, out_z, argmax_desc.desc(), argmax_ptr,
      pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,
      pooled_features_desc.desc(), pooled_features_ptr));
}

void roiaware_pool3d_forward_mlu(int boxes_num, int pts_num, int channels,
                                 int max_pts_each_voxel, int out_x, int out_y,
                                 int out_z, const Tensor rois, const Tensor pts,
                                 const Tensor pts_feature, Tensor argmax,
                                 Tensor pts_idx_of_voxels,
                                 Tensor pooled_features, int pool_method) {
  RoiawarePool3dForwardMLUKernelLauncher(
      pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
      out_y, out_z, rois, pts, pts_feature, pts_idx_of_voxels, pooled_features,
      argmax);
}

void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method);

REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, MLU,
                     roiaware_pool3d_forward_mlu);

void RoiawarePool3dBackwardMLUKernelLauncher(
    int pool_method, int boxes_num, int out_x, int out_y, int out_z,
    int channels, int max_pts_each_voxel, const Tensor pts_idx_of_voxels,
    const Tensor argmax, const Tensor grad_out, Tensor grad_in) {
  // get compute handle
  auto handle = mluOpGetCurrentHandle();
  auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());
  auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      argmax, argmax.suggest_memory_format());
  auto grad_out_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      grad_out, grad_out.suggest_memory_format());
  auto grad_in_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      grad_in, grad_in.suggest_memory_format());

  MluOpTensorDescriptor pts_idx_of_voxels_desc, argmax_desc, grad_out_desc,
      grad_in_desc;

  pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);
  argmax_desc.set(argmax_contiguous);
  grad_out_desc.set(grad_out_contiguous);
  grad_in_desc.set(grad_in_contiguous);

  auto pts_idx_of_voxels_impl =
      torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);
  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);
  auto grad_out_impl = torch_mlu::getMluTensorImpl(grad_out_contiguous);
  auto grad_in_impl = torch_mlu::getMluTensorImpl(grad_in_contiguous);

  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
  auto argmax_ptr = argmax_impl->cnnlMalloc();
  auto grad_out_ptr = grad_out_impl->cnnlMalloc();
  auto grad_in_ptr = grad_in_impl->cnnlMalloc();

  CNLOG(INFO) << "Call mluOpRoiawarePool3dBackward().";
  TORCH_MLUOP_CHECK(mluOpRoiawarePool3dBackward(
      handle, pool_method, boxes_num, out_x, out_y, out_z, channels,
      max_pts_each_voxel, pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,
      argmax_desc.desc(), argmax_ptr, grad_out_desc.desc(), grad_out_ptr,
      grad_in_desc.desc(), grad_in_ptr));
}

void roiaware_pool3d_backward_mlu(int boxes_num, int out_x, int out_y,
                                  int out_z, int channels,
                                  int max_pts_each_voxel,
                                  const Tensor pts_idx_of_voxels,
                                  const Tensor argmax, const Tensor grad_out,
                                  Tensor grad_in, int pool_method) {
  RoiawarePool3dBackwardMLUKernelLauncher(
      pool_method, boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
      pts_idx_of_voxels, argmax, grad_out, grad_in);
}

void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method);

REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, MLU,
                     roiaware_pool3d_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 by Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void RoIPointPool3dForwardMLUKernelLauncher(
    int batch_size, int pts_num, int boxes_num, int feature_in_len,
    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
    const Tensor pts_feature, Tensor pooled_features,
    Tensor pooled_empty_flag) {
  // check datatype
  TORCH_CHECK(((xyz.scalar_type() == pooled_features.scalar_type()) &&
               (boxes3d.scalar_type() == pooled_features.scalar_type()) &&
               (pts_feature.scalar_type() == pooled_features.scalar_type())),
              "data types of xyz, boxes3d, pts_feature and pooled_features "
              "should be the same, ",
              "but now xyz type is ", xyz.scalar_type(), ", boxes3d type is ",
              boxes3d.scalar_type(), ", pts_feature type is ",
              pts_feature.scalar_type(), ", pooled_features type is ",
              pooled_features.scalar_type(), ".");
  TORCH_CHECK(
      (xyz.scalar_type() == at::kFloat || xyz.scalar_type() == at::kHalf),
      "xyz type should be Float or Half, got ", xyz.scalar_type(), ".");
  TORCH_CHECK((pooled_empty_flag.scalar_type() == at::kInt),
              "pooled_empty_flag type should be Int, got ",
              pooled_empty_flag.scalar_type(), ".");

  // check shape
  TORCH_CHECK(boxes3d.dim() == 3, "boxes3d should be a 3d tensor, got ",
              boxes3d.dim(), "D.");
  TORCH_CHECK(pts_feature.dim() == 3, "pts_feature should be a 3d tensor, got ",
              pts_feature.dim(), "D.");

  TORCH_CHECK(boxes3d.size(2) == 7,
              "the 3rd dimensions of boxes3d should be 7, got ",
              boxes3d.size(2), ".");
  TORCH_CHECK((boxes3d.size(0) == batch_size),
              "the 1st dimensions of boxes3d should be batch_size, ",
              "but now the 1st dimension of boxes3d is ", boxes3d.size(0),
              ", and batch_size is ", batch_size, ".");
  TORCH_CHECK((pts_feature.size(0) == batch_size),
              "the 1st dimensions of pts_feature should be batch_size, ",
              "but now the 1st dimension of pts_feature is ",
              pts_feature.size(0), ", and batch_size is ", batch_size, ".");
  TORCH_CHECK((pts_feature.size(1) == pts_num),
              "the 2nd dimensions of pts_feature should be pts_num, ",
              "but now the 2nd dimension of pts_feature is ",
              pts_feature.size(1), ", and pts_num is ", pts_num, ".");

  // check zero element
  if (xyz.numel() == 0 || pts_feature.numel() == 0 || boxes3d.numel() == 0 ||
      pooled_features.numel() == 0 || pooled_empty_flag.numel() == 0) {
    return;
  }

  // large tensor check
  const size_t max_input_size = 2147483648;
  TORCH_CHECK(xyz.numel() < max_input_size,
              "xyz element num should be less than 2^31, got ", xyz.numel(),
              ".");
  TORCH_CHECK(boxes3d.numel() < max_input_size,
              "boxes3d element num should be less than 2^31, got ",
              boxes3d.numel(), ".");
  TORCH_CHECK(pts_feature.numel() < max_input_size,
              "pts_feature element num should be less than 2^31, got ",
              pts_feature.numel(), ".");

  // set contiguous
  auto xyz_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(xyz, xyz.suggest_memory_format());
  auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      pts_feature, pts_feature.suggest_memory_format());
  auto boxes3d_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      boxes3d, boxes3d.suggest_memory_format());
  auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      pooled_features, pooled_features.suggest_memory_format());
  auto pooled_empty_flag_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      pooled_empty_flag, pooled_empty_flag.suggest_memory_format());

  // get ptr of tensors
  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);
  auto xyz_ptr = xyz_impl->cnnlMalloc();
  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);
  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
  auto boxes3d_impl = torch_mlu::getMluTensorImpl(boxes3d_contiguous);
  auto boxes3d_ptr = boxes3d_impl->cnnlMalloc();
  auto pooled_features_impl =
      torch_mlu::getMluTensorImpl(pooled_features_contiguous);
  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();
  auto pooled_empty_flag_impl =
      torch_mlu::getMluTensorImpl(pooled_empty_flag_contiguous);
  auto pooled_empty_flag_ptr = pooled_empty_flag_impl->cnnlMalloc();

  // create tensor descriptors
  MluOpTensorDescriptor xyz_desc, pts_feature_desc, boxes3d_desc,
      pooled_features_desc, pooled_empty_flag_desc;
  xyz_desc.set(xyz_contiguous);
  pts_feature_desc.set(pts_feature_contiguous);
  boxes3d_desc.set(boxes3d_contiguous);
  pooled_features_desc.set(pooled_features_contiguous);
  pooled_empty_flag_desc.set(pooled_empty_flag_contiguous);

  // get workspace
  size_t workspace_size = 0;
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpGetRoiPointPool3dWorkspaceSize(
      handle, batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
      xyz_desc.desc(), pts_feature_desc.desc(), boxes3d_desc.desc(),
      pooled_features_desc.desc(), pooled_empty_flag_desc.desc(),
      &workspace_size));

  auto workspace = at::empty(workspace_size, xyz.options().dtype(at::kByte));
  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
  auto workspace_ptr = workspace_impl->cnnlMalloc();
  TORCH_MLUOP_CHECK(mluOpRoiPointPool3d(
      handle, batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
      xyz_desc.desc(), xyz_ptr, pts_feature_desc.desc(), pts_feature_ptr,
      boxes3d_desc.desc(), boxes3d_ptr, workspace_ptr, workspace_size,
      pooled_features_desc.desc(), pooled_features_ptr,
      pooled_empty_flag_desc.desc(), (int *)pooled_empty_flag_ptr));
}

void roipoint_pool3d_forward_mlu(int batch_size, int pts_num, int boxes_num,
                                 int feature_in_len, int sampled_pts_num,
                                 const Tensor xyz, const Tensor boxes3d,
                                 const Tensor pts_feature,
                                 Tensor pooled_features,
                                 Tensor pooled_empty_flag) {
  RoIPointPool3dForwardMLUKernelLauncher(
      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
}

void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag);

REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, MLU,
                     roipoint_pool3d_forward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 by Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void RotatedFeatureAlignForwardMLUKernelLauncher(const Tensor features,
                                                 const Tensor best_bboxes,
                                                 const float spatial_scale,
                                                 const int points,
                                                 Tensor output) {
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(features.dim());
  auto features_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(features, memory_format);
  auto best_bboxes_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      best_bboxes, best_bboxes.suggest_memory_format());
  auto output_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);

  MluOpTensorDescriptor features_desc, best_bboxes_desc, output_desc;
  features_desc.set_with_layout(features_, MLUOP_LAYOUT_NHWC);
  best_bboxes_desc.set(best_bboxes_contiguous);
  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);

  // get ptr of tensors
  auto features_impl = torch_mlu::getMluTensorImpl(features_);
  auto features_ptr = features_impl->cnnlMalloc();
  auto best_bboxes_impl = torch_mlu::getMluTensorImpl(best_bboxes_contiguous);
  auto best_bboxes_ptr = best_bboxes_impl->cnnlMalloc();
  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
  auto output_ptr = output_impl->cnnlMalloc();

  // get compute handle
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpRotatedFeatureAlignForward(
      handle, features_desc.desc(), features_ptr, best_bboxes_desc.desc(),
      best_bboxes_ptr, spatial_scale, points, output_desc.desc(), output_ptr));

  output.copy_(output_contiguous);
}

void RotatedFeatureAlignBackwardMLUKernelLauncher(const Tensor top_grad,
                                                  const Tensor best_bboxes,
                                                  const float spatial_scale,
                                                  const int points,
                                                  Tensor bottom_grad) {
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
  auto top_grad_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
  auto best_bboxes_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      best_bboxes, best_bboxes.suggest_memory_format());
  auto bottom_grad_ =
      torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);

  // get ptr of tensors
  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);
  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
  auto best_bboxes_impl = torch_mlu::getMluTensorImpl(best_bboxes_contiguous);
  auto best_bboxes_ptr = best_bboxes_impl->cnnlMalloc();
  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);
  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();

  MluOpTensorDescriptor top_grad_desc, best_bboxes_desc, bottom_grad_desc;
  top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);
  best_bboxes_desc.set(best_bboxes_contiguous);
  bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);

  // get compute handle
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpRotatedFeatureAlignBackward(
      handle, top_grad_desc.desc(), top_grad_ptr, best_bboxes_desc.desc(),
      best_bboxes_ptr, spatial_scale, points, bottom_grad_desc.desc(),
      bottom_grad_ptr));
  bottom_grad.copy_(bottom_grad_);
}

void rotated_feature_align_forward_mlu(const Tensor features,
                                       const Tensor best_bboxes,
                                       const float spatial_scale,
                                       const int points, Tensor output) {
  RotatedFeatureAlignForwardMLUKernelLauncher(features, best_bboxes,
                                              spatial_scale, points, output);
}

void rotated_feature_align_backward_mlu(const Tensor top_grad,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor bottom_grad) {
  RotatedFeatureAlignBackwardMLUKernelLauncher(
      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
}

void rotated_feature_align_forward_impl(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output);

void rotated_feature_align_backward_impl(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad);

REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, MLU,
                     rotated_feature_align_forward_mlu);
REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, MLU,
                     rotated_feature_align_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2023 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

std::vector<Tensor> dynamic_point_to_voxel_forward_mlu(
    const Tensor &feats, const Tensor &coors, const reduce_t reduce_type) {
  // params check
  TORCH_CHECK(feats.scalar_type() == at::kFloat,
              "feats type should be Float, got ", feats.scalar_type());
  TORCH_CHECK(coors.scalar_type() == at::kInt,
              "coors type should be Int32, got ", coors.scalar_type());
  TORCH_CHECK(feats.size(0) == coors.size(0),
              "feats.dim(0) and coors.dim(0) should be same, got ",
              feats.size(0), " vs ", coors.size(0));

  const int num_input = feats.size(0);
  const int num_feats = feats.size(1);
  // zero-element check
  if (num_input == 0)
    return {feats.clone().detach(), coors.clone().detach(),
            coors.new_empty({0}, torch::kInt32),
            coors.new_empty({0}, torch::kInt32)};

  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);
  auto reduced_feats = at::empty({num_input, num_feats}, feats.options());
  auto out_coors = at::empty({num_input, 3}, coors.options());
  auto coors_map = at::empty({num_input}, coors.options());
  auto reduce_count = at::empty({num_input}, coors.options());
  auto voxel_num = at::empty({1}, coors.options());

  INITIAL_MLU_PARAM_WITH_TENSOR(feats);
  INITIAL_MLU_PARAM_WITH_TENSOR(coors);
  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);
  INITIAL_MLU_PARAM_WITH_TENSOR(out_coors);
  INITIAL_MLU_PARAM_WITH_TENSOR(coors_map);
  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);
  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);

  // get compute handle
  auto handle = mluOpGetCurrentHandle();

  size_t workspace_size;
  TORCH_MLUOP_CHECK(mluOpGetDynamicPointToVoxelForwardWorkspaceSize(
      handle, feats_desc.desc(), coors_desc.desc(), &workspace_size));
  auto workspace_tensor =
      at::empty(workspace_size, feats.options().dtype(at::kByte));
  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);

  // launch kernel
  TORCH_MLUOP_CHECK(mluOpDynamicPointToVoxelForward(
      handle, mlu_reduce_type, feats_desc.desc(), feats_ptr, coors_desc.desc(),
      coors_ptr, workspace_tensor_ptr, workspace_size,
      reduced_feats_desc.desc(), reduced_feats_ptr, out_coors_desc.desc(),
      out_coors_ptr, coors_map_desc.desc(), coors_map_ptr,
      reduce_count_desc.desc(), reduce_count_ptr, voxel_num_desc.desc(),
      voxel_num_ptr));

  int voxel_num_value = *static_cast<int *>(voxel_num.cpu().data_ptr());
  TORCH_CHECK(voxel_num_value <= feats.size(0),
              "voxel_num should be less than or equal to feats_num, got ",
              voxel_num_value, " vs ", feats.size(0));
  return {reduced_feats.slice(0, 0, voxel_num_value),
          out_coors.slice(0, 0, voxel_num_value), coors_map,
          reduce_count.slice(0, 0, voxel_num_value)};
}

void dynamic_point_to_voxel_backward_mlu(
    Tensor &grad_feats, const Tensor &grad_reduced_feats, const Tensor &feats,
    const Tensor &reduced_feats, const Tensor &coors_idx,
    const Tensor &reduce_count, const reduce_t reduce_type) {
  // params check
  TORCH_CHECK(grad_reduced_feats.scalar_type() == at::kFloat,
              "grad_reduced_feats type should be Float, got ",
              grad_reduced_feats.scalar_type());
  TORCH_CHECK(feats.scalar_type() == at::kFloat,
              "feats type should be Float, got ", feats.scalar_type());
  TORCH_CHECK(reduced_feats.scalar_type() == at::kFloat,
              "reduced_feats type should be Float, got ",
              reduced_feats.scalar_type());
  TORCH_CHECK(coors_idx.scalar_type() == at::kInt,
              "coors_idx type should be Int32, got ", coors_idx.scalar_type());
  TORCH_CHECK(reduce_count.scalar_type() == at::kInt,
              "reduce_count type should be Int32, got ",
              reduce_count.scalar_type());

  const int num_input = feats.size(0);
  const int num_reduced = reduced_feats.size(0);
  const int num_feats = feats.size(1);

  grad_feats.fill_(0);

  // zero-element check
  if (num_input == 0 || num_reduced == 0) return;

  // TODO(miaochen): remove this after mlu-ops supports other mode of reduce.
  TORCH_CHECK(reduce_type == reduce_t::MAX,
              "only supports max reduce in current version, got ",
              to_string(reduce_type));

  int voxel_num_value = reduced_feats.size(0);
  auto opts = torch::TensorOptions().dtype(torch::kInt32);
  auto voxel_num =
      torch::from_blob(&voxel_num_value, {1}, opts).clone().to(at::kMLU);
  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);

  INITIAL_MLU_PARAM_WITH_TENSOR(grad_feats);
  INITIAL_MLU_PARAM_WITH_TENSOR(grad_reduced_feats);
  INITIAL_MLU_PARAM_WITH_TENSOR(feats);
  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);
  INITIAL_MLU_PARAM_WITH_TENSOR(coors_idx);
  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);
  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);

  // get compute handle
  auto handle = mluOpGetCurrentHandle();

  size_t workspace_size;
  TORCH_MLUOP_CHECK(mluOpGetDynamicPointToVoxelBackwardWorkspaceSize(
      handle, mlu_reduce_type, grad_feats_desc.desc(), feats_desc.desc(),
      grad_reduced_feats_desc.desc(), coors_idx_desc.desc(),
      reduce_count_desc.desc(), voxel_num_desc.desc(), &workspace_size));
  auto workspace_tensor =
      at::empty(workspace_size, feats.options().dtype(at::kByte));
  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);

  // launch kernel
  TORCH_MLUOP_CHECK(mluOpDynamicPointToVoxelBackward(
      handle, mlu_reduce_type, grad_reduced_feats_desc.desc(),
      grad_reduced_feats_ptr, feats_desc.desc(), feats_ptr,
      reduced_feats_desc.desc(), reduced_feats_ptr, coors_idx_desc.desc(),
      coors_idx_ptr, reduce_count_desc.desc(), reduce_count_ptr,
      voxel_num_desc.desc(), voxel_num_ptr, workspace_tensor_ptr,
      workspace_size, grad_feats_desc.desc(), grad_feats_ptr));
}

std::vector<Tensor> dynamic_point_to_voxel_forward_impl(
    const Tensor &feats, const Tensor &coors, const reduce_t reduce_type);

void dynamic_point_to_voxel_backward_impl(
    Tensor &grad_feats, const Tensor &grad_reduced_feats, const Tensor &feats,
    const Tensor &reduced_feats, const Tensor &coors_idx,
    const Tensor &reduce_count, const reduce_t reduce_type);

REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, MLU,
                     dynamic_point_to_voxel_forward_mlu);
REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, MLU,
                     dynamic_point_to_voxel_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include <torch/script.h>

#include <vector>

#include "mlu_common_helper.h"
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  // The following code is copied from
  // mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu to ensure the output is
  // available for network train. The outputs of this function have correct
  // shape but wrong value.
  auto numAct = indices.size(0);
  auto kernelVolume = kernelSize[0];
  int sub_m = (int)_subM;
  int transpose = (int)_transpose;
  int batch = (int)batchSize;
  auto coorDim = indices.size(1) - 1;

  for (int i = 1; i < kernelSize.size(); ++i) {
    kernelVolume *= kernelSize[i];
  }

  auto outputVolume = outSpatialShape[0];
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];
  }
  torch::Tensor indicePairs = at::full({kernelVolume, 2, numAct}, -1,
                                       indices.options().dtype(at::kInt));
  torch::Tensor indiceNum =
      at::zeros({kernelVolume}, indices.options().dtype(at::kInt));
  int out_size = sub_m == 1
                     ? numAct
                     : std::min(numAct * kernelVolume, batch * outputVolume);
  torch::Tensor out_indices =
      at::zeros({out_size, coorDim + 1}, indices.options().dtype(at::kInt));
  auto indices_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      indices, at::MemoryFormat::Contiguous);
  auto indicePairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      indicePairs, at::MemoryFormat::Contiguous);
  auto indiceNum_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      indiceNum, at::MemoryFormat::Contiguous);
  auto out_indices_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      out_indices, at::MemoryFormat::Contiguous);

  std::vector<int> input_space;
  std::vector<int> filter_space;
  std::vector<int> output_space;
  std::vector<int> padding32;
  std::vector<int> stride32;
  std::vector<int> dilation32;
  for (int i = 0; i < NDim; i++) {
    input_space.push_back(spatialShape[i]);
    filter_space.push_back(kernelSize[i]);
    output_space.push_back(outSpatialShape[i]);
    padding32.push_back(padding[i]);
    stride32.push_back(stride[i]);
    dilation32.push_back(dilation[i]);
  }
  MluOpTensorDescriptor indices_desc, out_indices_desc, indicePairs_desc,
      indiceNum_desc;
  indices_desc.set(indices_contiguous);
  indicePairs_desc.set(indicePairs_contiguous);
  indiceNum_desc.set(indiceNum_contiguous);
  out_indices_desc.set(out_indices_contiguous);
  {
    mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;
    mluOpDataType_t dtype = MLUOP_DTYPE_INT32;
    std::vector<int> dims;
    dims = {numAct, coorDim + 1};
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        indices_desc.desc(), layout, dtype, dims.size(), dims.data()));
    dims = {kernelVolume, 2, numAct};
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        indicePairs_desc.desc(), layout, dtype, dims.size(), dims.data()));
    dims = {kernelVolume};
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        indiceNum_desc.desc(), layout, dtype, dims.size(), dims.data()));
    dims = {out_size, coorDim + 1};
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        out_indices_desc.desc(), layout, dtype, dims.size(), dims.data()));
  }

  mluOpSparseConvolutionDescriptor_t sparse_conv_desc;
  TORCH_MLUOP_CHECK(mluOpCreateSparseConvolutionDescriptor(&sparse_conv_desc));
  TORCH_MLUOP_CHECK(mluOpSetSparseConvolutionDescriptor(
      sparse_conv_desc, NDim + 2, batch, padding32.data(), stride32.data(),
      dilation32.data(), input_space.data(), filter_space.data(),
      output_space.data(), sub_m, transpose, 0));

  auto handle = mluOpGetCurrentHandle();
  size_t workspace_size = 0;
  TORCH_MLUOP_CHECK(mluOpGetIndicePairsWorkspaceSize(
      handle, sparse_conv_desc, indices_desc.desc(), indicePairs_desc.desc(),
      out_indices_desc.desc(), indiceNum_desc.desc(), &workspace_size));
  auto indice_workspace_size =
      at::empty(workspace_size, indices.options().dtype(at::kByte));

  auto indices_impl = torch_mlu::getMluTensorImpl(indices_contiguous);
  auto out_indices_impl = torch_mlu::getMluTensorImpl(out_indices_contiguous);
  auto indicePairs_impl = torch_mlu::getMluTensorImpl(indicePairs_contiguous);
  auto indiceNum_impl = torch_mlu::getMluTensorImpl(indiceNum_contiguous);
  auto indice_workspace_impl =
      torch_mlu::getMluTensorImpl(indice_workspace_size);

  auto indices_ptr = indices_impl->cnnlMalloc();
  auto out_indices_ptr = out_indices_impl->cnnlMalloc();
  auto indicePairs_ptr = indicePairs_impl->cnnlMalloc();
  auto indiceNum_ptr = indiceNum_impl->cnnlMalloc();
  auto indice_workspace_ptr = indice_workspace_impl->cnnlMalloc();

  TORCH_MLUOP_CHECK(mluOpGetIndicePairs(
      handle, sparse_conv_desc, indices_desc.desc(), indices_ptr,
      indice_workspace_ptr, workspace_size, indicePairs_desc.desc(),
      indicePairs_ptr, out_indices_desc.desc(), out_indices_ptr,
      indiceNum_desc.desc(), indiceNum_ptr));
  int num_act_out = 0;
  TORCH_MLUOP_CHECK(
      mluOpGetSparseConvolutionNumActOut(sparse_conv_desc, &num_act_out));
  TORCH_MLUOP_CHECK(mluOpDestroySparseConvolutionDescriptor(sparse_conv_desc));
  if (!sub_m) {
    return {out_indices.slice(0, 0, num_act_out), indicePairs, indiceNum};
  } else {
    return {indices, indicePairs, indiceNum};
  }
}

torch::Tensor IndiceConvForwardMLUKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
    int64_t _subM) {
  auto indice_num_cpu = indiceNum.to({torch::kCPU});
  auto indice_num_cpu_64 = indice_num_cpu.to(torch::kInt64);
  auto indice_num = indice_num_cpu_64.data_ptr<int64_t>();

  // generate empty output
  int C = filters.dim() == 4 ? filters.size(3) : filters.size(4);
  torch::Tensor output =
      at::zeros({numActOut, C}, features.options().dtype(at::kFloat));
  // generate descriptor
  auto features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      features, at::MemoryFormat::Contiguous);
  auto filters_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      filters, at::MemoryFormat::Contiguous);
  auto indice_pairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      indicePairs, at::MemoryFormat::Contiguous);
  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      output, at::MemoryFormat::Contiguous);

  MluOpTensorDescriptor features_desc, filters_desc, indice_pairs_desc,
      output_desc;
  features_desc.set(features_contiguous);
  filters_desc.set(filters_contiguous);
  indice_pairs_desc.set(indice_pairs_contiguous);
  output_desc.set(output_contiguous);

  // set layout
  {
    mluOpTensorLayout_t layout;
    mluOpDataType_t dtype;
    int dim;
    int dims[8];

    // features_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(features_desc.desc(), &layout,
                                               &dtype, &dim, dims));
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        features_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));

    // filters_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(filters_desc.desc(), &layout,
                                               &dtype, &dim, dims));
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        filters_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));

    // indice_pairs_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(indice_pairs_desc.desc(),
                                               &layout, &dtype, &dim, dims));
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        indice_pairs_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));

    // output_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(output_desc.desc(), &layout,
                                               &dtype, &dim, dims));
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        output_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
  }

  auto handle = mluOpGetCurrentHandle();
  size_t workspace_size = 0;
  TORCH_MLUOP_CHECK(mluOpGetIndiceConvolutionForwardWorkspaceSize(
      handle, features_desc.desc(), filters_desc.desc(),
      indice_pairs_desc.desc(), output_desc.desc(), indice_num, numActOut,
      _inverse, _subM, &workspace_size));

  auto workspace =
      at::empty(workspace_size, features.options().dtype(at::kByte));

  auto features_impl = torch_mlu::getMluTensorImpl(features_contiguous);
  auto filters_impl = torch_mlu::getMluTensorImpl(filters_contiguous);
  auto indice_pairs_impl = torch_mlu::getMluTensorImpl(indice_pairs_contiguous);
  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);

  auto features_ptr = features_impl->cnnlMalloc();
  auto filters_ptr = filters_impl->cnnlMalloc();
  auto indice_pairs_ptr = indice_pairs_impl->cnnlMalloc();
  auto workspace_ptr = workspace_impl->cnnlMalloc();

  //  outputs
  auto output_impl = torch_mlu::getMluTensorImpl(output);
  auto output_ptr = output_impl->cnnlMalloc();
  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionForward(
      handle, features_desc.desc(), features_ptr, filters_desc.desc(),
      filters_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
      numActOut, _inverse, _subM, workspace_ptr, workspace_size,
      output_desc.desc(), output_ptr));

  return output;
}

std::vector<torch::Tensor> IndiceConvBackwardMLUKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM) {
  auto indice_num_cpu = indiceNum.to({torch::kCPU});
  auto indice_num_cpu_64 = indice_num_cpu.to(torch::kInt64);
  auto indice_num = indice_num_cpu_64.data_ptr<int64_t>();

  // generate empty input_grad
  torch::Tensor input_grad = at::zeros({features.size(0), features.size(1)},
                                       features.options().dtype(at::kFloat));
  torch::Tensor filters_grad;
  if (filters.dim() == 4) {
    int h = filters.size(0);
    int w = filters.size(1);
    int c = filters.size(2);
    int n = filters.size(3);
    filters_grad = at::zeros({h, w, c, n}, filters.options().dtype(at::kFloat));
  } else if (filters.dim() == 5) {
    int d = filters.size(0);
    int h = filters.size(1);
    int w = filters.size(2);
    int c = filters.size(3);
    int n = filters.size(4);
    filters_grad =
        at::zeros({d, h, w, c, n}, filters.options().dtype(at::kFloat));
  }

  auto features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      features, at::MemoryFormat::Contiguous);
  auto filters_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      filters, at::MemoryFormat::Contiguous);
  auto output_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      outGrad, at::MemoryFormat::Contiguous);
  auto indice_pairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      indicePairs, at::MemoryFormat::Contiguous);
  auto input_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      features, at::MemoryFormat::Contiguous);
  auto filters_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      filters, at::MemoryFormat::Contiguous);

  MluOpTensorDescriptor features_desc, output_grad_desc, filters_desc,
      indice_pairs_desc, input_grad_desc, filters_grad_desc;
  features_desc.set(features_contiguous);
  filters_desc.set(filters_contiguous);
  output_grad_desc.set(output_grad_contiguous);
  indice_pairs_desc.set(indice_pairs_contiguous);
  input_grad_desc.set(input_grad_contiguous);
  filters_grad_desc.set(filters_grad_contiguous);

  // need to set desc layout with mluOp functions
  {
    mluOpTensorLayout_t layout;
    mluOpDataType_t dtype;
    int dim;
    int dims[8];

    // features_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(features_desc.desc(), &layout,
                                               &dtype, &dim, dims));
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        features_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));

    // filters_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(filters_desc.desc(), &layout,
                                               &dtype, &dim, dims));
    if (dim == 4) {
      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
          filters_desc.desc(), MLUOP_LAYOUT_HWCN, dtype, dim, dims));
    } else {
      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
          filters_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
    }

    // output_grad_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(output_grad_desc.desc(), &layout,
                                               &dtype, &dim, dims));
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        output_grad_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));

    // indice_pairs_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(indice_pairs_desc.desc(),
                                               &layout, &dtype, &dim, dims));
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        indice_pairs_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));

    // input_grad_desc
    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(input_grad_desc.desc(), &layout,
                                               &dtype, &dim, dims));
    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
        input_grad_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
  }

  auto handle = mluOpGetCurrentHandle();
  size_t data_workspace_size = 0;
  mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(
      handle, output_grad_desc.desc(), filters_desc.desc(),
      indice_pairs_desc.desc(), input_grad_desc.desc(), indice_num, _inverse,
      &data_workspace_size);

  size_t filters_workspace_size = 0;
  TORCH_MLUOP_CHECK(mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(
      handle, features_desc.desc(), output_grad_desc.desc(),
      indice_pairs_desc.desc(), filters_grad_desc.desc(), indice_num, _inverse,
      _subM, &filters_workspace_size));

  auto indice_convbpdata_workspace =
      at::empty(data_workspace_size, features.options().dtype(at::kByte));
  auto indice_convbpfilter_workspace =
      at::empty(filters_workspace_size, filters.options().dtype(at::kByte));

  auto features_impl = torch_mlu::getMluTensorImpl(features_contiguous);
  auto filters_impl = torch_mlu::getMluTensorImpl(filters_contiguous);
  auto output_grad_impl = torch_mlu::getMluTensorImpl(output_grad_contiguous);
  auto indice_pairs_impl = torch_mlu::getMluTensorImpl(indice_pairs_contiguous);
  auto indice_convbpdata_workspace_impl =
      torch_mlu::getMluTensorImpl(indice_convbpdata_workspace);
  auto indice_convbpfilter_workspace_impl =
      torch_mlu::getMluTensorImpl(indice_convbpfilter_workspace);

  auto features_ptr = features_impl->cnnlMalloc();
  auto filters_ptr = filters_impl->cnnlMalloc();
  auto output_grad_ptr = output_grad_impl->cnnlMalloc();
  auto indice_pairs_ptr = indice_pairs_impl->cnnlMalloc();
  auto indice_convbpdata_workspace_ptr =
      indice_convbpdata_workspace_impl->cnnlMalloc();
  auto indice_convbpfilter_workspace_ptr =
      indice_convbpfilter_workspace_impl->cnnlMalloc();

  // outputs
  auto input_grad_impl = torch_mlu::getMluTensorImpl(input_grad);
  auto input_grad_ptr = input_grad_impl->cnnlMalloc();
  auto filters_grad_impl = torch_mlu::getMluTensorImpl(filters_grad);
  auto filters_grad_ptr = filters_grad_impl->cnnlMalloc();

  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionBackwardData(
      handle, output_grad_desc.desc(), output_grad_ptr, filters_desc.desc(),
      filters_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
      _inverse, _subM, indice_convbpdata_workspace_ptr, data_workspace_size,
      input_grad_desc.desc(), input_grad_ptr));

  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionBackwardFilter(
      handle, features_desc.desc(), features_ptr, output_grad_desc.desc(),
      output_grad_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
      _inverse, _subM, indice_convbpfilter_workspace_ptr,
      filters_workspace_size, filters_grad_desc.desc(), filters_grad_ptr));

  std::vector<torch::Tensor> result;
  result.push_back(input_grad);
  result.push_back(filters_grad);
  return result;
}

torch::Tensor indice_conv_forward_mlu(torch::Tensor features,
                                      torch::Tensor filters,
                                      torch::Tensor indicePairs,
                                      torch::Tensor indiceNum,
                                      int64_t numActOut, int64_t _inverse,
                                      int64_t _subM) {
  return IndiceConvForwardMLUKernelLauncher(
      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
}

std::vector<torch::Tensor> indice_conv_backward_mlu(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM) {
  return IndiceConvBackwardMLUKernelLauncher(
      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
}

torch::Tensor indice_conv_forward_impl(torch::Tensor features,
                                       torch::Tensor filters,
                                       torch::Tensor indicePairs,
                                       torch::Tensor indiceNum,
                                       int64_t numActOut, int64_t _inverse,
                                       int64_t _subM);

std::vector<torch::Tensor> indice_conv_backward_impl(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM);

REGISTER_DEVICE_IMPL(indice_conv_forward_impl, MLU, indice_conv_forward_mlu);
REGISTER_DEVICE_IMPL(indice_conv_backward_impl, MLU, indice_conv_backward_mlu);

template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<2>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<3>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<4>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void ThreeNNMLUKernelLauncher(int b, int n, int m, const Tensor unknown,
                              const Tensor known, Tensor dist2, Tensor idx) {
  auto unknown_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      unknown, unknown.suggest_memory_format());
  auto known_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      known, known.suggest_memory_format());
  auto dist2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      dist2, dist2.suggest_memory_format());
  auto idx_contiguous =
      torch_mlu::cnnl::ops::cnnl_contiguous(idx, idx.suggest_memory_format());

  MluOpTensorDescriptor unknown_desc, known_desc, dist2_desc, idx_desc;
  unknown_desc.set(unknown_contiguous);
  known_desc.set(known_contiguous);
  dist2_desc.set(dist2_contiguous);
  idx_desc.set(idx_contiguous);

  auto handle = mluOpGetCurrentHandle();
  size_t workspace_size = 0;
  TORCH_MLUOP_CHECK(mluOpGetThreeNNForwardWorkspaceSize(
      handle, known_desc.desc(), &workspace_size));
  auto known_workspace =
      at::empty(workspace_size, known.options().dtype(at::kByte));

  auto unknown_impl = torch_mlu::getMluTensorImpl(unknown_contiguous);
  auto known_impl = torch_mlu::getMluTensorImpl(known_contiguous);
  auto dist2_impl = torch_mlu::getMluTensorImpl(dist2_contiguous);
  auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);
  auto workspace_impl = torch_mlu::getMluTensorImpl(known_workspace);
  auto unknown_ptr = unknown_impl->cnnlMalloc();
  auto known_ptr = known_impl->cnnlMalloc();
  auto dist2_ptr = dist2_impl->cnnlMalloc();
  auto idx_ptr = idx_impl->cnnlMalloc();
  auto workspace_ptr = workspace_impl->cnnlMalloc();

  TORCH_MLUOP_CHECK(mluOpThreeNNForward(
      handle, unknown_desc.desc(), unknown_ptr, known_desc.desc(), known_ptr,
      workspace_ptr, workspace_size, dist2_desc.desc(), dist2_ptr,
      idx_desc.desc(), idx_ptr));
}

void three_nn_forward_mlu(int b, int n, int m, const Tensor unknown,
                          const Tensor known, Tensor dist2, Tensor idx) {
  ThreeNNMLUKernelLauncher(b, n, m, unknown, known, dist2, idx);
}

void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx);

REGISTER_DEVICE_IMPL(three_nn_forward_impl, MLU, three_nn_forward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,
                                      Tensor output) {
  // params check
  TORCH_CHECK(
      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
      "input type should be Float or Half, got ", input.scalar_type(), ".");
  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
              input.dim(), "d.");
  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
              shift.dim(), "d.");
  TORCH_CHECK(
      input.size(0) == shift.size(0),
      "input batch size should be the same as shift's, input batch size is ",
      input.size(0), " and shift batch size is ", shift.size(0), ".");
  TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero.");
  TORCH_CHECK(input.size(3) != 0,
              "The last dim size of input should not be zero.");
  if (input.size(1) == 0) {
    return;
  }

  // set contiguous
  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      input, input.suggest_memory_format());
  auto shift_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      shift, shift.suggest_memory_format());
  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      output, output.suggest_memory_format());

  // get tensor impl
  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
  auto shift_impl = torch_mlu::getMluTensorImpl(shift_contiguous);
  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);

  // get the mlu ptr
  auto input_ptr = input_impl->cnnlMalloc();
  auto shift_ptr = shift_impl->cnnlMalloc();
  auto output_ptr = output_impl->cnnlMalloc();

  // set tensor descriptor
  MluOpTensorDescriptor input_desc, shift_desc, output_desc;
  input_desc.set(input_contiguous);
  shift_desc.set(shift_contiguous);
  output_desc.set(output_contiguous);

  // get current handle
  auto handle = mluOpGetCurrentHandle();

  TORCH_MLUOP_CHECK(mluOpTinShiftForward(handle, input_desc.desc(), input_ptr,
                                         shift_desc.desc(), shift_ptr,
                                         output_desc.desc(), output_ptr));
}

void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,
                                       Tensor grad_input) {
  // params check
  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
                  grad_output.scalar_type() == at::kHalf,
              "grad_output type should be Float or Half, got ",
              grad_output.scalar_type(), ".");
  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
              grad_output.dim(), "d.");
  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
              shift.dim(), "d.");
  TORCH_CHECK(grad_output.size(0) == shift.size(0),
              "grad_output batch size should be the same as shift's, "
              "grad_output batch size is ",
              grad_output.size(0), ", shift batch size is ", shift.size(0),
              ".");
  TORCH_CHECK(grad_output.size(0) != 0,
              "grad_output batch size should not be zero.");
  TORCH_CHECK(grad_output.size(3) != 0,
              "The last dim size of grad_output should not be zero.");
  if (grad_output.size(1) == 0) {
    return;
  }

  // set contiguous
  auto grad_output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      grad_output, grad_output.suggest_memory_format());
  auto shift_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      shift, shift.suggest_memory_format());
  auto grad_input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
      grad_input, grad_input.suggest_memory_format());

  // get tensor impl
  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_contiguous);
  auto shift_impl = torch_mlu::getMluTensorImpl(shift_contiguous);
  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_contiguous);

  // get the mlu ptr
  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
  auto shift_ptr = shift_impl->cnnlMalloc();
  auto grad_input_ptr = grad_input_impl->cnnlMalloc();

  // set tensor descriptor
  MluOpTensorDescriptor grad_output_desc, shift_desc, grad_input_desc;
  grad_output_desc.set(grad_output_contiguous);
  shift_desc.set(shift_contiguous);
  grad_input_desc.set(grad_input_contiguous);

  // get current handle
  auto handle = mluOpGetCurrentHandle();

  TORCH_MLUOP_CHECK(mluOpTinShiftBackward(
      handle, grad_output_desc.desc(), grad_output_ptr, shift_desc.desc(),
      shift_ptr, grad_input_desc.desc(), grad_input_ptr));
}

void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {
  TINShiftForwardMLUKernelLauncher(input, shift, output);
}

void tin_shift_backward_mlu(Tensor grad_output, Tensor shift,
                            Tensor grad_input) {
  TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);
}

void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);

void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
                             Tensor grad_input);

REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);
REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp
================================================
/*************************************************************************
 * Copyright (C) 2022 by Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
#include "mlu_common_helper.h"

/*************************************************************************
 * This MACRO contains operations of simple tensor to mlu-tensor.
 * _contiguous, _desc, _impl, _ptr will be automatically generated in
 * this MACRO.
 *************************************************************************/
#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \
  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \
      NAME, NAME.suggest_memory_format());                          \
  MluOpTensorDescriptor NAME##_desc;                                \
  NAME##_desc.set(NAME##_contigous);                                \
  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
  auto NAME##_ptr = NAME##_impl->cnnlMalloc();

int HardVoxelizeForwardMLUKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3) {
  std::vector<float> _voxel_size(voxel_size.begin(), voxel_size.end());
  std::vector<float> _coors_range(coors_range.begin(), coors_range.end());
  auto opts = torch::TensorOptions().dtype(torch::kFloat32);
  auto voxel_size_tensor =
      torch::from_blob(_voxel_size.data(), {int64_t(_voxel_size.size())}, opts)
          .clone()
          .to(at::kMLU);
  auto coors_range_tensor =
      torch::from_blob(_coors_range.data(), {int64_t(_coors_range.size())},
                       opts)
          .clone()
          .to(at::kMLU);
  INITIAL_MLU_PARAM_WITH_TENSOR(points);
  INITIAL_MLU_PARAM_WITH_TENSOR(voxels);
  INITIAL_MLU_PARAM_WITH_TENSOR(coors);
  INITIAL_MLU_PARAM_WITH_TENSOR(num_points_per_voxel);
  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_size_tensor);
  INITIAL_MLU_PARAM_WITH_TENSOR(coors_range_tensor);

  auto voxel_num_tensor = at::empty({1}, points.options().dtype(torch::kInt32));
  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num_tensor);

  size_t workspace_size;
  auto handle = mluOpGetCurrentHandle();
  TORCH_MLUOP_CHECK(mluOpGetVoxelizationWorkspaceSize(
      handle, points_desc.desc(), voxel_size_tensor_desc.desc(),
      coors_range_tensor_desc.desc(), max_points, max_voxels, NDim, true,
      voxels_desc.desc(), coors_desc.desc(), num_points_per_voxel_desc.desc(),
      voxel_num_tensor_desc.desc(), &workspace_size));
  auto workspace_tensor =
      at::empty(workspace_size, points.options().dtype(at::kByte));
  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);

  TORCH_MLUOP_CHECK(mluOpVoxelization(
      handle, points_desc.desc(), points_ptr, voxel_size_tensor_desc.desc(),
      voxel_size_tensor_ptr, coors_range_tensor_desc.desc(),
      coors_range_tensor_ptr, max_points, max_voxels, NDim, true,
      workspace_tensor_ptr, workspace_size, voxels_desc.desc(), voxels_ptr,
      coors_desc.desc(), coors_ptr, num_points_per_voxel_desc.desc(),
      num_points_per_voxel_ptr, voxel_num_tensor_desc.desc(),
      voxel_num_tensor_ptr));
  auto voxel_num_cpu = voxel_num_tensor.to(at::kCPU);
  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
  return voxel_num_int;
}

int hard_voxelize_forward_mlu(const at::Tensor &points, at::Tensor &voxels,
                              at::Tensor &coors,
                              at::Tensor &num_points_per_voxel,
                              const std::vector<float> voxel_size,
                              const std::vector<float> coors_range,
                              const int max_points, const int max_voxels,
                              const int NDim) {
  return HardVoxelizeForwardMLUKernelLauncher(
      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
      max_points, max_voxels, NDim);
}

int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim);

REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, MLU,
                     hard_voxelize_forward_mlu);


================================================
FILE: mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>

#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"

using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif

void modulated_deformable_im2col_impl(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col) {
  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
                       data_mask, batch_size, channels, height_im, width_im,
                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
                       stride_h, stride_w, dilation_h, dilation_w,
                       deformable_group, data_col);
}

void modulated_deformable_col2im_impl(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im) {
  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
                       data_mask, batch_size, channels, height_im, width_im,
                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
                       stride_h, stride_w, dilation_h, dilation_w,
                       deformable_group, grad_im);
}

void modulated_deformable_col2im_coord_impl(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask) {
  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
                       data_im, data_offset, data_mask, batch_size, channels,
                       height_im, width_im, height_col, width_col, kernel_h,
                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
                       dilation_w, deformable_group, grad_offset, grad_mask);
}

void modulated_deform_conv_forward_fallthrough(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias) {
  at::DeviceGuard guard(input.device());

  const int batch = input.size(0);
  const int channels = input.size(1);
  const int height = input.size(2);
  const int width = input.size(3);

  const int channels_out = weight.size(0);
  const int channels_kernel = weight.size(1);
  const int kernel_h_ = weight.size(2);
  const int kernel_w_ = weight.size(3);

  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
             kernel_h_, kernel_w, kernel_h_, kernel_w_);
  if (channels != channels_kernel * group)
    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
             channels, channels_kernel * group);

  const int height_out =
      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int width_out =
      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;

  if (ones.ndimension() != 2 ||
      ones.size(0) * ones.size(1) < height_out * width_out) {
    // Resize plane and fill with ones...
    ones = at::ones({height_out, width_out}, input.options());
  }

  // resize output
  output = output.view({batch, channels_out, height_out, width_out}).zero_();
  // resize temporary columns
  columns =
      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
                input.options());

  output = output.view({output.size(0), group, output.size(1) / group,
                        output.size(2), output.size(3)});

  for (int b = 0; b < batch; b++) {
    modulated_deformable_im2col_impl(
        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
        dilation_h, dilation_w, deformable_group, columns);

    // divide into group
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
                          weight.size(2), weight.size(3)});
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});

    for (int g = 0; g < group; g++) {
      output[b][g] = output[b][g]
                         .flatten(1)
                         .addmm_(weight[g].flatten(1), columns[g])
                         .view_as(output[b][g]);
    }

    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});
    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
  }

  output = output.view({output.size(0), output.size(1) * output.size(2),
                        output.size(3), output.size(4)});

  if (with_bias) {
    output += bias.view({1, bias.size(0), 1, 1});
  }
}

void modulated_deform_conv_backward_fallthrough(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias) {
  at::DeviceGuard guard(input.device());

  const int batch = input.size(0);
  const int channels = input.size(1);
  const int height = input.size(2);
  const int width = input.size(3);

  const int channels_kernel = weight.size(1);
  const int kernel_h_ = weight.size(2);
  const int kernel_w_ = weight.size(3);
  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
             kernel_h_, kernel_w, kernel_h_, kernel_w_);
  if (channels != channels_kernel * group)
    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
             channels, channels_kernel * group);

  const int height_out =
      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int width_out =
      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;

  if (ones.ndimension() != 2 ||
      ones.size(0) * ones.size(1) < height_out * width_out) {
    // Resize plane and fill with ones...
    ones = at::ones({height_out, width_out}, input.options());
  }

  grad_input = grad_input.view({batch, channels, height, width});
  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
                      input.options());

  grad_output =
      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
                        grad_output.size(2), grad_output.size(3)});

  for (int b = 0; b < batch; b++) {
    // divide int group
    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
                          weight.size(2), weight.size(3)});

    for (int g = 0; g < group; g++) {
      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
    }

    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                          weight.size(3), weight.size(4)});

    // gradient w.r.t. input coordinate data
    modulated_deformable_col2im_coord_impl(
        columns, input[b], offset[b], mask[b], 1, channels, height, width,
        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
        grad_mask[b]);
    // gradient w.r.t. input data
    modulated_deformable_col2im_impl(
        columns, offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
        dilation_h, dilation_w, deformable_group, grad_input[b]);

    // gradient w.r.t. weight, dWeight should accumulate across the batch and
    // group
    modulated_deformable_im2col_impl(
        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
        dilation_h, dilation_w, deformable_group, columns);

    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
                                    grad_weight.size(1), grad_weight.size(2),
                                    grad_weight.size(3)});
    if (with_bias)
      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});

    for (int g = 0; g < group; g++) {
      grad_weight[g] =
          grad_weight[g]
              .flatten(1)
              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
              .view_as(grad_weight[g]);
      if (with_bias) {
        grad_bias[g] =
            grad_bias[g]
                .view({-1, 1})
                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
                .view(-1);
      }
    }

    columns =
        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
                                    grad_weight.size(2), grad_weight.size(3),
                                    grad_weight.size(4)});
    if (with_bias)
      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
  }
  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
                                  grad_output.size(2), grad_output.size(3),
                                  grad_output.size(4)});
}

#ifdef MMCV_WITH_DIOPI
void modulated_deform_conv_forward_diopi(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias) {
  auto input_p = toDiopiTensorHandle(input);
  diopiDevice_t device;
  diopiGetTensorDevice(input_p, &device);
  if (device == diopi_host) {
    modulated_deform_conv_forward_fallthrough(
        input, weight, bias, ones, offset, mask, output, columns, kernel_h,
        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
        group, deformable_group, with_bias);
    return;
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  auto weight_p = toDiopiTensorHandle(weight);
  auto bias_p = toDiopiTensorHandle(bias);
  auto ones_p = toDiopiTensorHandle(ones);
  auto offset_p = toDiopiTensorHandle(offset);
  auto mask_p = toDiopiTensorHandle(mask);
  auto output_p = toDiopiTensorHandle(output);
  auto columns_p = toDiopiTensorHandle(columns);
  if (reinterpret_cast<void*>(diopiModulatedDeformConvMmcv) != nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret = diopiModulatedDeformConvMmcv(
          ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,
          mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
          dilation_h, dilation_w, group, deformable_group, with_bias);
      if (ret == diopiSuccess) return;
    } else {
      auto ret = diopiModulatedDeformConvMmcv(
          ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,
          mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
          dilation_h, dilation_w, group, deformable_group, with_bias);
      if (ret == diopiSuccess) return;
    }
  }
  LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
  auto input_cpu = input.cpu();
  auto weight_cpu = weight.cpu();
  auto bias_cpu = bias.cpu();
  auto ones_cpu = ones.cpu();
  auto offset_cpu = offset.cpu();
  auto mask_cpu = mask.cpu();
  auto output_cpu = output.cpu();
  auto columns_cpu = columns.cpu();
  modulated_deform_conv_forward_fallthrough(
      input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,
      output_cpu, columns_cpu, kernel_h, kernel_w, stride_h, stride_w, pad_h,
      pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);
  output.copy_(output_cpu);
  return;
}

void modulated_deform_conv_backward_diopi(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias) {
  auto input_p = toDiopiTensorHandle(input);
  diopiDevice_t device;
  diopiGetTensorDevice(input_p, &device);
  if (device == diopi_host) {
    modulated_deform_conv_backward_fallthrough(
        input, weight, bias, ones, offset, mask, columns, grad_input,
        grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,
        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
        group, deformable_group, with_bias);
    return;
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  auto weight_p = toDiopiTensorHandle(weight);
  auto bias_p = toDiopiTensorHandle(bias);
  auto ones_p = toDiopiTensorHandle(ones);
  auto offset_p = toDiopiTensorHandle(offset);
  auto mask_p = toDiopiTensorHandle(mask);
  auto columns_p = toDiopiTensorHandle(columns);
  auto grad_input_p = toDiopiTensorHandle(grad_input);
  auto grad_weight_p = toDiopiTensorHandle(grad_weight);
  auto grad_bias_p = toDiopiTensorHandle(grad_bias);
  auto grad_offset_p = toDiopiTensorHandle(grad_offset);
  auto grad_mask_p = toDiopiTensorHandle(grad_mask);
  auto grad_output_p = toDiopiTensorHandle(grad_output);

  if (reinterpret_cast<void*>(diopiModulatedDeformConvBackwardMmcv) !=
      nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret = diopiModulatedDeformConvBackwardMmcv(
          ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,
          grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,
          columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w,
          pad_h, pad_w, dilation_h, dilation_w, group, deformable_group,
          with_bias);
      if (ret == diopiSuccess) return;
    } else {
      auto ret = diopiModulatedDeformConvBackwardMmcv(
          ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,
          grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,
          columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w,
          pad_h, pad_w, dilation_h, dilation_w, group, deformable_group,
          with_bias);
      if (ret == diopiSuccess) return;
    }
  }
  LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
  auto input_cpu = input.cpu();
  auto weight_cpu = weight.cpu();
  auto bias_cpu = bias.cpu();
  auto ones_cpu = ones.cpu();
  auto offset_cpu = offset.cpu();
  auto mask_cpu = mask.cpu();
  auto columns_cpu = columns.cpu();
  auto grad_input_cpu = grad_input.cpu();
  auto grad_weight_cpu = grad_weight.cpu();
  auto grad_bias_cpu = grad_bias.cpu();
  auto grad_offset_cpu = grad_offset.cpu();
  auto grad_mask_cpu = grad_mask.cpu();
  auto grad_output_cpu = grad_output.cpu();
  modulated_deform_conv_backward_fallthrough(
      input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,
      columns_cpu, grad_input_cpu, grad_weight_cpu, grad_bias_cpu,
      grad_offset_cpu, grad_mask_cpu, grad_output_cpu, kernel_h, kernel_w,
      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
      deformable_group, with_bias);
  grad_input.copy_(grad_input_cpu);
  grad_weight.copy_(grad_weight_cpu);
  grad_bias.copy_(grad_bias_cpu);
  grad_offset.copy_(grad_offset_cpu);
  grad_mask.copy_(grad_mask_cpu);
  return;
}
#endif

void modulated_deform_conv_forward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias) {
#ifdef MMCV_WITH_DIOPI
  modulated_deform_conv_forward_diopi(
      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
      deformable_group, with_bias);
#else
  modulated_deform_conv_forward_fallthrough(
      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
      deformable_group, with_bias);
#endif
}

void modulated_deform_conv_backward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias) {
#ifdef MMCV_WITH_DIOPI
  modulated_deform_conv_backward_diopi(
      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
      deformable_group, with_bias);
#else
  modulated_deform_conv_backward_fallthrough(
      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
      deformable_group, with_bias);
#endif
}


================================================
FILE: mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "pytorch_device_registry.hpp"

#include "MPSLibrary.h"
#include "MPSStream.h"
#include "MPSUtils.h"

using at::Tensor;

const static std::string kSourceCode = R"(
#include <metal_math>
#include <metal_stdlib>
using namespace metal;

kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
                       constant const float4* bboxes2,
                       device float* ious,
                       constant int& num_bbox1,
                       constant int& num_bbox2,
                       constant int& mode,
                       constant bool& aligned,
                       constant int& offset,
                       uint index [[thread_position_in_grid]])
{
    int base1 = index;
    int base2 = index;
    if(!aligned){
      base1 = index / num_bbox2;
      base2 = index % num_bbox2;
    }

    const float f_offset = float(offset);

    const float4 b1 = bboxes1[base1];
    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);

    const float4 b2 = bboxes2[base2];
    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);

    const float2 left_top = fmax(b1.xy, b2.xy);
    const float2 right_bottom = fmin(b1.zw, b2.zw);
    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
    const float interS = wh.x * wh.y;

    const float baseS =
        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
    ious[index] = interS / baseS;
}
)";

void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                                   const int mode, const bool aligned, const int offset) {
  // get stream
  auto stream = at::mps::getCurrentMPSStream();
  auto library_manager = MPSLibraryManager::getInstance();
  MPSLibrary* library;
  const static std::string kLibraryName = "bbox_overlap";
  if (library_manager->hasLibrary(kLibraryName))
    library = library_manager->getLibrary(kLibraryName);
  else
    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
  auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");

  // create command buffer and encoder
  MTLCommandBuffer_t command_buffer = stream->commandBuffer();
  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];

  // set pso and buffer
  int output_size = ious.numel();
  int num_bbox1 = bboxes1.size(0);
  int num_bbox2 = bboxes2.size(0);
  int num_elements = output_size;
  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
             offset);

  // set grid size
  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
  if (thread_group_size_x > num_elements) {
    thread_group_size_x = num_elements;
  }
  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);

  // encoding
  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
  [compute_encoder endEncoding];

  // commit, not sure if flush is required
  stream->commit(false);
}

void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
                       const bool aligned, const int offset) {
  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}

void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
                        const bool aligned, const int offset);
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);


================================================
FILE: mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

Tensor ms_deform_attn_impl_forward(const Tensor &value,
                                   const Tensor &spatial_shapes,
                                   const Tensor &level_start_index,
                                   const Tensor &sampling_loc,
                                   const Tensor &attn_weight,
                                   const int im2col_step) {
  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
                              spatial_shapes, level_start_index, sampling_loc,
                              attn_weight, im2col_step);
}

void ms_deform_attn_impl_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
    const int im2col_step) {
  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
                       level_start_index, sampling_loc, attn_weight,
                       grad_output, grad_value, grad_sampling_loc,
                       grad_attn_weight, im2col_step);
}

Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
                              const Tensor &level_start_index,
                              const Tensor &sampling_loc,
                              const Tensor &attn_weight,
                              const int im2col_step) {
  at::DeviceGuard guard(value.device());
  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
                                     sampling_loc, attn_weight, im2col_step);
}

void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
                             const Tensor &level_start_index,
                             const Tensor &sampling_loc,
                             const Tensor &attn_weight,
                             const Tensor &grad_output, Tensor &grad_value,
                             Tensor &grad_sampling_loc,
                             Tensor &grad_attn_weight, const int im2col_step) {
  at::DeviceGuard guard(value.device());
  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
                               sampling_loc, attn_weight, grad_output,
                               grad_value, grad_sampling_loc, grad_attn_weight,
                               im2col_step);
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/active_rotated_filter_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/musa/ActiveRotatingFilter_musa.cu
#include "active_rotated_filter_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void ActiveRotatedFilterForwardMUSAKernelLauncher(const Tensor input,
                                                  const Tensor indices,
                                                  Tensor output) {
  int num_output_planes = input.size(0);
  int num_input_planes = input.size(1);
  int num_orientations = input.size(2);
  int kH = input.size(3);
  int kW = input.size(4);
  int num_rotations = indices.size(3);
  int nEntry = num_orientations * kH * kW;
  int output_size = input.numel();

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "active_rotated_filter_forward_musa_kernel", [&] {
        active_rotated_filter_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                indices.data_ptr<int>(), num_input_planes, num_output_planes,
                num_orientations, num_rotations, nEntry,
                output.data_ptr<scalar_t>());
      });
  AT_MUSA_CHECK(musaGetLastError());
}

void ActiveRotatedFilterBackwardMUSAKernelLauncher(const Tensor grad_out,
                                                   const Tensor indices,
                                                   Tensor grad_in) {
  int num_orientations = indices.size(0);
  int kH = indices.size(1);
  int kW = indices.size(2);
  int num_rotations = indices.size(3);
  int num_output_planes = grad_out.size(0) / num_rotations;
  int num_input_planes = grad_out.size(1) / num_orientations;
  int nEntry = num_orientations * kH * kW;
  int output_size = grad_in.numel();

  c10::musa::MUSAGuard device_guard(indices.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      grad_out.scalar_type(), "active_rotated_filter_backward_musa_kernel",
      [&] {
        active_rotated_filter_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_out.data_ptr<scalar_t>(),
                indices.data_ptr<int>(), num_input_planes, num_output_planes,
                num_orientations, num_rotations, nEntry,
                grad_in.data_ptr<scalar_t>());
      });
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/assign_score_withk_musa.mu
================================================
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include <stdio.h>
#include <stdlib.h>

#include "assign_score_withk_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void AssignScoreWithKForwardMUSAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& points, const Tensor& centers, const Tensor& scores,
    const Tensor& knn_idx, Tensor& output) {
  c10::musa::MUSAGuard device_guard(points.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES(
      points.scalar_type(), "assign_score_withk_forward_musa_kernel", [&] {
        assign_score_withk_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),
                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),
                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void AssignScoreWithKBackwardMUSAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
  c10::musa::MUSAGuard device_guard(grad_out.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
  dim3 threads1(THREADS_PER_BLOCK);
  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
  dim3 threads2(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES(
      grad_out.scalar_type(), "assign_score_withk_points_backward_musa_kernel",
      [&] {
        assign_score_withk_points_backward_musa_kernel<scalar_t>
            <<<blocks1, threads1, 0, stream>>>(
                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),
                grad_points.data_ptr<scalar_t>(),
                grad_centers.data_ptr<scalar_t>());
      });

  AT_DISPATCH_FLOATING_TYPES(
      grad_out.scalar_type(), "assign_score_withk_scores_backward_musa_kernel",
      [&] {
        assign_score_withk_scores_backward_musa_kernel<scalar_t>
            <<<blocks2, threads2, 0, stream>>>(
                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),
                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/ball_query_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "ball_query_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void BallQueryForwardMUSAKernelLauncher(int b, int n, int m, float min_radius,
                                        float max_radius, int nsample,
                                        const Tensor new_xyz, const Tensor xyz,
                                        Tensor idx) {
  // new_xyz: (B, M, 3)
  // xyz: (B, N, 3)
  // output:
  //      idx: (B, M, nsample)

  c10::musa::MUSAGuard device_guard(new_xyz.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES(
      new_xyz.scalar_type(), "ball_query_forward_musa_kernel", [&] {
        ball_query_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, n, m, min_radius, max_radius, nsample,
                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),
                idx.data_ptr<int>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/bbox_overlaps_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "bbox_overlaps_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"


template <>
__global__ void bbox_overlaps_musa_kernel<at::Half>(
    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
    const int num_bbox1, const int num_bbox2, const int mode,
    const bool aligned, const int offset) {
  bbox_overlaps_musa_kernel_half(reinterpret_cast<const __half*>(bbox1),
                                 reinterpret_cast<const __half*>(bbox2),
                                 reinterpret_cast<__half*>(ious), num_bbox1,
                                 num_bbox2, mode, aligned, offset);
}


void BBoxOverlapsMUSAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                    Tensor ious, const int mode,
                                    const bool aligned, const int offset) {
  int output_size = ious.numel();
  int num_bbox1 = bboxes1.size(0);
  int num_bbox2 = bboxes2.size(0);

  c10::musa::MUSAGuard device_guard(bboxes1.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      bboxes1.scalar_type(), "bbox_overlaps_musa_kernel", ([&] {
        bbox_overlaps_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
                offset);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/bezier_align_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "bezier_align_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void BezierAlignForwardMUSAKernelLauncher(Tensor input, Tensor rois,
                                          Tensor output, int aligned_height,
                                          int aligned_width,
                                          float spatial_scale,
                                          int sampling_ratio, bool aligned) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "bezier_align_forward_musa_kernel", [&] {
        bezier_align_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                aligned_height, aligned_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
                channels, height, width);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void BezierAlignBackwardMUSAKernelLauncher(
    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "bezier_align_backward_musa_kernel", [&] {
        bezier_align_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
                aligned_height, aligned_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
                channels, height, width);
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/bias_act_musa.mu
================================================
// Modified from
// https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.cpp

// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <c10/util/Half.h>
#include <musa_runtime.h>
#include <torch/types.h>

#include "pytorch_musa_helper.hpp"

struct bias_act_kernel_params {
  const void *x;     // [sizeX]
  const void *b;     // [sizeB] or NULL
  const void *xref;  // [sizeX] or NULL
  const void *yref;  // [sizeX] or NULL
  const void *dy;    // [sizeX] or NULL
  void *y;           // [sizeX]

  int grad;
  int act;
  float alpha;
  float gain;
  float clamp;

  int sizeX;
  int sizeB;
  int stepB;
  int loopX;
};

// MUSA kernel selection.

template <class T>
void *choose_bias_act_kernel(const bias_act_kernel_params &p);
//------------------------------------------------------------------------
// Helpers.

template <class T>
struct InternalType;
template <>
struct InternalType<double> {
  typedef double scalar_t;
};
template <>
struct InternalType<float> {
  typedef float scalar_t;
};
template <>
struct InternalType<c10::Half> {
  typedef float scalar_t;
};

//------------------------------------------------------------------------
// MUSA kernel.

template <class T, int A>
__global__ void bias_act_kernel(bias_act_kernel_params p) {
  typedef typename InternalType<T>::scalar_t scalar_t;
  int G = p.grad;
  scalar_t alpha = (scalar_t)p.alpha;
  scalar_t gain = (scalar_t)p.gain;
  scalar_t clamp = (scalar_t)p.clamp;
  scalar_t one = (scalar_t)1;
  scalar_t two = (scalar_t)2;
  scalar_t expRange = (scalar_t)80;
  scalar_t halfExpRange = (scalar_t)40;
  scalar_t seluScale = (scalar_t)1.0507009873554804934193349852946;
  scalar_t seluAlpha = (scalar_t)1.6732632423543772848170429916717;

  // Loop over elements.
  int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
  for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX;
       loopIdx++, xi += blockDim.x) {
    // Load.
    scalar_t x = (scalar_t)((const T *)p.x)[xi];
    scalar_t b =
        (p.b) ? (scalar_t)((const T *)p.b)[(xi / p.stepB) % p.sizeB] : 0;
    scalar_t xref = (p.xref) ? (scalar_t)((const T *)p.xref)[xi] : 0;
    scalar_t yref = (p.yref) ? (scalar_t)((const T *)p.yref)[xi] : 0;
    scalar_t dy = (p.dy) ? (scalar_t)((const T *)p.dy)[xi] : one;
    scalar_t yy = (gain != 0) ? yref / gain : 0;
    scalar_t y = 0;

    // Apply bias.
    ((G == 0) ? x : xref) += b;

    // linear
    if (A == 1) {
      if (G == 0) y = x;
      if (G == 1) y = x;
    }

    // relu
    if (A == 2) {
      if (G == 0) y = (x > 0) ? x : 0;
      if (G == 1) y = (yy > 0) ? x : 0;
    }

    // lrelu
    if (A == 3) {
      if (G == 0) y = (x > 0) ? x : x * alpha;
      if (G == 1) y = (yy > 0) ? x : x * alpha;
    }

    // tanh
    if (A == 4) {
      if (G == 0) {
        scalar_t c = exp(x);
        scalar_t d = one / c;
        y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d);
      }
      if (G == 1) y = x * (one - yy * yy);
      if (G == 2) y = x * (one - yy * yy) * (-two * yy);
    }

    // sigmoid
    if (A == 5) {
      if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
      if (G == 1) y = x * yy * (one - yy);
      if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
    }

    // elu
    if (A == 6) {
      if (G == 0) y = (x >= 0) ? x : exp(x) - one;
      if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
    }

    // selu
    if (A == 7) {
      if (G == 0)
        y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
      if (G == 1)
        y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
    }

    // softplus
    if (A == 8) {
      if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
      if (G == 1) y = x * (one - exp(-yy));
      if (G == 2) {
        scalar_t c = exp(-yy);
        y = x * c * (one - c);
      }
    }

    // swish
    if (A == 9) {
      if (G == 0)
        y = (x < -expRange) ? 0 : x / (exp(-x) + one);
      else {
        scalar_t c = exp(xref);
        scalar_t d = c + one;
        if (G == 1)
          y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
        else
          y = (xref > halfExpRange)
                  ? 0
                  : x * c * (xref * (two - d) + two * d) / (d * d * d);
        yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
      }
    }

    // Apply gain.
    y *= gain * dy;

    // Clamp.
    if (clamp >= 0) {
      if (G == 0)
        y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
      else
        y = (yref > -clamp & yref < clamp) ? y : 0;
    }

    // Store.
    ((T *)p.y)[xi] = (T)y;
  }
}

//------------------------------------------------------------------------
// MUSA kernel selection.

template <class T>
void *choose_bias_act_kernel(const bias_act_kernel_params &p) {
  if (p.act == 1) return (void *)bias_act_kernel<T, 1>;
  if (p.act == 2) return (void *)bias_act_kernel<T, 2>;
  if (p.act == 3) return (void *)bias_act_kernel<T, 3>;
  if (p.act == 4) return (void *)bias_act_kernel<T, 4>;
  if (p.act == 5) return (void *)bias_act_kernel<T, 5>;
  if (p.act == 6) return (void *)bias_act_kernel<T, 6>;
  if (p.act == 7) return (void *)bias_act_kernel<T, 7>;
  if (p.act == 8) return (void *)bias_act_kernel<T, 8>;
  if (p.act == 9) return (void *)bias_act_kernel<T, 9>;
  return NULL;
}

//------------------------------------------------------------------------

static bool has_same_layout(torch::Tensor x, torch::Tensor y) {
  if (x.dim() != y.dim()) return false;
  for (int64_t i = 0; i < x.dim(); i++) {
    if (x.size(i) != y.size(i)) return false;
    if (x.size(i) >= 2 && x.stride(i) != y.stride(i)) return false;
  }
  return true;
}

//------------------------------------------------------------------------
torch::Tensor bias_act_op(const torch::Tensor &x, const torch::Tensor &b,
                          const torch::Tensor &xref, const torch::Tensor &yref,
                          const torch::Tensor &dy, int grad, int dim, int act,
                          float alpha, float gain, float clamp) {
  // Validate arguments.
  TORCH_CHECK(x.is_privateuseone(), "x must reside on MUSA device");
  TORCH_CHECK(
      b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()),
      "b must have the same dtype and device as x");
  TORCH_CHECK(xref.numel() == 0 ||
                  (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() &&
                   xref.device() == x.device()),
              "xref must have the same shape, dtype, and device as x");
  TORCH_CHECK(yref.numel() == 0 ||
                  (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() &&
                   yref.device() == x.device()),
              "yref must have the same shape, dtype, and device as x");
  TORCH_CHECK(
      dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() &&
                          dy.device() == x.device()),
      "dy must have the same dtype and device as x");
  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
  TORCH_CHECK(b.dim() == 1, "b must have rank 1");
  TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()),
              "dim is out of bounds");
  TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim),
              "b has wrong number of elements");
  TORCH_CHECK(grad >= 0, "grad must be non-negative");

  // Validate layout.
  TORCH_CHECK(x.is_non_overlapping_and_dense(),
              "x must be non-overlapping and dense");
  TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
  TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x),
              "xref must have the same layout as x");
  TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x),
              "yref must have the same layout as x");
  TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x),
              "dy must have the same layout as x");

  // Create output tensor.
  const at::musa::OptionalMUSAGuard device_guard(device_of(x));
  torch::Tensor y = torch::empty_like(x);
  TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");

  // Initialize MUSA kernel parameters.
  bias_act_kernel_params p;
  p.x = x.data_ptr();
  p.b = (b.numel()) ? b.data_ptr() : NULL;
  p.xref = (xref.numel()) ? xref.data_ptr() : NULL;
  p.yref = (yref.numel()) ? yref.data_ptr() : NULL;
  p.dy = (dy.numel()) ? dy.data_ptr() : NULL;
  p.y = y.data_ptr();
  p.grad = grad;
  p.act = act;
  p.alpha = alpha;
  p.gain = gain;
  p.clamp = clamp;
  p.sizeX = (int)x.numel();
  p.sizeB = (int)b.numel();
  p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;

  // Choose MUSA kernel.
  void *kernel;
  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "upfirdn2d_musa", [&] {
    kernel = choose_bias_act_kernel<scalar_t>(p);
  });
  TORCH_CHECK(kernel, "no MUSA kernel found for the specified activation func");

  // Launch MUSA kernel.
  p.loopX = 4;
  int blockSize = 4 * 32;
  int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
  void *args[] = {&p};
#ifdef MMCV_WITH_HIP
  AT_MUSA_CHECK(hipLaunchKernel(kernel, gridSize, blockSize, args, 0,
                                c10::musa::getCurrentMUSAStream()));
#else
  AT_MUSA_CHECK(musaLaunchKernel(kernel, gridSize, blockSize, args, 0,
                                 c10::musa::getCurrentMUSAStream()));
#endif

  return y;
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/border_align_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "border_align_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void BorderAlignForwardMUSAKernelLauncher(const Tensor &input,
                                          const Tensor &boxes, Tensor output,
                                          Tensor argmax_idx,
                                          const int pool_size) {
  // shape assertion
  AT_ASSERTM(input.ndimension() == 4,
             "non-empty 4D(batch mode) tensor expected for input feature");
  AT_ASSERTM(boxes.ndimension() == 3,
             "boxes must be 3D tensor with size of [B, H*W, 4]");

  int batch_size = input.size(0);
  int feat_channels = input.size(1);
  int channels = feat_channels / 4;
  int height = input.size(2);
  int width = input.size(3);
  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format
  int box_size = boxes.size(1);
  // shape [N, channels, box_size, 4] for output
  int nthreads = batch_size * channels * box_size;

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  dim3 block(128, 4);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "border_align_forward_musa_kernel", [&] {
        border_align_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
                nthreads, input.data_ptr<scalar_t>(),
                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                argmax_idx.data_ptr<int>(), channels, box_size, height, width,
                pool_size);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void BorderAlignBackwardMUSAKernelLauncher(const Tensor &grad_output,
                                           const Tensor &boxes,
                                           const Tensor &argmax_idx,
                                           Tensor grad_input,
                                           const int pool_size) {
  int batch_size = grad_input.size(0);
  int feat_channels = grad_input.size(1);
  int channels = feat_channels / 4;
  int height = grad_input.size(2);
  int width = grad_input.size(3);
  int box_size = boxes.size(1);
  int nthreads = batch_size * channels * box_size;

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  dim3 block(128, 4);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "border_align_backward_musa_kernel", [&] {
        border_align_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
                nthreads, grad_output.data_ptr<scalar_t>(),
                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),
                grad_input.data_ptr<scalar_t>(), channels, box_size, height,
                width, pool_size);
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/box_iou_quadri_musa.mu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "box_iou_quadri_musa.muh"
#include "pytorch_musa_helper.hpp"

void box_iou_quadri_musa(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned) {
  using scalar_t = float;
  AT_ASSERTM(boxes1.is_privateuseone(), "boxes1 must be a MUSA tensor");
  AT_ASSERTM(boxes2.is_privateuseone(), "boxes2 must be a MUSA tensor");

  int output_size = ious.numel();
  int num_boxes1 = boxes1.size(0);
  int num_boxes2 = boxes2.size(0);

  c10::musa::MUSAGuard device_guard(boxes1.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  box_iou_quadri_musa_kernel<scalar_t>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
          mode_flag, aligned);
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/box_iou_rotated_musa.mu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_musa.cu
#include "box_iou_rotated_musa.muh"
#include "pytorch_musa_helper.hpp"

void box_iou_rotated_musa(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned) {
  using scalar_t = float;
  AT_ASSERTM(boxes1.is_privateuseone(), "boxes1 must be a MUSA tensor");
  AT_ASSERTM(boxes2.is_privateuseone(), "boxes2 must be a MUSA tensor");

  int output_size = ious.numel();
  int num_boxes1 = boxes1.size(0);
  int num_boxes2 = boxes2.size(0);

  c10::musa::MUSAGuard device_guard(boxes1.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  box_iou_rotated_musa_kernel<scalar_t>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
          mode_flag, aligned);
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/carafe_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "carafe_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

#include <iostream>

#if MUSA_ARCH > 21
void CARAFEForwardMUSAKernelLauncher(const Tensor features, const Tensor masks,
                                     Tensor rfeatures, Tensor routput,
                                     Tensor rmasks, Tensor output,
                                     const int kernel_size,
                                     const int group_size,
                                     const int scale_factor) {
  const int batch_size = output.size(0);
  const int channels = output.size(1);
  const int output_height = output.size(2);
  const int output_width = output.size(3);

  const int input_height = features.size(2);
  const int input_width = features.size(3);

  const int mask_channels = masks.size(1);

  rfeatures.resize_({batch_size, input_height, input_width, channels});
  routput.resize_({batch_size, output_height, output_width, channels});
  rmasks.resize_({batch_size, output_height, output_width, mask_channels});

  // one warp per pixel
  c10::musa::MUSAGuard device_guard(features.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
        const int dh = divideUP(channels, kTileDim);
        const int dw = divideUP(input_height * input_width, kTileDim);
        BatchTranspose2DMUSAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, channels, input_height * input_width, dh, dw,
                bottom_data, top_data);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
        const int dh = divideUP(mask_channels, kTileDim);
        const int dw = divideUP(output_height * output_width, kTileDim);
        BatchTranspose2DMUSAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, mask_channels, output_height * output_width, dh, dw,
                bottom_data, top_data);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "CARAFELaucherForward", ([&] {
        const int num_kernels =
            batch_size * output_height * output_width * THREADS_PER_PIXEL;
        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
        scalar_t *top_data = routput.data_ptr<scalar_t>();
        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
                                  THREADS_PER_BLOCK, 0, stream>>>(
            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
            scale_factor, channels, input_height, input_width, output_height,
            output_width, mask_channels, top_data);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features.scalar_type(), "NHWC2NCHW", ([&] {
        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
        scalar_t *top_data = output.data_ptr<scalar_t>();
        const int dh = divideUP(output_height * output_width, kTileDim);
        const int dw = divideUP(channels, kTileDim);
        BatchTranspose2DMUSAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, output_height * output_width, channels, dh, dw,
                bottom_data, top_data);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}

void CARAFEBackwardMUSAKernelLauncher(
    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
    const int kernel_size, const int group_size, const int scale_factor) {
  const int batch_size = top_grad.size(0);
  const int channels = top_grad.size(1);
  const int output_height = top_grad.size(2);
  const int output_width = top_grad.size(3);

  const int input_height = bottom_grad.size(2);
  const int input_width = bottom_grad.size(3);

  const int mask_channels = masks.size(1);

  rtop_grad.resize_({batch_size, output_height, output_width, channels});
  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});

  c10::musa::MUSAGuard device_guard(top_grad.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
        const int dh = divideUP(channels, kTileDim);
        const int dw = divideUP(output_height * output_width, kTileDim);
        BatchTranspose2DMUSAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, channels, output_height * output_width, dh, dw,
                bottom_data, top_data);
      }));

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
        const int num_kernels =
            batch_size * output_height * output_width * THREADS_PER_PIXEL;
        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();

        CARAFEBackward_Feature<scalar_t>
            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
                         group_size, scale_factor, channels, input_height,
                         input_width, output_height, output_width,
                         mask_channels, bottom_diff);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "FeatureSum", ([&] {
        const int num_kernels =
            batch_size * input_height * input_width * THREADS_PER_PIXEL;
        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();

        FeatureSum<scalar_t>
            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
                         input_height, input_width, bottom_diff);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
        const int dh = divideUP(input_height * input_width, kTileDim);
        const int dw = divideUP(channels, kTileDim);
        BatchTranspose2DMUSAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, input_height * input_width, channels, dh, dw,
                bottom_data, top_data);
      }));

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
        const int num_kernels = batch_size * output_height * output_width *
                                mask_channels * WARP_SIZE;
        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();

        CARAFEBackward_Mask<scalar_t>
            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
                         group_size, scale_factor, channels, input_height,
                         input_width, output_height, output_width,
                         mask_channels, mask_diff);
      }));
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
        const int dh = divideUP(output_height * output_width, kTileDim);
        const int dw = divideUP(mask_channels, kTileDim);
        BatchTranspose2DMUSAKernel<scalar_t>
            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
                batch_size, output_height * output_width, mask_channels, dh, dw,
                bottom_data, top_data);
      }));

  AT_MUSA_CHECK(musaGetLastError());
}
#endif //MUSA_ARCH


================================================
FILE: mmcv/ops/csrc/pytorch/musa/carafe_naive_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "carafe_naive_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void CARAFENAIVEForwardMUSAKernelLauncher(const Tensor features,
                                          const Tensor masks, Tensor output,
                                          const int kernel_size,
                                          const int group_size,
                                          const int scale_factor) {
  int output_size = output.numel();
  int channels = output.size(1);
  int height = output.size(2);
  int width = output.size(3);

  c10::musa::MUSAGuard device_guard(features.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      features.scalar_type(), "CARAFENAIVEForward", ([&] {
        carafe_naive_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, features.data_ptr<scalar_t>(),
                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                kernel_size, group_size, scale_factor, channels, height, width);
      }));

  AT_MUSA_CHECK(musaGetLastError());
}

void CARAFENAIVEBackwardMUSAKernelLauncher(
    const Tensor top_grad, const Tensor features, const Tensor masks,
    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
    const int group_size, const int scale_factor) {
  int output_size = top_grad.numel();
  int channels = top_grad.size(1);
  int height = top_grad.size(2);
  int width = top_grad.size(3);

  c10::musa::MUSAGuard device_guard(top_grad.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
        carafe_naive_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, top_grad.data_ptr<scalar_t>(),
                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
                bottom_grad.data_ptr<scalar_t>(),
                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
                scale_factor, channels, height, width);
      }));

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/chamfer_distance_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
#include "chamfer_distance_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"
#if MUSA_ARCH > 21
void ChamferDistanceForwardMUSAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
    const Tensor dist2, const Tensor idx1, const Tensor idx2) {
  int batch_size = xyz1.size(0);
  int n = xyz1.size(1);
  int m = xyz2.size(1);

  c10::musa::MUSAGuard device_guard(xyz1.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz1.scalar_type(), "chamfer_distance_forward_musa_kernel", [&] {
        chamfer_distance_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
                batch_size, n, xyz1.data_ptr<scalar_t>(), m,
                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
                idx1.data_ptr<int>());
      });
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz1.scalar_type(), "chamfer_distance_forward_musa_kernel", [&] {
        chamfer_distance_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
                batch_size, m, xyz2.data_ptr<scalar_t>(), n,
                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
                idx2.data_ptr<int>());
      });
  AT_MUSA_CHECK(musaGetLastError());
}

void ChamferDistanceBackwardMUSAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2) {
  int batch_size = xyz1.size(0);
  int n = xyz1.size(1);
  int m = xyz2.size(1);

  c10::musa::MUSAGuard device_guard(xyz1.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      xyz1.scalar_type(), "chamfer_distance_backward_musa_kernel", [&] {
        chamfer_distance_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
                batch_size, m, xyz1.data_ptr<scalar_t>(), n,
                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
                grad_xyz2.data_ptr<scalar_t>());
      });
  AT_DISPATCH_FLOATING_TYPES(
      xyz1.scalar_type(), "chamfer_distance_backward_musa_kernel", [&] {
        chamfer_distance_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
                batch_size, n, xyz2.data_ptr<scalar_t>(), m,
                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
                grad_xyz1.data_ptr<scalar_t>());
      });
  AT_MUSA_CHECK(musaGetLastError());
}
#else
#warning "chamfer_distance is supported when MUSA_ARCH > 21"
#endif  //MUSA_ARCH


================================================
FILE: mmcv/ops/csrc/pytorch/musa/convex_iou.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
#include "convex_iou_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void ConvexIoUMUSAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                 Tensor ious) {
  int output_size = ious.numel();
  int num_pointsets = pointsets.size(0);
  int num_polygons = polygons.size(0);

  c10::musa::MUSAGuard device_guard(pointsets.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      pointsets.scalar_type(), "convex_iou_musa_kernel", ([&] {
        convex_iou_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
      }));
  AT_MUSA_CHECK(musaGetLastError());
}

void ConvexGIoUMUSAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                  Tensor output) {
  int output_size = output.numel();
  int num_pointsets = pointsets.size(0);
  int num_polygons = polygons.size(0);

  c10::musa::MUSAGuard device_guard(pointsets.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      pointsets.scalar_type(), "convex_giou_musa_kernel", ([&] {
        convex_giou_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/correlation_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_musa_kernel.cu
// Original licence: Under MIT License

#include "correlation_musa.muh"
#include "pytorch_musa_helper.hpp"

void CorrelationForwardMUSAKernelLauncher(Tensor input1, Tensor input2,
                                          Tensor output, int kH, int kW,
                                          int patchH, int patchW, int padH,
                                          int padW, int dilationH,
                                          int dilationW, int dilation_patchH,
                                          int dilation_patchW, int dH, int dW) {
  const int batch_size = input1.size(0);
  const int iH = input1.size(2);
  const int iW = input1.size(3);
  const int dilatedKH = (kH - 1) * dilationH + 1;
  const int dilatedKW = (kW - 1) * dilationW + 1;

  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;

  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();

  const dim3 threads(WARP_SIZE, 4, 4);
  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);

  c10::musa::MUSAGuard device_guard(input1.device());

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input1.scalar_type(), "correlation_forward_musa", ([&] {
        TensorAcc4R trInput1_acc =
            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc4R trInput2_acc =
            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc5R output_acc =
            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();

        correlation_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, c10::musa::getCurrentMUSAStream()>>>(
                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,
                padH, padW, dilationH, dilationW, dilation_patchH,
                dilation_patchW, dH, dW, oH, oW);
      }));
}

void CorrelationBackwardMUSAKernelLauncher(
    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,
    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,
    int padW, int dilationH, int dilationW, int dilation_patchH,
    int dilation_patchW, int dH, int dW) {
  const int batch_size = input1.size(0);
  const int iH = input1.size(2);
  const int iW = input1.size(3);
  const int C = input1.size(1);

  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
  const dim3 blocks(batch_size, iH, iW);
  const dim3 threads(THREADS_PER_BLOCK);

  c10::musa::MUSAGuard device_guard(input1.device());

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input1.scalar_type(), "correlation_backward_musa", ([&] {
        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
        TensorAcc4R input1_acc =
            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc4R input2_acc =
            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc4R grad_input1_acc =
            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc4R grad_input2_acc =
            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
        TensorAcc5R grad_output_acc =
            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();

        correlation_backward_musa_kernel_input1<scalar_t>
            <<<blocks, threads, grad_cache_size,
               c10::musa::getCurrentMUSAStream()>>>(
                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
                dilation_patchW, dH, dW);

        correlation_backward_musa_kernel_input2<scalar_t>
            <<<blocks, threads, grad_cache_size,
               c10::musa::getCurrentMUSAStream()>>>(
                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
                dilation_patchW, dH, dW);
      }));
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/deform_conv_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "deform_conv_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void deformable_im2col_musa(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col) {
  // num_axes should be smaller than block size
  // todo: check parallel_imgs is correctly passed in
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels = channels * height_col * width_col * parallel_imgs;
  int channel_per_deformable_group = channels / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();

        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
                                       THREADS_PER_BLOCK, 0,
                                       c10::musa::getCurrentMUSAStream()>>>(
            num_kernels, data_im_, data_offset_, height, width, ksize_h,
            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
            channel_per_deformable_group, parallel_imgs, channels,
            deformable_group, height_col, width_col, data_col_);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}

void deformable_col2im_musa(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im) {
  // todo: make sure parallel_imgs is passed in correctly
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels =
      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
  int channel_per_deformable_group = channels / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();

        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
                                       THREADS_PER_BLOCK, 0,
                                       c10::musa::getCurrentMUSAStream()>>>(
            num_kernels, data_col_, data_offset_, channels, height, width,
            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
            dilation_w, channel_per_deformable_group, parallel_imgs,
            deformable_group, height_col, width_col, grad_im_);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}

void deformable_col2im_coord_musa(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset) {
  int height_col =
      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
  int width_col =
      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
                    deformable_group * parallel_imgs;
  int channel_per_deformable_group =
      channels * ksize_h * ksize_w / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();

        deformable_col2im_coord_gpu_kernel<<<
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            c10::musa::getCurrentMUSAStream()>>>(
            num_kernels, data_col_, data_im_, data_offset_, channels, height,
            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
            2 * ksize_h * ksize_w * deformable_group, deformable_group,
            height_col, width_col, grad_offset_);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/deform_roi_pool_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "deform_roi_pool_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void DeformRoIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois,
                                            Tensor offset, Tensor output,
                                            int pooled_height, int pooled_width,
                                            float spatial_scale,
                                            int sampling_ratio, float gamma) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "deform_roi_pool_forward_musa_kernel", [&] {
        deform_roi_pool_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio,
                static_cast<scalar_t>(gamma), channels, height, width);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void DeformRoIPoolBackwardMUSAKernelLauncher(
    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
    float spatial_scale, int sampling_ratio, float gamma) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      grad_output.scalar_type(), "deform_roi_pool_backward_musa_kernel", [&] {
        deform_roi_pool_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio,
                static_cast<scalar_t>(gamma), channels, height, width);
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/diff_iou_rotated_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/musa_op/sort_vert_kernel.cu  # noqa
#include "diff_iou_rotated_musa_kernel.muh"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_musa_helper.hpp"

at::Tensor DiffIoURotatedSortVerticesMUSAKernelLauncher(at::Tensor vertices,
                                                        at::Tensor mask,
                                                        at::Tensor num_valid) {
  c10::musa::MUSAGuard device_guard(vertices.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  CHECK_CONTIGUOUS(vertices);
  CHECK_CONTIGUOUS(mask);
  CHECK_CONTIGUOUS(num_valid);
  CHECK_MUSA(vertices);
  CHECK_MUSA(mask);
  CHECK_MUSA(num_valid);

  int b = vertices.size(0);
  int n = vertices.size(1);
  int m = vertices.size(2);
  at::Tensor idx =
      torch::zeros({b, n, MAX_NUM_VERT_IDX},
                   at::device(vertices.device()).dtype(at::ScalarType::Int));

  diff_iou_rotated_sort_vertices_forward_musa_kernel<<<b, opt_n_thread(n), 0,
                                                       stream>>>(
      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
      num_valid.data_ptr<int>(), idx.data_ptr<int>());
  AT_MUSA_CHECK(musaGetLastError());

  return idx;
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/filtered_lrelu.mu
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include <c10/util/Half.h>
#include <musa_runtime.h>
#include <torch/types.h>

#include <cstdint>

#include "pytorch_musa_helper.hpp"
#include "pytorch_device_registry.hpp"

//------------------------------------------------------------------------
// MUSA kernel parameters.

struct filtered_lrelu_kernel_params {
  // These parameters decide which kernel to use.
  int up;        // upsampling ratio (1, 2, 4)
  int down;      // downsampling ratio (1, 2, 4)
  int2 fuShape;  // [size, 1] | [size, size]
  int2 fdShape;  // [size, 1] | [size, size]

  int _dummy;  // Alignment.

  // Rest of the parameters.
  const void *x;     // Input tensor.
  void *y;           // Output tensor.
  const void *b;     // Bias tensor.
  unsigned char *s;  // Sign tensor in/out. NULL if unused.
  const float *fu;   // Upsampling filter.
  const float *fd;   // Downsampling filter.

  int2 pad0;    // Left/top padding.
  float gain;   // Additional gain factor.
  float slope;  // Leaky ReLU slope on negative side.
  float clamp;  // Clamp after nonlinearity.
  int flip;     // Filter kernel flip for gradient computation.

  int tilesXdim;  // Original number of horizontal output tiles.
  int tilesXrep;  // Number of horizontal tiles per CTA.
  int blockZofs;  // Block z offset to support large minibatch, channel
                  // dimensions.

  int4 xShape;  // [width, height, channel, batch]
  int4 yShape;  // [width, height, channel, batch]
  int2 sShape;  // [width, height] - width is in bytes. Contiguous. Zeros if
                // unused.
  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
  int swLimit;  // Active width of sign tensor in bytes.

  longlong4 xStride;   // Strides of all tensors except signs, same component
                       // order as shapes.
  longlong4 yStride;   //
  int64_t bStride;     //
  longlong3 fuStride;  //
  longlong3 fdStride;  //
};

struct filtered_lrelu_act_kernel_params {
  void *x;           // Input/output, modified in-place.
  unsigned char *s;  // Sign tensor in/out. NULL if unused.

  float gain;   // Additional gain factor.
  float slope;  // Leaky ReLU slope on negative side.
  float clamp;  // Clamp after nonlinearity.

  int4 xShape;        // [width, height, channel, batch]
  longlong4 xStride;  // Input/output tensor strides, same order as in shape.
  int2 sShape;  // [width, height] - width is in elements. Contiguous. Zeros if
                // unused.
  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
};

//------------------------------------------------------------------------
// MUSA kernel specialization.

struct filtered_lrelu_kernel_spec {
  void *setup;   // Function for filter kernel setup.
  void *exec;    // Function for main operation.
  int2 tileOut;  // Width/height of launch tile.
  int numWarps;  // Number of warps per thread block, determines launch block
                 // size.
  int xrep;      // For processing multiple horizontal tiles per thread block.
  int dynamicSharedKB;  // How much dynamic shared memory the exec kernel wants.
};

//------------------------------------------------------------------------
// MUSA kernel selection.

template <class T, class index_t, bool signWrite, bool signRead>
filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
    const filtered_lrelu_kernel_params &p, int sharedKB);
template <class T, bool signWrite, bool signRead>
void *choose_filtered_lrelu_act_kernel(void);

//------------------------------------------------------------------------
// Helpers.

enum              // Filter modes.
{ MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
  MODE_FUSD = 1,  // Full upsampling, separable downsampling.
  MODE_SUFD = 2,  // Separable upsampling, full downsampling.
  MODE_FUFD = 3,  // Full upsampling, full downsampling.
};

template <class T>
struct InternalType;
template <>
struct InternalType<double> {
  typedef double scalar_t;
  typedef double2 vec2_t;
  typedef double4 vec4_t;
  __device__ __forceinline__ static vec2_t zero_vec2(void) {
    return make_double2(0, 0);
  }
  __device__ __forceinline__ static vec4_t zero_vec4(void) {
    return make_double4(0, 0, 0, 0);
  }
  __device__ __forceinline__ static double clamp(double x, double c) {
    return fmin(fmax(x, -c), c);
  }
};
template <>
struct InternalType<float> {
  typedef float scalar_t;
  typedef float2 vec2_t;
  typedef float4 vec4_t;
  __device__ __forceinline__ static vec2_t zero_vec2(void) {
    return make_float2(0, 0);
  }
  __device__ __forceinline__ static vec4_t zero_vec4(void) {
    return make_float4(0, 0, 0, 0);
  }
  __device__ __forceinline__ static float clamp(float x, float c) {
    return fminf(fmaxf(x, -c), c);
  }
};
template <>
struct InternalType<c10::Half> {
  typedef float scalar_t;
  typedef float2 vec2_t;
  typedef float4 vec4_t;
  __device__ __forceinline__ static vec2_t zero_vec2(void) {
    return make_float2(0, 0);
  }
  __device__ __forceinline__ static vec4_t zero_vec4(void) {
    return make_float4(0, 0, 0, 0);
  }
  __device__ __forceinline__ static float clamp(float x, float c) {
    return fminf(fmaxf(x, -c), c);
  }
};

#define MIN(A, B) ((A) < (B) ? (A) : (B))
#define MAX(A, B) ((A) > (B) ? (A) : (B))
#define CEIL_DIV(A, B)                                   \
  (((B) == 1)                                            \
       ? (A)                                             \
       : ((B) == 2) ? ((int)((A) + 1) >> 1)              \
                    : ((B) == 4) ? ((int)((A) + 3) >> 2) \
                                 : (((A) + ((A) > 0 ? (B)-1 : 0)) / (B)))

// This works only up to blocks of size 256 x 256 and for all N that are powers
// of two.
template <int N>
__device__ __forceinline__ void fast_div_mod(int &x, int &y, unsigned int i) {
  if ((N & (N - 1)) && N <= 256)
    y = (i * ((1 << 24) / N + 1)) >> 24;  // Assumes N <= 256, i < N*256.
  else
    y = i / N;

  x = i - y * N;
}

// Type cast stride before reading it.
template <class T>
__device__ __forceinline__ T get_stride(const int64_t &x) {
  return *reinterpret_cast<const T *>(&x);
}

//------------------------------------------------------------------------
// Filters, setup kernel, copying function.

#define MAX_FILTER_SIZE 32

// Combined up/down filter buffers so that transfer can be done with one copy.
__device__ float
    g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE];  // Filters in global memory,
                                                    // written by setup kernel.
__device__ __constant__ float
    c_fbuf[2 * MAX_FILTER_SIZE *
           MAX_FILTER_SIZE];  // Filters in constant memory, read by main
                              // kernel.

// Accessors to combined buffers to index up/down filters individually.
#define c_fu (c_fbuf)
#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
#define g_fu (g_fbuf)
#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)

// Set up filters into global memory buffer.
static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p) {
  for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE;
       idx += blockDim.x) {
    int x, y;
    fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);

    int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
    int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
    if (p.fuShape.y > 0)
      g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y)
                      ? 0.0f
                      : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
    else
      g_fu[idx] =
          (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];

    int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
    int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
    if (p.fdShape.y > 0)
      g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y)
                      ? 0.0f
                      : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
    else
      g_fd[idx] =
          (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
  }
}

// Host function to copy filters written by setup kernel into constant buffer
// for main kernel.
static musaError_t copy_filters(musaStream_t stream) {
  void *src = 0;
  musaError_t err = musaGetSymbolAddress(&src, g_fbuf);
  if (err) return err;
  return musaMemcpyToSymbolAsync(
      c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0,
      musaMemcpyDeviceToDevice, stream);
}

//------------------------------------------------------------------------
// Coordinate spaces:
// - Relative to input tensor:      inX, inY, tileInX, tileInY
// - Relative to input tile:        relInX, relInY, tileInW, tileInH
// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
//
// Relationships between coordinate spaces:
// - inX = tileInX + relInX
// - inY = tileInY + relInY
// - relUpX = relInX * up + phaseInX
// - relUpY = relInY * up + phaseInY
// - relUpX = relOutX * down
// - relUpY = relOutY * down
// - outX = tileOutX + relOutX
// - outY = tileOutY + relOutY

extern __shared__ char
    s_buf_raw[];  // When sharedKB <= 48, allocate shared memory statically
                  // inside the kernel, otherwise use the externally allocated
                  // shared memory buffer.

template <class T, class index_t, int sharedKB, bool signWrite, bool signRead,
          int filterMode, int up, int fuSize, int down, int fdSize,
          int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep,
          bool enableWriteSkip>
static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
  // Check that we don't try to support non-existing filter modes.
  static_assert(up == 1 || up == 2 || up == 4,
                "only up=1, up=2, up=4 scales supported");
  static_assert(down == 1 || down == 2 || down == 4,
                "only down=1, down=2, down=4 scales supported");
  static_assert(fuSize >= up,
                "upsampling filter size must be at least upsampling factor");
  static_assert(
      fdSize >= down,
      "downsampling filter size must be at least downsampling factor");
  static_assert(
      fuSize % up == 0,
      "upsampling filter size must be divisible with upsampling factor");
  static_assert(
      fdSize % down == 0,
      "downsampling filter size must be divisible with downsampling factor");
  static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE,
                "filter size greater than MAX_FILTER_SIZE");
  static_assert(up != 1 || (fuSize == 1 && (filterMode == MODE_FUFD ||
                                            filterMode == MODE_FUSD)),
                "up=1 supported only for 1x1 full filters");
  static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD ||
                                              filterMode == MODE_SUFD)),
                "down=1 supported only for 1x1 full filters");
  static_assert(
      !(up == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)),
      "full filters not supported for up=4");
  static_assert(
      !(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)),
      "full filters not supported for down=4");

  // Static definitions.
  typedef typename InternalType<T>::scalar_t scalar_t;
  typedef typename InternalType<T>::vec2_t vec2_t;
  typedef typename InternalType<T>::vec4_t vec4_t;
  const int tileUpW = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) &
                      ~3;  // Upsampled tile width, rounded up to multiple of 4.
  const int tileUpH =
      tileOutH * down + (fdSize - 1) - (down - 1);  // Upsampled tile height.
  const int tileInW =
      CEIL_DIV(tileUpW + (fuSize - 1), up);  // Input tile width.
  const int tileInH =
      CEIL_DIV(tileUpH + (fuSize - 1), up);  // Input tile height.
  const int tileUpH_up =
      CEIL_DIV(tileUpH, up) *
      up;  // Upsampled tile height rounded up to a multiple of up.
  const int tileInH_up =
      CEIL_DIV(tileUpH_up + (fuSize - 1),
               up);  // For allocations only, to avoid shared memory read
                     // overruns with up=2 and up=4.

  // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
  const bool downInline =
      (down == 1) && ((up == 1 && filterMode == MODE_FUFD) ||
                      (up == 2 && filterMode == MODE_SUFD));

  // Sizes of logical buffers.
  const int szIn = tileInH_up * tileInW;
  const int szUpX = tileInH_up * tileUpW;
  const int szUpXY = downInline ? 0 : (tileUpH * tileUpW);
  const int szDownX = tileUpH * tileOutW;

  // Sizes for shared memory arrays.
  const int s_buf0_size_base =
      (filterMode == MODE_SUSD)
          ? MAX(szIn, szUpXY)
          : (filterMode == MODE_FUSD)
                ? MAX(szIn, szDownX)
                : (filterMode == MODE_SUFD)
                      ? MAX(szIn, szUpXY)
                      : (filterMode == MODE_FUFD) ? szIn : -1;
  const int s_buf1_size_base =
      (filterMode == MODE_SUSD)
          ? MAX(szUpX, szDownX)
          : (filterMode == MODE_FUSD)
                ? szUpXY
                : (filterMode == MODE_SUFD)
                      ? szUpX
                      : (filterMode == MODE_FUFD) ? szUpXY : -1;

  // Ensure U128 alignment.
  const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
  const int s_buf1_size = (s_buf1_size_base + 3) & ~3;

  // Check at compile time that we don't use too much shared memory.
  static_assert(
      (s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10),
      "shared memory overflow");

  // Declare shared memory arrays.
  scalar_t *s_buf0;
  scalar_t *s_buf1;
  if (sharedKB <= 48) {
    // Allocate shared memory arrays here.
    __shared__ scalar_t
        s_buf0_st[(sharedKB > 48)
                      ? (1 << 24)
                      : (s_buf0_size +
                         s_buf1_size)];  // Prevent launching if this isn't
                                         // optimized away when unused.
    s_buf0 = s_buf0_st;
    s_buf1 = s_buf0 + s_buf0_size;
  } else {
    // Use the dynamically allocated shared memory array.
    s_buf0 = (scalar_t *)s_buf_raw;
    s_buf1 = s_buf0 + s_buf0_size;
  }

  // Pointers to the buffers.
  scalar_t *
      s_tileIn;  // Input tile:                      [relInX * tileInH + relInY]
  scalar_t *s_tileUpX;   // After horizontal upsampling:     [relInY * tileUpW +
                         // relUpX]
  scalar_t *s_tileUpXY;  // After upsampling:                [relUpY * tileUpW +
                         // relUpX]
  scalar_t *s_tileDownX;  // After horizontal downsampling:   [relUpY * tileOutW
                          // + relOutX]
  if (filterMode == MODE_SUSD) {
    s_tileIn = s_buf0;
    s_tileUpX = s_buf1;
    s_tileUpXY = s_buf0;
    s_tileDownX = s_buf1;
  } else if (filterMode == MODE_FUSD) {
    s_tileIn = s_buf0;
    s_tileUpXY = s_buf1;
    s_tileDownX = s_buf0;
  } else if (filterMode == MODE_SUFD) {
    s_tileIn = s_buf0;
    s_tileUpX = s_buf1;
    s_tileUpXY = s_buf0;
  } else if (filterMode == MODE_FUFD) {
    s_tileIn = s_buf0;
    s_tileUpXY = s_buf1;
  }

  // Allow large grids in z direction via per-launch offset.
  int channelIdx = blockIdx.z + p.blockZofs;
  int batchIdx = channelIdx / p.yShape.z;
  channelIdx -= batchIdx * p.yShape.z;

  // Offset to output feature map. In bytes.
  index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) +
                      batchIdx * get_stride<index_t>(p.yStride.w);

  // Sign shift amount.
  uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;

// Inner tile loop.
#pragma unroll 1
  for (int tileIdx = 0;
       !enableXrep ||
       (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y));
       tileIdx++) {
    // Locate output tile.
    int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
    int tileOutX = tileX * tileOutW;
    int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;

    // Locate input tile.
    int tmpX = tileOutX * down - p.pad0.x;
    int tmpY = tileOutY * down - p.pad0.y;
    int tileInX = CEIL_DIV(tmpX, up);
    int tileInY = CEIL_DIV(tmpY, up);
    const int phaseInX = tileInX * up - tmpX;
    const int phaseInY = tileInY * up - tmpY;

    // Extra sync if input and output buffers are the same and we are not on
    // first tile.
    if (enableXrep && tileIdx > 0 &&
        (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) ||
         (filterMode == MODE_FUFD && downInline)))
      __syncthreads();

    // Load input tile & apply bias. Unrolled.
    scalar_t b =
        (scalar_t) * (const T *)((const char *)p.b +
                                 (channelIdx * get_stride<index_t>(p.bStride)));
    index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) +
                       batchIdx * get_stride<index_t>(p.xStride.w);
    int idx = threadIdx.x;
    const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
#pragma unroll
    for (int loop = 0; loop < loopCountIN; loop++) {
      int relInX, relInY;
      fast_div_mod<tileInW>(relInX, relInY, idx);
      int inX = tileInX + relInX;
      int inY = tileInY + relInY;
      scalar_t v = 0;

      if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
        v = (scalar_t) * ((const T *)((const char *)p.x +
                                      (inX * get_stride<index_t>(p.xStride.x) +
                                       inY * get_stride<index_t>(p.xStride.y) +
                                       mapOfsIn))) +
            b;

      bool skip = (loop == loopCountIN - 1) && (idx >= tileInW * tileInH);
      if (!skip) s_tileIn[idx] = v;

      idx += threadsPerBlock;
    }

    if (filterMode == MODE_SUSD ||
        filterMode == MODE_SUFD)  // Separable upsampling filter.
    {
      // Horizontal upsampling.
      __syncthreads();
      if (up == 4) {
        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
             idx += blockDim.x * up) {
          int relUpX0, relInY;
          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
          int relInX0 = relUpX0 / up;
          int src0 = relInX0 + tileInW * relInY;
          int dst = relInY * tileUpW + relUpX0;
          vec4_t v = InternalType<T>::zero_vec4();
          scalar_t a = s_tileIn[src0];
          if (phaseInX == 0) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
              v.y += a * (scalar_t)c_fu[step * up + 3];
              v.z += a * (scalar_t)c_fu[step * up + 2];
              v.w += a * (scalar_t)c_fu[step * up + 1];
            }
          } else if (phaseInX == 1) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 1];
              v.y += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
              v.z += a * (scalar_t)c_fu[step * up + 3];
              v.w += a * (scalar_t)c_fu[step * up + 2];
            }
          } else if (phaseInX == 2) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 2];
              v.y += a * (scalar_t)c_fu[step * up + 1];
              v.z += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
              v.w += a * (scalar_t)c_fu[step * up + 3];
            }
          } else  // (phaseInX == 3)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 3];
              v.y += a * (scalar_t)c_fu[step * up + 2];
              v.z += a * (scalar_t)c_fu[step * up + 1];
              v.w += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
            }
          }
          s_tileUpX[dst + 0] = v.x;
          s_tileUpX[dst + 1] = v.y;
          s_tileUpX[dst + 2] = v.z;
          s_tileUpX[dst + 3] = v.w;
        }
      } else if (up == 2) {
        bool p0 = (phaseInX == 0);
        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
             idx += blockDim.x * up) {
          int relUpX0, relInY;
          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
          int relInX0 = relUpX0 / up;
          int src0 = relInX0 + tileInW * relInY;
          int dst = relInY * tileUpW + relUpX0;
          vec2_t v = InternalType<T>::zero_vec2();
          scalar_t a = s_tileIn[src0];
          if (p0)  // (phaseInX == 0)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
              v.y += a * (scalar_t)c_fu[step * up + 1];
            }
          } else  // (phaseInX == 1)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 1];
              v.y += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileIn[src0 + step + 1];
            }
          }
          s_tileUpX[dst + 0] = v.x;
          s_tileUpX[dst + 1] = v.y;
        }
      }

      // Vertical upsampling & nonlinearity.

      __syncthreads();
      int groupMask = 15 << ((threadIdx.x & 31) & ~3);
      int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
                          : 0;  // Skip already written signs.
      int sShapeMaxY =
          MIN(p.sShape.y,
              tileOutY * down + tileUpH);  // Avoid out-of-tile sign writes.
      if (up == 4) {
        minY -= 3;  // Adjust according to block height.
        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
             idx += blockDim.x) {
          int relUpX, relInY0;
          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
          int relUpY0 = relInY0 * up;
          int src0 = relInY0 * tileUpW + relUpX;
          int dst = relUpY0 * tileUpW + relUpX;
          vec4_t v = InternalType<T>::zero_vec4();

          scalar_t a = s_tileUpX[src0];
          if (phaseInY == 0) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
              v.y += a * (scalar_t)c_fu[step * up + 3];
              v.z += a * (scalar_t)c_fu[step * up + 2];
              v.w += a * (scalar_t)c_fu[step * up + 1];
            }
          } else if (phaseInY == 1) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 1];
              v.y += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
              v.z += a * (scalar_t)c_fu[step * up + 3];
              v.w += a * (scalar_t)c_fu[step * up + 2];
            }
          } else if (phaseInY == 2) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 2];
              v.y += a * (scalar_t)c_fu[step * up + 1];
              v.z += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
              v.w += a * (scalar_t)c_fu[step * up + 3];
            }
          } else  // (phaseInY == 3)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 3];
              v.y += a * (scalar_t)c_fu[step * up + 2];
              v.z += a * (scalar_t)c_fu[step * up + 1];
              v.w += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
            }
          }

          int x = tileOutX * down + relUpX;
          int y = tileOutY * down + relUpY0;
          int signX = x + p.sOfs.x;
          int signY = y + p.sOfs.y;
          int signZ = blockIdx.z + p.blockZofs;
          int signXb = signX >> 2;
          index_t si0 =
              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
          index_t si1 = si0 + p.sShape.x;
          index_t si2 = si0 + p.sShape.x * 2;
          index_t si3 = si0 + p.sShape.x * 3;

          v.x *= (scalar_t)((float)up * (float)up * p.gain);
          v.y *= (scalar_t)((float)up * (float)up * p.gain);
          v.z *= (scalar_t)((float)up * (float)up * p.gain);
          v.w *= (scalar_t)((float)up * (float)up * p.gain);

          if (signWrite) {
            if (!enableWriteSkip) {
              // Determine and write signs.
              int sx = __float_as_uint(v.x) >> 31 << 0;
              int sy = __float_as_uint(v.y) >> 31 << 8;
              int sz = __float_as_uint(v.z) >> 31 << 16;
              int sw = __float_as_uint(v.w) >> 31 << 24;
              if (sx) v.x *= p.slope;
              if (sy) v.y *= p.slope;
              if (sz) v.z *= p.slope;
              if (sw) v.w *= p.slope;
              if (fabsf(v.x) > p.clamp) {
                sx = 2 << 0;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
              }
              if (fabsf(v.y) > p.clamp) {
                sy = 2 << 8;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
              }
              if (fabsf(v.z) > p.clamp) {
                sz = 2 << 16;
                v.z = InternalType<T>::clamp(v.z, p.clamp);
              }
              if (fabsf(v.w) > p.clamp) {
                sw = 2 << 24;
                v.w = InternalType<T>::clamp(v.w, p.clamp);
              }

              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
                // Combine signs.
                uint32_t s = sx + sy + sw + sz;
                s <<= (signX & 3) << 1;
#ifdef MMCV_WITH_HIP
                s |= __shfl_xor(s, 1);
                s |= __shfl_xor(s, 2);
#else
                s |= __shfl_xor_sync(groupMask, s, 1);
                s |= __shfl_xor_sync(groupMask, s, 2);
#endif

                // Write signs.
                if ((uint32_t)(signY + 0) < sShapeMaxY) {
                  p.s[si0] = (unsigned char)(s >> 0);
                }
                if ((uint32_t)(signY + 1) < sShapeMaxY) {
                  p.s[si1] = (unsigned char)(s >> 8);
                }
                if ((uint32_t)(signY + 2) < sShapeMaxY) {
                  p.s[si2] = (unsigned char)(s >> 16);
                }
                if ((uint32_t)(signY + 3) < sShapeMaxY) {
                  p.s[si3] = (unsigned char)(s >> 24);
                }
              }
            } else {
              // Determine and write signs.
              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
                int sx = __float_as_uint(v.x) >> 31 << 0;
                int sy = __float_as_uint(v.y) >> 31 << 8;
                int sz = __float_as_uint(v.z) >> 31 << 16;
                int sw = __float_as_uint(v.w) >> 31 << 24;
                if (sx) v.x *= p.slope;
                if (sy) v.y *= p.slope;
                if (sz) v.z *= p.slope;
                if (sw) v.w *= p.slope;
                if (fabsf(v.x) > p.clamp) {
                  sx = 2 << 0;
                  v.x = InternalType<T>::clamp(v.x, p.clamp);
                }
                if (fabsf(v.y) > p.clamp) {
                  sy = 2 << 8;
                  v.y = InternalType<T>::clamp(v.y, p.clamp);
                }
                if (fabsf(v.z) > p.clamp) {
                  sz = 2 << 16;
                  v.z = InternalType<T>::clamp(v.z, p.clamp);
                }
                if (fabsf(v.w) > p.clamp) {
                  sw = 2 << 24;
                  v.w = InternalType<T>::clamp(v.w, p.clamp);
                }

                // Combine signs.
                uint32_t s = sx + sy + sw + sz;
                s <<= (signX & 3) << 1;
#ifdef MMCV_WITH_HIP
                s |= __shfl_xor(s, 1);
                s |= __shfl_xor(s, 2);
#else
                s |= __shfl_xor_sync(groupMask, s, 1);
                s |= __shfl_xor_sync(groupMask, s, 2);
#endif

                // Write signs.
                if ((uint32_t)(signY + 0) < sShapeMaxY) {
                  p.s[si0] = (unsigned char)(s >> 0);
                }
                if ((uint32_t)(signY + 1) < sShapeMaxY) {
                  p.s[si1] = (unsigned char)(s >> 8);
                }
                if ((uint32_t)(signY + 2) < sShapeMaxY) {
                  p.s[si2] = (unsigned char)(s >> 16);
                }
                if ((uint32_t)(signY + 3) < sShapeMaxY) {
                  p.s[si3] = (unsigned char)(s >> 24);
                }
              } else {
                // Just compute the values.
                if (v.x < 0.f) v.x *= p.slope;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
                if (v.y < 0.f) v.y *= p.slope;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
                if (v.z < 0.f) v.z *= p.slope;
                v.z = InternalType<T>::clamp(v.z, p.clamp);
                if (v.w < 0.f) v.w *= p.slope;
                v.w = InternalType<T>::clamp(v.w, p.clamp);
              }
            }
          } else if (signRead)  // Read signs and apply.
          {
            if ((uint32_t)signXb < p.swLimit) {
              int ss = (signX & 3) << 1;
              if ((uint32_t)(signY + 0) < p.sShape.y) {
                int s = p.s[si0] >> ss;
                if (s & 1) v.x *= p.slope;
                if (s & 2) v.x = 0.f;
              }
              if ((uint32_t)(signY + 1) < p.sShape.y) {
                int s = p.s[si1] >> ss;
                if (s & 1) v.y *= p.slope;
                if (s & 2) v.y = 0.f;
              }
              if ((uint32_t)(signY + 2) < p.sShape.y) {
                int s = p.s[si2] >> ss;
                if (s & 1) v.z *= p.slope;
                if (s & 2) v.z = 0.f;
              }
              if ((uint32_t)(signY + 3) < p.sShape.y) {
                int s = p.s[si3] >> ss;
                if (s & 1) v.w *= p.slope;
                if (s & 2) v.w = 0.f;
              }
            }
          } else  // Forward pass with no sign write.
          {
            if (v.x < 0.f) v.x *= p.slope;
            v.x = InternalType<T>::clamp(v.x, p.clamp);
            if (v.y < 0.f) v.y *= p.slope;
            v.y = InternalType<T>::clamp(v.y, p.clamp);
            if (v.z < 0.f) v.z *= p.slope;
            v.z = InternalType<T>::clamp(v.z, p.clamp);
            if (v.w < 0.f) v.w *= p.slope;
            v.w = InternalType<T>::clamp(v.w, p.clamp);
          }

          s_tileUpXY[dst + 0 * tileUpW] = v.x;
          if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
          if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
          if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
        }
      } else if (up == 2) {
        minY -= 1;  // Adjust according to block height.
        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
             idx += blockDim.x) {
          int relUpX, relInY0;
          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
          int relUpY0 = relInY0 * up;
          int src0 = relInY0 * tileUpW + relUpX;
          int dst = relUpY0 * tileUpW + relUpX;
          vec2_t v = InternalType<T>::zero_vec2();

          scalar_t a = s_tileUpX[src0];
          if (phaseInY == 0) {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
              v.y += a * (scalar_t)c_fu[step * up + 1];
            }
          } else  // (phaseInY == 1)
          {
#pragma unroll
            for (int step = 0; step < fuSize / up; step++) {
              v.x += a * (scalar_t)c_fu[step * up + 1];
              v.y += a * (scalar_t)c_fu[step * up + 0];
              a = s_tileUpX[src0 + (step + 1) * tileUpW];
            }
          }

          int x = tileOutX * down + relUpX;
          int y = tileOutY * down + relUpY0;
          int signX = x + p.sOfs.x;
          int signY = y + p.sOfs.y;
          int signZ = blockIdx.z + p.blockZofs;
          int signXb = signX >> 2;
          index_t si0 =
              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
          index_t si1 = si0 + p.sShape.x;

          v.x *= (scalar_t)((float)up * (float)up * p.gain);
          v.y *= (scalar_t)((float)up * (float)up * p.gain);

          if (signWrite) {
            if (!enableWriteSkip) {
              // Determine and write signs.
              int sx = __float_as_uint(v.x) >> 31 << 0;
              int sy = __float_as_uint(v.y) >> 31 << 8;
              if (sx) v.x *= p.slope;
              if (sy) v.y *= p.slope;
              if (fabsf(v.x) > p.clamp) {
                sx = 2 << 0;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
              }
              if (fabsf(v.y) > p.clamp) {
                sy = 2 << 8;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
              }

              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
                // Combine signs.
                int s = sx + sy;
                s <<= signXo;
#ifdef MMCV_WITH_HIP
                s |= __shfl_xor(s, 1);
                s |= __shfl_xor(s, 2);
#else
                s |= __shfl_xor_sync(groupMask, s, 1);
                s |= __shfl_xor_sync(groupMask, s, 2);
#endif

                // Write signs.
                if ((uint32_t)(signY + 0) < sShapeMaxY) {
                  p.s[si0] = (unsigned char)(s >> 0);
                }
                if ((uint32_t)(signY + 1) < sShapeMaxY) {
                  p.s[si1] = (unsigned char)(s >> 8);
                }
              }
            } else {
              // Determine and write signs.
              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
                int sx = __float_as_uint(v.x) >> 31 << 0;
                int sy = __float_as_uint(v.y) >> 31 << 8;
                if (sx) v.x *= p.slope;
                if (sy) v.y *= p.slope;
                if (fabsf(v.x) > p.clamp) {
                  sx = 2 << 0;
                  v.x = InternalType<T>::clamp(v.x, p.clamp);
                }
                if (fabsf(v.y) > p.clamp) {
                  sy = 2 << 8;
                  v.y = InternalType<T>::clamp(v.y, p.clamp);
                }

                // Combine signs.
                int s = sx + sy;
                s <<= signXo;
#ifdef MMCV_WITH_HIP
                s |= __shfl_xor(s, 1);
                s |= __shfl_xor(s, 2);
#else
                s |= __shfl_xor_sync(groupMask, s, 1);
                s |= __shfl_xor_sync(groupMask, s, 2);
#endif

                // Write signs.
                if ((uint32_t)(signY + 0) < sShapeMaxY) {
                  p.s[si0] = (unsigned char)(s >> 0);
                }
                if ((uint32_t)(signY + 1) < sShapeMaxY) {
                  p.s[si1] = (unsigned char)(s >> 8);
                }
              } else {
                // Just compute the values.
                if (v.x < 0.f) v.x *= p.slope;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
                if (v.y < 0.f) v.y *= p.slope;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
              }
            }
          } else if (signRead)  // Read signs and apply.
          {
            if ((uint32_t)signXb < p.swLimit) {
              if ((uint32_t)(signY + 0) < p.sShape.y) {
                int s = p.s[si0] >> signXo;
                if (s & 1) v.x *= p.slope;
                if (s & 2) v.x = 0.f;
              }
              if ((uint32_t)(signY + 1) < p.sShape.y) {
                int s = p.s[si1] >> signXo;
                if (s & 1) v.y *= p.slope;
                if (s & 2) v.y = 0.f;
              }
            }
          } else  // Forward pass with no sign write.
          {
            if (v.x < 0.f) v.x *= p.slope;
            v.x = InternalType<T>::clamp(v.x, p.clamp);
            if (v.y < 0.f) v.y *= p.slope;
            v.y = InternalType<T>::clamp(v.y, p.clamp);
          }

          if (!downInline) {
            // Write into temporary buffer.
            s_tileUpXY[dst] = v.x;
            if (relUpY0 < tileUpH - 1) s_tileUpXY[dst + tileUpW] = v.y;
          } else {
            // Write directly into output buffer.
            if ((uint32_t)x < p.yShape.x) {
              int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
              index_t ofs = x * get_stride<index_t>(p.yStride.x) +
                            y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
              if ((uint32_t)y + 0 < p.yShape.y)
                *((T *)((char *)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
              if ((uint32_t)y + 1 < ymax)
                *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.y))) =
                    (T)(v.y * (scalar_t)c_fd[0]);
            }
          }
        }
      }
    } else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD) {
      // Full upsampling filter.

      if (up == 2) {
        // 2 x 2-wide.
        __syncthreads();
        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y
                            : 0;  // Skip already written signs.
        for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH;
             idx += blockDim.x * 4) {
          int relUpX0, relUpY0;
          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
          int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
          int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
          int src0 = relInX0 + tileInW * relInY0;
          int tap0y = (relInY0 * up + phaseInY - relUpY0);

#define X_LOOP(TAPY, PX)                                             \
  for (int sx = 0; sx < fuSize / up; sx++) {                         \
    v.x += a * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \
                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
    v.z += b * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \
                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
    if ((PX) == 0) {                                                 \
      a = b;                                                         \
      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
    }                                                                \
    v.y += a * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \
                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
    v.w += b * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \
                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
    if ((PX) == 1) {                                                 \
      a = b;                                                         \
      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
    }                                                                \
  }

          vec4_t v = InternalType<T>::zero_vec4();
          if (tap0y == 0 && phaseInX == 0)
#pragma unroll
            for (int sy = 0; sy < fuSize / up; sy++) {
              scalar_t a = s_tileIn[src0 + sy * tileInW];
              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
#pragma unroll
              X_LOOP(0, 0)
            }
          if (tap0y == 0 && phaseInX == 1)
#pragma unroll
            for (int sy = 0; sy < fuSize / up; sy++) {
              scalar_t a = s_tileIn[src0 + sy * tileInW];
              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
#pragma unroll
              X_LOOP(0, 1)
            }
          if (tap0y == 1 && phaseInX == 0)
#pragma unroll
            for (int sy = 0; sy < fuSize / up; sy++) {
              scalar_t a = s_tileIn[src0 + sy * tileInW];
              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
#pragma unroll
              X_LOOP(1, 0)
            }
          if (tap0y == 1 && phaseInX == 1)
#pragma unroll
            for (int sy = 0; sy < fuSize / up; sy++) {
              scalar_t a = s_tileIn[src0 + sy * tileInW];
              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
#pragma unroll
              X_LOOP(1, 1)
            }

#undef X_LOOP

          int x = tileOutX * down + relUpX0;
          int y = tileOutY * down + relUpY0;
          int signX = x + p.sOfs.x;
          int signY = y + p.sOfs.y;
          int signZ = blockIdx.z + p.blockZofs;
          int signXb = signX >> 2;
          index_t si =
              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);

          v.x *= (scalar_t)((float)up * (float)up * p.gain);
          v.y *= (scalar_t)((float)up * (float)up * p.gain);
          v.z *= (scalar_t)((float)up * (float)up * p.gain);
          v.w *= (scalar_t)((float)up * (float)up * p.gain);

          if (signWrite) {
            if (!enableWriteSkip) {
              // Determine and write signs.
              int sx = __float_as_uint(v.x) >> 31;
              int sy = __float_as_uint(v.y) >> 31;
              int sz = __float_as_uint(v.z) >> 31;
              int sw = __float_as_uint(v.w) >> 31;
              if (sx) v.x *= p.slope;
              if (fabsf(v.x) > p.clamp) {
                sx = 2;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
              }
              if (sy) v.y *= p.slope;
              if (fabsf(v.y) > p.clamp) {
                sy = 2;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
              }
              if (sz) v.z *= p.slope;
              if (fabsf(v.z) > p.clamp) {
                sz = 2;
                v.z = InternalType<T>::clamp(v.z, p.clamp);
              }
              if (sw) v.w *= p.slope;
              if (fabsf(v.w) > p.clamp) {
                sw = 2;
                v.w = InternalType<T>::clamp(v.w, p.clamp);
              }

              if ((uint32_t)signXb < p.swLimit &&
                  (uint32_t)signY < p.sShape.y && signY >= minY) {
                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
              }
            } else {
              // Determine and write signs.
              if ((uint32_t)signXb < p.swLimit &&
                  (uint32_t)signY < p.sShape.y && signY >= minY) {
                int sx = __float_as_uint(v.x) >> 31;
                int sy = __float_as_uint(v.y) >> 31;
                int sz = __float_as_uint(v.z) >> 31;
                int sw = __float_as_uint(v.w) >> 31;
                if (sx) v.x *= p.slope;
                if (fabsf(v.x) > p.clamp) {
                  sx = 2;
                  v.x = InternalType<T>::clamp(v.x, p.clamp);
                }
                if (sy) v.y *= p.slope;
                if (fabsf(v.y) > p.clamp) {
                  sy = 2;
                  v.y = InternalType<T>::clamp(v.y, p.clamp);
                }
                if (sz) v.z *= p.slope;
                if (fabsf(v.z) > p.clamp) {
                  sz = 2;
                  v.z = InternalType<T>::clamp(v.z, p.clamp);
                }
                if (sw) v.w *= p.slope;
                if (fabsf(v.w) > p.clamp) {
                  sw = 2;
                  v.w = InternalType<T>::clamp(v.w, p.clamp);
                }

                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
              } else {
                // Just compute the values.
                if (v.x < 0.f) v.x *= p.slope;
                v.x = InternalType<T>::clamp(v.x, p.clamp);
                if (v.y < 0.f) v.y *= p.slope;
                v.y = InternalType<T>::clamp(v.y, p.clamp);
                if (v.z < 0.f) v.z *= p.slope;
                v.z = InternalType<T>::clamp(v.z, p.clamp);
                if (v.w < 0.f) v.w *= p.slope;
                v.w = InternalType<T>::clamp(v.w, p.clamp);
              }
            }
          } else if (signRead)  // Read sign and apply.
          {
            if ((uint32_t)signY < p.sShape.y) {
              int s = 0;
              if ((uint32_t)signXb < p.swLimit) s = p.s[si];
              if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
              s >>= (signX & 3) << 1;
              if (s & 0x01) v.x *= p.slope;
              if (s & 0x02) v.x = 0.f;
              if (s & 0x04) v.y *= p.slope;
              if (s & 0x08) v.y = 0.f;
              if (s & 0x10) v.z *= p.slope;
              if (s & 0x20) v.z = 0.f;
              if (s & 0x40) v.w *= p.slope;
              if (s & 0x80) v.w = 0.f;
            }
          } else  // Forward pass with no sign write.
          {
            if (v.x < 0.f) v.x *= p.slope;
            v.x = InternalType<T>::clamp(v.x, p.clamp);
            if (v.y < 0.f) v.y *= p.slope;
            v.y = InternalType<T>::clamp(v.y, p.clamp);
            if (v.z < 0.f) v.z *= p.slope;
            v.z = InternalType<T>::clamp(v.z, p.clamp);
            if (v.w < 0.f) v.w *= p.slope;
            v.w = InternalType<T>::clamp(v.w, p.clamp);
          }

          s_tileUpXY[idx + 0] = v.x;
          s_tileUpXY[idx + 1] = v.y;
          s_tileUpXY[idx + 2] = v.z;
          s_tileUpXY[idx + 3] = v.w;
        }
      } else if (up == 1) {
        __syncthreads();
        uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
                            : 0;  // Skip already written signs.
        for (int idx = threadIdx.x; idx < tileUpW * tileUpH;
             idx += blockDim.x) {
          int relUpX0, relUpY0;
          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
          scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0];  // 1x1 filter.

          int x = tileOutX * down + relUpX0;
          int y = tileOutY * down + relUpY0;
          int signX = x + p.sOfs.x;
          int signY = y + p.sOfs.y;
          int signZ = blockIdx.z + p.blockZofs;
          int signXb = signX >> 2;
          index_t si =
              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
          v *= (scalar_t)((float)up * (float)up * p.gain);

          if (signWrite) {
            if (!enableWriteSkip) {
              // Determine and write sign.
              uint32_t s = 0;
              uint32_t signXbit = (1u << signXo);
              if (v < 0.f) {
                s = signXbit;
                v *= p.slope;
              }
              if (fabsf(v) > p.clamp) {
                s = signXbit * 2;
                v = InternalType<T>::clamp(v, p.clamp);
              }
              if ((uint32_t)signXb < p.swLimit &&
                  (uint32_t)signY < p.sShape.y && signY >= minY) {
#ifdef MMCV_WITH_HIP
                s += __shfl_xor(s, 1);  // Coalesce.
                s += __shfl_xor(s, 2);  // Coalesce.
#else
                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
#endif
                p.s[si] = s;  // Write.
              }
            } else {
              // Determine and write sign.
              if ((uint32_t)signXb < p.swLimit &&
                  (uint32_t)signY < p.sShape.y && signY >= minY) {
                uint32_t s = 0;
                uint32_t signXbit = (1u << signXo);
                if (v < 0.f) {
                  s = signXbit;
                  v *= p.slope;
                }
                if (fabsf(v) > p.clamp) {
                  s = signXbit * 2;
                  v = InternalType<T>::clamp(v, p.clamp);
                }
#ifdef MMCV_WITH_HIP
                s += __shfl_xor(s, 1);  // Coalesce.
                s += __shfl_xor(s, 2);  // Coalesce.
#else
                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
#endif
                p.s[si] = s;  // Write.
              } else {
                // Just compute the value.
                if (v < 0.f) v *= p.slope;
                v = InternalType<T>::clamp(v, p.clamp);
              }
            }
          } else if (signRead) {
            // Read sign and apply if within sign tensor bounds.
            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y) {
              int s = p.s[si];
              s >>= signXo;
              if (s & 1) v *= p.slope;
              if (s & 2) v = 0.f;
            }
          } else  // Forward pass with no sign write.
          {
            if (v < 0.f) v *= p.slope;
            v = InternalType<T>::clamp(v, p.clamp);
          }

          if (!downInline)  // Write into temporary buffer.
            s_tileUpXY[idx] = v;
          else if ((uint32_t)x < p.yShape.x &&
                   (uint32_t)y <
                       p.yShape.y)  // Write directly into output buffer
            *((T *)((char *)p.y + (x * get_stride<index_t>(p.yStride.x) +
                                   y * get_stride<index_t>(p.yStride.y) +
                                   mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
        }
      }
    }

    // Downsampling.
    if (filterMode == MODE_SUSD || filterMode == MODE_FUSD) {
      // Horizontal downsampling.
      __syncthreads();
      if (down == 4 && tileOutW % 4 == 0) {
        // Calculate 4 pixels at a time.
        for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH;
             idx += blockDim.x * 4) {
          int relOutX0, relUpY;
          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
          int relUpX0 = relOutX0 * down;
          int src0 = relUpY * tileUpW + relUpX0;
          vec4_t v = InternalType<T>::zero_vec4();
#pragma unroll
          for (int step = 0; step < fdSize; step++) {
            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
            v.y += s_tileUpXY[src0 + 4 + step] * (scalar_t)c_fd[step];
            v.z += s_tileUpXY[src0 + 8 + step] * (scalar_t)c_fd[step];
            v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
          }
          s_tileDownX[idx + 0] = v.x;
          s_tileDownX[idx + 1] = v.y;
          s_tileDownX[idx + 2] = v.z;
          s_tileDownX[idx + 3] = v.w;
        }
      } else if ((down == 2 || down == 4) && (tileOutW % 2 == 0)) {
        // Calculate 2 pixels at a time.
        for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH;
             idx += blockDim.x * 2) {
          int relOutX0, relUpY;
          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
          int relUpX0 = relOutX0 * down;
          int src0 = relUpY * tileUpW + relUpX0;
          vec2_t v = InternalType<T>::zero_vec2();
#pragma unroll
          for (int step = 0; step < fdSize; step++) {
            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
            v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
          }
          s_tileDownX[idx + 0] = v.x;
          s_tileDownX[idx + 1] = v.y;
        }
      } else {
        // Calculate 1 pixel at a time.
        for (int idx = threadIdx.x; idx < tileOutW * tileUpH;
             idx += blockDim.x) {
          int relOutX0, relUpY;
          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
          int relUpX0 = relOutX0 * down;
          int src = relUpY * tileUpW + relUpX0;
          scalar_t v = 0.f;
#pragma unroll
          for (int step = 0; step < fdSize; step++)
            v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
          s_tileDownX[idx] = v;
        }
      }

      // Vertical downsampling & store output tile.
      __syncthreads();
      for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
           idx += blockDim.x) {
        int relOutX, relOutY0;
        fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
        int relUpY0 = relOutY0 * down;
        int src0 = relUpY0 * tileOutW + relOutX;
        scalar_t v = 0;
#pragma unroll
        for (int step = 0; step < fdSize; step++)
          v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];

        int outX = tileOutX + relOutX;
        int outY = tileOutY + relOutY0;

        if (outX < p.yShape.x & outY < p.yShape.y)
          *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
                                 outY * get_stride<index_t>(p.yStride.y) +
                                 mapOfsOut))) = (T)v;
      }
    } else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD) {
      // Full downsampling filter.
      if (down == 2) {
        // 2-wide.
        __syncthreads();
        for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH;
             idx += blockDim.x * 2) {
          int relOutX0, relOutY0;
          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
          int relUpX0 = relOutX0 * down;
          int relUpY0 = relOutY0 * down;
          int src0 = relUpY0 * tileUpW + relUpX0;
          vec2_t v = InternalType<T>::zero_vec2();
#pragma unroll
          for (int sy = 0; sy < fdSize; sy++)
#pragma unroll
            for (int sx = 0; sx < fdSize; sx++) {
              v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] *
                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
              v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] *
                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
            }

          int outX = tileOutX + relOutX0;
          int outY = tileOutY + relOutY0;
          if ((uint32_t)outY < p.yShape.y) {
            index_t ofs = outX * get_stride<index_t>(p.yStride.x) +
                          outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
            if (outX + 0 < p.yShape.x) *((T *)((char *)p.y + ofs)) = (T)v.x;
            if (outX + 1 < p.yShape.x)
              *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.x))) =
                  (T)v.y;
          }
        }
      } else if (down == 1 && !downInline) {
        // Thread per pixel.
        __syncthreads();
        for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
             idx += blockDim.x) {
          int relOutX0, relOutY0;
          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
          scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0];  // 1x1 filter.

          int outX = tileOutX + relOutX0;
          int outY = tileOutY + relOutY0;
          if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
            *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
                                   outY * get_stride<index_t>(p.yStride.y) +
                                   mapOfsOut))) = (T)v;
        }
      }
    }

    if (!enableXrep) break;
  }
}

//------------------------------------------------------------------------
// Compute activation function and signs for upsampled data tensor, modifying
// data tensor in-place. Used for accelerating the generic variant. Sign tensor
// is known to be contiguous, and p.x and p.s have the same z, w dimensions.
// 64-bit indexing is always used.

template <class T, bool signWrite, bool signRead>
static __global__ void filtered_lrelu_act_kernel(
    filtered_lrelu_act_kernel_params p) {
  typedef typename InternalType<T>::scalar_t scalar_t;

  // Indexing.
  int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
  int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
  int32_t qmax =
      p.xShape.z * p.xShape.w;  // Combined minibatch*channel maximum index.

  // Loop to accommodate oversized tensors.
  for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y) {
      // Extract z and w (channel, minibatch index).
      int32_t w = q / p.xShape.z;
      int32_t z = q - w * p.xShape.z;

      // Choose behavior based on sign read/write mode.
      if (signWrite) {
        // Process value if in p.x.
        uint32_t s = 0;
        if (x < p.xShape.x && y < p.xShape.y) {
          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
                       w * p.xStride.w;
          T *pv = ((T *)p.x) + ix;
          scalar_t v = (scalar_t)(*pv);

          // Gain, LReLU, clamp.
          v *= p.gain;
          if (v < 0.f) {
            v *= p.slope;
            s = 1;  // Sign.
          }
          if (fabsf(v) > p.clamp) {
            v = InternalType<T>::clamp(v, p.clamp);
            s = 2;  // Clamp.
          }

          *pv = (T)v;  // Write value.
        }

        // Coalesce into threads 0 and 16 of warp.
        uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
        s <<= ((threadIdx.x & 15) << 1);  // Shift into place.
#ifdef MMCV_WITH_HIP
        s |= __shfl_xor(s, 1);  // Distribute.
        s |= __shfl_xor(s, 2);
        s |= __shfl_xor(s, 4);
        s |= __shfl_xor(s, 8);
#else
        s |= __shfl_xor_sync(m, s, 1);                  // Distribute.
        s |= __shfl_xor_sync(m, s, 2);
        s |= __shfl_xor_sync(m, s, 4);
        s |= __shfl_xor_sync(m, s, 8);
#endif

        // Write signs if leader and in p.s.
        if (!(threadIdx.x & 15) && x < p.sShape.x)  // y is always in.
        {
          uint64_t is =
              x + p.sShape.x * (y + (int64_t)p.sShape.y * q);  // Contiguous.
          ((uint32_t *)p.s)[is >> 4] = s;
        }
      } else if (signRead) {
        // Process value if in p.x.
        if (x < p.xShape.x)  // y is always in.
        {
          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
                       w * p.xStride.w;
          T *pv = ((T *)p.x) + ix;
          scalar_t v = (scalar_t)(*pv);
          v *= p.gain;

          // Apply sign buffer offset.
          uint32_t sx = x + p.sOfs.x;
          uint32_t sy = y + p.sOfs.y;

          // Read and apply signs if we land inside valid region of sign buffer.
          if (sx < p.sShape.x && sy < p.sShape.y) {
            uint64_t is =
                (sx >> 2) + (p.sShape.x >> 2) *
                                (sy + (uint64_t)p.sShape.y * q);  // Contiguous.
            unsigned char s = p.s[is];
            s >>= (sx & 3) << 1;  // Shift into place.
            if (s & 1)            // Sign?
              v *= p.slope;
            if (s & 2)  // Clamp?
              v = 0.f;
          }

          *pv = (T)v;  // Write value.
        }
      } else {
        // Forward pass with no sign write. Process value if in p.x.
        if (x < p.xShape.x)  // y is always in.
        {
          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
                       w * p.xStride.w;
          T *pv = ((T *)p.x) + ix;
          scalar_t v = (scalar_t)(*pv);
          v *= p.gain;
          if (v < 0.f) v *= p.slope;
          if (fabsf(v) > p.clamp) v = InternalType<T>::clamp(v, p.clamp);
          *pv = (T)v;  // Write value.
        }
      }
    }
}

template <class T, bool signWrite, bool signRead>
void *choose_filtered_lrelu_act_kernel(void) {
  return (void *)filtered_lrelu_act_kernel<T, signWrite, signRead>;
}

//------------------------------------------------------------------------
// MUSA kernel selection.

template <class T, class index_t, bool signWrite, bool signRead>
filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
    const filtered_lrelu_kernel_params &p, int sharedKB) {
  filtered_lrelu_kernel_spec s = {0};

  // Return the first matching kernel.
#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS)                        \
  if (sharedKB >= SH)                                                          \
    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) ||      \
        (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD)))         \
      if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) ||    \
          (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD)))       \
        if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU &&             \
            p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) {           \
          static_assert((D * TW % 4) == 0,                                     \
                        "down * tileWidth must be divisible by 4");            \
          static_assert(                                                       \
              FU % U == 0,                                                     \
              "upscaling filter size must be multiple of upscaling factor");   \
          static_assert(FD % D == 0,                                           \
                        "downscaling filter size must be multiple of "         \
                        "downscaling factor");                                 \
          s.setup = (void *)setup_filters_kernel;                              \
          s.exec = (void *)                                                    \
              filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, \
                                    U, FU, D, FD, TW, TH, W * 32, !!XR, !!WS>; \
          s.tileOut = make_int2(TW, TH);                                       \
          s.numWarps = W;                                                      \
          s.xrep = XR;                                                         \
          s.dynamicSharedKB = (SH == 48) ? 0 : SH;                             \
          return s;                                                            \
        }

  // Launch parameters for various kernel specializations.
  // Small filters must be listed before large filters, otherwise the kernel for
  // larger filter will always match first. Kernels that use more shared memory
  // must be listed before those that use less, for the same reason.

  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 1, 1, /*mode*/ MODE_FUFD,
       /*tw,th,warps,xrep,wskip*/ 64, 178, 32, 0, 0)  // 1t-upf1-downf1
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 152, 95, 16, 0, 0)  // 4t-ups2-downf1
  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 56, 22, 16, 0, 0)  // 4t-upf1-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 56, 29, 16, 11, 0)  // 4t-ups2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 60, 28, 16, 0, 0)  // 4t-upf2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 56, 28, 16, 0, 0)  // 4t-ups2-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 56, 31, 16, 11, 0)  // 4t-ups4-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 56, 36, 16, 0, 0)  // 4t-ups4-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 16, 22, 16, 12, 0)  // 4t-ups2-downs4
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 29, 15, 16, 0, 0)  // 4t-upf2-downs4
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 96, 150, 28, 0, 0)  // 6t-ups2-downf1
  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 35, 24, 0, 0)  // 6t-upf1-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 46, 16, 10, 0)  // 6t-ups2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 58, 28, 24, 8, 0)  // 6t-upf2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 52, 28, 16, 0, 0)  // 6t-ups2-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 51, 16, 5, 0)  // 6t-ups4-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 32, 56, 16, 6, 0)  // 6t-ups4-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 16, 18, 16, 12, 0)  // 6t-ups2-downs4
  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 27, 31, 32, 6, 0)  // 6t-upf2-downs4 96kB
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 27, 13, 24, 0, 0)  // 6t-upf2-downs4
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 148, 89, 24, 0, 0)  // 8t-ups2-downf1
  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 31, 16, 5, 0)  // 8t-upf1-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 41, 16, 9, 0)  // 8t-ups2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 56, 26, 24, 0, 0)  // 8t-upf2-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 32, 40, 16, 0, 0)  // 8t-ups2-downf2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 32, 46, 24, 5, 0)  // 8t-ups4-downs2
  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
       /*tw,th,warps,xrep,wskip*/ 32, 50, 16, 0, 0)  // 8t-ups4-downf2
  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 24, 24, 32, 12, 1)  // 8t-ups2-downs4 96kB
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
       /*tw,th,warps,xrep,wskip*/ 16, 13, 16, 10, 1)  // 8t-ups2-downs4
  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 25, 28, 28, 4, 0)  // 8t-upf2-downs4 96kB
  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
       /*tw,th,warps,xrep,wskip*/ 25, 10, 24, 0, 0)  // 8t-upf2-downs4

#undef CASE
  return s;  // No kernel found.
}

//------------------------------------------------------------------------

#define BUILD_FILTERED_LRELU_OP 1

#ifndef MMCV_WITH_HIP
#ifdef __GNUC__
#if __GNUC__ < 6
#undef BUILD_FILTERED_LRELU_OP
#define BUILD_FILTERED_LRELU_OP 0
#endif
#endif

std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
    bool writeSigns) {
  // Set MUSA device.
  TORCH_CHECK(x.is_privateuseone(), "x must reside on MUSA device");
  const at::musa::OptionalMUSAGuard device_guard(device_of(x));

  // Validate arguments.
  TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() &&
                  b.device() == x.device(),
              "all input tensors must reside on the same device");
  TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat,
              "fu and fd must be float32");
  TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat,
              "x and b must be float16 or float32");
  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
                  x.size(3) <= INT_MAX,
              "x is too large");
  TORCH_CHECK(x.numel() > 0, "x is empty");
  TORCH_CHECK(
      (fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2),
      "fu and fd must be rank 1 or 2");
  TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX,
              "fu is too large");
  TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX,
              "fd is too large");
  TORCH_CHECK(fu.numel() > 0, "fu is empty");
  TORCH_CHECK(fd.numel() > 0, "fd is empty");
  TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1),
              "b must be a vector with the same number of channels as x");
  TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");

  // Figure out how much shared memory is available on the device.
  int maxSharedBytes = 0;
#ifdef MMCV_WITH_HIP
  musaDeviceGetAttribute(&maxSharedBytes,
                         hipDeviceAttributeSharedMemPerBlockOptin,
                         x.device().index());
#else
  AT_MUSA_CHECK(musaDeviceGetAttribute(&maxSharedBytes,
                                       musaDevAttrMaxSharedMemoryPerBlockOptin,
                                       x.device().index()));
#endif
  int sharedKB = maxSharedBytes >> 10;

  // Populate enough launch parameters to check if a MUSA kernel exists.
  filtered_lrelu_kernel_params p;
  p.up = up;
  p.down = down;
  p.fuShape =
      make_int2((int)fu.size(-1),
                fu.dim() == 2 ? (int)fu.size(0)
                              : 0);  // shape [n, 0] indicates separable filter.
  p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
  filtered_lrelu_kernel_spec test_spec =
      choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
  if (!test_spec.exec) {
    // No kernel found - return empty tensors and indicate missing kernel with
    // return code of -1.
    return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
  }

  // Input/output element size.
  int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;

  // Input sizes.
  int64_t xw = (int)x.size(3);
  int64_t xh = (int)x.size(2);
  int64_t fut_w = (int)fu.size(-1) - 1;
  int64_t fut_h = (int)fu.size(0) - 1;
  int64_t fdt_w = (int)fd.size(-1) - 1;
  int64_t fdt_h = (int)fd.size(0) - 1;

  // Logical size of upsampled buffer.
  int64_t cw = xw * up + (px0 + px1) - fut_w;
  int64_t ch = xh * up + (py0 + py1) - fut_h;
  TORCH_CHECK(
      cw > fdt_w && ch > fdt_h,
      "upsampled buffer must be at least the size of downsampling filter");
  TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");

  // Compute output size and allocate.
  int64_t yw = (cw - fdt_w + (down - 1)) / down;
  int64_t yh = (ch - fdt_h + (down - 1)) / down;
  TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
  TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
  torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(),
                                 x.suggest_memory_format());

  // Allocate sign tensor.
  torch::Tensor so;
  torch::Tensor s = si;
  bool readSigns = !!s.numel();
  int64_t sw_active = 0;  // Active width of sign tensor.
  if (writeSigns) {
    sw_active = yw * down - (down - 1) + fdt_w;   // Active width in elements.
    int64_t sh = yh * down - (down - 1) + fdt_h;  // Height = active height.
    int64_t sw = (sw_active + 15) & ~15;  // Width  = active width in elements,
                                          // rounded up to multiple of 16.
    TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
    s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2},
                          x.options().dtype(torch::kUInt8),
                          at::MemoryFormat::Contiguous);
  } else if (readSigns)
    sw_active = s.size(3) << 2;

  // Validate sign tensor if in use.
  if (readSigns || writeSigns) {
    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
    TORCH_CHECK(s.device() == x.device(),
                "signs must reside on the same device as x");
    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
                "signs must have same batch & channels as x");
    TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX,
                "signs is too large");
  }

  // Populate rest of MUSA kernel parameters.
  p.x = x.data_ptr();
  p.y = y.data_ptr();
  p.b = b.data_ptr();
  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
  p.fu = fu.data_ptr<float>();
  p.fd = fd.data_ptr<float>();
  p.pad0 = make_int2(px0, py0);
  p.gain = gain;
  p.slope = slope;
  p.clamp = clamp;
  p.flip = (flip_filters) ? 1 : 0;
  p.xShape =
      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
  p.yShape =
      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
  p.sShape = (readSigns || writeSigns)
                 ? make_int2((int)s.size(3), (int)s.size(2))
                 : make_int2(0, 0);  // Width is in bytes. Contiguous.
  p.sOfs = make_int2(sx, sy);
  p.swLimit = (sw_active + 3) >> 2;  // Rounded up to bytes.

  // x, y, b strides are in bytes.
  p.xStride = make_longlong4(sz * x.stride(3), sz * x.stride(2),
                             sz * x.stride(1), sz * x.stride(0));
  p.yStride = make_longlong4(sz * y.stride(3), sz * y.stride(2),
                             sz * y.stride(1), sz * y.stride(0));
  p.bStride = sz * b.stride(0);

  // fu, fd strides are in elements.
  p.fuStride =
      make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
  p.fdStride =
      make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);

  // Determine if indices don't fit in int32. Support negative strides although
  // Torch currently never produces those.
  bool index64b = false;
  if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
  if (std::min(x.size(0) * p.xStride.w, 0ll) +
          std::min(x.size(1) * p.xStride.z, 0ll) +
          std::min(x.size(2) * p.xStride.y, 0ll) +
          std::min(x.size(3) * p.xStride.x, 0ll) <
      -INT_MAX)
    index64b = true;
  if (std::max(x.size(0) * p.xStride.w, 0ll) +
          std::max(x.size(1) * p.xStride.z, 0ll) +
          std::max(x.size(2) * p.xStride.y, 0ll) +
          std::max(x.size(3) * p.xStride.x, 0ll) >
      INT_MAX)
    index64b = true;
  if (std::min(y.size(0) * p.yStride.w, 0ll) +
          std::min(y.size(1) * p.yStride.z, 0ll) +
          std::min(y.size(2) * p.yStride.y, 0ll) +
          std::min(y.size(3) * p.yStride.x, 0ll) <
      -INT_MAX)
    index64b = true;
  if (std::max(y.size(0) * p.yStride.w, 0ll) +
          std::max(y.size(1) * p.yStride.z, 0ll) +
          std::max(y.size(2) * p.yStride.y, 0ll) +
          std::max(y.size(3) * p.yStride.x, 0ll) >
      INT_MAX)
    index64b = true;
  if (s.numel() > INT_MAX) index64b = true;

  // Choose MUSA kernel.
  filtered_lrelu_kernel_spec spec = {0};
  AT_DISPATCH_FLOATING_TYPES(
      x.scalar_type(), "filtered_lrelu_musa", [&] {
        if constexpr (sizeof(scalar_t) <=
                      4)  // Exclude doubles. constexpr
                          // prevents template instantiation.
        {
          // Choose kernel based on index type, datatype and sign read/write
          // modes.
          if (!index64b && writeSigns && !readSigns)
            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true, false>(
                p, sharedKB);
          else if (!index64b && !writeSigns && readSigns)
            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true>(
                p, sharedKB);
          else if (!index64b && !writeSigns && !readSigns)
            spec =
                choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(
                    p, sharedKB);
          else if (index64b && writeSigns && !readSigns)
            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true, false>(
                p, sharedKB);
          else if (index64b && !writeSigns && readSigns)
            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true>(
                p, sharedKB);
          else if (index64b && !writeSigns && !readSigns)
            spec =
                choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(
                    p, sharedKB);
        }
      });
  TORCH_CHECK(
      spec.exec,
      "internal error - MUSA kernel not found")  // This should not happen
                                                 // because we tested earlier
                                                 // that kernel exists.

  // Launch MUSA kernel.
  void *args[] = {&p};
  int bx = spec.numWarps * 32;
  int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
  int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
  int gz = p.yShape.z * p.yShape.w;

  // Repeat multiple horizontal tiles in a CTA?
  if (spec.xrep) {
    p.tilesXrep = spec.xrep;
    p.tilesXdim = gx;

    gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
    std::swap(gx, gy);
  } else {
    p.tilesXrep = 0;
    p.tilesXdim = 0;
  }
#ifdef MMCV_WITH_HIP
  AT_MUSA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,
                                c10::musa::getCurrentMUSAStream()));
#else
  // Launch filter setup kernel.
  AT_MUSA_CHECK(musaLaunchKernel(spec.setup, 1, 1024, args, 0,
                                 c10::musa::getCurrentMUSAStream()));
#endif

  // Copy kernels to constant memory.
  if (writeSigns && !readSigns)
    AT_MUSA_CHECK((copy_filters(c10::musa::getCurrentMUSAStream())));
  else if (!writeSigns && readSigns)
    AT_MUSA_CHECK((copy_filters(c10::musa::getCurrentMUSAStream())));
  else if (!writeSigns && !readSigns)
    AT_MUSA_CHECK((copy_filters(c10::musa::getCurrentMUSAStream())));

  // Set cache and shared memory configurations for main kernel.
  // FIXME:TODO FIX BUG
  AT_MUSA_CHECK(musaFuncSetCacheConfig(spec.exec, musaFuncCachePreferShared));
  if (spec.dynamicSharedKB)  // Need dynamically allocated shared memory?
#ifdef MMCV_WITH_HIP
    AT_MUSA_CHECK(hipFuncSetAttribute(
        spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,
        spec.dynamicSharedKB << 10));
#else
    AT_MUSA_CHECK(musaFuncSetAttribute(
        spec.exec, musaFuncAttributeMaxDynamicSharedMemorySize,
        spec.dynamicSharedKB << 10));
#endif
  // FIXME:TODO FIX BUG
  AT_MUSA_CHECK(
      musaFuncSetSharedMemConfig(spec.exec, musaSharedMemBankSizeFourByte));

  // Launch main kernel.
  const int maxSubGz = 65535;  // MUSA maximum for block z dimension.
  for (int zofs = 0; zofs < gz;
       zofs += maxSubGz)  // Do multiple launches if gz is too big.
  {
    p.blockZofs = zofs;
    int subGz = std::min(maxSubGz, gz - zofs);
// FIXME:TODO FIX BUG
#ifdef MMCV_WITH_HIP
    AT_MUSA_CHECK(hipLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
                                  spec.dynamicSharedKB << 10,
                                  c10::musa::getCurrentMUSAStream()));
#else
    AT_MUSA_CHECK(musaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
                                   spec.dynamicSharedKB << 10,
                                   c10::musa::getCurrentMUSAStream()));
#endif
  }

  // Done.
  return std::make_tuple(y, so, 0);
}

std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
    bool writeSigns);

REGISTER_DEVICE_IMPL(filtered_lrelu_op_impl, MUSA, filtered_lrelu_op);

#else

#pragma message(                           \
    "filtered_lrelu_op is not available. " \
    "Please update your compiler and musa version.")

#endif
#undef BUILD_FILTERED_LRELU_OP

//------------------------------------------------------------------------

torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
                                    int sy, float gain, float slope,
                                    float clamp, bool writeSigns) {
  // Set MUSA device.
  TORCH_CHECK(x.is_privateuseone(), "x must reside on MUSA device");
  const at::musa::OptionalMUSAGuard device_guard(device_of(x));

  // Validate arguments.
  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
                  x.size(3) <= INT_MAX,
              "x is too large");
  TORCH_CHECK(x.numel() > 0, "x is empty");
  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat ||
                  x.dtype() == torch::kDouble,
              "x must be float16, float32 or float64");

  // Output signs if we don't have sign input.
  torch::Tensor so;
  torch::Tensor s = si;
  bool readSigns = !!s.numel();
  if (writeSigns) {
    int64_t sw = x.size(3);
    sw = (sw + 15) & ~15;  // Round to a multiple of 16 for coalescing.
    s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2},
                          x.options().dtype(torch::kUInt8),
                          at::MemoryFormat::Contiguous);
  }

  // Validate sign tensor if in use.
  if (readSigns || writeSigns) {
    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
    TORCH_CHECK(s.device() == x.device(),
                "signs must reside on the same device as x");
    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
                "signs must have same batch & channels as x");
    TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX,
                "signs tensor is too large");
  }

  // Initialize MUSA kernel parameters.
  filtered_lrelu_act_kernel_params p;
  p.x = x.data_ptr();
  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
  p.gain = gain;
  p.slope = slope;
  p.clamp = clamp;
  p.xShape =
      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
  p.xStride =
      make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
  p.sShape = (readSigns || writeSigns)
                 ? make_int2((int)s.size(3) << 2, (int)s.size(2))
                 : make_int2(0, 0);  // Width is in elements. Contiguous.
  p.sOfs = make_int2(sx, sy);

  // Choose MUSA kernel.
  void *func = 0;
  AT_DISPATCH_FLOATING_TYPES(
      x.scalar_type(), "filtered_lrelu_act_musa", [&] {
        if (writeSigns)
          func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
        else if (readSigns)
          func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
        else
          func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
      });
  TORCH_CHECK(func, "internal error - MUSA kernel not found");

  // Launch MUSA kernel.
  void *args[] = {&p};
  int bx = 128;  // 4 warps per block.

  // Logical size of launch = writeSigns ? p.s : p.x
  uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
  uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
  uint32_t gz =
      p.xShape.z * p.xShape.w;  // Same as in p.sShape if signs are in use.
  gx = (gx - 1) / bx + 1;

  // Make sure grid y and z dimensions are within MUSA launch limits. Kernel
  // loops internally to do the rest.
  const uint32_t gmax = 65535;
  gy = std::min(gy, gmax);
  gz = std::min(gz, gmax);

  // Launch.
#ifdef MMCV_WITH_HIP
  AT_MUSA_CHECK(hipLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
                                c10::musa::getCurrentMUSAStream()));
#else
  AT_MUSA_CHECK(musaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
                                 c10::musa::getCurrentMUSAStream()));
#endif

  return so;
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/focal_loss_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_musa_helper.hpp"
#include "sigmoid_focal_loss_musa_kernel.muh"
#include "softmax_focal_loss_musa_kernel.muh"

void SigmoidFocalLossForwardMUSAKernelLauncher(Tensor input, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha) {
  int output_size = output.numel();
  int num_classes = input.size(1);
  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
             "target label should smaller or equal than num classes");
  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "sigmoid_focal_loss_forward_musa_kernel", [&] {
        sigmoid_focal_loss_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void SigmoidFocalLossBackwardMUSAKernelLauncher(Tensor input, Tensor target,
                                                Tensor weight,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha) {
  int output_size = grad_input.numel();
  int num_classes = input.size(1);

  c10::musa::MUSAGuard device_guard(grad_input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "sigmoid_focal_loss_backward_musa_kernel", [&] {
        sigmoid_focal_loss_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void SoftmaxFocalLossForwardMUSAKernelLauncher(Tensor softmax, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha) {
  int output_size = output.numel();
  int num_classes = softmax.size(1);

  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
             "target label should smaller or equal than num classes");
  c10::musa::MUSAGuard device_guard(softmax.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      softmax.scalar_type(), "softmax_focal_loss_forward_musa_kernel", [&] {
        softmax_focal_loss_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, softmax.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void SoftmaxFocalLossBackwardMUSAKernelLauncher(Tensor softmax, Tensor target,
                                                Tensor weight, Tensor buff,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha) {
  int num_classes = softmax.size(1);

  int output_size = buff.numel();
  c10::musa::MUSAGuard device_guard(grad_input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      grad_input.scalar_type(),
      "softmax_focal_loss_backward_musa1_"
      "kernel",
      [&] {
        softmax_focal_loss_backward_musa1_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, softmax.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
      });

  AT_MUSA_CHECK(musaGetLastError());

  output_size = grad_input.numel();
  AT_DISPATCH_FLOATING_TYPES(
      grad_input.scalar_type(),
      "softmax_focal_loss_backward_musa2_"
      "kernel",
      [&] {
        softmax_focal_loss_backward_musa2_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, softmax.data_ptr<scalar_t>(),
                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
                grad_input.data_ptr<scalar_t>(), num_classes);
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/furthest_point_sample_musa.mu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu

#include <stdio.h>
#include <stdlib.h>

#include "furthest_point_sample_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

inline int opt_n_threads(int work_size) {
  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);

  return max(min(1 << pow_2, 1024), 1);
}

void FurthestPointSamplingForwardMUSAKernelLauncher(int b, int n, int m,
                                                    const float* dataset,
                                                    float* temp, int* idxs) {
  // dataset: (B, N, 3)
  // tmp: (B, N)
  // output:
  //      idx: (B, M)

  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  unsigned int n_threads = opt_n_threads(n);

  switch (n_threads) {
    case 1024:
      furthest_point_sampling_forward_musa_kernel<1024>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 512:
      furthest_point_sampling_forward_musa_kernel<512>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 256:
      furthest_point_sampling_forward_musa_kernel<256>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 128:
      furthest_point_sampling_forward_musa_kernel<128>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 64:
      furthest_point_sampling_forward_musa_kernel<64>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 32:
      furthest_point_sampling_forward_musa_kernel<32>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 16:
      furthest_point_sampling_forward_musa_kernel<16>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 8:
      furthest_point_sampling_forward_musa_kernel<8>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 4:
      furthest_point_sampling_forward_musa_kernel<4>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 2:
      furthest_point_sampling_forward_musa_kernel<2>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 1:
      furthest_point_sampling_forward_musa_kernel<1>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    default:
      furthest_point_sampling_forward_musa_kernel<512>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
  }

  AT_MUSA_CHECK(musaGetLastError());
}

void FurthestPointSamplingWithDistForwardMUSAKernelLauncher(
    int b, int n, int m, const float* dataset, float* temp, int* idxs) {
  // dataset: (B, N, N)
  // temp: (B, N)
  // output:
  //      idx: (B, M)

  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  unsigned int n_threads = opt_n_threads(n);

  switch (n_threads) {
    case 1024:
      furthest_point_sampling_with_dist_forward_musa_kernel<1024>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 512:
      furthest_point_sampling_with_dist_forward_musa_kernel<512>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 256:
      furthest_point_sampling_with_dist_forward_musa_kernel<256>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 128:
      furthest_point_sampling_with_dist_forward_musa_kernel<128>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 64:
      furthest_point_sampling_with_dist_forward_musa_kernel<64>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 32:
      furthest_point_sampling_with_dist_forward_musa_kernel<32>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 16:
      furthest_point_sampling_with_dist_forward_musa_kernel<16>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 8:
      furthest_point_sampling_with_dist_forward_musa_kernel<8>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 4:
      furthest_point_sampling_with_dist_forward_musa_kernel<4>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 2:
      furthest_point_sampling_with_dist_forward_musa_kernel<2>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    case 1:
      furthest_point_sampling_with_dist_forward_musa_kernel<1>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
      break;
    default:
      furthest_point_sampling_with_dist_forward_musa_kernel<512>
          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
  }

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/fused_bias_leakyrelu_musa.mu
================================================
// Modified from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu
// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
//
// This work is made available under the Nvidia Source Code License-NC.
// To view a copy of this license, visit
// https://nvlabs.github.io/stylegan2/license.html

#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include "torch_musa/csrc/aten/musa/MUSAContext.h"
#include <musa.h>
#include <musa_runtime.h>
#include <torch/types.h>

#include <ATen/musa/MUSA_PORT_ApplyUtils.muh>

template <typename scalar_t>
static __global__ void fused_bias_act_kernel(
    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,
    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,
    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;

  scalar_t zero = 0.0;

  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;
       loop_idx++, xi += blockDim.x) {
    scalar_t x = p_x[xi];

    if (use_bias) {
      x += p_b[(xi / step_b) % size_b];
    }

    scalar_t ref = use_ref ? p_ref[xi] : zero;

    scalar_t y;

    // act = 1: linear layer
    // act = 3: leaky relu layer
    // grad = 0: direct forward path
    // grad = 1: first order deviation
    // grad = 2: second order deviation
    switch (act * 10 + grad) {
      default:
      case 10:
        y = x;
        break;
      case 11:
        y = x;
        break;
      case 12:
        y = 0.0;
        break;

      case 30:
        y = (x > 0.0) ? x : x * alpha;
        break;
      case 31:
        y = (ref > 0.0) ? x : x * alpha;
        break;
      case 32:
        y = 0.0;
        break;
    }

    out[xi] = y * scale;
  }
}

torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
                                      const torch::Tensor& bias,
                                      const torch::Tensor& refer, int act,
                                      int grad, float alpha, float scale) {
  int curDevice = -1;
  musaGetDevice(&curDevice);
  musaStream_t stream = at::musa::getCurrentMUSAStream(curDevice);

  auto x = input.contiguous();
  auto b = bias.contiguous();
  auto ref = refer.contiguous();

  int use_bias = b.numel() ? 1 : 0;
  int use_ref = ref.numel() ? 1 : 0;

  int size_x = x.numel();
  int size_b = b.numel();
  int step_b = 1;

  for (int i = 1 + 1; i < x.dim(); i++) {
    step_b *= x.size(i);
  }

  int loop_x = 4;
  int block_size = 4 * 32;
  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;

  auto y = torch::empty_like(x);

  AT_DISPATCH_FLOATING_TYPES(
      x.scalar_type(), "fused_bias_act_kernel", [&] {
        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,
            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);
      });

  return y;
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/fused_spconv_ops_musa.mu
================================================
#include <musa_runtime_api.h>
#include <torch/script.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>

#include "pytorch_musa_helper.hpp"

torch::Tensor FusedIndiceConvBatchnormMUSAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM) {
  c10::musa::MUSAGuard device_guard(features.device());
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto indicePairMaxSizeIter =
      std::max_element(indicePairNumCpu.data_ptr<int>(),
                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
  int indicePairMaxOffset =
      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;

  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());

  torch::Tensor output =
      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
  torch::Tensor inputBuffer =
      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
  if (subM) {  // the center index of subm conv don't need gather and scatter
               // add.
    torch::mm_out(output, features, filters[indicePairMaxOffset]);
  }
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }

    AT_DISPATCH_FLOATING_TYPES(
        features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
          auto outputBufferBlob = torch::from_blob(
              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
          auto inputBufferBlob = torch::from_blob(
              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);

          if (device == torch::kCPU) {
            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
          } else {
            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
                gatherFtor;
            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
            TV_CHECK_MUSA_ERR();
            /* slower than SparseGatherFunctor, may due to int->long conversion
            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
            auto indicePairBlob =
            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
            indicePairOptions); torch::index_select_out(inputBufferBlob,
            features, 0, indicePairBlob);*/
          }
          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);

          if (device == torch::kCPU) {
            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::CPU(), tv::torch2tv<scalar_t>(output),
                tv::torch2tv<const scalar_t>(outputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
          } else {
            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
                tv::torch2tv<const scalar_t>(outputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
            TV_CHECK_MUSA_ERR();
          }
        });
  }

  return output;
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/gather_points_musa.mu
================================================
#include <stdio.h>
#include <stdlib.h>

#include "gather_points_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void GatherPointsForwardMUSAKernelLauncher(int b, int c, int n, int npoints,
                                           const Tensor points,
                                           const Tensor idx, Tensor out) {
  // points: (B, C, N)
  // idx: (B, npoints)
  // output:
  //      out: (B, C, npoints)

  c10::musa::MUSAGuard device_guard(points.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "gather_points_forward_musa_kernel", [&] {
        gather_points_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, npoints, points.data_ptr<scalar_t>(),
                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void GatherPointsBackwardMUSAKernelLauncher(int b, int c, int n, int npoints,
                                            const Tensor grad_out,
                                            const Tensor idx,
                                            Tensor grad_points) {
  // grad_out: (B, C, npoints)
  // idx: (B, npoints)
  // output:
  //      grad_points: (B, C, N)

  c10::musa::MUSAGuard device_guard(grad_out.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "gather_points_backward_musa_kernel", [&] {
        gather_points_backward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),
                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/group_points_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#include <stdio.h>
#include <stdlib.h>

#include "group_points_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void GroupPointsForwardMUSAKernelLauncher(int b, int c, int n, int npoints,
                                          int nsample, const Tensor points,
                                          const Tensor idx, Tensor out) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)

  c10::musa::MUSAGuard device_guard(points.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "group_points_forward_musa_kernel", [&] {
        group_points_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),
                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void GroupPointsBackwardMUSAKernelLauncher(int b, int c, int n, int npoints,
                                           int nsample, const Tensor grad_out,
                                           const Tensor idx,
                                           Tensor grad_points) {
  // grad_out: (B, C, npoints, nsample)
  // idx: (B, npoints, nsample)
  // output:
  //      grad_points: (B, C, N)

  c10::musa::MUSAGuard device_guard(grad_out.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "group_points_backward_musa_kernel", [&] {
        group_points_backward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),
                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/iou3d_musa.mu
================================================
// Modified from
// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu

/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/

#include <stdio.h>

#include "iou3d_musa_kernel.muh"
#include "nms_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void IoU3DBoxesOverlapBevForwardMUSAKernelLauncher(const int num_a,
                                                   const Tensor boxes_a,
                                                   const int num_b,
                                                   const Tensor boxes_b,
                                                   Tensor ans_overlap) {
  c10::musa::MUSAGuard device_guard(boxes_a.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);

  iou3d_boxes_overlap_bev_forward_musa_kernel<<<blocks, threads, 0, stream>>>(
      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
      ans_overlap.data_ptr<float>());

  AT_MUSA_CHECK(musaGetLastError());
}

void IoU3DNMS3DForwardMUSAKernelLauncher(const Tensor boxes, Tensor& keep,
                                         Tensor& keep_num,
                                         float nms_overlap_thresh) {
  using namespace at::indexing;
  c10::musa::MUSAGuard device_guard(boxes.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  int boxes_num = boxes.size(0);

  const int col_blocks =
      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));

  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);

  iou3d_nms3d_forward_musa_kernel<<<blocks, threads, 0, stream>>>(
      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
      (unsigned long long*)mask.data_ptr<int64_t>());

  at::Tensor keep_t = at::zeros(
      {boxes_num}, boxes.options().dtype(at::kBool).device(::at::musa::kMUSA));
  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
                          col_blocks * sizeof(unsigned long long), stream>>>(
      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
      boxes_num);

  auto keep_data = keep_t.nonzero().index({Slice(), 0});
  keep_num.fill_(at::Scalar(keep_data.size(0)));
  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
  AT_MUSA_CHECK(musaGetLastError());
}

void IoU3DNMS3DNormalForwardMUSAKernelLauncher(const Tensor boxes, Tensor& keep,
                                               Tensor& keep_num,
                                               float nms_overlap_thresh) {
  using namespace at::indexing;
  c10::musa::MUSAGuard device_guard(boxes.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  int boxes_num = boxes.size(0);

  const int col_blocks =
      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));

  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);

  iou3d_nms3d_normal_forward_musa_kernel<<<blocks, threads, 0, stream>>>(
      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
      (unsigned long long*)mask.data_ptr<int64_t>());

  at::Tensor keep_t = at::zeros(
      {boxes_num}, boxes.options().dtype(at::kBool).device(::at::musa::kMUSA));
  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
                          col_blocks * sizeof(unsigned long long), stream>>>(
      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
      boxes_num);

  auto keep_data = keep_t.nonzero().index({Slice(), 0});
  keep_num.fill_(at::Scalar(keep_data.size(0)));
  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/knn_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap

#include <cmath>
#include <cstdio>

#include "knn_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void KNNForwardMUSAKernelLauncher(int b, int n, int m, int nsample,
                                  const Tensor xyz, const Tensor new_xyz,
                                  Tensor idx, Tensor dist2) {
  // param new_xyz: (B, m, 3)
  // param xyz: (B, n, 3)
  // param idx: (B, m, nsample)

  c10::musa::MUSAGuard device_guard(new_xyz.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES(
      new_xyz.scalar_type(), "knn_forward_musa_kernel", [&] {
        knn_forward_musa_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            b, n, m, nsample, xyz.data_ptr<scalar_t>(),
            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),
            dist2.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/masked_conv2d_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "masked_conv2d_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void MaskedIm2colForwardMUSAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
                                           Tensor top_data, const int kernel_h,
                                           const int kernel_w, const int pad_h,
                                           const int pad_w) {
  int channels = bottom_data.size(1);
  int height = bottom_data.size(2);
  int width = bottom_data.size(3);
  int mask_cnt = mask_h_idx.size(0);
  int output_size = mask_cnt * channels;

  c10::musa::MUSAGuard device_guard(bottom_data.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
        MaskedIm2colForward<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, bottom_data_, height, width, kernel_h, kernel_w,
                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}

void MaskedCol2imForwardMUSAKernelLauncher(
    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
    Tensor top_data, const int height, const int width, const int channels) {
  int mask_cnt = mask_h_idx.size(0);
  int output_size = mask_cnt * channels;

  c10::musa::MUSAGuard device_guard(bottom_data.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();

        MaskedCol2imForward<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, bottom_data_, height, width, channels, mask_h_idx_,
                mask_w_idx_, mask_cnt, top_data_);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/min_area_polygons.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
#include "min_area_polygons_musa.muh"
#include "pytorch_musa_helper.hpp"

void MinAreaPolygonsMUSAKernelLauncher(const Tensor pointsets,
                                       Tensor polygons) {
  int num_pointsets = pointsets.size(0);
  const int output_size = polygons.numel();
  c10::musa::MUSAGuard device_guard(pointsets.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      pointsets.scalar_type(), "min_area_polygons_musa_kernel", ([&] {
        min_area_polygons_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                num_pointsets, pointsets.data_ptr<scalar_t>(),
                polygons.data_ptr<scalar_t>());
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/modulated_deform_conv_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "modulated_deform_conv_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void modulated_deformable_im2col_musa(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col) {
  // num_axes should be smaller than block size
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels = channels * batch_size * height_col * width_col;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();

        modulated_deformable_im2col_gpu_kernel<<<
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            c10::musa::getCurrentMUSAStream()>>>(
            num_kernels, data_im_, data_offset_, data_mask_, height_im,
            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
            channels, deformable_group, height_col, width_col, data_col_);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}

void modulated_deformable_col2im_musa(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im) {
  const int channel_per_deformable_group = channels / deformable_group;
  const int num_kernels =
      channels * kernel_h * kernel_w * batch_size * height_col * width_col;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();

        modulated_deformable_col2im_gpu_kernel<<<
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            c10::musa::getCurrentMUSAStream()>>>(
            num_kernels, data_col_, data_offset_, data_mask_, channels,
            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
            batch_size, deformable_group, height_col, width_col, grad_im_);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}

void modulated_deformable_col2im_coord_musa(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask) {
  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
                          kernel_w * deformable_group;
  const int channel_per_deformable_group =
      channels * kernel_h * kernel_w / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();

        modulated_deformable_col2im_coord_gpu_kernel<<<
            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
            c10::musa::getCurrentMUSAStream()>>>(
            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
            stride_h, stride_w, dilation_h, dilation_w,
            channel_per_deformable_group, batch_size,
            2 * kernel_h * kernel_w * deformable_group, deformable_group,
            height_col, width_col, grad_offset_, grad_mask_);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/ms_deform_attn_musa.mu
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/

#include <ATen/ATen.h>
#include "torch_musa/csrc/aten/musa/MUSAContext.h"
#include <musa.h>
#include <musa_runtime.h>

#include <THC/THCAtomics.muh>
#include <vector>

#include "ms_deform_attn_musa_kernel.muh"

template <typename scalar_t>
void ms_deformable_im2col_musa(musaStream_t stream, const scalar_t *data_value,
                               const int64_t *data_spatial_shapes,
                               const int64_t *data_level_start_index,
                               const scalar_t *data_sampling_loc,
                               const scalar_t *data_attn_weight,
                               const int batch_size, const int spatial_size,
                               const int num_heads, const int channels,
                               const int num_levels, const int num_query,
                               const int num_point, scalar_t *data_col) {
  const int num_kernels = batch_size * num_query * num_heads * channels;
  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
  const int num_threads = THREADS_PER_BLOCK;
  ms_deformable_im2col_gpu_kernel<scalar_t>
      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
          num_heads, channels, num_levels, num_query, num_point, data_col);

  musaError_t err = musaGetLastError();
  if (err != musaSuccess) {
    printf("error in ms_deformable_im2col_musa: %s\n", musaGetErrorString(err));
  }
}

template <typename scalar_t>
void ms_deformable_col2im_musa(
    musaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
    const int batch_size, const int spatial_size, const int num_heads,
    const int channels, const int num_levels, const int num_query,
    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
    scalar_t *grad_attn_weight) {
  const int num_threads =
      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;
  const int num_kernels = batch_size * num_query * num_heads * channels;
  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
  if (channels > THREADS_PER_BLOCK) {
    if ((channels & THREADS_PER_BLOCK - 1) == 0) {
      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
             num_threads * 3 * sizeof(scalar_t), stream>>>(
              num_kernels, grad_col, data_value, data_spatial_shapes,
              data_level_start_index, data_sampling_loc, data_attn_weight,
              batch_size, spatial_size, num_heads, channels, num_levels,
              num_query, num_point, grad_value, grad_sampling_loc,
              grad_attn_weight);
    } else {
      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                       data_level_start_index, data_sampling_loc,
                       data_attn_weight, batch_size, spatial_size, num_heads,
                       channels, num_levels, num_query, num_point, grad_value,
                       grad_sampling_loc, grad_attn_weight);
    }
  } else {
    switch (channels) {
      case 1:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      1>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 2:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      2>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 4:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      4>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 8:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      8>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 16:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      16>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 32:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
                                                                      32>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 64:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
                                                                      64>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 128:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
                                                                      128>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 256:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
                                                                      256>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      case 512:
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
                                                                      512>
            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
                         data_level_start_index, data_sampling_loc,
                         data_attn_weight, batch_size, spatial_size, num_heads,
                         channels, num_levels, num_query, num_point, grad_value,
                         grad_sampling_loc, grad_attn_weight);
        break;
      default:
        if (channels < 64) {
          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
                 num_threads * 3 * sizeof(scalar_t), stream>>>(
                  num_kernels, grad_col, data_value, data_spatial_shapes,
                  data_level_start_index, data_sampling_loc, data_attn_weight,
                  batch_size, spatial_size, num_heads, channels, num_levels,
                  num_query, num_point, grad_value, grad_sampling_loc,
                  grad_attn_weight);
        } else {
          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
                 num_threads * 3 * sizeof(scalar_t), stream>>>(
                  num_kernels, grad_col, data_value, data_spatial_shapes,
                  data_level_start_index, data_sampling_loc, data_attn_weight,
                  batch_size, spatial_size, num_heads, channels, num_levels,
                  num_query, num_point, grad_value, grad_sampling_loc,
                  grad_attn_weight);
        }
    }
  }
  musaError_t err = musaGetLastError();
  if (err != musaSuccess) {
    printf("error in ms_deformable_col2im_musa: %s\n", musaGetErrorString(err));
  }
}

at::Tensor ms_deform_attn_musa_forward(const at::Tensor &value,
                                       const at::Tensor &spatial_shapes,
                                       const at::Tensor &level_start_index,
                                       const at::Tensor &sampling_loc,
                                       const at::Tensor &attn_weight,
                                       const int im2col_step) {
  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
  AT_ASSERTM(spatial_shapes.is_contiguous(),
             "spatial_shapes tensor has to be contiguous");
  AT_ASSERTM(level_start_index.is_contiguous(),
             "level_start_index tensor has to be contiguous");
  AT_ASSERTM(sampling_loc.is_contiguous(),
             "sampling_loc tensor has to be contiguous");
  AT_ASSERTM(attn_weight.is_contiguous(),
             "attn_weight tensor has to be contiguous");

  AT_ASSERTM(value.is_privateuseone(), "value must be a MUSA tensor");
  AT_ASSERTM(spatial_shapes.is_privateuseone(), "spatial_shapes must be a MUSA tensor");
  AT_ASSERTM(level_start_index.is_privateuseone(),
             "level_start_index must be a MUSA tensor");
  AT_ASSERTM(sampling_loc.is_privateuseone(), "sampling_loc must be a MUSA tensor");
  AT_ASSERTM(attn_weight.is_privateuseone(), "attn_weight must be a MUSA tensor");

  const int batch = value.size(0);
  const int spatial_size = value.size(1);
  const int num_heads = value.size(2);
  const int channels = value.size(3);

  const int num_levels = spatial_shapes.size(0);

  const int num_query = sampling_loc.size(1);
  const int num_point = sampling_loc.size(4);

  const int im2col_step_ = std::min(batch, im2col_step);

  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
             batch, im2col_step_);

  auto output =
      at::zeros({batch, num_query, num_heads, channels}, value.options());

  const int batch_n = im2col_step_;
  auto output_n = output.view(
      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
  auto per_value_size = spatial_size * num_heads * channels;
  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
  for (int n = 0; n < batch / im2col_step_; ++n) {
    auto columns = output_n.select(0, n);
    AT_DISPATCH_FLOATING_TYPES(
        value.scalar_type(), "ms_deform_attn_forward_musa", ([&] {
          ms_deformable_im2col_musa(
              c10::musa::getCurrentMUSAStream(),
              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
              spatial_shapes.data_ptr<int64_t>(),
              level_start_index.data_ptr<int64_t>(),
              sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
              attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size,
              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
              num_point, columns.data_ptr<scalar_t>());
        }));
  }

  output = output.view({batch, num_query, num_heads * channels});

  return output;
}

void ms_deform_attn_musa_backward(
    const at::Tensor &value, const at::Tensor &spatial_shapes,
    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
    const at::Tensor &attn_weight, const at::Tensor &grad_output,
    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
    at::Tensor &grad_attn_weight, const int im2col_step) {
  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
  AT_ASSERTM(spatial_shapes.is_contiguous(),
             "spatial_shapes tensor has to be contiguous");
  AT_ASSERTM(level_start_index.is_contiguous(),
             "level_start_index tensor has to be contiguous");
  AT_ASSERTM(sampling_loc.is_contiguous(),
             "sampling_loc tensor has to be contiguous");
  AT_ASSERTM(attn_weight.is_contiguous(),
             "attn_weight tensor has to be contiguous");
  AT_ASSERTM(grad_output.is_contiguous(),
             "grad_output tensor has to be contiguous");

  AT_ASSERTM(value.is_privateuseone(), "value must be a MUSA tensor");
  AT_ASSERTM(spatial_shapes.is_privateuseone(), "spatial_shapes must be a MUSA tensor");
  AT_ASSERTM(level_start_index.is_privateuseone(),
             "level_start_index must be a MUSA tensor");
  AT_ASSERTM(sampling_loc.is_privateuseone(), "sampling_loc must be a MUSA tensor");
  AT_ASSERTM(attn_weight.is_privateuseone(), "attn_weight must be a MUSA tensor");
  AT_ASSERTM(grad_output.is_privateuseone(), "grad_output must be a MUSA tensor");

  const int batch = value.size(0);
  const int spatial_size = value.size(1);
  const int num_heads = value.size(2);
  const int channels = value.size(3);

  const int num_levels = spatial_shapes.size(0);

  const int num_query = sampling_loc.size(1);
  const int num_point = sampling_loc.size(4);

  const int im2col_step_ = std::min(batch, im2col_step);

  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
             batch, im2col_step_);

  const int batch_n = im2col_step_;
  auto per_value_size = spatial_size * num_heads * channels;
  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
  auto grad_output_n = grad_output.view(
      {batch / im2col_step_, batch_n, num_query, num_heads, channels});

  for (int n = 0; n < batch / im2col_step_; ++n) {
    auto grad_output_g = grad_output_n.select(0, n);
    AT_DISPATCH_FLOATING_TYPES(
        value.scalar_type(), "ms_deform_attn_backward_musa", ([&] {
          ms_deformable_col2im_musa(
              c10::musa::getCurrentMUSAStream(),
              grad_output_g.data_ptr<scalar_t>(),
              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
              spatial_shapes.data_ptr<int64_t>(),
              level_start_index.data_ptr<int64_t>(),
              sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
              attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size,
              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
              num_point,
              grad_value.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_value_size,
              grad_sampling_loc.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_sample_loc_size,
              grad_attn_weight.data_ptr<scalar_t>() +
                  n * im2col_step_ * per_attn_weight_size);
        }));
  }
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/musabind.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void AssignScoreWithKForwardMUSAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor &points, const Tensor &centers, const Tensor &scores,
    const Tensor &knn_idx, Tensor &output);

void AssignScoreWithKBackwardMUSAKernelLauncher(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
    Tensor &grad_centers, Tensor &grad_scores);

void assign_score_withk_forward_musa(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor &points,
                                     const Tensor &centers,
                                     const Tensor &scores,
                                     const Tensor &knn_idx, Tensor &output) {
  AssignScoreWithKForwardMUSAKernelLauncher(
      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
};

void assign_score_withk_backward_musa(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
    Tensor &grad_centers, Tensor &grad_scores) {
  AssignScoreWithKBackwardMUSAKernelLauncher(
      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
      grad_points, grad_centers, grad_scores);
};

void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor &points,
                                     const Tensor &centers,
                                     const Tensor &scores,
                                     const Tensor &knn_idx, Tensor &output);

void assign_score_withk_backward_impl(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
    Tensor &grad_centers, Tensor &grad_scores);

REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, MUSA,
                     assign_score_withk_forward_musa);
REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, MUSA,
                     assign_score_withk_backward_musa);

void BallQueryForwardMUSAKernelLauncher(int b, int n, int m, float min_radius,
                                        float max_radius, int nsample,
                                        const Tensor new_xyz, const Tensor xyz,
                                        Tensor idx);

void ball_query_forward_musa(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx) {
  BallQueryForwardMUSAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
                                     new_xyz, xyz, idx);
};

void ball_query_forward_impl(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx);
REGISTER_DEVICE_IMPL(ball_query_forward_impl, MUSA, ball_query_forward_musa);

void StackBallQueryForwardMUSAKernelLauncher(float max_radius, int nsample,
                                             const Tensor new_xyz,
                                             const Tensor new_xyz_batch_cnt,
                                             const Tensor xyz,
                                             const Tensor xyz_batch_cnt,
                                             Tensor idx);

void stack_ball_query_forward_musa(float max_radius, int nsample,
                                   const Tensor new_xyz,
                                   const Tensor new_xyz_batch_cnt,
                                   const Tensor xyz, const Tensor xyz_batch_cnt,
                                   Tensor idx) {
  StackBallQueryForwardMUSAKernelLauncher(
      max_radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
};

void stack_ball_query_forward_impl(float max_radius, int nsample,
                                   const Tensor new_xyz,
                                   const Tensor new_xyz_batch_cnt,
                                   const Tensor xyz, const Tensor xyz_batch_cnt,
                                   Tensor idx);
REGISTER_DEVICE_IMPL(stack_ball_query_forward_impl, MUSA,
                     stack_ball_query_forward_musa);

void BBoxOverlapsMUSAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                    Tensor ious, const int mode,
                                    const bool aligned, const int offset);

void bbox_overlaps_musa(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset) {
  BBoxOverlapsMUSAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}

void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset);
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MUSA, bbox_overlaps_musa);

void BorderAlignForwardMUSAKernelLauncher(const Tensor &input,
                                          const Tensor &boxes, Tensor output,
                                          Tensor argmax_idx,
                                          const int pool_size);

void BorderAlignBackwardMUSAKernelLauncher(const Tensor &grad_output,
                                           const Tensor &boxes,
                                           const Tensor &argmax_idx,
                                           Tensor grad_input,
                                           const int pool_size);

void border_align_forward_musa(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size) {
  BorderAlignForwardMUSAKernelLauncher(input, boxes, output, argmax_idx,
                                       pool_size);
}

void border_align_backward_musa(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size) {
  BorderAlignBackwardMUSAKernelLauncher(grad_output, boxes, argmax_idx,
                                        grad_input, pool_size);
}

void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size);

void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size);

REGISTER_DEVICE_IMPL(border_align_forward_impl, MUSA,
                     border_align_forward_musa);
REGISTER_DEVICE_IMPL(border_align_backward_impl, MUSA,
                     border_align_backward_musa);

void box_iou_rotated_musa(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);

void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_rotated_impl, MUSA, box_iou_rotated_musa);

void box_iou_quadri_musa(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned);

void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_quadri_impl, MUSA, box_iou_quadri_musa);

#if ((!defined(MUSA_ARCH)) || (defined(MUSA_ARCH)) && (MUSA_ARCH > 21))

void CARAFEForwardMUSAKernelLauncher(const Tensor features, const Tensor masks,
                                     Tensor rfeatures, Tensor routput,
                                     Tensor rmasks, Tensor output,
                                     const int kernel_size,
                                     const int group_size,
                                     const int scale_factor);

void CARAFEBackwardMUSAKernelLauncher(
    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
    const int kernel_size, const int group_size, const int scale_factor);

void carafe_forward_musa(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor) {
  CARAFEForwardMUSAKernelLauncher(features, masks, rfeatures, routput, rmasks,
                                  output, kernel_size, group_size,
                                  scale_factor);
}

void carafe_backward_musa(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor) {
  CARAFEBackwardMUSAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
                                   bottom_grad, mask_grad, kernel_size,
                                   group_size, scale_factor);
}

void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
                         Tensor routput, Tensor rmasks, Tensor output,
                         int kernel_size, int group_size, int scale_factor);

void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                          Tensor rtop_grad, Tensor rbottom_grad_hs,
                          Tensor rbottom_grad, Tensor rmask_grad,
                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                          int group_size, int scale_factor);

REGISTER_DEVICE_IMPL(carafe_forward_impl, MUSA, carafe_forward_musa);
REGISTER_DEVICE_IMPL(carafe_backward_impl, MUSA, carafe_backward_musa);
#endif

void CARAFENAIVEForwardMUSAKernelLauncher(const Tensor features,
                                          const Tensor masks, Tensor output,
                                          const int kernel_size,
                                          const int group_size,
                                          const int scale_factor);

void CARAFENAIVEBackwardMUSAKernelLauncher(
    const Tensor top_grad, const Tensor features, const Tensor masks,
    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
    const int group_size, const int scale_factor);

void carafe_naive_forward_musa(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor) {
  CARAFENAIVEForwardMUSAKernelLauncher(features, masks, output, kernel_size,
                                       group_size, scale_factor);
}

void carafe_naive_backward_musa(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor) {
  CARAFENAIVEBackwardMUSAKernelLauncher(top_grad, features, masks, bottom_grad,
                                        mask_grad, kernel_size, group_size,
                                        scale_factor);
}
void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
                               int kernel_size, int group_size,
                               int scale_factor);

void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
                                Tensor bottom_grad, Tensor mask_grad,
                                int kernel_size, int group_size,
                                int scale_factor);

REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, MUSA,
                     carafe_naive_forward_musa);
REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, MUSA,
                     carafe_naive_backward_musa);

void CorrelationForwardMUSAKernelLauncher(Tensor input1, Tensor input2,
                                          Tensor output, int kH, int kW,
                                          int patchH, int patchW, int padH,
                                          int padW, int dilationH,
                                          int dilationW, int dilation_patchH,
                                          int dilation_patchW, int dH, int dW);

void CorrelationBackwardMUSAKernelLauncher(Tensor grad_output, Tensor input1,
                                           Tensor input2, Tensor grad_input1,
                                           Tensor grad_input2, int kH, int kW,
                                           int patchH, int patchW, int padH,
                                           int padW, int dilationH,
                                           int dilationW, int dilation_patchH,
                                           int dilation_patchW, int dH, int dW);

void correlation_forward_musa(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW) {
  CorrelationForwardMUSAKernelLauncher(
      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
      dilationW, dilation_patchH, dilation_patchW, dH, dW);
}

void correlation_backward_musa(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
  CorrelationBackwardMUSAKernelLauncher(
      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
      dilation_patchW, dH, dW);
}

void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
                              int kH, int kW, int patchH, int patchW, int padH,
                              int padW, int dilationH, int dilationW,
                              int dilation_patchH, int dilation_patchW, int dH,
                              int dW);

void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
                               Tensor grad_input1, Tensor grad_input2, int kH,
                               int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW);

REGISTER_DEVICE_IMPL(correlation_forward_impl, MUSA, correlation_forward_musa);
REGISTER_DEVICE_IMPL(correlation_backward_impl, MUSA,
                     correlation_backward_musa);

void deformable_im2col_musa(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col);

void deformable_col2im_musa(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im);

void deformable_col2im_coord_musa(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset);

void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor data_col);

void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
                            const int channels, const int height,
                            const int width, const int ksize_h,
                            const int ksize_w, const int pad_h, const int pad_w,
                            const int stride_h, const int stride_w,
                            const int dilation_h, const int dilation_w,
                            const int parallel_imgs, const int deformable_group,
                            Tensor grad_im);

void deformable_col2im_coord_impl(
    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int parallel_imgs,
    const int deformable_group, Tensor grad_offset);

REGISTER_DEVICE_IMPL(deformable_im2col_impl, MUSA, deformable_im2col_musa);
REGISTER_DEVICE_IMPL(deformable_col2im_impl, MUSA, deformable_col2im_musa);
REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, MUSA,
                     deformable_col2im_coord_musa);

void DeformRoIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois,
                                            Tensor offset, Tensor output,
                                            int pooled_height, int pooled_width,
                                            float spatial_scale,
                                            int sampling_ratio, float gamma);

void DeformRoIPoolBackwardMUSAKernelLauncher(
    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
    float spatial_scale, int sampling_ratio, float gamma);

void deform_roi_pool_forward_musa(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
  DeformRoIPoolForwardMUSAKernelLauncher(input, rois, offset, output,
                                         pooled_height, pooled_width,
                                         spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_backward_musa(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma) {
  DeformRoIPoolBackwardMUSAKernelLauncher(
      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
      pooled_width, spatial_scale, sampling_ratio, gamma);
}

void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma);

void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma);

REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, MUSA,
                     deform_roi_pool_forward_musa);
REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, MUSA,
                     deform_roi_pool_backward_musa);

void SigmoidFocalLossForwardMUSAKernelLauncher(Tensor input, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha);

void SigmoidFocalLossBackwardMUSAKernelLauncher(Tensor input, Tensor target,
                                                Tensor weight,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha);

void SoftmaxFocalLossForwardMUSAKernelLauncher(Tensor softmax, Tensor target,
                                               Tensor weight, Tensor output,
                                               const float gamma,
                                               const float alpha);

void SoftmaxFocalLossBackwardMUSAKernelLauncher(Tensor softmax, Tensor target,
                                                Tensor weight, Tensor buff,
                                                Tensor grad_input,
                                                const float gamma,
                                                const float alpha);

void sigmoid_focal_loss_forward_musa(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  SigmoidFocalLossForwardMUSAKernelLauncher(input, target, weight, output,
                                            gamma, alpha);
}

void sigmoid_focal_loss_backward_musa(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha) {
  SigmoidFocalLossBackwardMUSAKernelLauncher(input, target, weight, grad_input,
                                             gamma, alpha);
}

void softmax_focal_loss_forward_musa(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
  SoftmaxFocalLossForwardMUSAKernelLauncher(input, target, weight, output,
                                            gamma, alpha);
}

void softmax_focal_loss_backward_musa(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
  SoftmaxFocalLossBackwardMUSAKernelLauncher(input, target, weight, buff,
                                             grad_input, gamma, alpha);
}

void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha);

void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha);

REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MUSA,
                     sigmoid_focal_loss_forward_musa);
REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MUSA,
                     sigmoid_focal_loss_backward_musa);
REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, MUSA,
                     softmax_focal_loss_forward_musa);
REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, MUSA,
                     softmax_focal_loss_backward_musa);

void FurthestPointSamplingForwardMUSAKernelLauncher(int b, int n, int m,
                                                    const float *dataset,
                                                    float *temp, int *idxs);

void FurthestPointSamplingWithDistForwardMUSAKernelLauncher(
    int b, int n, int m, const float *dataset, float *temp, int *idxs);

void furthest_point_sampling_forward_musa(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m) {
  const float *dataset = points_tensor.data_ptr<float>();
  float *temp = temp_tensor.data_ptr<float>();
  int *idxs = idx_tensor.data_ptr<int>();
  FurthestPointSamplingForwardMUSAKernelLauncher(b, n, m, dataset, temp, idxs);
}

void furthest_point_sampling_with_dist_forward_musa(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m) {
  const float *dataset = points_tensor.data_ptr<float>();
  float *temp = temp_tensor.data_ptr<float>();
  int *idxs = idx_tensor.data_ptr<int>();
  FurthestPointSamplingWithDistForwardMUSAKernelLauncher(b, n, m, dataset, temp,
                                                         idxs);
}

void furthest_point_sampling_forward_impl(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m);

void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m);

REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, MUSA,
                     furthest_point_sampling_forward_musa);
REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, MUSA,
                     furthest_point_sampling_with_dist_forward_musa);

torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
                                      const torch::Tensor &bias,
                                      const torch::Tensor &refer, int act,
                                      int grad, float alpha, float scale);

torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor &input,
                                           const torch::Tensor &bias,
                                           const torch::Tensor &refer, int act,
                                           int grad, float alpha, float scale);
REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, MUSA,
                     fused_bias_leakyrelu_op);

torch::Tensor bias_act_op_impl(const torch::Tensor &input,
                               const torch::Tensor &bias,
                               const torch::Tensor &xref,
                               const torch::Tensor &yref,
                               const torch::Tensor &dy, int grad, int dim,
                               int act, float alpha, float gain, float clamp);

torch::Tensor bias_act_op(const torch::Tensor &input, const torch::Tensor &bias,
                          const torch::Tensor &xref, const torch::Tensor &yref,
                          const torch::Tensor &dy, int grad, int dim, int act,
                          float alpha, float gain, float clamp);

REGISTER_DEVICE_IMPL(bias_act_op_impl, MUSA, bias_act_op);

torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
                                         int sx, int sy, float gain,
                                         float slope, float clamp,
                                         bool writeSigns);

torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
                                    int sy, float gain, float slope,
                                    float clamp, bool writeSigns);

REGISTER_DEVICE_IMPL(filtered_lrelu_act_op_impl, MUSA, filtered_lrelu_act_op);

void GatherPointsForwardMUSAKernelLauncher(int b, int c, int n, int npoints,
                                           const Tensor points,
                                           const Tensor idx, Tensor out);

void GatherPointsBackwardMUSAKernelLauncher(int b, int c, int n, int npoints,
                                            const Tensor grad_out,
                                            const Tensor idx,
                                            Tensor grad_points);

void gather_points_forward_musa(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
  GatherPointsForwardMUSAKernelLauncher(b, c, n, npoints, points, idx, out);
};

void gather_points_backward_musa(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
  GatherPointsBackwardMUSAKernelLauncher(b, c, n, npoints, grad_out, idx,
                                         grad_points);
};

void gather_points_forward_impl(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out);

void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points);

REGISTER_DEVICE_IMPL(gather_points_forward_impl, MUSA,
                     gather_points_forward_musa);
REGISTER_DEVICE_IMPL(gather_points_backward_impl, MUSA,
                     gather_points_backward_musa);

void GroupPointsForwardMUSAKernelLauncher(int b, int c, int n, int npoints,
                                          int nsample, const Tensor points,
                                          const Tensor idx, Tensor out);

void GroupPointsBackwardMUSAKernelLauncher(int b, int c, int n, int npoints,
                                           int nsample, const Tensor grad_out,
                                           const Tensor idx,
                                           Tensor grad_points);

void group_points_forward_musa(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
  GroupPointsForwardMUSAKernelLauncher(b, c, n, npoints, nsample, points, idx,
                                       out);
};

void group_points_backward_musa(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
  GroupPointsBackwardMUSAKernelLauncher(b, c, n, npoints, nsample, grad_out,
                                        idx, grad_points);
};

void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out);

void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points);

REGISTER_DEVICE_IMPL(group_points_forward_impl, MUSA,
                     group_points_forward_musa);
REGISTER_DEVICE_IMPL(group_points_backward_impl, MUSA,
                     group_points_backward_musa);

void StackGroupPointsForwardMUSAKernelLauncher(
    int b, int c, int m, int nsample, const Tensor features_tensor,
    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
    const Tensor idx_batch_cnt_tensor, Tensor out_tensor);
void StackGroupPointsBackwardMUSAKernelLauncher(
    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor);

void stack_group_points_forward_musa(int b, int c, int m, int nsample,
                                     const Tensor features_tensor,
                                     const Tensor features_batch_cnt_tensor,
                                     const Tensor idx_tensor,
                                     const Tensor idx_batch_cnt_tensor,
                                     Tensor out_tensor) {
  StackGroupPointsForwardMUSAKernelLauncher(
      b, c, m, nsample, features_tensor, features_batch_cnt_tensor, idx_tensor,
      idx_batch_cnt_tensor, out_tensor);
};

void stack_group_points_backward_musa(int b, int c, int m, int n, int nsample,
                                      const Tensor grad_out_tensor,
                                      const Tensor idx_tensor,
                                      const Tensor idx_batch_cnt_tensor,
                                      const Tensor features_batch_cnt_tensor,
                                      Tensor grad_features_tensor) {
  StackGroupPointsBackwardMUSAKernelLauncher(
      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
      features_batch_cnt_tensor, grad_features_tensor);
};

void stack_group_points_forward_impl(int b, int c, int m, int nsample,
                                     const Tensor features_tensor,
                                     const Tensor features_batch_cnt_tensor,
                                     const Tensor idx_tensor,
                                     const Tensor idx_batch_cnt_tensor,
                                     Tensor out_tensor);

void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
                                      const Tensor grad_out_tensor,
                                      const Tensor idx_tensor,
                                      const Tensor idx_batch_cnt_tensor,
                                      const Tensor features_batch_cnt_tensor,
                                      Tensor grad_features_tensor);

REGISTER_DEVICE_IMPL(stack_group_points_forward_impl, MUSA,
                     stack_group_points_forward_musa);
REGISTER_DEVICE_IMPL(stack_group_points_backward_impl, MUSA,
                     stack_group_points_backward_musa);

void IoU3DBoxesOverlapBevForwardMUSAKernelLauncher(const int num_a,
                                                   const Tensor boxes_a,
                                                   const int num_b,
                                                   const Tensor boxes_b,
                                                   Tensor ans_overlap);

void IoU3DNMS3DForwardMUSAKernelLauncher(const Tensor boxes, Tensor &keep,
                                         Tensor &keep_num,
                                         float nms_overlap_thresh);

void IoU3DNMS3DNormalForwardMUSAKernelLauncher(const Tensor boxes, Tensor &keep,
                                               Tensor &keep_num,
                                               float nms_overlap_thresh);

void iou3d_boxes_overlap_bev_forward_musa(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap) {
  IoU3DBoxesOverlapBevForwardMUSAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
                                                ans_overlap);
};

void iou3d_nms3d_forward_musa(const Tensor boxes, Tensor &keep,
                              Tensor &keep_num, float nms_overlap_thresh) {
  IoU3DNMS3DForwardMUSAKernelLauncher(boxes, keep, keep_num,
                                      nms_overlap_thresh);
};

void iou3d_nms3d_normal_forward_musa(const Tensor boxes, Tensor &keep,
                                     Tensor &keep_num,
                                     float nms_overlap_thresh) {
  IoU3DNMS3DNormalForwardMUSAKernelLauncher(boxes, keep, keep_num,
                                            nms_overlap_thresh);
};

void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap);

void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
                              Tensor &keep_num, float nms_overlap_thresh);

void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
                                     Tensor &keep_num,
                                     float nms_overlap_thresh);

REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, MUSA,
                     iou3d_boxes_overlap_bev_forward_musa);
REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, MUSA, iou3d_nms3d_forward_musa);
REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, MUSA,
                     iou3d_nms3d_normal_forward_musa);

void KNNForwardMUSAKernelLauncher(int b, int n, int m, int nsample,
                                  const Tensor xyz, const Tensor new_xyz,
                                  Tensor idx, Tensor dist2);

void knn_forward_musa(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
  KNNForwardMUSAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
}

void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2);
REGISTER_DEVICE_IMPL(knn_forward_impl, MUSA, knn_forward_musa);

void MaskedIm2colForwardMUSAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
                                           Tensor top_data, const int kernel_h,
                                           const int kernel_w, const int pad_h,
                                           const int pad_w);

void MaskedCol2imForwardMUSAKernelLauncher(const Tensor bottom_data,
                                           const Tensor mask_h_idx,
                                           const Tensor mask_w_idx,
                                           Tensor top_data, const int height,
                                           const int width, const int channels);

void masked_im2col_forward_musa(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
  MaskedIm2colForwardMUSAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
                                        kernel_h, kernel_w, pad_h, pad_w);
}

void masked_col2im_forward_musa(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels) {
  // im: (n, ic, h, w), kernel size (kh, kw)
  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
  MaskedCol2imForwardMUSAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
                                        width, channels);
}

void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor col,
                                const int kernel_h, const int kernel_w,
                                const int pad_h, const int pad_w);

void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                const Tensor mask_w_idx, Tensor im, int height,
                                int width, int channels);

REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, MUSA,
                     masked_im2col_forward_musa);
REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, MUSA,
                     masked_col2im_forward_musa);

void modulated_deformable_im2col_musa(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col);

void modulated_deformable_col2im_musa(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im);

void modulated_deformable_col2im_coord_musa(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask);

void modulated_deformable_im2col_impl(
    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor data_col);

void modulated_deformable_col2im_impl(
    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
    const int batch_size, const int channels, const int height_im,
    const int width_im, const int height_col, const int width_col,
    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w, const int dilation_h,
    const int dilation_w, const int deformable_group, Tensor grad_im);

void modulated_deformable_col2im_coord_impl(
    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
    const Tensor data_mask, const int batch_size, const int channels,
    const int height_im, const int width_im, const int height_col,
    const int width_col, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w, const int deformable_group,
    Tensor grad_offset, Tensor grad_mask);

REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, MUSA,
                     modulated_deformable_im2col_musa);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, MUSA,
                     modulated_deformable_col2im_musa);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, MUSA,
                     modulated_deformable_col2im_coord_musa);

Tensor ms_deform_attn_musa_forward(const Tensor &value,
                                   const Tensor &spatial_shapes,
                                   const Tensor &level_start_index,
                                   const Tensor &sampling_loc,
                                   const Tensor &attn_weight,
                                   const int im2col_step);

void ms_deform_attn_musa_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);

Tensor ms_deform_attn_impl_forward(const Tensor &value,
                                   const Tensor &spatial_shapes,
                                   const Tensor &level_start_index,
                                   const Tensor &sampling_loc,
                                   const Tensor &attn_weight,
                                   const int im2col_step);

void ms_deform_attn_impl_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);

REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MUSA,
                     ms_deform_attn_musa_forward);
REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MUSA,
                     ms_deform_attn_musa_backward);

Tensor NMSMUSAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                             int offset);

Tensor nms_musa(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  return NMSMUSAKernelLauncher(boxes, scores, iou_threshold, offset);
}

Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
REGISTER_DEVICE_IMPL(nms_impl, MUSA, nms_musa);

void PointsInBoxesPartForwardMUSAKernelLauncher(int batch_size, int boxes_num,
                                                int pts_num, const Tensor boxes,
                                                const Tensor pts,
                                                Tensor box_idx_of_points);

void PointsInBoxesAllForwardMUSAKernelLauncher(int batch_size, int boxes_num,
                                               int pts_num, const Tensor boxes,
                                               const Tensor pts,
                                               Tensor box_idx_of_points);

void points_in_boxes_part_forward_musa(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points) {
  PointsInBoxesPartForwardMUSAKernelLauncher(batch_size, boxes_num, pts_num,
                                             boxes, pts, box_idx_of_points);
};

void points_in_boxes_all_forward_musa(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points) {
  PointsInBoxesAllForwardMUSAKernelLauncher(batch_size, boxes_num, pts_num,
                                            boxes, pts, box_idx_of_points);
};

void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points);

void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points);
REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, MUSA,
                     points_in_boxes_part_forward_musa);
REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, MUSA,
                     points_in_boxes_all_forward_musa);

void PSAMaskForwardMUSAKernelLauncher(const int psa_type, const Tensor input,
                                      Tensor output, const int num_,
                                      const int h_feature, const int w_feature,
                                      const int h_mask, const int w_mask,
                                      const int half_h_mask,
                                      const int half_w_mask);

void PSAMaskBackwardMUSAKernelLauncher(
    const int psa_type, const Tensor grad_output, Tensor grad_input,
    const int num_, const int h_feature, const int w_feature, const int h_mask,
    const int w_mask, const int half_h_mask, const int half_w_mask);

void psamask_forward_musa(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask) {
  PSAMaskForwardMUSAKernelLauncher(psa_type, input, output, num_, h_feature,
                                   w_feature, h_mask, w_mask, half_h_mask,
                                   half_w_mask);
}

void psamask_backward_musa(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask) {
  PSAMaskBackwardMUSAKernelLauncher(psa_type, grad_output, grad_input, num_,
                                    h_feature, w_feature, h_mask, w_mask,
                                    half_h_mask, half_w_mask);
}

void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask);

void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask);
REGISTER_DEVICE_IMPL(psamask_forward_impl, MUSA, psamask_forward_musa);
REGISTER_DEVICE_IMPL(psamask_backward_impl, MUSA, psamask_backward_musa);

void ROIAlignForwardMUSAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,
                                       int aligned_height, int aligned_width,
                                       float spatial_scale, int sampling_ratio,
                                       int pool_mode, bool aligned);

void ROIAlignBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,
                                        Tensor argmax_y, Tensor argmax_x,
                                        Tensor grad_input, int aligned_height,
                                        int aligned_width, float spatial_scale,
                                        int sampling_ratio, int pool_mode,
                                        bool aligned);

void roi_align_forward_musa(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  ROIAlignForwardMUSAKernelLauncher(
      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
      spatial_scale, sampling_ratio, pool_mode, aligned);
}

void roi_align_backward_musa(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
  ROIAlignBackwardMUSAKernelLauncher(
      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}

void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);

void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned);

REGISTER_DEVICE_IMPL(roi_align_forward_impl, MUSA, roi_align_forward_musa);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, MUSA, roi_align_backward_musa);

void ROIAlignRotatedForwardMUSAKernelLauncher(
    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor output);

void ROIAlignRotatedBackwardMUSAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);

void roi_align_rotated_forward_musa(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);

  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }

  int num_channels = input.size(1);
  int data_height = input.size(2);
  int data_width = input.size(3);
  ROIAlignRotatedForwardMUSAKernelLauncher(
      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
      num_channels, data_height, data_width, num_rois, aligned_height,
      aligned_width, output);
}

void roi_align_rotated_backward_musa(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }

  int num_channels = bottom_grad.size(1);
  int data_height = bottom_grad.size(2);
  int data_width = bottom_grad.size(3);
  ROIAlignRotatedBackwardMUSAKernelLauncher(
      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
      num_channels, data_height, data_width, num_rois, aligned_height,
      aligned_width, bottom_grad);
}

void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise);

void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MUSA,
                     roi_align_rotated_forward_musa);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MUSA,
                     roi_align_rotated_backward_musa);

void RiROIAlignRotatedForwardMUSAKernelLauncher(
    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor output);

void RiROIAlignRotatedBackwardMUSAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor bottom_grad);

void riroi_align_rotated_forward_musa(Tensor features, Tensor rois,
                                      Tensor output, int pooled_height,
                                      int pooled_width, float spatial_scale,
                                      int num_samples, int num_orientations,
                                      bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }
  CHECK_CONTIGUOUS(features);
  CHECK_CONTIGUOUS(rois);
  int num_channels = features.size(1) / num_orientations;
  int data_height = features.size(2);
  int data_width = features.size(3);
  RiROIAlignRotatedForwardMUSAKernelLauncher(
      features, rois, spatial_scale, num_samples, clockwise, num_channels,
      data_height, data_width, num_rois, pooled_height, pooled_width,
      num_orientations, output);
}

void riroi_align_rotated_backward_musa(Tensor top_grad, Tensor rois,
                                       Tensor bottom_grad, int pooled_height,
                                       int pooled_width, float spatial_scale,
                                       int num_samples, int num_orientations,
                                       bool clockwise) {
  // Number of ROIs
  int num_rois = rois.size(0);
  int size_rois = rois.size(1);
  if (size_rois != 6) {
    AT_ERROR("wrong roi size");
  }
  CHECK_CONTIGUOUS(top_grad);
  CHECK_CONTIGUOUS(rois);
  int num_channels = bottom_grad.size(1) / num_orientations;
  int data_height = bottom_grad.size(2);
  int data_width = bottom_grad.size(3);
  RiROIAlignRotatedBackwardMUSAKernelLauncher(
      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
      data_height, data_width, num_rois, pooled_height, pooled_width,
      num_orientations, bottom_grad);
}

void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
                                      Tensor output, int pooled_height,
                                      int pooled_width, float spatial_scale,
                                      int num_samples, int num_orientations,
                                      bool clockwise);

void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                       Tensor bottom_grad, int pooled_height,
                                       int pooled_width, float spatial_scale,
                                       int num_samples, int num_orientations,
                                       bool clockwise);

REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, MUSA,
                     riroi_align_rotated_forward_musa);
REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, MUSA,
                     riroi_align_rotated_backward_musa);

void RoiawarePool3dForwardMUSAKernelLauncher(
    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
    int out_y, int out_z, const Tensor rois, const Tensor pts,
    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
    Tensor pooled_features, int pool_method);

void RoiawarePool3dBackwardMUSAKernelLauncher(
    int boxes_num, int out_x, int out_y, int out_z, int channels,
    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
    const Tensor grad_out, Tensor grad_in, int pool_method);

void roiaware_pool3d_forward_musa(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method) {
  RoiawarePool3dForwardMUSAKernelLauncher(
      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
      pool_method);
};

void roiaware_pool3d_backward_musa(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method) {
  RoiawarePool3dBackwardMUSAKernelLauncher(
      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
};

void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method);

void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method);

REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, MUSA,
                     roiaware_pool3d_forward_musa);
REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, MUSA,
                     roiaware_pool3d_backward_musa);

void RoIPointPool3dForwardMUSAKernelLauncher(
    int batch_size, int pts_num, int boxes_num, int feature_in_len,
    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);

void roipoint_pool3d_forward_musa(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag) {
  RoIPointPool3dForwardMUSAKernelLauncher(
      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
};

void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag);
REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, MUSA,
                     roipoint_pool3d_forward_musa);

void ROIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                      Tensor argmax, int pooled_height,
                                      int pooled_width, float spatial_scale);

void ROIPoolBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,
                                       Tensor argmax, Tensor grad_input,
                                       int pooled_height, int pooled_width,
                                       float spatial_scale);

void roi_pool_forward_musa(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale) {
  ROIPoolForwardMUSAKernelLauncher(input, rois, output, argmax, pooled_height,
                                   pooled_width, spatial_scale);
}

void roi_pool_backward_musa(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale) {
  ROIPoolBackwardMUSAKernelLauncher(grad_output, rois, argmax, grad_input,
                                    pooled_height, pooled_width, spatial_scale);
}

void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale);
void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale);
REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MUSA, roi_pool_forward_musa);
REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MUSA, roi_pool_backward_musa);

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;

std::vector<at::Tensor> DynamicPointToVoxelForwardMUSAKernelLauncher(
    const at::Tensor &feats, const at::Tensor &coors,
    const reduce_t reduce_type);

void DynamicPointToVoxelBackwardMUSAKernelLauncher(
    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
    const at::Tensor &feats, const at::Tensor &reduced_feats,
    const at::Tensor &coors_map, const at::Tensor &reduce_count,
    const reduce_t reduce_type);

std::vector<torch::Tensor> dynamic_point_to_voxel_forward_musa(
    const torch::Tensor &feats, const torch::Tensor &coors,
    const reduce_t reduce_type) {
  return DynamicPointToVoxelForwardMUSAKernelLauncher(feats, coors,
                                                      reduce_type);
};

void dynamic_point_to_voxel_backward_musa(
    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
    const reduce_t reduce_type) {
  DynamicPointToVoxelBackwardMUSAKernelLauncher(grad_feats, grad_reduced_feats,
                                                feats, reduced_feats, coors_idx,
                                                reduce_count, reduce_type);
};

std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
    const torch::Tensor &feats, const torch::Tensor &coors,
    const reduce_t reduce_type);

void dynamic_point_to_voxel_backward_impl(
    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
    const reduce_t reduce_type);

REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, MUSA,
                     dynamic_point_to_voxel_forward_musa);
REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, MUSA,
                     dynamic_point_to_voxel_backward_musa);

void SyncBNForwardMeanMUSAKernelLauncher(const Tensor input, Tensor mean);

void SyncBNForwardVarMUSAKernelLauncher(const Tensor input, const Tensor mean,
                                        Tensor var);

void SyncBNForwardOutputMUSAKernelLauncher(
    const Tensor input, const Tensor mean, const Tensor var,
    Tensor running_mean, Tensor running_var, const Tensor weight,
    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
    float momentum, int group_size);

void SyncBNBackwardParamMUSAKernelLauncher(const Tensor grad_output,
                                           const Tensor norm,
                                           Tensor grad_weight,
                                           Tensor grad_bias);

void SyncBNBackwardDataMUSAKernelLauncher(const Tensor grad_output,
                                          const Tensor weight,
                                          const Tensor grad_weight,
                                          const Tensor grad_bias,
                                          const Tensor norm, const Tensor std,
                                          Tensor grad_input);

void sync_bn_forward_mean_musa(const Tensor input, Tensor mean) {
  SyncBNForwardMeanMUSAKernelLauncher(input, mean);
}

void sync_bn_forward_var_musa(const Tensor input, const Tensor mean,
                              Tensor var) {
  SyncBNForwardVarMUSAKernelLauncher(input, mean, var);
}

void sync_bn_forward_output_musa(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size) {
  SyncBNForwardOutputMUSAKernelLauncher(input, mean, var, running_mean,
                                        running_var, weight, bias, norm, std,
                                        output, eps, momentum, group_size);
}

void sync_bn_backward_param_musa(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias) {
  SyncBNBackwardParamMUSAKernelLauncher(grad_output, norm, grad_weight,
                                        grad_bias);
}

void sync_bn_backward_data_musa(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input) {
  SyncBNBackwardDataMUSAKernelLauncher(grad_output, weight, grad_weight,
                                       grad_bias, norm, std, grad_input);
}

void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);

void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
                              Tensor var);

void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size);

void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias);

void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input);

REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, MUSA,
                     sync_bn_forward_mean_musa);
REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, MUSA, sync_bn_forward_var_musa);
REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, MUSA,
                     sync_bn_forward_output_musa);
REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, MUSA,
                     sync_bn_backward_param_musa);
REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, MUSA,
                     sync_bn_backward_data_musa);

void ThreeInterpolateForwardMUSAKernelLauncher(int b, int c, int m, int n,
                                               const Tensor points,
                                               const Tensor idx,
                                               const Tensor weight, Tensor out);

void ThreeInterpolateBackwardMUSAKernelLauncher(int b, int c, int n, int m,
                                                const Tensor grad_out,
                                                const Tensor idx,
                                                const Tensor weight,
                                                Tensor grad_points);

void three_interpolate_forward_musa(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out) {
  ThreeInterpolateForwardMUSAKernelLauncher(b, c, m, n, points, idx, weight,
                                            out);
};

void three_interpolate_backward_musa(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points) {
  ThreeInterpolateBackwardMUSAKernelLauncher(b, c, n, m, grad_out, idx, weight,
                                             grad_points);
};

void three_interpolate_forward_impl(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out);

void three_interpolate_backward_impl(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points);
REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, MUSA,
                     three_interpolate_forward_musa);
REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, MUSA,
                     three_interpolate_backward_musa);

void ThreeNNForwardMUSAKernelLauncher(int b, int n, int m, const Tensor unknown,
                                      const Tensor known, Tensor dist2,
                                      Tensor idx);

void three_nn_forward_musa(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx) {
  ThreeNNForwardMUSAKernelLauncher(b, n, m, unknown, known, dist2, idx);
};

void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx);
REGISTER_DEVICE_IMPL(three_nn_forward_impl, MUSA, three_nn_forward_musa);

void TINShiftForwardMUSAKernelLauncher(Tensor input, Tensor shift,
                                       Tensor output);

void TINShiftBackwardMUSAKernelLauncher(Tensor grad_output, Tensor shift,
                                        Tensor grad_input);

void tin_shift_forward_musa(Tensor input, Tensor shift, Tensor output) {
  TINShiftForwardMUSAKernelLauncher(input, shift, output);
}

void tin_shift_backward_musa(Tensor grad_output, Tensor shift,
                             Tensor grad_input) {
  TINShiftBackwardMUSAKernelLauncher(grad_output, shift, grad_input);
}

void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
                             Tensor grad_input);
REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MUSA, tin_shift_forward_musa);
REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MUSA, tin_shift_backward_musa);

#if ((!defined(MUSA_ARCH)) || (defined(MUSA_ARCH)) && (MUSA_ARCH > 21))
torch::Tensor upfirdn2d_op(torch::Tensor input, torch::Tensor filter, int upx,
                           int upy, int downx, int downy, int padx0, int padx1,
                           int pady0, int pady1, bool flip, float gain);

torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
                                int upx, int upy, int downx, int downy,
                                int padx0, int padx1, int pady0, int pady1,
                                bool flip, float gain);
REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, MUSA, upfirdn2d_op);
#endif

int HardVoxelizeForwardMUSAKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3);

int NondeterministicHardVoxelizeForwardMUSAKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3);

void DynamicVoxelizeForwardMUSAKernelLauncher(
    const at::Tensor &points, at::Tensor &coors,
    const std::vector<float> voxel_size, const std::vector<float> coors_range,
    const int NDim = 3);

int hard_voxelize_forward_musa(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim) {
  return HardVoxelizeForwardMUSAKernelLauncher(
      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
      max_points, max_voxels, NDim);
};

int nondeterministic_hard_voxelize_forward_musa(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim) {
  return NondeterministicHardVoxelizeForwardMUSAKernelLauncher(
      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
      max_points, max_voxels, NDim);
};

void dynamic_voxelize_forward_musa(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim) {
  DynamicVoxelizeForwardMUSAKernelLauncher(points, coors, voxel_size,
                                           coors_range, NDim);
};

int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim);

int nondeterministic_hard_voxelize_forward_impl(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim);

void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim);

REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, MUSA,
                     hard_voxelize_forward_musa);
REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, MUSA,
                     nondeterministic_hard_voxelize_forward_musa);
REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, MUSA,
                     dynamic_voxelize_forward_musa);

void RotatedFeatureAlignForwardMUSAKernelLauncher(const Tensor features,
                                                  const Tensor best_bboxes,
                                                  const float spatial_scale,
                                                  const int points,
                                                  Tensor output);

void RotatedFeatureAlignBackwardMUSAKernelLauncher(const Tensor top_grad,
                                                   const Tensor best_bboxes,
                                                   const float spatial_scale,
                                                   const int points,
                                                   Tensor bottom_grad);

void rotated_feature_align_forward_musa(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output) {
  RotatedFeatureAlignForwardMUSAKernelLauncher(features, best_bboxes,
                                               spatial_scale, points, output);
};

void rotated_feature_align_backward_musa(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad) {
  RotatedFeatureAlignBackwardMUSAKernelLauncher(
      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
};

void rotated_feature_align_forward_impl(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output);

void rotated_feature_align_backward_impl(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad);

REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, MUSA,
                     rotated_feature_align_forward_musa);
REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, MUSA,
                     rotated_feature_align_backward_musa);

void PointsInPolygonsForwardMUSAKernelLauncher(const at::Tensor points,
                                               const at::Tensor polygons,
                                               const int rows, const int cols,
                                               at::Tensor output);

void points_in_polygons_forward_musa(const Tensor points, const Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols) {
  PointsInPolygonsForwardMUSAKernelLauncher(points, polygons, rows, cols,
                                            output);
};

void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols);

REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, MUSA,
                     points_in_polygons_forward_musa);

torch::Tensor IndiceMaxpoolForwardMUSAKernelLauncher(torch::Tensor features,
                                                     torch::Tensor indicePairs,
                                                     torch::Tensor indiceNum,
                                                     int64_t numAct);

torch::Tensor indice_maxpool_forward_musa(torch::Tensor features,
                                          torch::Tensor indicePairs,
                                          torch::Tensor indiceNum,
                                          int64_t numAct) {
  return IndiceMaxpoolForwardMUSAKernelLauncher(features, indicePairs,
                                                indiceNum, numAct);
};

torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
                                          torch::Tensor indicePairs,
                                          torch::Tensor indiceNum,
                                          int64_t numAct);
REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, MUSA,
                     indice_maxpool_forward_musa);

torch::Tensor IndiceMaxpoolBackwardMUSAKernelLauncher(torch::Tensor features,
                                                      torch::Tensor outFeatures,
                                                      torch::Tensor outGrad,
                                                      torch::Tensor indicePairs,
                                                      torch::Tensor indiceNum);

torch::Tensor indice_maxpool_backward_musa(torch::Tensor features,
                                           torch::Tensor outFeatures,
                                           torch::Tensor outGrad,
                                           torch::Tensor indicePairs,
                                           torch::Tensor indiceNum) {
  return IndiceMaxpoolBackwardMUSAKernelLauncher(features, outFeatures, outGrad,
                                                 indicePairs, indiceNum);
};

torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
                                           torch::Tensor outFeatures,
                                           torch::Tensor outGrad,
                                           torch::Tensor indicePairs,
                                           torch::Tensor indiceNum);

REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, MUSA,
                     indice_maxpool_backward_musa)

torch::Tensor IndiceConvForwardMUSAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
    int64_t _subM);

torch::Tensor indice_conv_forward_musa(torch::Tensor features,
                                       torch::Tensor filters,
                                       torch::Tensor indicePairs,
                                       torch::Tensor indiceNum,
                                       int64_t numActOut, int64_t _inverse,
                                       int64_t _subM) {
  return IndiceConvForwardMUSAKernelLauncher(
      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
};

torch::Tensor indice_conv_forward_impl(torch::Tensor features,
                                       torch::Tensor filters,
                                       torch::Tensor indicePairs,
                                       torch::Tensor indiceNum,
                                       int64_t numActOut, int64_t _inverse,
                                       int64_t _subM);

REGISTER_DEVICE_IMPL(indice_conv_forward_impl, MUSA, indice_conv_forward_musa);

std::vector<torch::Tensor> IndiceConvBackwardMUSAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM);

std::vector<torch::Tensor> indice_conv_backward_musa(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM) {
  return IndiceConvBackwardMUSAKernelLauncher(
      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
};

std::vector<torch::Tensor> indice_conv_backward_impl(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM);

REGISTER_DEVICE_IMPL(indice_conv_backward_impl, MUSA,
                     indice_conv_backward_musa);

torch::Tensor FusedIndiceConvBatchnormMUSAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM);

torch::Tensor fused_indice_conv_batchnorm_forward_musa(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM) {
  return FusedIndiceConvBatchnormMUSAKernelLauncher(features, filters, bias,
                                                    indicePairs, indiceNum,
                                                    numActOut, _inverse, _subM);
};

torch::Tensor fused_indice_conv_batchnorm_forward_impl(
    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
    int64_t _inverse, int64_t _subM);

REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, MUSA,
                     fused_indice_conv_batchnorm_forward_musa)

void MinAreaPolygonsMUSAKernelLauncher(const Tensor pointsets, Tensor polygons);

void min_area_polygons_musa(const Tensor pointsets, Tensor polygons) {
  MinAreaPolygonsMUSAKernelLauncher(pointsets, polygons);
}

void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);

REGISTER_DEVICE_IMPL(min_area_polygons_impl, MUSA, min_area_polygons_musa);

void ActiveRotatedFilterForwardMUSAKernelLauncher(const Tensor input,
                                                  const Tensor indices,
                                                  Tensor output);

void ActiveRotatedFilterBackwardMUSAKernelLauncher(const Tensor grad_out,
                                                   const Tensor indices,
                                                   Tensor grad_in);

void active_rotated_filter_forward_musa(const Tensor input,
                                        const Tensor indices, Tensor output) {
  ActiveRotatedFilterForwardMUSAKernelLauncher(input, indices, output);
};

void active_rotated_filter_backward_musa(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in) {
  ActiveRotatedFilterBackwardMUSAKernelLauncher(grad_out, indices, grad_in);
};

void active_rotated_filter_forward_impl(const Tensor input,
                                        const Tensor indices, Tensor output);

void active_rotated_filter_backward_impl(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in);

REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, MUSA,
                     active_rotated_filter_forward_musa);
REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, MUSA,
                     active_rotated_filter_backward_musa);

void ConvexIoUMUSAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                 Tensor ious);

void ConvexGIoUMUSAKernelLauncher(const Tensor pointsets, const Tensor polygons,
                                  Tensor output);

void convex_iou_musa(const Tensor pointsets, const Tensor polygons,
                     Tensor ious) {
  ConvexIoUMUSAKernelLauncher(pointsets, polygons, ious);
}

void convex_giou_musa(const Tensor pointsets, const Tensor polygons,
                      Tensor output) {
  ConvexGIoUMUSAKernelLauncher(pointsets, polygons, output);
}

void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
                     Tensor ious);

void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
                      Tensor output);

REGISTER_DEVICE_IMPL(convex_iou_impl, MUSA, convex_iou_musa);
REGISTER_DEVICE_IMPL(convex_giou_impl, MUSA, convex_giou_musa);

Tensor DiffIoURotatedSortVerticesMUSAKernelLauncher(Tensor vertices,
                                                    Tensor mask,
                                                    Tensor num_valid);

Tensor diff_iou_rotated_sort_vertices_forward_musa(Tensor vertices, Tensor mask,
                                                   Tensor num_valid) {
  return DiffIoURotatedSortVerticesMUSAKernelLauncher(vertices, mask,
                                                      num_valid);
}

Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
                                                   Tensor num_valid);

REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, MUSA,
                     diff_iou_rotated_sort_vertices_forward_musa);

#if ((!defined(MUSA_ARCH)) || (defined(MUSA_ARCH)) && (MUSA_ARCH > 21))
void ChamferDistanceForwardMUSAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
    const Tensor dist2, const Tensor idx1, const Tensor idx2);
#endif

void ChamferDistanceBackwardMUSAKernelLauncher(
    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);

#if ((!defined(MUSA_ARCH)) || (defined(MUSA_ARCH)) && (MUSA_ARCH > 21))
void chamfer_distance_forward_musa(const Tensor xyz1, const Tensor xyz2,
                                   const Tensor dist1, const Tensor dist2,
                                   const Tensor idx1, const Tensor idx2) {
  ChamferDistanceForwardMUSAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
                                           idx2);
};

void chamfer_distance_backward_musa(const Tensor xyz1, const Tensor xyz2,
                                    Tensor idx1, Tensor idx2, Tensor graddist1,
                                    Tensor graddist2, Tensor gradxyz1,
                                    Tensor gradxyz2) {
  ChamferDistanceBackwardMUSAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
                                            graddist2, gradxyz1, gradxyz2);
};

void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
                                   const Tensor dist1, const Tensor dist2,
                                   const Tensor idx1, const Tensor idx2);

void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
                                    Tensor idx1, Tensor idx2, Tensor graddist1,
                                    Tensor graddist2, Tensor gradxyz1,
                                    Tensor gradxyz2);

REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, MUSA,
                     chamfer_distance_forward_musa);
REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, MUSA,
                     chamfer_distance_backward_musa);
#endif

void PrROIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois,
                                        Tensor output, int pooled_height,
                                        int pooled_width, float spatial_scale);

void PrROIPoolBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,
                                         Tensor grad_input, int pooled_height,
                                         int pooled_width, float spatial_scale);

void PrROIPoolCoorBackwardMUSAKernelLauncher(
    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);

void prroi_pool_forward_musa(Tensor input, Tensor rois, Tensor output,
                             int pooled_height, int pooled_width,
                             float spatial_scale) {
  PrROIPoolForwardMUSAKernelLauncher(input, rois, output, pooled_height,
                                     pooled_width, spatial_scale);
}

void prroi_pool_backward_musa(Tensor grad_output, Tensor rois,
                              Tensor grad_input, int pooled_height,
                              int pooled_width, float spatial_scale) {
  PrROIPoolBackwardMUSAKernelLauncher(grad_output, rois, grad_input,
                                      pooled_height, pooled_width,
                                      spatial_scale);
}

void prroi_pool_coor_backward_musa(Tensor output, Tensor grad_output,
                                   Tensor input, Tensor rois, Tensor grad_rois,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale) {
  PrROIPoolCoorBackwardMUSAKernelLauncher(output, grad_output, input, rois,
                                          grad_rois, pooled_height,
                                          pooled_width, spatial_scale);
}

void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                             int pooled_height, int pooled_width,
                             float spatial_scale);
void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
                              Tensor grad_input, int pooled_height,
                              int pooled_width, float spatial_scale);
void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
                                   Tensor input, Tensor rois, Tensor grad_rois,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale);
REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, MUSA, prroi_pool_forward_musa);
REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, MUSA, prroi_pool_backward_musa);
REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, MUSA,
                     prroi_pool_coor_backward_musa);

void BezierAlignForwardMUSAKernelLauncher(Tensor input, Tensor rois,
                                          Tensor output, int aligned_height,
                                          int aligned_width,
                                          float spatial_scale,
                                          int sampling_ratio, bool aligned);

void BezierAlignBackwardMUSAKernelLauncher(
    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned);

void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                               int aligned_height, int aligned_width,
                               float spatial_scale, int sampling_ratio,
                               bool aligned);

void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
                                Tensor grad_input, int aligned_height,
                                int aligned_width, float spatial_scale,
                                int sampling_ratio, bool aligned);

REGISTER_DEVICE_IMPL(bezier_align_forward_impl, MUSA,
                     BezierAlignForwardMUSAKernelLauncher);
REGISTER_DEVICE_IMPL(bezier_align_backward_impl, MUSA,
                     BezierAlignBackwardMUSAKernelLauncher);


================================================
FILE: mmcv/ops/csrc/pytorch/musa/nms_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "nms_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

Tensor NMSMUSAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                             int offset) {
  c10::musa::MUSAGuard device_guard(boxes.device());

  if (boxes.numel() == 0) {
    return at::empty({0}, boxes.options().dtype(at::kLong));
  }
  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
  auto boxes_sorted = boxes.index_select(0, order_t);

  int boxes_num = boxes.size(0);
  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
  dim3 threads(threadsPerBlock);
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  nms_musa<<<blocks, threads, 0, stream>>>(
      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
      (unsigned long long*)mask.data_ptr<int64_t>());

  // Filter the boxes which should be kept.
  at::Tensor keep_t = at::zeros(
      {boxes_num}, boxes.options().dtype(at::kBool).device(::at::musa::kMUSA));
  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
                          col_blocks * sizeof(unsigned long long), stream>>>(
      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
      boxes_num);
  AT_MUSA_CHECK(musaGetLastError());
  return order_t.masked_select(keep_t);
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/nms_quadri_musa.mu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "nms_quadri_musa.muh"
#include "pytorch_musa_helper.hpp"

Tensor nms_quadri_musa(const Tensor dets, const Tensor scores,
                       const Tensor order_t, const Tensor dets_sorted,
                       float iou_threshold, const int multi_label) {
  // using scalar_t = float;
  AT_ASSERTM(dets.is_privateuseone(), "dets must be a MUSA tensor");
  AT_ASSERTM(scores.is_privateuseone(), "scores must be a MUSA tensor");
  c10::musa::MUSAGuard device_guard(dets.device());

  int dets_num = dets.size(0);

  const int col_blocks = at::musa::ATenCeilDiv(dets_num, threadsPerBlock);

  Tensor mask =
      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));

  dim3 blocks(col_blocks, col_blocks);
  dim3 threads(threadsPerBlock);
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  AT_DISPATCH_FLOATING_TYPES(
      dets_sorted.scalar_type(), "nms_quadri_kernel_musa", [&] {
        nms_quadri_musa_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
      });

  Tensor mask_cpu = mask.to(at::kCPU);
  unsigned long long* mask_host =
      (unsigned long long*)mask_cpu.data_ptr<int64_t>();

  std::vector<unsigned long long> remv(col_blocks);
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  Tensor keep =
      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
  int64_t* keep_out = keep.data_ptr<int64_t>();

  int num_to_keep = 0;
  for (int i = 0; i < dets_num; i++) {
    int nblock = i / threadsPerBlock;
    int inblock = i % threadsPerBlock;

    if (!(remv[nblock] & (1ULL << inblock))) {
      keep_out[num_to_keep++] = i;
      unsigned long long* p = mask_host + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv[j] |= p[j];
      }
    }
  }

  AT_MUSA_CHECK(musaGetLastError());
  return order_t.index(
      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
           .to(order_t.device(), keep.scalar_type())});
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/nms_rotated_musa.mu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_musa.cu
#include "nms_rotated_musa.muh"
#include "pytorch_musa_helper.hpp"

Tensor nms_rotated_musa(const Tensor dets, const Tensor scores,
                        const Tensor order_t, const Tensor dets_sorted,
                        float iou_threshold, const int multi_label) {
  // using scalar_t = float;
  AT_ASSERTM(dets.is_privateuseone(), "dets must be a MUSA tensor");
  AT_ASSERTM(scores.is_privateuseone(), "scores must be a MUSA tensor");
  c10::musa::MUSAGuard device_guard(dets.device());

  int dets_num = dets.size(0);

  const int col_blocks = at::musa::ATenCeilDiv(dets_num, threadsPerBlock);

  Tensor mask =
      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));

  dim3 blocks(col_blocks, col_blocks);
  dim3 threads(threadsPerBlock);
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  AT_DISPATCH_FLOATING_TYPES(
      dets_sorted.scalar_type(), "nms_rotated_kernel_musa", [&] {
        nms_rotated_musa_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
      });

  Tensor mask_cpu = mask.to(at::kCPU);
  unsigned long long* mask_host =
      (unsigned long long*)mask_cpu.data_ptr<int64_t>();

  std::vector<unsigned long long> remv(col_blocks);
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  Tensor keep =
      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
  int64_t* keep_out = keep.data_ptr<int64_t>();

  int num_to_keep = 0;
  for (int i = 0; i < dets_num; i++) {
    int nblock = i / threadsPerBlock;
    int inblock = i % threadsPerBlock;

    if (!(remv[nblock] & (1ULL << inblock))) {
      keep_out[num_to_keep++] = i;
      unsigned long long* p = mask_host + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv[j] |= p[j];
      }
    }
  }

  AT_MUSA_CHECK(musaGetLastError());
  return order_t.index(
      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
           .to(order_t.device(), keep.scalar_type())});
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/points_in_boxes_musa.mu
================================================
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.

#include <stdio.h>

#include "points_in_boxes_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void PointsInBoxesPartForwardMUSAKernelLauncher(int batch_size, int boxes_num,
                                                int pts_num, const Tensor boxes,
                                                const Tensor pts,
                                                Tensor box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is
  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
  // -1

  c10::musa::MUSAGuard device_guard(boxes.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES(
      boxes.scalar_type(), "points_in_boxes_part_forward_musa_kernel", [&] {
        points_in_boxes_part_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void PointsInBoxesAllForwardMUSAKernelLauncher(int batch_size, int boxes_num,
                                               int pts_num, const Tensor boxes,
                                               const Tensor pts,
                                               Tensor box_idx_of_points) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
  // default -1

  c10::musa::MUSAGuard device_guard(boxes.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES(
      boxes.scalar_type(), "points_in_boxes_all_forward_musa_kernel", [&] {
        points_in_boxes_all_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/points_in_polygons_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/ming71/MUSA/blob/master/point_justify/points_justify_kernel.cu

#include <stdio.h>

#include "points_in_polygons_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void PointsInPolygonsForwardMUSAKernelLauncher(const at::Tensor points,
                                               const at::Tensor polygons,
                                               const int rows, const int cols,
                                               at::Tensor output) {
  const int output_size = rows * cols;
  c10::musa::MUSAGuard device_guard(points.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      points.scalar_type(), "points_in_polygons_forward_musa_kernel", ([&] {
        const scalar_t *vertex1 = points.data_ptr<scalar_t>();
        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();
        scalar_t *inside_flag = output.data_ptr<scalar_t>();

        points_in_polygons_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, vertex1, vertex2, rows, cols, inside_flag);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/prroi_pool_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "prroi_pool_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void PrROIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois,
                                        Tensor output, int pooled_height,
                                        int pooled_width, float spatial_scale) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  prroi_pool_forward_musa_kernel<float>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
          output.data_ptr<float>(), pooled_height, pooled_width,
          static_cast<float>(spatial_scale), channels, height, width);

  AT_MUSA_CHECK(musaGetLastError());
}

void PrROIPoolBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,
                                         Tensor grad_input, int pooled_height,
                                         int pooled_width,
                                         float spatial_scale) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  prroi_pool_backward_musa_kernel<float>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
          grad_input.data_ptr<float>(), pooled_height, pooled_width,
          static_cast<float>(spatial_scale), channels, height, width);

  AT_MUSA_CHECK(musaGetLastError());
}

void PrROIPoolCoorBackwardMUSAKernelLauncher(Tensor output, Tensor grad_output,
                                             Tensor input, Tensor rois,
                                             Tensor grad_rois,
                                             int pooled_height,
                                             int pooled_width,
                                             float spatial_scale) {
  int output_size = grad_output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  prroi_pool_coor_backward_musa_kernel<float>
      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
          input.data_ptr<float>(), rois.data_ptr<float>(),
          grad_rois.data_ptr<float>(), pooled_height, pooled_width,
          static_cast<float>(spatial_scale), channels, height, width);

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/psamask_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src

#include <torch/serialize/tensor.h>

#include "psamask_musa_kernel.muh"
#include "pytorch_musa_helper.hpp"

void PSAMaskForwardMUSAKernelLauncher(const int psa_type, const Tensor input,
                                      Tensor output, const int num_,
                                      const int h_feature, const int w_feature,
                                      const int h_mask, const int w_mask,
                                      const int half_h_mask,
                                      const int half_w_mask) {
  int nthreads = num_ * h_feature * w_feature;
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  if (psa_type == 0)
    AT_DISPATCH_FLOATING_TYPES(
        input.scalar_type(), "psamask_collect_forward_musa", [&] {
          psamask_collect_forward_musa<scalar_t><<<nthreads, 512, 0, stream>>>(
              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
              half_w_mask, input.data_ptr<scalar_t>(),
              output.data_ptr<scalar_t>());
        });
  else
    AT_DISPATCH_FLOATING_TYPES(
        input.scalar_type(), "psamask_distribute_forward_musa", [&] {
          psamask_distribute_forward_musa<scalar_t>
              <<<nthreads, 512, 0, stream>>>(
                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
                  half_w_mask, input.data_ptr<scalar_t>(),
                  output.data_ptr<scalar_t>());
        });
}

void PSAMaskBackwardMUSAKernelLauncher(
    const int psa_type, const Tensor grad_output, Tensor grad_input,
    const int num_, const int h_feature, const int w_feature, const int h_mask,
    const int w_mask, const int half_h_mask, const int half_w_mask) {
  int nthreads = num_ * h_feature * w_feature;
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  if (psa_type == 0)
    AT_DISPATCH_FLOATING_TYPES(
        grad_input.scalar_type(), "psamask_collect_backward_musa", [&] {
          psamask_collect_backward_musa<scalar_t><<<nthreads, 512, 0, stream>>>(
              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
              half_w_mask, grad_output.data_ptr<scalar_t>(),
              grad_input.data_ptr<scalar_t>());
        });
  else
    AT_DISPATCH_FLOATING_TYPES(
        grad_input.scalar_type(), "psamask_distribute_backward_musa", [&] {
          psamask_distribute_backward_musa<scalar_t>
              <<<nthreads, 512, 0, stream>>>(
                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
                  half_w_mask, grad_output.data_ptr<scalar_t>(),
                  grad_input.data_ptr<scalar_t>());
        });
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/riroi_align_rotated_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_musa_helper.hpp"
#include "riroi_align_rotated_musa_kernel.muh"

void RiROIAlignRotatedForwardMUSAKernelLauncher(
    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor output) {
  const int output_size =
      num_rois * pooled_height * pooled_width * channels * num_orientations;
  c10::musa::MUSAGuard device_guard(features.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      features.scalar_type(), "riroi_align_rotated_forward_musa_kernel", ([&] {
        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
        scalar_t *top_data = output.data_ptr<scalar_t>();

        riroi_align_rotated_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
                num_samples, clockwise, channels, height, width, pooled_height,
                pooled_width, num_orientations, top_data);
      }));

  AT_MUSA_CHECK(musaGetLastError());
}

void RiROIAlignRotatedBackwardMUSAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int num_samples, const bool clockwise, const int channels,
    const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, const int num_orientations,
    at::Tensor bottom_grad) {
  const int output_size =
      num_rois * pooled_height * pooled_width * channels * num_orientations;
  c10::musa::MUSAGuard device_guard(top_grad.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      top_grad.scalar_type(), "riroi_align_rotated_backward_musa_kernel", ([&] {
        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
        riroi_align_rotated_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, top_diff, rois_data, spatial_scale, num_samples,
                clockwise, channels, height, width, pooled_height, pooled_width,
                num_orientations, bottom_diff);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/roi_align_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_musa_helper.hpp"
#include "roi_align_musa_kernel.muh"

void ROIAlignForwardMUSAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax_y, Tensor argmax_x,
                                       int aligned_height, int aligned_width,
                                       float spatial_scale, int sampling_ratio,
                                       int pool_mode, bool aligned) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "roi_align_forward_musa_kernel", [&] {
        roi_align_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
                aligned_height, aligned_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
                aligned, channels, height, width);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void ROIAlignBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,
                                        Tensor argmax_y, Tensor argmax_x,
                                        Tensor grad_input, int aligned_height,
                                        int aligned_width, float spatial_scale,
                                        int sampling_ratio, int pool_mode,
                                        bool aligned) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "roi_align_backward_musa_kernel", [&] {
        roi_align_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
                aligned_height, aligned_width,
                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
                aligned, channels, height, width);
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/roi_align_rotated_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_musa_helper.hpp"
#include "roi_align_rotated_musa_kernel.muh"

void ROIAlignRotatedForwardMUSAKernelLauncher(
    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor output) {
  const int output_size = num_rois * pooled_height * pooled_width * channels;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
        const scalar_t *bottom_data = input.data_ptr<scalar_t>();
        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
        scalar_t *top_data = output.data_ptr<scalar_t>();

        roi_align_rotated_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
                sampling_ratio, aligned, clockwise, channels, height, width,
                pooled_height, pooled_width, top_data);
      }));

  AT_MUSA_CHECK(musaGetLastError());
}

void ROIAlignRotatedBackwardMUSAKernelLauncher(
    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
    const int sampling_ratio, const bool aligned, const bool clockwise,
    const int channels, const int height, const int width, const int num_rois,
    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
  const int output_size = num_rois * pooled_height * pooled_width * channels;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
        roi_align_rotated_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,
                aligned, clockwise, channels, height, width, pooled_height,
                pooled_width, bottom_diff);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/roi_pool_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_musa_helper.hpp"
#include "roi_pool_musa_kernel.muh"

void ROIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                      Tensor argmax, int pooled_height,
                                      int pooled_width, float spatial_scale) {
  int output_size = output.numel();
  int channels = input.size(1);
  int height = input.size(2);
  int width = input.size(3);

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "roi_pool_forward_musa_kernel", [&] {
        roi_pool_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
                argmax.data_ptr<int>(), pooled_height, pooled_width,
                static_cast<scalar_t>(spatial_scale), channels, height, width);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void ROIPoolBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,
                                       Tensor argmax, Tensor grad_input,
                                       int pooled_height, int pooled_width,
                                       float spatial_scale) {
  int output_size = grad_output.numel();
  int channels = grad_input.size(1);
  int height = grad_input.size(2);
  int width = grad_input.size(3);

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "roi_pool_backward_musa_kernel", [&] {
        roi_pool_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
                channels, height, width);
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/roiaware_pool3d_musa.mu
================================================
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.

#include <stdio.h>

#include "pytorch_musa_helper.hpp"
#include "roiaware_pool3d_musa_kernel.muh"

void RoiawarePool3dForwardMUSAKernelLauncher(
    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
    int out_y, int out_z, const Tensor rois, const Tensor pts,
    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
    Tensor pooled_features, int pool_method) {
  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params
  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params
  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params
  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:
  // max_pool 1: avg_pool

  c10::musa::MUSAGuard device_guard(pts_feature.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  Tensor pts_mask =
      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));

  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      rois.scalar_type(), "generate_pts_mask_for_box3d", [&] {
        generate_pts_mask_for_box3d<scalar_t>
            <<<blocks_mask, threads, 0, stream>>>(
                boxes_num, pts_num, out_x, out_y, out_z,
                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),
                pts_mask.data_ptr<int>());
      });

  AT_MUSA_CHECK(musaGetLastError());

  // TODO: Merge the collect and pool functions, SS

  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));

  AT_DISPATCH_INTEGRAL_TYPES(
      pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
        collect_inside_pts_for_box3d<scalar_t>
            <<<blocks_collect, threads, 0, stream>>>(
                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,
                pts_mask.data_ptr<int>(),
                pts_idx_of_voxels.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());

  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
                   channels, boxes_num);
  if (pool_method == 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
              out_z, pts_feature.data_ptr<scalar_t>(),
              pts_idx_of_voxels.data_ptr<int>(),
              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());
        });
  } else if (pool_method == 1) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        pts_feature.scalar_type(), "roiaware_avgpool3d", [&] {
          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
              out_z, pts_feature.data_ptr<scalar_t>(),
              pts_idx_of_voxels.data_ptr<int>(),
              pooled_features.data_ptr<scalar_t>());
        });
  }

  AT_MUSA_CHECK(musaGetLastError());
}

void RoiawarePool3dBackwardMUSAKernelLauncher(
    int boxes_num, int out_x, int out_y, int out_z, int channels,
    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
    const Tensor grad_out, Tensor grad_in, int pool_method) {
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params argmax: (N, out_x, out_y, out_z, C)
  // params grad_out: (N, out_x, out_y, out_z, C)
  // params grad_in: (npoints, C), return value
  // params pool_method: 0: max_pool, 1: avg_pool

  c10::musa::MUSAGuard device_guard(grad_out.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
              boxes_num);
  dim3 threads(THREADS_PER_BLOCK);

  if (pool_method == 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        grad_in.scalar_type(), "roiaware_maxpool3d_backward", [&] {
          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),
              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());
        });
  } else if (pool_method == 1) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        grad_in.scalar_type(), "roiaware_avgpool3d_backward", [&] {
          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),
              grad_in.data_ptr<scalar_t>());
        });
  }

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/roipoint_pool3d_musa.mu
================================================
/*
Modified from
https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/

#include <math.h>
#include <stdio.h>

#include "pytorch_musa_helper.hpp"
#include "roipoint_pool3d_musa_kernel.muh"

void RoIPointPool3dForwardMUSAKernelLauncher(
    int batch_size, int pts_num, int boxes_num, int feature_in_len,
    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
    const Tensor pts_feature, Tensor pooled_features,
    Tensor pooled_empty_flag) {
  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},
                                boxes3d.options().dtype(at::kInt));

  c10::musa::MUSAGuard device_guard(xyz.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz.scalar_type(), "assign_pts_to_box3d", [&] {
        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(
            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),
            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());
      });

  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},
                             boxes3d.options().dtype(at::kInt));

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);

  get_pooled_idx<<<blocks2, threads, 0, stream>>>(
      batch_size, pts_num, boxes_num, sampled_pts_num,
      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
      pooled_empty_flag.data_ptr<int>());

  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
                   batch_size);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      xyz.scalar_type(), "roipoint_pool3d_forward", [&] {
        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(
            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),
            pts_feature.data_ptr<scalar_t>(),
            pooled_features.data_ptr<scalar_t>(),
            pooled_empty_flag.data_ptr<int>());
      });
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/rotated_feature_align_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "pytorch_musa_helper.hpp"
#include "rotated_feature_align_musa_kernel.muh"

void RotatedFeatureAlignForwardMUSAKernelLauncher(const Tensor features,
                                                  const Tensor best_bboxes,
                                                  const float spatial_scale,
                                                  const int points,
                                                  Tensor output) {
  c10::musa::MUSAGuard device_guard(features.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  const int output_size = features.numel();
  AT_DISPATCH_FLOATING_TYPES(
      features.scalar_type(), "rotated_feature_align_forward_musa_kernel",
      ([&] {
        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
        scalar_t* top_data = output.data_ptr<scalar_t>();

        rotated_feature_align_forward_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, points, bottom_data, bboxes_data,
                scalar_t(spatial_scale), features.size(1), features.size(2),
                features.size(3), top_data);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}

void RotatedFeatureAlignBackwardMUSAKernelLauncher(const Tensor top_grad,
                                                   const Tensor best_bboxes,
                                                   const float spatial_scale,
                                                   const int points,
                                                   Tensor bottom_grad) {
  c10::musa::MUSAGuard device_guard(top_grad.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  const int output_size = top_grad.numel();
  AT_DISPATCH_FLOATING_TYPES(
      top_grad.scalar_type(), "rotated_feature_align_backward_musa_kernel",
      ([&] {
        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();

        rotated_feature_align_backward_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, points, top_diff, bboxes_data,
                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),
                top_grad.size(3), bottom_diff);
      }));
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/scatter_points_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include <stdio.h>
#include <stdlib.h>
#include <torch/types.h>

#include "pytorch_musa_helper.hpp"
#include "scatter_points_musa_kernel.muh"

std::vector<at::Tensor> DynamicPointToVoxelForwardMUSAKernelLauncher(
    const at::Tensor &feats, const at::Tensor &coors,
    const reduce_t reduce_type) {
  const int num_input = feats.size(0);
  const int num_feats = feats.size(1);

  if (num_input == 0)
    return {feats.clone().detach(), coors.clone().detach(),
            coors.new_empty({0}, torch::kInt32),
            coors.new_empty({0}, torch::kInt32)};

  at::Tensor out_coors;
  at::Tensor coors_map;
  at::Tensor reduce_count;

  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);

  std::tie(out_coors, coors_map, reduce_count) =
      at::unique_dim(coors_clean, 0, true, true, true);

  if (out_coors[0][0].lt(0).item<bool>()) {
    // the first element of out_coors (-1,-1,-1) and should be removed
    out_coors = out_coors.slice(0, 1);
    reduce_count = reduce_count.slice(0, 1);
    coors_map = coors_map - 1;
  }

  coors_map = coors_map.to(torch::kInt32);
  reduce_count = reduce_count.to(torch::kInt32);

  auto reduced_feats =
      at::empty({out_coors.size(0), num_feats}, feats.options());

  c10::musa::MUSAGuard device_guard(feats.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  AT_DISPATCH_FLOATING_TYPES(
      feats.scalar_type(), "feats_reduce_kernel", ([&] {
        if (reduce_type == reduce_t::MAX)
          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
        else
          reduced_feats.fill_(static_cast<scalar_t>(0));

        dim3 blocks(std::min(
            at::musa::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
        dim3 threads(THREADS_PER_BLOCK);
        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(
            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,
            reduce_type);
        if (reduce_type == reduce_t::MEAN)
          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
      }));

  AT_MUSA_CHECK(musaGetLastError());

  return {reduced_feats, out_coors, coors_map, reduce_count};
}

void DynamicPointToVoxelBackwardMUSAKernelLauncher(
    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
    const at::Tensor &feats, const at::Tensor &reduced_feats,
    const at::Tensor &coors_map, const at::Tensor &reduce_count,
    const reduce_t reduce_type) {
  const int num_input = feats.size(0);
  const int num_reduced = reduced_feats.size(0);
  const int num_feats = feats.size(1);

  grad_feats.fill_(0);
  // copy voxel grad to points

  if (num_input == 0 || num_reduced == 0) return;
  c10::musa::MUSAGuard device_guard(feats.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
        ([&] {
          dim3 blocks(std::min(
              at::musa::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
          dim3 threads(THREADS_PER_BLOCK);
          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(
              grad_feats.data_ptr<scalar_t>(),
              grad_reduced_feats.data_ptr<scalar_t>(),
              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
              num_input, num_feats, reduce_type);
        }));

    AT_MUSA_CHECK(musaGetLastError());
  } else {
    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
                                coors_map.options().dtype(torch::kInt32));
    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(),
        "max_reduce_traceback_scatter_idx_kernel", ([&] {
          dim3 blocks(std::min(
              at::musa::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
          dim3 threads(THREADS_PER_BLOCK);
          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,
                                                    stream>>>(
              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
              num_input, num_feats);
        }));

    AT_MUSA_CHECK(musaGetLastError());

    AT_DISPATCH_FLOATING_TYPES(
        grad_reduced_feats.scalar_type(),
        "max_reduce_traceback_scatter_idx_kernel", ([&] {
          dim3 blocks(
              std::min(at::musa::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),
                       maxGridDim));
          dim3 threads(THREADS_PER_BLOCK);
          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(
              grad_feats.data_ptr<scalar_t>(),
              grad_reduced_feats.data_ptr<scalar_t>(),
              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
        }));

    AT_MUSA_CHECK(musaGetLastError());
  }
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/sparse_indice.mu
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <ATen/ATen.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>

#include <chrono>
#include <limits>
#include <spconv/indice.muh>
#include <type_traits>

#include "pytorch_musa_helper.hpp"

namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose) {
    Index batchSize = gridsOut.dim(0);
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0) return 0;
    if (transpose)
      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
                              indiceNum, indicePairUnique, kernelSize, stride,
                              padding, dilation, outSpatialShape);
    else
      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
                              indiceNum, indicePairUnique, kernelSize, stride,
                              padding, dilation, outSpatialShape);
    TV_CHECK_MUSA_ERR();
    return 1;
  }
};

template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    Index batchSize = gridsOut.dim(0);
    auto kernelVolume = indicePairs.dim(0);
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0) return 0;
    Index numAct = indicePairUnique.dim(0) - 1;
    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numAct), tv::launch::MUSA_NUM_THREADS, 0,
           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
                            indicePairUnique, outSpatialShape, batchSize);
    TV_CHECK_MUSA_ERR();
    assignIndicePairsKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,
           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
                            indicePairUnique, outSpatialShape);
    TV_CHECK_MUSA_ERR();

    if (resetGrid) {
      resetGridKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numAct), tv::launch::MUSA_NUM_THREADS, 0,
             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
      TV_CHECK_MUSA_ERR();
    }
    return numAct;
  }
};

template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0) return 0;
    prepareSubMGridKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,
           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
    TV_CHECK_MUSA_ERR();
    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
        <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,
           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
                            kernelSize, stride, padding, dilation,
                            outSpatialShape);
    TV_CHECK_MUSA_ERR();

    if (resetGrid) {
      resetGridSubMKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
                              numActIn);
      TV_CHECK_MUSA_ERR();
    }
    return numActIn;
  }
};
}  // namespace functor

#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \
                                                       int, NDIM>;            \
  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
                                                         int, NDIM>;          \
  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
                                                         int, NDIM>;          \
  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \
                                                       int, NDIM>;

#define DECLARE_GPU_INDEX(Index)          \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);

DECLARE_GPU_INDEX(int);

#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM


================================================
FILE: mmcv/ops/csrc/pytorch/musa/sparse_maxpool.mu
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <ATen/ATen.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/maxpool.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>

#include <chrono>
#include <limits>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.muh>

#include "pytorch_musa_helper.hpp"

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
                                      const scalar_t *inFeatures,
                                      const Index *indicesIn,
                                      const Index *indicesOut, int numHot,
                                      int numPlanes) {
  scalar_t in, out;
  int ILPStrideY[NumILP];
  Index idxo, idxi;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
  outFeatures += blockIdx.y * NumTLP;
  inFeatures += blockIdx.y * NumTLP;
  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
       ix += blockDim.x * gridDim.x) {
    {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
        in = inFeatures[idxi];
        out = outFeatures[idxo];
        if (in > out) {
          outFeatures[idxo] = in;
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
                                             const scalar_t *inFeatures,
                                             const Index *indicesIn,
                                             const Index *indicesOut,
                                             int numHot, int numPlanes) {
  int ILPStrideX[NumILP];
  Index RI[NumILP];
  Index RO[NumILP];
  scalar_t in, out;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        in = inFeatures[RI[ilp] + iy];
        out = outFeatures[RO[ilp] + iy];
        if (in > out) {
          outFeatures[RO[ilp] + iy] = in;
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType>
__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
                                         const scalar_t *inFeatures,
                                         const Index *indicesIn,
                                         const Index *indicesOut, int numHot,
                                         int numPlanes) {
  int ILPStrideY[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
  scalar_t bufi[vecloadFactor];
  scalar_t bufo[vecloadFactor];
  Index idxi, idxo;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
  outFeatures += blockIdx.y * NumTLP;
  inFeatures += blockIdx.y * NumTLP;
  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
       ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      reinterpret_cast<VecType *>(bufo)[0] =
          reinterpret_cast<VecType *>(outFeatures)[idxo];
      reinterpret_cast<VecType *>(bufi)[0] =
          reinterpret_cast<const VecType *>(inFeatures)[idxi];
#pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
        if (bufi[i] > bufo[i]) {
          bufo[i] = bufi[i];
        }
      }
      reinterpret_cast<VecType *>(outFeatures)[idxo] =
          reinterpret_cast<VecType *>(bufo)[0];
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
                                        const scalar_t *inFeatures,
                                        const Index *indicesIn,
                                        const Index *indicesOut, int numHot,
                                        int numPlanes) {
  int ILPStrideX[NumILP];
  Index RI[NumILP];
  Index RO[NumILP];
  scalar_t in, out;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < numHot) {
        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
      }
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < numHot) {
          in = inFeatures[RI[ilp] + iy];
          out = outFeatures[RO[ilp] + iy];
          if (in > out) {
            outFeatures[RO[ilp] + iy] = in;
          }
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
                                      const scalar_t *inFeatures,
                                      const scalar_t *fout, scalar_t *fin,
                                      const Index *indicesIn,
                                      const Index *indicesOut, int numHot,
                                      int numPlanes) {
  scalar_t in, out;
  Index idxo, idxi;
  int ILPStrideY[NumILP];
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
  outFeatures += blockIdx.y * NumTLP;
  inFeatures += blockIdx.y * NumTLP;
  fout += blockIdx.y * NumTLP;
  fin += blockIdx.y * NumTLP;
  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
       ix += blockDim.x * gridDim.x) {
    {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
        in = inFeatures[idxi];
        out = outFeatures[idxo];
        if (in == out) {
          fin[idxi] += fout[idxo];
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericBlockKernel(
    const scalar_t *outFeatures, const scalar_t *inFeatures,
    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
    const Index *indicesOut, int numHot, int numPlanes) {
  int ILPStrideX[NumILP];
  Index RI[NumILP];
  Index RO[NumILP];
  scalar_t in, out;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        in = inFeatures[RI[ilp] + iy];
        out = outFeatures[RO[ilp] + iy];
        if (in == out) {
          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
        }
      }
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP,
          typename VecType>
__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
                                         const scalar_t *inFeatures,
                                         const scalar_t *fout, scalar_t *fin,
                                         const Index *indicesIn,
                                         const Index *indicesOut, int numHot,
                                         int numPlanes) {
  int ILPStrideY[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
  scalar_t bufi[vecloadFactor];
  scalar_t bufo[vecloadFactor];
  scalar_t bufdi[vecloadFactor];
  scalar_t bufdo[vecloadFactor];
  Index idxi, idxo;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
  outFeatures += blockIdx.y * NumTLP;
  inFeatures += blockIdx.y * NumTLP;
  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
       ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
      reinterpret_cast<VecType *>(bufo)[0] =
          reinterpret_cast<const VecType *>(outFeatures)[idxo];
      reinterpret_cast<VecType *>(bufi)[0] =
          reinterpret_cast<const VecType *>(inFeatures)[idxi];
      reinterpret_cast<VecType *>(bufdo)[0] =
          reinterpret_cast<const VecType *>(fout)[idxo];
      reinterpret_cast<VecType *>(bufdi)[0] =
          reinterpret_cast<VecType *>(fin)[idxi];

#pragma unroll
      for (int i = 0; i < vecloadFactor; i++) {
        if (bufi[i] == bufo[i]) {
          bufdi[i] += bufdo[i];
        }
      }
      reinterpret_cast<VecType *>(fin)[idxi] =
          reinterpret_cast<VecType *>(bufdi)[0];
    }
  }
}

template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
                                        const scalar_t *inFeatures,
                                        const scalar_t *fout, scalar_t *fin,
                                        const Index *indicesIn,
                                        const Index *indicesOut, int numHot,
                                        int numPlanes) {
  int ILPStrideX[NumILP];
  Index RI[NumILP];
  Index RO[NumILP];
  scalar_t in, out;
#pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < numHot) {
        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
      }
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
        if (ix + ILPStrideX[ilp] < numHot) {
          in = inFeatures[RI[ilp] + iy];
          out = outFeatures[RO[ilp] + iy];
          if (in == out) {
            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
          }
        }
      }
    }
  }
}

namespace functor {
template <typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> inFeatures,
                  tv::TensorView<const Index> indices, int size) {
    if (size <= 0) return;
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
                                 &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;

      int numHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                    indices.subview(0).data(),
                                    indices.subview(1).data(), numHotBlock,
                                    numPlanes / vecloadFactor);
            TV_CHECK_MUSA_ERR();
          }

          if (size > numHotBlock) {
            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                       indices.subview(0).data() + numHotBlock,
                                       indices.subview(1).data() + numHotBlock,
                                       size - numHotBlock, numPlanes);
            TV_CHECK_MUSA_ERR();
          }
          notFound = false;
        }
      }
    });

    if (notFound) {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      int numHotBlock = (size / NumTLP) * NumTLP;
      if (numHotBlock >= NumTLP) {
        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(),
                indices.subview(0).data(), indices.subview(1).data(),
                numHotBlock, numPlanes);
        TV_CHECK_MUSA_ERR();
      }

      if (size > numHotBlock) {
        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(),
                indices.subview(0).data() + numHotBlock,
                indices.subview(1).data() + numHotBlock, size - numHotBlock,
                numPlanes);
        TV_CHECK_MUSA_ERR();
      }
    }
  }
};

template <typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::TorchGPU &d,
                  tv::TensorView<const scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> inFeatures,
                  tv::TensorView<const scalar_t> fout,
                  tv::TensorView<scalar_t> fin,
                  tv::TensorView<const Index> indices, int size) {
    if (size <= 0) return;
    int numPlanes = inFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
                                 &indices, &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;

      int numHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (numHotBlock >= NumTLP) {
            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
                                     vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                    fout.data(), fin.data(),
                                    indices.subview(0).data(),
                                    indices.subview(1).data(), numHotBlock,
                                    numPlanes / vecloadFactor);
            TV_CHECK_MUSA_ERR();
          }

          if (size > numHotBlock) {
            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                       fout.data(), fin.data(),
                                       indices.subview(0).data() + numHotBlock,
                                       indices.subview(1).data() + numHotBlock,
                                       size - numHotBlock, numPlanes);
            TV_CHECK_MUSA_ERR();
          }
          notFound = false;
        }
      }
    });

    if (notFound) {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      int numHotBlock = (size / NumTLP) * NumTLP;
      if (numHotBlock >= NumTLP) {
        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
                indices.subview(0).data(), indices.subview(1).data(),
                numHotBlock, numPlanes);
        TV_CHECK_MUSA_ERR();
      }

      if (size > numHotBlock) {
        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
                indices.subview(0).data() + numHotBlock,
                indices.subview(1).data() + numHotBlock, size - numHotBlock,
                numPlanes);
        TV_CHECK_MUSA_ERR();
      }
    }
  }
};

}  // namespace functor

#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
                                                       Index>;                 \
  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \
                                                        scalar_t, Index>;

#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);

DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);

#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX


================================================
FILE: mmcv/ops/csrc/pytorch/musa/sparse_pool_ops_musa.mu
================================================
#include <musa_runtime_api.h>
#include <torch/script.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/maxpool.h>

#include "pytorch_musa_helper.hpp"

torch::Tensor IndiceMaxpoolForwardMUSAKernelLauncher(torch::Tensor features,
                                                     torch::Tensor indicePairs,
                                                     torch::Tensor indiceNum,
                                                     int64_t numAct) {
  c10::musa::MUSAGuard device_guard(features.device());
  auto device = features.device().type();
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
      continue;
    }
    AT_DISPATCH_FLOATING_TYPES(
        features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
          if (device == torch::kCPU) {
            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
                forwardFtor;
            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
                        tv::torch2tv<const scalar_t>(features),
                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
          } else {
            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
                forwardFtor;
            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
                        tv::torch2tv<const scalar_t>(features),
                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
            TV_CHECK_MUSA_ERR();
          }
        });
  }
  return output;
}

torch::Tensor IndiceMaxpoolBackwardMUSAKernelLauncher(torch::Tensor features,
                                                      torch::Tensor outFeatures,
                                                      torch::Tensor outGrad,
                                                      torch::Tensor indicePairs,
                                                      torch::Tensor indiceNum) {
  c10::musa::MUSAGuard device_guard(features.device());
  auto device = features.device().type();
  auto numInPlanes = features.size(1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
  auto kernelVolume = indicePairs.size(0);
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
      continue;
    }
    AT_DISPATCH_FLOATING_TYPES(
        features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
          if (device == torch::kCPU) {
            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
                backwardFtor;
            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
                         tv::torch2tv<const scalar_t>(features),
                         tv::torch2tv<const scalar_t>(outGrad),
                         tv::torch2tv<scalar_t>(inputGrad),
                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
          } else {
            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
                backwardFtor;
            backwardFtor(tv::TorchGPU(),
                         tv::torch2tv<const scalar_t>(outFeatures),
                         tv::torch2tv<const scalar_t>(features),
                         tv::torch2tv<const scalar_t>(outGrad),
                         tv::torch2tv<scalar_t>(inputGrad),
                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
            TV_CHECK_MUSA_ERR();
          }
        });
  }
  return inputGrad;
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/sparse_reordering.mu
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <ATen/ATen.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/spconv/reordering.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>

#include <chrono>
#include <limits>
#include <spconv/reordering.muh>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.muh>

#include "pytorch_musa_helper.hpp"

namespace functor {
template <typename scalar_t, typename Index>
struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
                  tv::TensorView<const scalar_t> features,
                  tv::TensorView<const Index> indices, int size) {
    if (size <= 0) return;
    int numPlanes = features.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
                                 &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;
      int nHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (nHotBlock >= NumTLP) {
            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
                                 vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(buffer.data(), features.data(),
                                    indices.data(), nHotBlock,
                                    numPlanes / vecloadFactor);

            TV_CHECK_MUSA_ERR();
          }
          if (size - nHotBlock > 0) {
            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
                            vecload_type_t>
                <<<dim3(1, numPlanes / NumTLP),
                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
                                    features.data(), indices.data() + nHotBlock,
                                    size - nHotBlock,
                                    numPlanes / vecloadFactor);
            TV_CHECK_MUSA_ERR();
          }
          notFound = false;
        }
      }
    });

    if (notFound) {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
          <<<dim3(tv::launch::DivUp(size, NumTLP),
                  tv::launch::DivUp(numPlanes, NumTLP)),
             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
              buffer.data(), features.data(), indices.data(), size, numPlanes);
      TV_CHECK_MUSA_ERR();
    }
  }
};
template <typename scalar_t, typename Index>
struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
  using vecload_type_t =
      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
                  tv::TensorView<const scalar_t> buffer,
                  tv::TensorView<const Index> indices, int size, bool stable) {
    if (size <= 0) return;
    int numPlanes = outFeatures.dim(1);
    bool notFound = true;
    constexpr int vecloadFactor =
        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
                                 &notFound](auto NumTLP) {
      constexpr int NumILP = NumTLP / 4;
      int nHotBlock = (size / NumTLP) * NumTLP;
      if (notFound) {
        if (numPlanes % NumTLP == 0) {
          if (nHotBlock >= NumTLP) {
            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
                                     vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
                   d.getStream()>>>(outFeatures.data(), buffer.data(),
                                    indices.data(), nHotBlock,
                                    numPlanes / vecloadFactor);
            TV_CHECK_MUSA_ERR();
          }
          if (size - nHotBlock > 0) {
            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
                   0, d.getStream()>>>(
                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
            TV_CHECK_MUSA_ERR();
          }
          notFound = false;
        }
      }
    });
    if (notFound) {
      constexpr int NumTLP = 64;
      constexpr int NumILP = NumTLP / 4;
      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
          <<<dim3(tv::launch::DivUp(size, NumTLP),
                  tv::launch::DivUp(numPlanes, NumTLP)),
             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
              outFeatures.data(), buffer.data(), indices.data(), size,
              numPlanes);
      TV_CHECK_MUSA_ERR();
    }
  }
};

}  // namespace functor

#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
                                                   Index>;

#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);

DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);

#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX


================================================
FILE: mmcv/ops/csrc/pytorch/musa/spconv_ops_musa.mu
================================================
#include <musa_runtime_api.h>
#include <torch/script.h>
// clang-format off
// TODO: make spconv_utils.h order agnostic
#include "../spconv_utils.h"
// clang-format on
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>

#include "pytorch_musa_helper.hpp"

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  c10::musa::MUSAGuard device_guard(indices.device());
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
  auto numAct = indices.size(0);
  auto coorDim = indices.size(1) - 1;
  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
  auto kernelVolume = kernelSize[0];
  for (int i = 1; i < kernelSize.size(); ++i) {
    kernelVolume *= kernelSize[i];
  }
  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
  auto outputVolume = outSpatialShape[0];
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];
  }
  torch::Tensor indicePairs =
      torch::full({kernelVolume, 2, numAct}, -1,
                  torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor indiceNum = torch::zeros(
      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor gridOut =
      torch::full({batchSize * outputVolume}, -1,
                  torch::dtype(torch::kInt32).device(indices.device()));
  int64_t numActOut = -1;
  tv::SimpleVector<int, NDim> outSpatialShape32;
  tv::SimpleVector<int, NDim> kernelSize32;
  tv::SimpleVector<int, NDim> stride32;
  tv::SimpleVector<int, NDim> padding32;
  tv::SimpleVector<int, NDim> dilation32;
  auto indicePairUnique = torch::full(
      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
      torch::dtype(torch::kInt32).device(indices.device()));
  for (int i = 0; i < NDim; ++i) {
    outSpatialShape32.push_back(outSpatialShape[i]);
    kernelSize32.push_back(kernelSize[i]);
    if (subM) {
      stride32.push_back(1);
      padding32.push_back(kernelSize[i] / 2);
      dilation32.push_back(dilation[i]);
    } else {
      stride32.push_back(stride[i]);
      padding32.push_back(padding[i]);
      dilation32.push_back(dilation[i]);
    }
  }
  if (subM) {
    if (indices.device().type() == torch::kCPU) {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose);
    } else {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose);
    }
    return {indices, indicePairs, indiceNum};
  } else {
    torch::Tensor outInds =
        torch::zeros({numAct * kernelVolume, coorDim + 1},
                     torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) {
      auto getIndicePairFtor =
          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
          transpose);
    } else {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
                                                 NDim>();
      auto getIndicePairFtorP2 =
          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
                                                 NDim>();
      numActOut = getIndicePairFtorP1(
          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
          padding32, dilation32, outSpatialShape32, transpose);
      if (numActOut > 0) {
        auto res = torch::_unique(indicePairUnique);
        indicePairUnique = std::get<0>(res);
        numActOut = getIndicePairFtorP2(
            tv::TorchGPU(), tv::torch2tv<const int>(indices),
            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
      }
    }
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
  }
}

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsBackwardMUSAKernelLauncher(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  c10::musa::MUSAGuard device_guard(indices.device());
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
  auto numAct = indices.size(0);
  auto coorDim = indices.size(1) - 1;
  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
  auto kernelVolume = kernelSize[0];
  for (int i = 1; i < kernelSize.size(); ++i) {
    kernelVolume *= kernelSize[i];
  }
  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
  auto outputVolume = outSpatialShape[0];
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];
  }
  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
  torch::Tensor indicePairs =
      torch::full({kernelVolume, 2, numAct}, -1,
                  torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor indiceNum = torch::zeros(
      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
  int64_t numActOut = -1;
  tv::SimpleVector<int, NDim> outSpatialShape32;
  tv::SimpleVector<int, NDim> kernelSize32;
  tv::SimpleVector<int, NDim> stride32;
  tv::SimpleVector<int, NDim> padding32;
  tv::SimpleVector<int, NDim> dilation32;
  auto indicePairUnique = torch::full(
      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
      torch::dtype(torch::kInt32).device(indices.device()));
  for (int i = 0; i < NDim; ++i) {
    outSpatialShape32.push_back(outSpatialShape[i]);
    kernelSize32.push_back(kernelSize[i]);
    if (subM) {
      stride32.push_back(1);
      padding32.push_back(kernelSize[i] / 2);
      dilation32.push_back(dilation[i]);
    } else {
      stride32.push_back(stride[i]);
      padding32.push_back(padding[i]);
      dilation32.push_back(dilation[i]);
    }
  }
  if (subM) {
    if (indices.device().type() == torch::kCPU) {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose);
      gridOut.fill_(-1);
    } else {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
          dilation32, outSpatialShape32, transpose, true);
    }
    return {indices, indicePairs, indiceNum};
  } else {
    torch::Tensor outInds =
        torch::zeros({numAct * kernelVolume, coorDim + 1},
                     torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) {
      auto getIndicePairFtor =
          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
          transpose, true);
      gridOut.fill_(-1);
    } else {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
                                                 NDim>();
      auto getIndicePairFtorP2 =
          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
                                                 NDim>();
      numActOut = getIndicePairFtorP1(
          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
          padding32, dilation32, outSpatialShape32, transpose);
      if (numActOut > 0) {
        auto res = torch::_unique(indicePairUnique);
        indicePairUnique = std::get<0>(res);
        numActOut = getIndicePairFtorP2(
            tv::TorchGPU(), tv::torch2tv<const int>(indices),
            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
            true);
      }
    }
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
  }
}

torch::Tensor IndiceConvForwardMUSAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
    int64_t _subM) {
  c10::musa::MUSAGuard device_guard(features.device());
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto indicePairMaxSizeIter =
      std::max_element(indicePairNumCpu.data_ptr<int>(),
                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
  int indicePairMaxOffset =
      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;

  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());

  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
  torch::Tensor inputBuffer =
      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
  if (subM) {
    torch::mm_out(output, features, filters[indicePairMaxOffset]);
  }
  double totalGatherTime = 0;
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }

    AT_DISPATCH_FLOATING_TYPES(
        features.scalar_type(), "IndiceConvForwardKernel", [&] {
          auto outputBufferBlob = torch::from_blob(
              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
          auto inputBufferBlob = torch::from_blob(
              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);

          if (device == torch::kCPU) {
            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
          } else {
            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
                gatherFtor;
            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
            TV_CHECK_MUSA_ERR();
            /* slower than SparseGatherFunctor, may due to int->long conversion
            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
            auto indicePairBlob =
            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
            indicePairOptions); torch::index_select_out(inputBufferBlob,
            features, 0, indicePairBlob);*/
          }
          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);

          if (device == torch::kCPU) {
            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::CPU(), tv::torch2tv<scalar_t>(output),
                tv::torch2tv<const scalar_t>(outputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
          } else {
            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
                tv::torch2tv<const scalar_t>(outputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                true);
            TV_CHECK_MUSA_ERR();
          }
        });
  }
  return output;
}

std::vector<torch::Tensor> IndiceConvBackwardMUSAKernelLauncher(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM) {
  c10::musa::MUSAGuard device_guard(features.device());
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;

  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
  auto kernelVolume = indicePairs.size(0);
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto indicePairMaxSizeIter =
      std::max_element(indicePairNumCpu.data_ptr<int>(),
                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
  int indicePairMaxOffset =
      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  auto filterShape = filters.sizes();
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
  torch::Tensor inputBuffer =
      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);

  filters = filters.view({-1, numInPlanes, numOutPlanes});
  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
  if (subM) {
    auto filterGradSub = filtersGrad[indicePairMaxOffset];
    torch::mm_out(filterGradSub, features.t(), outGrad);
    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
  }
  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }

    AT_DISPATCH_FLOATING_TYPES(
        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
          if (device == torch::kCPU) {
            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
            gatherFtorOut(
                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
                tv::torch2tv<const scalar_t>(outGrad),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
                nHot);
          } else {
            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
                gatherFtor;
            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
                gatherFtorOut;
            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
                       tv::torch2tv<const scalar_t>(features),
                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
                       nHot);
            TV_CHECK_MUSA_ERR();
            gatherFtorOut(
                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
                tv::torch2tv<const scalar_t>(outGrad),
                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
                nHot);
            TV_CHECK_MUSA_ERR();
          }
          auto filterGradSub = filtersGrad[i];
          auto outputBufferBlob = torch::from_blob(
              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
          auto inputBufferBlob = torch::from_blob(
              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);

          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
          if (device == torch::kCPU) {
            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
                tv::torch2tv<const scalar_t>(inputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
          } else {
            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
                scatterFtor;
            scatterFtor(
                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
                tv::torch2tv<const scalar_t>(inputBuffer),
                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
            TV_CHECK_MUSA_ERR();
          }
        });
  }
  return {inputGrad, filtersGrad.view(filterShape)};
}

template std::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher<2>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher<3>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher<4>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsBackwardMUSAKernelLauncher<2>(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> GetIndicePairsBackwardMUSAKernelLauncher<3>(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);


================================================
FILE: mmcv/ops/csrc/pytorch/musa/stack_ball_query_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_musa_helper.hpp"
#include "stack_ball_query_musa_kernel.muh"
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

void StackBallQueryForwardMUSAKernelLauncher(float max_radius, int nsample,
                                             const Tensor new_xyz,
                                             const Tensor new_xyz_batch_cnt,
                                             const Tensor xyz,
                                             const Tensor xyz_batch_cnt,
                                             Tensor idx) {
  c10::musa::MUSAGuard device_guard(new_xyz.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  //   const float *new_xyz_ptr = new_xyz.data_ptr<float>();
  //   const float *xyz_ptr = xyz.data_ptr<float>();
  //   const int *new_xyz_batch_cnt_ptr = new_xyz_batch_cnt.data_ptr<int>();
  //   const int *xyz_batch_cnt_ptr = xyz_batch_cnt.data_ptr<int>();
  //   int *idx_ptr = idx.data_ptr<int>();

  int B = xyz_batch_cnt.size(0);
  int M = new_xyz.size(0);

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      new_xyz.scalar_type(), "stack_ball_query_forward_musa_kernel", [&] {
        stack_ball_query_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                B, M, max_radius, nsample, new_xyz.data_ptr<scalar_t>(),
                new_xyz_batch_cnt.data_ptr<int>(), xyz.data_ptr<scalar_t>(),
                xyz_batch_cnt.data_ptr<int>(), idx.data_ptr<int>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/stack_group_points_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_musa_helper.hpp"
#include "stack_group_points_musa_kernel.muh"

void StackGroupPointsForwardMUSAKernelLauncher(
    int b, int c, int m, int nsample, const Tensor features_tensor,
    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
    const Tensor idx_batch_cnt_tensor, Tensor out_tensor) {
  // points: (B, C, N)
  // idx: (B, npoints, nsample)
  // output:
  //      out: (B, C, npoints, nsample)
  c10::musa::MUSAGuard device_guard(features_tensor.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      features_tensor.scalar_type(), "stack_group_points_forward_musa_kernel",
      [&] {
        stack_group_points_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, m, nsample, features_tensor.data_ptr<scalar_t>(),
                features_batch_cnt_tensor.data_ptr<int>(),
                idx_tensor.data_ptr<int>(),
                idx_batch_cnt_tensor.data_ptr<int>(),
                out_tensor.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void StackGroupPointsBackwardMUSAKernelLauncher(
    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor) {
  c10::musa::MUSAGuard device_guard(grad_features_tensor.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_features_tensor.scalar_type(),
      "stack_group_points_backward_musa_kernel", [&] {
        stack_group_points_backward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, m, n, nsample, grad_out_tensor.data_ptr<scalar_t>(),
                idx_tensor.data_ptr<int>(),
                idx_batch_cnt_tensor.data_ptr<int>(),
                features_batch_cnt_tensor.data_ptr<int>(),
                grad_features_tensor.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/sync_bn_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_musa_helper.hpp"
#include "sync_bn_musa_kernel.muh"

void SyncBNForwardMeanMUSAKernelLauncher(const Tensor input, Tensor mean) {
  int num = input.size(0);
  int channels = input.size(1);
  int spatial = input.size(2);

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "sync_bn_forward_mean_musa_kernel", [&] {
        sync_bn_forward_mean_musa_kernel<scalar_t>
            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
                channels, spatial);
      });
  AT_MUSA_CHECK(musaGetLastError());
}

void SyncBNForwardVarMUSAKernelLauncher(const Tensor input, const Tensor mean,
                                        Tensor var) {
  int num = input.size(0);
  int channels = input.size(1);
  int spatial = input.size(2);

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "sync_bn_forward_mean_musa_kernel", [&] {
        sync_bn_forward_var_musa_kernel<scalar_t>
            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
                var.data_ptr<float>(), num, channels, spatial);
      });
  AT_MUSA_CHECK(musaGetLastError());
}

void SyncBNForwardOutputMUSAKernelLauncher(
    const Tensor input, const Tensor mean, const Tensor var,
    Tensor running_mean, Tensor running_var, const Tensor weight,
    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
    float momentum, int group_size) {
  int num = input.size(0);
  int channels = input.size(1);
  int spatial = input.size(2);

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "sync_bn_forward_mean_musa_kernel", [&] {
        sync_bn_forward_output_musa_kernel<scalar_t>
            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
                var.data_ptr<float>(), running_mean.data_ptr<float>(),
                running_var.data_ptr<float>(), weight.data_ptr<float>(),
                bias.data_ptr<float>(), norm.data_ptr<float>(),
                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
                channels, spatial, eps, momentum, group_size);
      });
  AT_MUSA_CHECK(musaGetLastError());
}

void SyncBNBackwardParamMUSAKernelLauncher(const Tensor grad_output,
                                           const Tensor norm,
                                           Tensor grad_weight,
                                           Tensor grad_bias) {
  int num = grad_output.size(0);
  int channels = grad_output.size(1);
  int spatial = grad_output.size(2);

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      grad_output.scalar_type(), "sync_bn_backward_param_musa_kernel", [&] {
        sync_bn_backward_param_musa_kernel<scalar_t>
            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
                channels, spatial);
      });
  AT_MUSA_CHECK(musaGetLastError());
}

void SyncBNBackwardDataMUSAKernelLauncher(const Tensor grad_output,
                                          const Tensor weight,
                                          const Tensor grad_weight,
                                          const Tensor grad_bias,
                                          const Tensor norm, const Tensor std,
                                          Tensor grad_input) {
  int output_size = grad_input.numel();
  int num = grad_input.size(0);
  int channels = grad_input.size(1);
  int spatial = grad_input.size(2);

  c10::musa::MUSAGuard device_guard(grad_input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES(
      grad_output.scalar_type(), "sync_bn_backward_data_musa_kernel", [&] {
        sync_bn_backward_data_musa_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
                channels, spatial);
      });
  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/three_interpolate_musa.mu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_musa_helper.hpp"
#include "three_interpolate_musa_kernel.muh"

void ThreeInterpolateForwardMUSAKernelLauncher(int b, int c, int m, int n,
                                               const Tensor points,
                                               const Tensor idx,
                                               const Tensor weight,
                                               Tensor out) {
  // points: (B, C, M)
  // idx: (B, N, 3)
  // weight: (B, N, 3)
  // output:
  //      out: (B, C, N)

  c10::musa::MUSAGuard device_guard(points.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      points.scalar_type(), "three_interpolate_forward_musa_kernel", [&] {
        three_interpolate_forward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),
                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void ThreeInterpolateBackwardMUSAKernelLauncher(int b, int c, int n, int m,
                                                const Tensor grad_out,
                                                const Tensor idx,
                                                const Tensor weight,
                                                Tensor grad_points) {
  // grad_out: (B, C, N)
  // weight: (B, N, 3)
  // output:
  //      grad_points: (B, C, M)

  c10::musa::MUSAGuard device_guard(grad_out.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_out.scalar_type(), "three_interpolate_backward_musa_kernel", [&] {
        three_interpolate_backward_musa_kernel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),
                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/three_nn_musa.mu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_musa_helper.hpp"
#include "three_nn_musa_kernel.muh"

void ThreeNNForwardMUSAKernelLauncher(int b, int n, int m, const Tensor unknown,
                                      const Tensor known, Tensor dist2,
                                      Tensor idx) {
  // unknown: (B, N, 3)
  // known: (B, M, 3)
  // output:
  //      dist2: (B, N, 3)
  //      idx: (B, N, 3)

  c10::musa::MUSAGuard device_guard(unknown.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  // blockIdx.x(col), blockIdx.y(row)
  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES(
      unknown.scalar_type(), "three_nn_forward_musa_kernel", [&] {
        three_nn_forward_musa_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),
            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/tin_shift_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_musa_helper.hpp"
#include "pytorch_device_registry.hpp"
#include "tin_shift_musa_kernel.muh"

void TINShiftForwardMUSAKernelLauncher(Tensor input, Tensor shift,
                                       Tensor output) {
  int output_size = output.numel();
  int batch_size = input.size(0);
  int t_size = input.size(1);
  int channels = input.size(2);
  int hw_size = input.size(3);
  int group_size = shift.size(1);
  int group_channel = channels / group_size;
  int num_kernels = batch_size * hw_size * channels;

  c10::musa::MUSAGuard device_guard(input.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      input.scalar_type(), "tin_shift_forward_musa_kernel", [&] {
        tin_shift_forward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
                output.data_ptr<scalar_t>(), batch_size, channels, t_size,
                hw_size, group_size, group_channel);
      });

  AT_MUSA_CHECK(musaGetLastError());
}

void TINShiftBackwardMUSAKernelLauncher(Tensor grad_output, Tensor shift,
                                        Tensor grad_input) {
  int output_size = grad_output.numel();
  int batch_size = grad_output.size(0);
  int t_size = grad_output.size(1);
  int channels = grad_output.size(2);
  int hw_size = grad_output.size(3);
  int group_size = shift.size(1);
  int group_channel = channels / group_size;
  int num_kernels = batch_size * hw_size * channels;

  c10::musa::MUSAGuard device_guard(grad_output.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(), "tin_shift_backward_musa_kernel", [&] {
        tin_shift_backward_musa_kernel<scalar_t>
            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
                output_size, grad_output.data_ptr<scalar_t>(),
                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
                batch_size, channels, t_size, hw_size, group_size,
                group_channel);
      });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/musa/upfirdn2d_kernel.mu
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include <c10/util/Half.h>
#include <torch/types.h>

#include "pytorch_musa_helper.hpp"
#if MUSA_ARCH > 21
struct upfirdn2d_kernel_params {
  const void *x;
  const float *f;
  void *y;

  int2 up;
  int2 down;
  int2 pad0;
  int flip;
  float gain;

  int4 inSize;  // [width, height, channel, batch]
  int4 inStride;
  int2 filterSize;  // [width, height]
  int2 filterStride;
  int4 outSize;  // [width, height, channel, batch]
  int4 outStride;
  int sizeMinor;
  int sizeMajor;

  int loopMinor;
  int loopMajor;
  int loopX;
  int launchMinor;
  int launchMajor;
};

//------------------------------------------------------------------------
// MUSA kernel specialization.

struct upfirdn2d_kernel_spec {
  void *kernel;
  int tileOutW;
  int tileOutH;
  int loopMinor;
  int loopX;
};

//------------------------------------------------------------------------
// MUSA kernel selection.

template <class T>
upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params &p);
//------------------------------------------------------------------------

// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

//------------------------------------------------------------------------
// Helpers.

template <class T>
struct InternalType;
template <>
struct InternalType<double> {
  typedef double scalar_t;
};
template <>
struct InternalType<float> {
  typedef float scalar_t;
};
template <>
struct InternalType<c10::Half> {
  typedef float scalar_t;
};

static __device__ __forceinline__ int floor_div(int a, int b) {
  int t = 1 - a / b;
  return (a + t * b) / b - t;
}

//------------------------------------------------------------------------
// Generic MUSA implementation for large filters.

template <class T>
static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p) {
  typedef typename InternalType<T>::scalar_t scalar_t;

  // Calculate thread index.
  int minorBase = blockIdx.x * blockDim.x + threadIdx.x;
  int outY = minorBase / p.launchMinor;
  minorBase -= outY * p.launchMinor;
  int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
  int majorBase = blockIdx.z * p.loopMajor;
  if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)
    return;

  // Setup Y receptive field.
  int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;
  int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);
  int h =
      min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;
  int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;
  if (p.flip) filterY = p.filterSize.y - 1 - filterY;

  // Loop over major, minor, and X.
  for (int majorIdx = 0, major = majorBase;
       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
    for (int minorIdx = 0, minor = minorBase;
         minorIdx < p.loopMinor & minor < p.sizeMinor;
         minorIdx++, minor += p.launchMinor) {
      int nc = major * p.sizeMinor + minor;
      int n = nc / p.inSize.z;
      int c = nc - n * p.inSize.z;
      for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x;
           loopX++, outX += blockDim.y) {
        // Setup X receptive field.
        int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;
        int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);
        int w =
            min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) -
            inX;
        int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;
        if (p.flip) filterX = p.filterSize.x - 1 - filterX;

        // Initialize pointers.
        const T *xp =
            &((const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
                              c * p.inStride.z + n * p.inStride.w];
        const float *fp =
            &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];
        int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;
        int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;

        // Inner loop.
        scalar_t v = 0;
        for (int y = 0; y < h; y++) {
          for (int x = 0; x < w; x++) {
            v += (scalar_t)(*xp) * (scalar_t)(*fp);
            xp += p.inStride.x;
            fp += filterStepX;
          }
          xp += p.inStride.y - w * p.inStride.x;
          fp += filterStepY - w * filterStepX;
        }

        // Store result.
        v *= p.gain;
        ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
                   c * p.outStride.z + n * p.outStride.w] = (T)v;
      }
    }
}

//------------------------------------------------------------------------
// Specialized MUSA implementation for small filters.

template <class T, int upx, int upy, int downx, int downy, int filterW,
          int filterH, int tileOutW, int tileOutH, int loopMinor>
static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p) {
  typedef typename InternalType<T>::scalar_t scalar_t;
  const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;
  const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;
  __shared__ volatile scalar_t sf[filterH][filterW];
  __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];

  // Calculate tile index.
  int minorBase = blockIdx.x;
  int tileOutY = minorBase / p.launchMinor;
  minorBase -= tileOutY * p.launchMinor;
  minorBase *= loopMinor;
  tileOutY *= tileOutH;
  int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
  int majorBase = blockIdx.z * p.loopMajor;
  if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y |
      majorBase >= p.sizeMajor)
    return;

  // Load filter (flipped).
  for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW;
       tapIdx += blockDim.x) {
    int fy = tapIdx / filterW;
    int fx = tapIdx - fy * filterW;
    scalar_t v = 0;
    if (fx < p.filterSize.x & fy < p.filterSize.y) {
      int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;
      int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;
      v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];
    }
    sf[fy][fx] = v;
  }

  // Loop over major and X.
  for (int majorIdx = 0, major = majorBase;
       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) {
    int baseNC = major * p.sizeMinor + minorBase;
    int n = baseNC / p.inSize.z;
    int baseC = baseNC - n * p.inSize.z;
    for (int loopX = 0, tileOutX = tileOutXBase;
         loopX < p.loopX & tileOutX < p.outSize.x;
         loopX++, tileOutX += tileOutW) {
      // Load input pixels.
      int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;
      int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;
      int tileInX = floor_div(tileMidX, upx);
      int tileInY = floor_div(tileMidY, upy);
      __syncthreads();
      for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor;
           inIdx += blockDim.x) {
        int relC = inIdx;
        int relInX = relC / loopMinor;
        int relInY = relInX / tileInW;
        relC -= relInX * loopMinor;
        relInX -= relInY * tileInW;
        int c = baseC + relC;
        int inX = tileInX + relInX;
        int inY = tileInY + relInY;
        scalar_t v = 0;
        if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y &
            c < p.inSize.z)
          v = (scalar_t)(
              (const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
                              c * p.inStride.z + n * p.inStride.w];
        sx[relInY][relInX][relC] = v;
      }

      // Loop over output pixels.
      __syncthreads();
      for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor;
           outIdx += blockDim.x) {
        int relC = outIdx;
        int relOutX = relC / loopMinor;
        int relOutY = relOutX / tileOutW;
        relC -= relOutX * loopMinor;
        relOutX -= relOutY * tileOutW;
        int c = baseC + relC;
        int outX = tileOutX + relOutX;
        int outY = tileOutY + relOutY;

        // Setup receptive field.
        int midX = tileMidX + relOutX * downx;
        int midY = tileMidY + relOutY * downy;
        int inX = floor_div(midX, upx);
        int inY = floor_div(midY, upy);
        int relInX = inX - tileInX;
        int relInY = inY - tileInY;
        int filterX = (inX + 1) * upx - midX - 1;  // flipped
        int filterY = (inY + 1) * upy - midY - 1;  // flipped

        // Inner loop.
        if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z) {
          scalar_t v = 0;
#pragma unroll
          for (int y = 0; y < filterH / upy; y++)
#pragma unroll
            for (int x = 0; x < filterW / upx; x++)
              v += sx[relInY + y][relInX + x][relC] *
                   sf[filterY + y * upy][filterX + x * upx];
          v *= p.gain;
          ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
                     c * p.outStride.z + n * p.outStride.w] = (T)v;
        }
      }
    }
  }
}

//------------------------------------------------------------------------
// MUSA kernel selection.

template <class T>
upfirdn2d_kernel_spec choose_upfirdn2d_kernel(
    const upfirdn2d_kernel_params &p) {
  int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;
  upfirdn2d_kernel_spec spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 1,
                                4};  // contiguous
  if (s == 1)
    spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 4, 1};  // channels_last

  // No up/downsampling.
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 7 && fy <= 7)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 5 && fy <= 5)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 3 && fy <= 3)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 32, 32, 1>,
              32, 32, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 7 && fy <= 7)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 5 && fy <= 5)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 3 && fy <= 3)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 1, 128, 16>,
              1, 128, 16, 1};
  }

  // 2x upsampling.
  if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 64, 16, 1>,
              64, 16, 1, 1};
    if (s != 1 && fx <= 2 && fy <= 2)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 64, 16, 1>,
              64, 16, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 8 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 16, 16, 8>,
              16, 16, 8, 1};
    if (s == 1 && fx <= 2 && fy <= 2)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 16, 16, 8>,
              16, 16, 8, 1};
  }
  if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 8, 1>,
              128, 8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 1, 16>,
              128, 1, 16, 1};
  }
  if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 32, 32, 1>,
              32, 32, 1, 1};
    // channels_last
    if (s == 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 1, 128, 16>,
              1, 128, 16, 1};
  }

  // 2x downsampling.
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 32, 16, 1>,
              32, 16, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 32, 16, 1>,
              32, 16, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 32, 8, 1>, 32,
              8, 1, 1};
    if (s != 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 32, 8, 1>, 32,
              8, 1, 1};
    if (s != 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 32, 8, 1>, 32,
              8, 1, 1};
    if (s != 1 && fx <= 2 && fy <= 2)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 32, 8, 1>, 32,
              8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 16, 16, 1>,
              16, 16, 1, 1};
    if (s == 1 && fx <= 16 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 16, 16, 1>,
              16, 16, 1, 1};
    if (s == 1 && fx <= 8 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 8, 8, 8>, 8,
              8, 8, 1};
    if (s == 1 && fx <= 6 && fy <= 6)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 8, 8, 8>, 8,
              8, 8, 1};
    if (s == 1 && fx <= 4 && fy <= 4)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 8, 8, 8>, 8,
              8, 8, 1};
    if (s == 1 && fx <= 2 && fy <= 2)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 8, 8, 8>, 8,
              8, 8, 1};
  }
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 8, 1>,
              64, 8, 1, 1};
    if (s != 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 8, 1>,
              64, 8, 1, 1};
    if (s != 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 8, 1>, 64,
              8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 24 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 1, 8>,
              64, 1, 8, 1};
    if (s == 1 && fx <= 16 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 1, 8>,
              64, 1, 8, 1};
    if (s == 1 && fx <= 8 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 1, 8>, 64,
              1, 8, 1};
  }
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) {
    // contiguous
    if (s != 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 32, 16, 1>,
              32, 16, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 32, 16, 1>,
              32, 16, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 32, 16, 1>,
              32, 16, 1, 1};
    // channels_last
    if (s == 1 && fx <= 1 && fy <= 24)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 1, 64, 8>, 1,
              64, 8, 1};
    if (s == 1 && fx <= 1 && fy <= 16)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 1, 64, 8>, 1,
              64, 8, 1};
    if (s == 1 && fx <= 1 && fy <= 8)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 1, 64, 8>, 1,
              64, 8, 1};
  }

  // 4x upsampling.
  if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 48 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 64, 32, 1>,
              64, 32, 1, 1};
    if (s != 1 && fx <= 32 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 64, 32, 1>,
              64, 32, 1, 1};
    // channels_last
    if (s == 1 && fx <= 48 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 32, 32, 1>,
              32, 32, 1, 1};
    if (s == 1 && fx <= 32 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 32, 32, 1>,
              32, 32, 1, 1};
  }
  if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 48 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 8, 1>,
              128, 8, 1, 1};
    if (s != 1 && fx <= 32 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 8, 1>,
              128, 8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 48 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 1, 16>,
              128, 1, 16, 1};
    if (s == 1 && fx <= 32 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 1, 16>,
              128, 1, 16, 1};
  }
  if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 1 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 32, 32, 1>,
              32, 32, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 32, 32, 1>,
              32, 32, 1, 1};
    // channels_last
    if (s == 1 && fx <= 1 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 1, 128, 16>,
              1, 128, 16, 1};
    if (s == 1 && fx <= 1 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 1, 128, 16>,
              1, 128, 16, 1};
  }

  // 4x downsampling (inefficient).
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1) {
    // contiguous
    if (s != 1 && fx <= 48 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 8, 1>,
              32, 8, 1, 1};
    if (s != 1 && fx <= 32 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 8, 1>,
              32, 8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 48 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 1, 8>,
              32, 1, 8, 1};
    if (s == 1 && fx <= 32 && fy <= 1)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 1, 8>,
              32, 1, 8, 1};
  }
  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4) {
    // contiguous
    if (s != 1 && fx <= 1 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 32, 8, 1>,
              32, 8, 1, 1};
    if (s != 1 && fx <= 1 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 32, 8, 1>,
              32, 8, 1, 1};
    // channels_last
    if (s == 1 && fx <= 1 && fy <= 48)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 1, 32, 8>, 1,
              32, 8, 1};
    if (s == 1 && fx <= 1 && fy <= 32)
      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 1, 32, 8>, 1,
              32, 8, 1};
  }
  return spec;
}

//------------------------------------------------------------------------
// Template specializations.

template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>(
    const upfirdn2d_kernel_params &p);
template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>(
    const upfirdn2d_kernel_params &p);
template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(
    const upfirdn2d_kernel_params &p);

//------------------------------------------------------------------------

//------------------------------------------------------------------------

torch::Tensor upfirdn2d_op(torch::Tensor x, torch::Tensor f, int upx, int upy,
                           int downx, int downy, int padx0, int padx1,
                           int pady0, int pady1, bool flip, float gain) {
  // Validate arguments.
  TORCH_CHECK(x.is_privateuseone(), "x must reside on MUSA device");
  TORCH_CHECK(f.device() == x.device(),
              "f must reside on the same device as x");
  TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
  TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
  TORCH_CHECK(x.numel() > 0, "x has zero size");
  TORCH_CHECK(f.numel() > 0, "f has zero size");
  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
  TORCH_CHECK(f.dim() == 2, "f must be rank 2");
  TORCH_CHECK((x.size(0) - 1) * x.stride(0) + (x.size(1) - 1) * x.stride(1) +
                      (x.size(2) - 1) * x.stride(2) +
                      (x.size(3) - 1) * x.stride(3) <=
                  INT_MAX,
              "x memory footprint is too large");
  TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
  TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
  TORCH_CHECK(downx >= 1 && downy >= 1,
              "downsampling factor must be at least 1");

  // Create output tensor.
  const at::musa::OptionalMUSAGuard device_guard(device_of(x));
  int outW =
      ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;
  int outH =
      ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;
  TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1");
  torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW},
                                 x.options(), x.suggest_memory_format());
  TORCH_CHECK(y.numel() <= INT_MAX, "output is too large");
  TORCH_CHECK((y.size(0) - 1) * y.stride(0) + (y.size(1) - 1) * y.stride(1) +
                      (y.size(2) - 1) * y.stride(2) +
                      (y.size(3) - 1) * y.stride(3) <=
                  INT_MAX,
              "output memory footprint is too large");

  // Initialize MUSA kernel parameters.
  upfirdn2d_kernel_params p;
  p.x = x.data_ptr();
  p.f = f.data_ptr<float>();
  p.y = y.data_ptr();
  p.up = make_int2(upx, upy);
  p.down = make_int2(downx, downy);
  p.pad0 = make_int2(padx0, pady0);
  p.flip = (flip) ? 1 : 0;
  p.gain = gain;
  p.inSize =
      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
  p.inStride = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1),
                         (int)x.stride(0));
  p.filterSize = make_int2((int)f.size(1), (int)f.size(0));
  p.filterStride = make_int2((int)f.stride(1), (int)f.stride(0));
  p.outSize =
      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
  p.outStride = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1),
                          (int)y.stride(0));
  p.sizeMajor = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;
  p.sizeMinor = (p.inStride.z == 1) ? p.inSize.z : 1;

  // Choose MUSA kernel.
  upfirdn2d_kernel_spec spec;
  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "upfirdn2d_musa", [&] {
    spec = choose_upfirdn2d_kernel<scalar_t>(p);
  });

  // Set looping options.
  p.loopMajor = (p.sizeMajor - 1) / 16384 + 1;
  p.loopMinor = spec.loopMinor;
  p.loopX = spec.loopX;
  p.launchMinor = (p.sizeMinor - 1) / p.loopMinor + 1;
  p.launchMajor = (p.sizeMajor - 1) / p.loopMajor + 1;

  // Compute grid size.
  dim3 blockSize, gridSize;
  if (spec.tileOutW < 0)  // large
  {
    blockSize = dim3(4, 32, 1);
    gridSize =
        dim3(((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,
             (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1, p.launchMajor);
  } else  // small
  {
    blockSize = dim3(256, 1, 1);
    gridSize =
        dim3(((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,
             (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1, p.launchMajor);
  }

  // Launch MUSA kernel.
  void *args[] = {&p};
#ifdef MMCV_WITH_HIP
  AT_MUSA_CHECK(hipLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
                                c10::musa::getCurrentMUSAStream()));
#else
  AT_MUSA_CHECK(musaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
                                 c10::musa::getCurrentMUSAStream()));
#endif

  return y;
}
#else
#warning "upfirdn2d is supported when MUSA_ARCH > 21"
#endif  //MUSA_ARCH


================================================
FILE: mmcv/ops/csrc/pytorch/musa/voxelization_musa.mu
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_musa_helper.hpp"
#include "voxelization_musa_kernel.muh"

int HardVoxelizeForwardMUSAKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3) {
  // current version tooks about 0.04s for one frame on cpu
  // check device

  c10::musa::MUSAGuard device_guard(points.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  const int num_points = points.size(0);
  const int num_features = points.size(1);

  const float voxel_x = voxel_size[0];
  const float voxel_y = voxel_size[1];
  const float voxel_z = voxel_size[2];
  const float coors_x_min = coors_range[0];
  const float coors_y_min = coors_range[1];
  const float coors_z_min = coors_range[2];
  const float coors_x_max = coors_range[3];
  const float coors_y_max = coors_range[4];
  const float coors_z_max = coors_range[5];

  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);

  // map points to voxel coors
  at::Tensor temp_coors =
      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));

  dim3 grid(std::min(at::musa::ATenCeilDiv(num_points, 512), 4096));
  dim3 block(512);

  // 1. link point to corresponding voxel coors
  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "hard_voxelize_kernel", ([&] {
        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
            points.contiguous().data_ptr<scalar_t>(),
            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
            NDim);
      }));

  AT_MUSA_CHECK(musaGetLastError());

  // 2. map point to the idx of the corresponding voxel, find duplicate coor
  // create some temporary variables
  auto point_to_pointidx = -at::ones(
      {
          num_points,
      },
      points.options().dtype(at::kInt));
  auto point_to_voxelidx = -at::ones(
      {
          num_points,
      },
      points.options().dtype(at::kInt));

  dim3 map_grid(std::min(at::musa::ATenCeilDiv(num_points, 512), 4096));
  dim3 map_block(512);

  AT_DISPATCH_ALL_TYPES(
      temp_coors.scalar_type(), "determin_duplicate", ([&] {
        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
            temp_coors.contiguous().data_ptr<int>(),
            point_to_voxelidx.contiguous().data_ptr<int>(),
            point_to_pointidx.contiguous().data_ptr<int>(), max_points,
            max_voxels, num_points, NDim);
      }));

  AT_MUSA_CHECK(musaGetLastError());

  // 3. determine voxel num and voxel's coor index
  // make the logic in the MUSA device could accelerate about 10 times
  auto coor_to_voxelidx = -at::ones(
      {
          num_points,
      },
      points.options().dtype(at::kInt));
  auto voxel_num = at::zeros(
      {
          1,
      },
      points.options().dtype(at::kInt));  // must be zero from the beginning

  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
                          determin_voxel_num<int><<<1, 1, 0, stream>>>(
                              num_points_per_voxel.contiguous().data_ptr<int>(),
                              point_to_voxelidx.contiguous().data_ptr<int>(),
                              point_to_pointidx.contiguous().data_ptr<int>(),
                              coor_to_voxelidx.contiguous().data_ptr<int>(),
                              voxel_num.contiguous().data_ptr<int>(),
                              max_points, max_voxels, num_points);
                        }));

  AT_MUSA_CHECK(musaGetLastError());

  // 4. copy point features to voxels
  // Step 4 & 5 could be parallel
  auto pts_output_size = num_points * num_features;
  dim3 cp_grid(std::min(at::musa::ATenCeilDiv(pts_output_size, 512), 4096));
  dim3 cp_block(512);
  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "assign_point_to_voxel", ([&] {
        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(
            pts_output_size, points.contiguous().data_ptr<float>(),
            point_to_voxelidx.contiguous().data_ptr<int>(),
            coor_to_voxelidx.contiguous().data_ptr<int>(),
            voxels.contiguous().data_ptr<float>(), max_points, num_features,
            num_points, NDim);
      }));
  //   musaDeviceSynchronize();
  //   AT_MUSA_CHECK(musaGetLastError());

  // 5. copy coors of each voxels
  auto coors_output_size = num_points * NDim;
  dim3 coors_cp_grid(
      std::min(at::musa::ATenCeilDiv(coors_output_size, 512), 4096));
  dim3 coors_cp_block(512);
  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "assign_point_to_voxel", ([&] {
        assign_voxel_coors<float, int>
            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(
                coors_output_size, temp_coors.contiguous().data_ptr<int>(),
                point_to_voxelidx.contiguous().data_ptr<int>(),
                coor_to_voxelidx.contiguous().data_ptr<int>(),
                coors.contiguous().data_ptr<int>(), num_points, NDim);
      }));

  AT_MUSA_CHECK(musaGetLastError());

  auto voxel_num_cpu = voxel_num.to(at::kCPU);
  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];

  return voxel_num_int;
}

int NondeterministicHardVoxelizeForwardMUSAKernelLauncher(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3) {
  c10::musa::MUSAGuard device_guard(points.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  const int num_points = points.size(0);
  const int num_features = points.size(1);

  if (num_points == 0) return 0;

  dim3 blocks(
      std::min(at::musa::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
  dim3 threads(THREADS_PER_BLOCK);

  const float voxel_x = voxel_size[0];
  const float voxel_y = voxel_size[1];
  const float voxel_z = voxel_size[2];
  const float coors_x_min = coors_range[0];
  const float coors_y_min = coors_range[1];
  const float coors_z_min = coors_range[2];
  const float coors_x_max = coors_range[3];
  const float coors_y_max = coors_range[4];
  const float coors_z_max = coors_range[5];

  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);

  // map points to voxel coors
  at::Tensor temp_coors =
      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));

  // 1. link point to corresponding voxel coors
  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "hard_voxelize_kernel", ([&] {
        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
            points.contiguous().data_ptr<scalar_t>(),
            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
            NDim);
      }));

  at::Tensor coors_map;
  at::Tensor reduce_count;

  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);

  std::tie(temp_coors, coors_map, reduce_count) =
      at::unique_dim(coors_clean, 0, true, true, false);

  if (temp_coors[0][0].lt(0).item<bool>()) {
    // the first element of temp_coors is (-1,-1,-1) and should be removed
    temp_coors = temp_coors.slice(0, 1);
    coors_map = coors_map - 1;
  }

  int num_coors = temp_coors.size(0);
  temp_coors = temp_coors.to(at::kInt);
  coors_map = coors_map.to(at::kInt);

  at::Tensor coors_count = at::zeros({1}, coors_map.options());
  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
  reduce_count = at::zeros({num_coors}, coors_map.options());

  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "get_assign_pos", ([&] {
        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
            num_points, coors_map.contiguous().data_ptr<int32_t>(),
            pts_id.contiguous().data_ptr<int32_t>(),
            coors_count.contiguous().data_ptr<int32_t>(),
            reduce_count.contiguous().data_ptr<int32_t>(),
            coors_order.contiguous().data_ptr<int32_t>());
      }));

  AT_DISPATCH_ALL_TYPES(
      points.scalar_type(), "assign_point_to_voxel", ([&] {
        nondeterministic_assign_point_voxel<scalar_t>
            <<<blocks, threads, 0, stream>>>(
                num_points, points.contiguous().data_ptr<scalar_t>(),
                coors_map.contiguous().data_ptr<int32_t>(),
                pts_id.contiguous().data_ptr<int32_t>(),
                temp_coors.contiguous().data_ptr<int32_t>(),
                reduce_count.contiguous().data_ptr<int32_t>(),
                coors_order.contiguous().data_ptr<int32_t>(),
                voxels.contiguous().data_ptr<scalar_t>(),
                coors.contiguous().data_ptr<int32_t>(),
                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
                max_voxels, max_points, num_features, NDim);
      }));
  AT_MUSA_CHECK(musaGetLastError());
  return max_voxels < num_coors ? max_voxels : num_coors;
}

void DynamicVoxelizeForwardMUSAKernelLauncher(
    const at::Tensor &points, at::Tensor &coors,
    const std::vector<float> voxel_size, const std::vector<float> coors_range,
    const int NDim = 3) {
  // current version tooks about 0.04s for one frame on cpu
  // check device

  c10::musa::MUSAGuard device_guard(points.device());
  musaStream_t stream = c10::musa::getCurrentMUSAStream();

  const int num_points = points.size(0);
  const int num_features = points.size(1);

  const float voxel_x = voxel_size[0];
  const float voxel_y = voxel_size[1];
  const float voxel_z = voxel_size[2];
  const float coors_x_min = coors_range[0];
  const float coors_y_min = coors_range[1];
  const float coors_z_min = coors_range[2];
  const float coors_x_max = coors_range[3];
  const float coors_y_max = coors_range[4];
  const float coors_z_max = coors_range[5];

  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);

  const int col_blocks = at::musa::ATenCeilDiv(num_points, THREADS_PER_BLOCK);
  dim3 blocks(col_blocks);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
        points.contiguous().data_ptr<scalar_t>(),
        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
  });

  AT_MUSA_CHECK(musaGetLastError());
}


================================================
FILE: mmcv/ops/csrc/pytorch/nms.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>

#include "csrc_dipu/base/basedef.h"
#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"

using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif

Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
}

Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
                    float iou_threshold, float sigma, float min_score,
                    int method, int offset) {
  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
                              sigma, min_score, method, offset);
}

std::vector<std::vector<int> > nms_match_impl(Tensor dets,
                                              float iou_threshold) {
  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
}

#ifdef MMCV_WITH_DIOPI
Tensor nms_diopi(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  auto boxes_p = toDiopiTensorHandle(boxes);
  diopiDevice_t device;
  diopiGetTensorDevice(boxes_p, &device);
  if (device == diopi_host) {
    return nms_impl(boxes, scores, iou_threshold, offset);
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  Tensor out;
  auto outp = toDiopiTensorHandle(out);
  diopiTensorHandle_t* outhandle = &outp;
  auto scores_p = toDiopiTensorHandle(scores);
  bool is_mock_cuda = boxes.device().type() == dipu::DIPU_DEVICE_TYPE;
  if (is_mock_cuda && reinterpret_cast<void*>(diopiNmsMmcv) != nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret =
          diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);
      if (ret == diopiSuccess) {
        auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);
        return *tensorhandle;
      }
    } else {
      auto ret =
          diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);
      if (ret == diopiSuccess) {
        auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);
        return *tensorhandle;
      }
    }
  }
  LOG(WARNING) << "Fallback to cpu: mmcv ext op nms";
  auto boxes_cpu = boxes.cpu();
  auto scores_cpu = scores.cpu();
  return nms_impl(boxes_cpu, scores_cpu, iou_threshold, offset);
}
#endif

Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
#ifdef MMCV_WITH_DIOPI
  return nms_diopi(boxes, scores, iou_threshold, offset);
#else
  return nms_impl(boxes, scores, iou_threshold, offset);
#endif
}

Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
               float sigma, float min_score, int method, int offset) {
  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
                      method, offset);
}

std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
  return nms_match_impl(dets, iou_threshold);
}


================================================
FILE: mmcv/ops/csrc/pytorch/nms_quadri.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "pytorch_cpp_helper.hpp"

Tensor nms_quadri_cpu(const Tensor dets, const Tensor scores,
                      const float iou_threshold);

#ifdef MMCV_WITH_CUDA
Tensor nms_quadri_cuda(const Tensor dets, const Tensor scores,
                       const Tensor order, const Tensor dets_sorted,
                       const float iou_threshold, const int multi_label);
#elif MMCV_WITH_MUSA
Tensor nms_quadri_musa(const Tensor dets, const Tensor scores,
                       const Tensor order, const Tensor dets_sorted,
                       const float iou_threshold, const int multi_label);
#endif

// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
                  const Tensor dets_sorted, const float iou_threshold,
                  const int multi_label) {
  assert(dets.device().is_cuda() == scores.device().is_cuda());
  if (dets.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    return nms_quadri_cuda(dets, scores, order, dets_sorted, iou_threshold,
                           multi_label);
#elif MMCV_WITH_MUSA
    return nms_quadri_musa(dets, scores, order, dets_sorted, iou_threshold,
                           multi_label);
#else
    AT_ERROR("Not compiled with GPU support");
#endif
  }

  return nms_quadri_cpu(dets, scores, iou_threshold);
}


================================================
FILE: mmcv/ops/csrc/pytorch/nms_rotated.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
#include "pytorch_cpp_helper.hpp"

Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
                       const float iou_threshold);

#ifdef MMCV_WITH_CUDA
Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
                        const Tensor order, const Tensor dets_sorted,
                        const float iou_threshold, const int multi_label);
#endif

#ifdef MMCV_WITH_NPU
Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
                       const Tensor labels, const float iou_threshold);
#endif

#ifdef MMCV_WITH_MLU
Tensor nms_rotated_mlu(const Tensor dets, const Tensor scores,
                       const float iou_threshold);
#endif

#ifdef MMCV_WITH_MUSA
Tensor nms_rotated_musa(const Tensor dets, const Tensor scores,
                        const Tensor order, const Tensor dets_sorted,
                        const float iou_threshold, const int multi_label);
#endif

// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
                   const Tensor dets_sorted, const Tensor labels,
                   const float iou_threshold, const int multi_label) {
  assert(dets.device().is_cuda() == scores.device().is_cuda());
  if (dets.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    return nms_rotated_cuda(dets, scores, order, dets_sorted.contiguous(),
                            iou_threshold, multi_label);
#else
    AT_ERROR("Not compiled with GPU support");
#endif
#ifdef MMCV_WITH_XLA
  } else if (dets.device().type() == at::kXLA) {
    return nms_rotated_npu(dets, scores, labels, iou_threshold);
#endif
#ifdef MMCV_WITH_KPRIVATE
  } else if (dets.device().type() == at::kPrivateUse1) {
    return nms_rotated_npu(dets, scores, labels, iou_threshold);
#endif
#ifdef MMCV_WITH_MLU
  } else if (dets.device().type() == at::kMLU) {
    return nms_rotated_mlu(dets, scores, iou_threshold);
#endif
#ifdef MMCV_WITH_MUSA
  } else if (dets.device().type() == ::at::kPrivateUse1) {
    return nms_rotated_musa(dets, scores, order, dets_sorted.contiguous(),
                            iou_threshold, multi_label);
#endif
  }

  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
}


================================================
FILE: mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void active_rotated_filter_forward_impl(const Tensor input,
                                        const Tensor indices, Tensor output);

void active_rotated_filter_backward_impl(const Tensor grad_out,
                                         const Tensor indices, Tensor grad_in);

void active_rotated_filter_forward_npu(const Tensor input, const Tensor indices,
                                       Tensor output) {
  OpCommand cmd;
  cmd.Name("ActiveRotatedFilter")
      .Input(input)
      .Input(indices)
      .Output(output)
      .Run();
}

void active_rotated_filter_backward_npu(const Tensor grad_out,
                                        const Tensor indices, Tensor grad_in) {
  OpCommand cmd;
  cmd.Name("ActiveRotatedFilterGrad")
      .Input(grad_out)
      .Input(indices)
      .Output(grad_in)
      .Run();
}

REGISTER_NPU_IMPL(active_rotated_filter_forward_impl,
                  active_rotated_filter_forward_npu);

REGISTER_NPU_IMPL(active_rotated_filter_backward_impl,
                  active_rotated_filter_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void assign_score_withk_forward_npu(int B, int N0, int N1, int M, int K, int O,
                                    int aggregate, const Tensor& points,
                                    const Tensor& centers, const Tensor& scores,
                                    const Tensor& knn_idx, Tensor& output) {
  at::Tensor points_trans = points.permute({0, 3, 1, 2});
  at::Tensor centers_trans = centers.permute({0, 3, 1, 2});
  EXEC_NPU_CMD(aclnnAssignScoreWithk, points_trans, centers_trans, scores,
               knn_idx, B, N0, N1, M, K, O, aggregate, output);
}

void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
                                     int aggregate, const Tensor& points,
                                     const Tensor& centers,
                                     const Tensor& scores,
                                     const Tensor& knn_idx, Tensor& output);

REGISTER_NPU_IMPL(assign_score_withk_forward_impl,
                  assign_score_withk_forward_npu);

void assign_score_withk_backward_npu(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores) {
  at::Tensor grad_out_trans = grad_out.permute({0, 2, 3, 1});

  EXEC_NPU_CMD(aclnnAssignScoreWithkGrad, grad_out_trans, points, centers,
               scores, knn_idx, B, N0, N1, M, K, O, aggregate, grad_scores,
               grad_points, grad_centers);
}

void assign_score_withk_backward_impl(
    int B, int N0, int N1, int M, int K, int O, int aggregate,
    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
    Tensor& grad_centers, Tensor& grad_scores);

REGISTER_NPU_IMPL(assign_score_withk_backward_impl,
                  assign_score_withk_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/ball_query_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void ball_query_forward_npu(int b, int n, int m, float min_radius,
                            float max_radius, int nsample, const Tensor new_xyz,
                            const Tensor xyz, Tensor idx) {
  int64_t nsample_i64 = nsample;

  // transpose new_xyz from [B, M, 3] to [M, B, 3]
  at::Tensor new_xyz_transpose = new_xyz.transpose(0, 1).to(at::kFloat);

  // transpose xyz from [B, N, 3] to [B, 3, N]
  at::Tensor xyz_transpose = xyz.transpose(1, 2).to(at::kFloat);

  // transpose idx from [B, M, nsample] to [M, B, nsample]
  at::Tensor idx_transpose = idx.transpose(0, 1).contiguous();

  OpCommand cmd;
  cmd.Name("BallQuery")
      .Input(xyz_transpose)
      .Input(new_xyz_transpose)
      .Output(idx_transpose)
      .Attr("min_radius", min_radius)
      .Attr("max_radius", max_radius)
      .Attr("sample_num", nsample_i64)
      .Run();

  idx_transpose = idx_transpose.transpose(0, 1).contiguous();
  idx.copy_(idx_transpose);
}

void ball_query_forward_impl(int b, int n, int m, float min_radius,
                             float max_radius, int nsample,
                             const Tensor new_xyz, const Tensor xyz,
                             Tensor idx);

REGISTER_NPU_IMPL(ball_query_forward_impl, ball_query_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
================================================
#include "common_util.h"
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                        const int mode, const bool aligned, const int offset);

void bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                       const int mode, const bool aligned, const int offset) {
  string modeStr = "iou";
  if (mode == 1) {
    modeStr = "iof";
  }
  bool swap_flag = false;
  at::Tensor bboxesFP32 = bboxes2;
  at::Tensor gtboxesFP32 = bboxes1;
  if (bboxes2.size(0) < bboxes1.size(0)) {
    swap_flag = true;
    bboxesFP32 = bboxes1;
    gtboxesFP32 = bboxes2;
  }
  if (bboxes2.scalar_type() != at::kFloat) {
    bboxesFP32 = bboxesFP32.to(at::kFloat);
    gtboxesFP32 = gtboxesFP32.to(at::kFloat);
  }
  c10::SmallVector<int64_t, 8> iousSize = {gtboxesFP32.size(0),
                                           bboxesFP32.size(0)};
  if (aligned) {
    iousSize = {gtboxesFP32.size(0), 1};
  }
  at::Tensor iousFP32 = at::empty(iousSize, bboxesFP32.options());
  bboxesFP32 = aligned ? bboxesFP32.transpose(0, 1) : bboxesFP32;
  gtboxesFP32 = aligned ? gtboxesFP32.transpose(0, 1) : gtboxesFP32;
  OpCommand cmd;
  cmd.Name("Iou")
      .Input(bboxesFP32)
      .Input(gtboxesFP32)
      .Output(iousFP32)
      .Attr("mode", modeStr)
      .Attr("eps", (float)offset)
      .Attr("aligned", aligned)
      .Run();
  if (bboxes2.scalar_type() != at::kFloat) {
    iousFP32 = iousFP32.to(at::kHalf);
  }
  iousFP32 = swap_flag ? iousFP32.transpose(0, 1) : iousFP32;
  ious.copy_(iousFP32);
}

REGISTER_NPU_IMPL(bbox_overlaps_impl, bbox_overlaps_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/border_align_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
                               Tensor output, Tensor argmax_idx,
                               const int pool_size);

void border_align_forward_npu(const Tensor &input, const Tensor &boxes,
                              Tensor output, Tensor argmax_idx,
                              const int pool_size) {
  TORCH_CHECK(input.size(0) == boxes.size(0),
              "The batch sizes of feature map and rois must be the same.");
  TORCH_CHECK(input.size(1) % 4 == 0,
              "The number of channels must be divisible by 4.");
  TORCH_CHECK(pool_size >= 2, "The pool size should be larger than 2.");
  int32_t batch_size = input.size(0);
  int32_t channels = input.size(1);
  int32_t height = input.size(2);
  int32_t width = input.size(3);
  at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous();
  at::Tensor rois_map = boxes.contiguous();
  at::Tensor temp_tensor = at::zeros(
      {batch_size, height * width, pool_size + 1, channels}, input.options());
  EXEC_NPU_CMD(aclnnBorderAlign, feature_map, rois_map, pool_size, temp_tensor);
  auto max_result = temp_tensor.max(-2);
  at::Tensor output_ = std::get<0>(max_result).to(at::kFloat);
  output_ = output_.reshape({batch_size, height * width, 4, channels / 4})
                .permute({0, 3, 1, 2})
                .contiguous();
  output.copy_(output_);
  at::Tensor argmax_idx_ = std::get<1>(max_result).to(at::kInt);
  argmax_idx_ =
      argmax_idx_.reshape({batch_size, height * width, 4, channels / 4})
          .permute({0, 3, 1, 2})
          .contiguous();
  argmax_idx.copy_(argmax_idx_);
}
REGISTER_NPU_IMPL(border_align_forward_impl, border_align_forward_npu);

void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
                                const Tensor &argmax_idx, Tensor grad_input,
                                const int pool_size);

void border_align_backward_npu(const Tensor &grad_output, const Tensor &boxes,
                               const Tensor &argmax_idx, Tensor grad_input,
                               const int pool_size) {
  TORCH_CHECK(grad_output.dim() == 4,
              "grad_out.dim() must be 4, but got: ", grad_output.dim());
  TORCH_CHECK(boxes.dim() == 3, "idx.dim() must be 3, but got: ", boxes.dim());
  TORCH_CHECK(argmax_idx.dim() == 4,
              "argmax_idx.dim() must be 4, but got: ", argmax_idx.dim());

  int32_t batch_size = grad_output.size(0);
  int32_t feat_channels = grad_output.size(1) * 4;
  int32_t channels = grad_output.size(1);
  int32_t box_size = boxes.size(1);
  int32_t height = grad_input.size(2);
  int32_t width = grad_input.size(3);

  EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_output, boxes, argmax_idx, channels,
               box_size, height, width, pool_size, batch_size, grad_input);
}
REGISTER_NPU_IMPL(border_align_backward_impl, border_align_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/box_iou_quadri_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned);

void box_iou_quadri_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                        const int mode_flag, const bool aligned) {
  TORCH_CHECK(boxes1.size(1) == 8, "boxes1 must be 2D tensor (N, 8)");
  TORCH_CHECK(boxes1.size(1) == 8, "boxes1 must be 2D tensor (N, 8)");

  EXEC_NPU_CMD(aclnnBoxIou, boxes1, boxes2, mode_flag, aligned, ious);
  return;
}

REGISTER_NPU_IMPL(box_iou_quadri_impl, box_iou_quadri_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned);

void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                         const int mode_flag, const bool aligned) {
  TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)");
  TORCH_CHECK(boxes2.size(1) == 5, "boxes2 must be 2D tensor (N, 5)");
  EXEC_NPU_CMD(aclnnBoxIou, boxes1, boxes2, mode_flag, aligned, ious);
  return;
}

REGISTER_NPU_IMPL(box_iou_rotated_impl, box_iou_rotated_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

namespace {
constexpr int32_t MODE_FLAG_OVERLAP = 0;
constexpr int32_t FORMAT_FLAG_XYZWHDR = 3;
};  // namespace

void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap);

void iou3d_boxes_overlap_bev_forward_npu(const int num_a, const Tensor boxes_a,
                                         const int num_b, const Tensor boxes_b,
                                         Tensor ans_overlap) {
  TORCH_CHECK(boxes_a.size(1) == 7, "boxes_a must be 2D tensor (N, 7)");
  TORCH_CHECK(boxes_b.size(1) == 7, "boxes_b must be 2D tensor (N, 7)");

  auto clockwise = true;
  bool aligned = false;
  double margin = 1e-5;
  int32_t mode_flag = MODE_FLAG_OVERLAP;
  int32_t format_flag = FORMAT_FLAG_XYZWHDR;

  EXEC_NPU_CMD(aclnnBoxesOverlapBevV1, boxes_a, boxes_b, format_flag, clockwise,
               mode_flag, aligned, margin, ans_overlap);
  return;
}

REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl,
                  iou3d_boxes_overlap_bev_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1,
                                  Tensor dist2, Tensor idx1, Tensor idx2) {
  at::Tensor xyz1 = at::ones_like(XYZ1);
  at::Tensor xyz2 = at::ones_like(XYZ2);
  xyz1 = XYZ1.transpose(1, 2).transpose(0, 1);
  xyz2 = XYZ2.transpose(1, 2).transpose(0, 1);
  OpCommand cmd;
  cmd.Name("ChamferDistance")
      .Input(xyz1)
      .Input(xyz2)
      .Output(dist1)
      .Output(dist2)
      .Output(idx1)
      .Output(idx2)
      .Run();
}

void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1,
                                   Tensor idx2, Tensor grad_dist1,
                                   Tensor grad_dist2, Tensor grad_xyz1,
                                   Tensor grad_xyz2) {
  EXEC_NPU_CMD(aclnnChamferDistanceBackward, xyz1, xyz2, idx1, idx2, grad_dist1,
               grad_dist2, grad_xyz1, grad_xyz2);
}

void chamfer_distance_forward_impl(Tensor XYZ1, Tensor XYZ2, Tensor dist1,
                                   Tensor dist2, Tensor idx1, Tensor idx2);
REGISTER_NPU_IMPL(chamfer_distance_forward_impl, chamfer_distance_forward_npu);

void chamfer_distance_backward_impl(Tensor xyz1, Tensor xyz2, Tensor idx1,
                                    Tensor idx2, Tensor grad_dist1,
                                    Tensor grad_dist2, Tensor grad_xyz1,
                                    Tensor grad_xyz2);
REGISTER_NPU_IMPL(chamfer_distance_backward_impl,
                  chamfer_distance_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/common_util.h
================================================
#ifndef MMCV_OPS_CSRC_COMMON__UTIL_HPP_
#define MMCV_OPS_CSRC_COMMON__UTIL_HPP_

const int SIZE = 8;

#endif  // MMCV_OPS_CSRC_COMMON__UTIL_HPP_


================================================
FILE: mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                  Tensor output, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma);

void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                   Tensor rois, Tensor offset,
                                   Tensor grad_input, Tensor grad_offset,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale, int sampling_ratio,
                                   float gamma);

void deform_roi_pool_forward_npu(Tensor input, Tensor rois, Tensor offset,
                                 Tensor output, int pooled_height,
                                 int pooled_width, float spatial_scale,
                                 int sampling_ratio, float gamma) {
  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
  OpCommand cmd;
  cmd.Name("DeformableRoiPool")
      .Input(input)
      .Input(rois)
      .Input(offset)
      .Output(output)
      .Attr("spatial_scale", spatial_scale)
      .Attr("output_size", output_size)
      .Attr("sampling_ratio", sampling_ratio_)
      .Attr("gamma", gamma)
      .Run();
}

void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,
                                  Tensor offset, Tensor grad_input,
                                  Tensor grad_offset, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int sampling_ratio, float gamma) {
  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
  OpCommand cmd;
  cmd.Name("DeformableRoiPoolGrad")
      .Input(grad_output)
      .Input(input)
      .Input(rois)
      .Input(offset)
      .Output(grad_input)
      .Output(grad_offset)
      .Attr("output_size", output_size)
      .Attr("spatial_scale", spatial_scale)
      .Attr("sampling_ratio", sampling_ratio_)
      .Attr("gamma", gamma)
      .Run();
}

REGISTER_NPU_IMPL(deform_roi_pool_forward_impl, deform_roi_pool_forward_npu);

REGISTER_NPU_IMPL(deform_roi_pool_backward_impl, deform_roi_pool_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/diff_iou_rotated_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;

Tensor diff_iou_rotated_sort_vertices_npu(Tensor vertices, Tensor mask,
                                          Tensor num_valid) {
  TORCH_CHECK(vertices.dim() == 4,
              "vertices must be a 4D Tensor, but got: ", vertices.dim());
  TORCH_CHECK(mask.dim() == 3,
              "mask must be a 3D Tensor, but got: ", mask.dim());
  TORCH_CHECK(num_valid.dim() == 2,
              "num_valid must be a 2D Tensor, but got: ", num_valid.dim());

  uint32_t B = vertices.size(0);
  uint32_t N = vertices.size(1);

  at::Tensor sortedIdx = at::empty({B, N, 9}, num_valid.options());
  at::Tensor mask_fp = mask.to(at::kFloat);

  EXEC_NPU_CMD(aclnnDiffIouRotatedSortVertices, vertices, mask_fp, num_valid,
               sortedIdx);

  return sortedIdx;
}

Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
                                                   Tensor num_valid);

REGISTER_NPU_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
                  diff_iou_rotated_sort_vertices_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;

void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
                                    Tensor output, float gamma, float alpha) {
  int64_t n_class = input.size(1);
  at::Tensor target_y = at::ones_like(input);
  if (n_class == 1) {
    target_y = at::reshape(target, input.sizes());
    target_y = at::mul(target_y, -1.0);
    target_y = at::add(target_y, 1.0);
  } else {
    target_y = at::one_hot(target, n_class);
  }
  target_y = target_y.to(at::kInt);
  int64_t weight_size = weight.size(0);
  at::Tensor weight_y = at::ones_like(input);
  if (weight_size > 0) {
    at::Tensor weight_selected = weight.gather(0, target);
    weight_selected = weight_selected.unsqueeze(1);
    weight_y = weight_selected.expand_as(input);
  }
  OpCommand cmd;
  string reduction = "none";
  cmd.Name("SigmoidFocalLoss")
      .Input(input)
      .Input(target_y)
      .Input(weight_y)
      .Output(output)
      .Attr("gamma", gamma)
      .Attr("alpha", alpha)
      .Attr("reduction", reduction)
      .Run();
}

void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha);

void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
                                     Tensor grad_input, float gamma,
                                     float alpha) {
  int64_t n_class = input.size(1);
  at::Tensor target_y = at::ones_like(input);
  if (n_class == 1) {
    target_y = at::reshape(target, input.sizes());
  } else {
    target_y = at::one_hot(target, n_class);
    target_y = at::mul(target_y, -1.0);
    target_y = at::add(target_y, 1.0);
  }
  target_y = target_y.to(at::kInt);
  at::Tensor grad_up = at::ones_like(input);
  int64_t weight_size = weight.size(0);
  at::Tensor weight_y = at::ones_like(input);
  if (weight_size > 0) {
    weight_y = at::broadcast_to(weight, input.sizes());
  }
  OpCommand cmd;
  string reduction = "none";
  cmd.Name("SigmoidFocalLossGrad")
      .Input(input)
      .Input(target_y)
      .Input(grad_up)
      .Input(weight_y)
      .Output(grad_input)
      .Attr("gamma", gamma)
      .Attr("alpha", alpha)
      .Attr("reduction", reduction)
      .Run();
}

void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor grad_input,
                                      float gamma, float alpha);

void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
                                    Tensor output, float gamma, float alpha) {
  int64_t n_class = input.size(1);
  at::Tensor target_y = at::one_hot(target, n_class);
  target_y = target_y.to(at::kInt);
  int64_t weight_size = weight.size(0);
  at::Tensor weight_y = at::ones_like(input);
  if (weight_size > 0) {
    weight_y = at::broadcast_to(weight, input.sizes());
  }
  at::Tensor op_output = at::ones_like(input);
  OpCommand cmd;
  string reduction = "none";
  cmd.Name("SoftmaxFocalLoss")
      .Input(input)
      .Input(target_y)
      .Input(weight_y)
      .Output(op_output)
      .Attr("gamma", gamma)
      .Attr("alpha", alpha)
      .Attr("reduction", reduction)
      .Run();
  int64_t n_batch = input.size(0);
  c10::SmallVector<int64_t, 2> offsets = {0, 0};
  c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
  at::IntArrayRef offset = at::IntArrayRef(offsets);
  at::IntArrayRef size = at::IntArrayRef(sizes);
  at::IntArrayRef size_array = at::IntArrayRef(sizes);
  c10::SmallVector<int64_t, 8> offsetVec;
  for (uint64_t i = 0; i < offset.size(); i++) {
    offsetVec.emplace_back(offset[i]);
  }
  c10::SmallVector<int64_t, 8> sizeVec;
  for (uint64_t i = 0; i < size_array.size(); i++) {
    sizeVec.emplace_back(size_array[i]);
  }
  OpCommand cmd2;
  cmd2.Name("Slice")
      .Input(op_output)
      .Input(offsetVec)
      .Input(sizeVec)
      .Output(output)
      .Run();
}

void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                     Tensor grad_input, float gamma,
                                     float alpha);

void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
                                     Tensor buff, Tensor grad_input,
                                     float gamma, float alpha) {
  int64_t n_class = input.size(1);
  at::Tensor target_y = at::one_hot(target, n_class);
  target_y = target_y.to(at::kInt);
  at::Tensor grad_up = at::ones_like(input);
  int64_t weight_size = weight.size(0);
  at::Tensor weight_y = at::ones_like(input);
  if (weight_size > 0) {
    weight_y = at::broadcast_to(weight, input.sizes());
  }
  OpCommand cmd;
  string reduction = "none";
  cmd.Name("SoftmaxFocalLossGrad")
      .Input(input)
      .Input(target_y)
      .Input(grad_up)
      .Input(weight_y)
      .Output(grad_input)
      .Attr("gamma", gamma)
      .Attr("alpha", alpha)
      .Attr("reduction", reduction)
      .Run();
}

void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                      Tensor weight, Tensor buff,
                                      Tensor grad_input, float gamma,
                                      float alpha);

REGISTER_NPU_IMPL(sigmoid_focal_loss_forward_impl,
                  sigmoid_focal_loss_forward_npu);

REGISTER_NPU_IMPL(sigmoid_focal_loss_backward_impl,
                  sigmoid_focal_loss_backward_npu);

REGISTER_NPU_IMPL(softmax_focal_loss_forward_impl,
                  softmax_focal_loss_forward_npu);

REGISTER_NPU_IMPL(softmax_focal_loss_backward_impl,
                  softmax_focal_loss_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/furthest_point_sample_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void furthest_point_sampling_forward_npu(Tensor points_tensor,
                                         Tensor temp_tensor, Tensor idx_tensor,
                                         int b, int n, int m) {
  TORCH_CHECK(
      (points_tensor.sizes()[1] >= m),
      "the num of sampled points should smaller than total num of points.");
  at::Tensor points_xyz = points_tensor.transpose(1, 2).contiguous();
  at::Tensor nearest_dist = temp_tensor.contiguous();
  EXEC_NPU_CMD(aclnnFurthestPointSampling, points_xyz, nearest_dist, m,
               idx_tensor);
}

void furthest_point_sampling_forward_impl(Tensor points_tensor,
                                          Tensor temp_tensor, Tensor idx_tensor,
                                          int b, int n, int m);

REGISTER_NPU_IMPL(furthest_point_sampling_forward_impl,
                  furthest_point_sampling_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/furthest_point_sampling_with_dist_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;

void furthest_point_sampling_with_dist_npu(Tensor points_tensor,
                                           Tensor temp_tensor,
                                           Tensor idx_tensor, int b, int n,
                                           int m) {
  TORCH_CHECK(
      (points_tensor.sizes()[1] >= m),
      "the num of sampled points should smaller than total num of points.");
  EXEC_NPU_CMD(aclnnFurthestPointSamplingWithDist, points_tensor, temp_tensor,
               m, idx_tensor);
}

void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
                                                    Tensor temp_tensor,
                                                    Tensor idx_tensor, int b,
                                                    int n, int m);

REGISTER_NPU_IMPL(furthest_point_sampling_with_dist_forward_impl,
                  furthest_point_sampling_with_dist_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
================================================
#include "common_util.h"
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

Tensor fused_bias_leakyrelu_op_impl(const Tensor &input, const Tensor &bias,
                                    const Tensor &refer, int act, int grad,
                                    float alpha, float scale);

Tensor fused_bias_leakyrelu_npu(const Tensor &input, const Tensor &bias,
                                const Tensor &refer, int act, int grad,
                                float alpha, float scale) {
  at::Tensor py = at::empty_like(input);
  // forward
  if (grad == 0) {
    auto input_size = input.sizes();
    int input_length = input_size.size();
    c10::SmallVector<int64_t, 8> input_size_tmp;
    for (uint64_t i = 0; i < input_size.size(); i++) {
      input_size_tmp.emplace_back(input_size[i]);
    }
    if (input_length > 1) {
      for (int i = 0; i < input_length; i++) {
        if (i != 1) {
          input_size_tmp[i] = 1;
        }
      }
    }
    at::Tensor bias_tmp = at::reshape(bias, input_size_tmp);
    // at::Tensor bias_ = at_npu::native::NPUNativeFunctions::npu_broadcast(
    //     bias_tmp, input.sizes());
    at::Tensor bias_ = at::broadcast_to(bias_tmp, input.sizes());
    OpCommand cmd;
    cmd.Name("FusedBiasLeakyRelu")
        .Input(input)
        .Input(bias_)
        .Output(py)
        .Attr("scale", scale)
        .Attr("negative_slope", alpha)
        .Run();
  }

  // backward
  if (grad == 1) {
    OpCommand cmd;
    cmd.Name("FusedBiasLeakyReluGrad")
        .Input(input)
        .Input(refer)
        .Output(py)
        .Attr("scale", scale)
        .Attr("negative_slope", alpha)
        .Run();
  }
  return py;
}

REGISTER_NPU_IMPL(fused_bias_leakyrelu_op_impl, fused_bias_leakyrelu_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void gather_points_forward_npu(int b, int c, int n, int npoints,
                               const Tensor points, const Tensor idx,
                               Tensor out) {
  // b, c, n, and npoints do not need to be passed into gatherv2,
  // b, c, n, and npoints are calculated inside the operator
  // gatherv2 operator in ascend needs to set axis to 2, batch_dims is 1
  c10::SmallVector<int64_t, N> axis = {2};
  int64_t batch_dims = 1;

  OpCommand cmd;
  cmd.Name("GatherV2")
      .Input(points)
      .Input(idx)
      .Input(axis)
      .Output(out)
      .Attr("batch_dims", batch_dims)
      .Run();
}
void gather_points_backward_npu(int b, int c, int n, int npoints,
                                const Tensor grad_out, const Tensor idx,
                                Tensor grad_points) {
  at::Tensor indices = idx;
  if (idx.scalar_type() != at::ScalarType::Int) {
    indices = idx.to(at::kInt);
  }
  if (idx.dim() == 0) {
    indices.unsqueeze_(0);
  }
  int64_t dim = 0;
  auto shape = idx.sizes();
  c10::SmallVector<int64_t, 8> pad_size;
  for (uint64_t i = 0; i < shape.size(); i++) {
    pad_size.emplace_back(shape[i]);
  }
  at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous();
  at::Tensor grad_points_view = trans_grad_points.view(
      {trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1],
       trans_grad_points.sizes()[2]});
  at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous();
  trans_grad_out = trans_grad_out.view(
      {trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1],
       trans_grad_out.sizes()[2]});
  auto index = at::arange(0, b);
  index = index.to(grad_out.device());
  index = at::mul(index, n);
  index = index.view({b, 1});
  index = at::broadcast_to(index, pad_size);
  indices = at::add(index, indices);
  indices = indices.view({-1});
  OpCommand cmd;
  cmd.Name("InplaceIndexAdd")
      .Input(grad_points_view)
      .Input(indices)
      .Input(trans_grad_out)
      .Output(grad_points_view)
      .Attr("axis", dim)
      .Run();
  at::Tensor grad_points_result =
      grad_points_view.view(trans_grad_points.sizes());
  grad_points_result = grad_points_result.transpose(1, 2);
  grad_points.copy_(grad_points_result);
}

void gather_points_forward_impl(int b, int c, int n, int npoints,
                                const Tensor points, const Tensor idx,
                                Tensor out);
void gather_points_backward_impl(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points);

REGISTER_NPU_IMPL(gather_points_forward_impl, gather_points_forward_npu);
REGISTER_NPU_IMPL(gather_points_backward_impl, gather_points_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void group_points_forward_npu(int b, int c, int n, int npoints, int nsample,
                              const Tensor points, const Tensor idx,
                              Tensor out) {
  // b, c, n, and npoints do not need to be passed into gatherv2,
  // b, c, n, and npoints are calculated inside the operator
  // gatherv2 operator in ascend needs to set axis to 0, batch_dims is 0
  c10::SmallVector<int64_t, N> axis = {0};
  int64_t batch_dims = 0;

  auto index = at::arange(0, b);
  index = index.to(points.device());
  index = index.view({-1, 1, 1});
  index = at::mul(index, n);
  at::Tensor indices = at::add(index, idx);
  indices = indices.view({-1});

  at::Tensor trans_features = points.transpose(1, 2);
  at::Tensor features = trans_features.contiguous();
  features = features.view({b * n, c});

  OpCommand cmd;
  cmd.Name("GatherV2")
      .Input(features)
      .Input(indices)
      .Input(axis)
      .Output(out)
      .Attr("batch_dims", batch_dims)
      .Run();

  at::Tensor output =
      out.view({b, npoints, nsample, c}).transpose(1, 3).transpose(2, 3);
  at::Tensor res = output.contiguous();
  out.copy_(res);
}

void group_points_backward_npu(int b, int c, int n, int npoints, int nsample,
                               const Tensor grad_out, const Tensor idx,
                               Tensor grad_features) {
  at::Tensor trans_idx = idx.view({b * npoints * nsample});
  at::Tensor trans_grad_out = grad_out.permute({0, 2, 3, 1});
  at::Tensor grad_out_tensor = trans_grad_out.contiguous();
  grad_out_tensor = grad_out_tensor.view({b * npoints * nsample, c});
  at::Tensor out = at::zeros({b, n, c}, grad_out.options());

  EXEC_NPU_CMD(aclnnGroupPointsGrad, grad_out_tensor, trans_idx, b, c, n,
               npoints, nsample, out);

  at::Tensor grad_points = out.transpose(1, 2);

  grad_features.copy_(grad_points);
}

void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                               const Tensor points, const Tensor idx,
                               Tensor out);
void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor points, const Tensor idx,
                                Tensor out);

REGISTER_NPU_IMPL(group_points_forward_impl, group_points_forward_npu);
REGISTER_NPU_IMPL(group_points_backward_impl, group_points_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/knn_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"
#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
#include "torch_npu/csrc/framework/utils/OpAdapter.h"

using namespace NPU_NAME_SPACE;
using namespace std;

void knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz,
                     const Tensor new_xyz, Tensor idx, Tensor dist2) {
  // transpose known from [B, N, 3] to [B, 3, N]
  at::Tensor source = xyz.transpose(2, 1).contiguous();
  at::Tensor target = new_xyz.contiguous();

  bool is_from_knn = true;
  EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, nsample, dist2, idx);
}

void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2);

REGISTER_NPU_IMPL(knn_forward_impl, knn_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/ms_deform_attn_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

Tensor ms_deform_attn_impl_forward(const Tensor &value,
                                   const Tensor &value_spatial_shapes,
                                   const Tensor &value_level_start_index,
                                   const Tensor &sampling_locations,
                                   const Tensor &attention_weights,
                                   const int im2col_step);

void check_support(const Tensor &value, const Tensor &attention_weights) {
  TORCH_CHECK(
      (value.scalar_type() == at::kFloat || value.scalar_type() == at::kHalf),
      "Dtype of value should be float32 or float16.");
  int64_t num_heads = value.size(2);
  int64_t embed_dims = value.size(3);
  int64_t num_points = attention_weights.size(4);
  TORCH_CHECK((num_heads >= 4 && num_heads <= 8),
              "num_heads should be in the range of [4, 8]");
  TORCH_CHECK((embed_dims >= 32 && embed_dims <= 256),
              "embed_dims should be in the range of [32, 256]");
  TORCH_CHECK((num_points >= 4 && num_points <= 8),
              "num_points should be in the range of [4, 8]");
}

Tensor ms_deform_attn_forward_npu(const Tensor &value,
                                  const Tensor &value_spatial_shapes,
                                  const Tensor &value_level_start_index,
                                  const Tensor &sampling_locations,
                                  const Tensor &attention_weights,
                                  const int im2col_step) {
  check_support(value, attention_weights);
  at::Tensor value_fp32 = value;
  at::Tensor value_spatial_shapes_int32 = value_spatial_shapes;
  at::Tensor value_level_start_index_int32 = value_level_start_index;
  at::Tensor sampling_locations_fp32 = sampling_locations;
  at::Tensor attention_weights_fp32 = attention_weights;
  if (value.scalar_type() != at::kFloat) {
    value_fp32 = value.to(at::kFloat);
  }
  if (value_spatial_shapes.scalar_type() != at::kInt) {
    value_spatial_shapes_int32 = value_spatial_shapes.to(at::kInt);
  }
  if (value_level_start_index.scalar_type() != at::kInt) {
    value_level_start_index_int32 = value_level_start_index.to(at::kInt);
  }
  if (sampling_locations.scalar_type() != at::kFloat) {
    sampling_locations_fp32 = sampling_locations.to(at::kFloat);
  }
  if (attention_weights.scalar_type() != at::kFloat) {
    attention_weights_fp32 = attention_weights.to(at::kFloat);
  }

  c10::SmallVector<int64_t, 3> output_size = {
      value.size(0), sampling_locations.size(1), value.size(2) * value.size(3)};
  at::Tensor output = at::zeros(output_size, value_fp32.options());

  EXEC_NPU_CMD(aclnnMultiScaleDeformableAttnFunction, value_fp32,
               value_spatial_shapes_int32, value_level_start_index_int32,
               sampling_locations_fp32, attention_weights_fp32, output);

  at::Tensor real_output = output;
  if (value.scalar_type() != at::kFloat) {
    real_output = output.to(value.scalar_type());
  }
  return real_output;
}

REGISTER_NPU_IMPL(ms_deform_attn_impl_forward, ms_deform_attn_forward_npu);

void ms_deform_attn_impl_backward(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);

void ms_deform_attn_backward_npu(
    const Tensor &value, const Tensor &spatial_shapes,
    const Tensor &level_start_index, const Tensor &sampling_loc,
    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
    const int im2col_step) {
  check_support(value, attn_weight);
  EXEC_NPU_CMD(aclnnMultiScaleDeformableAttentionGrad, value, spatial_shapes,
               level_start_index, sampling_loc, attn_weight, grad_output,
               grad_value, grad_sampling_loc, grad_attn_weight);
}

REGISTER_NPU_IMPL(ms_deform_attn_impl_backward, ms_deform_attn_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/nms3d_normal_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;

void iou3d_nms3d_normal_forward_npu(const Tensor boxes, Tensor &keep,
                                    Tensor &num_out, float nms_overlap_thresh) {
  int32_t box_num = boxes.size(0);
  int32_t data_align = 16;
  int32_t mask_num = ((box_num - 1) / data_align + 1) * data_align;
  const double iou_threshold = nms_overlap_thresh;
  at::Tensor mask =
      at::empty({box_num, mask_num}, boxes.options().dtype(at::kShort));
  EXEC_NPU_CMD(aclnnNms3dNormal, boxes, iou_threshold, mask);

  Tensor keep_t = at::zeros({box_num}, mask.options());
  Tensor num_out_t = at::zeros(1, mask.options());
  EXEC_NPU_CMD(aclnnGatherNms3dMask, mask, keep_t, num_out_t);
  num_out.fill_(num_out_t.item().toLong());
  keep.copy_(keep_t);
}

void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
                                     Tensor &num_out, float nms_overlap_thresh);

REGISTER_NPU_IMPL(iou3d_nms3d_normal_forward_impl,
                  iou3d_nms3d_normal_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/nms3d_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

constexpr int32_t BOX_DIM = 7;

void iou3d_nms3d_forward_npu(const Tensor boxes, Tensor &keep, Tensor &num_out,
                             float nms_overlap_thresh) {
  TORCH_CHECK((boxes.sizes()[1] == BOX_DIM),
              "Input boxes shape should be (N, 7)");
  int32_t box_num = boxes.size(0);
  int32_t data_align = 16;
  int32_t mask_num = ((box_num - 1) / data_align + 1) * data_align;
  const double iou_threshold = nms_overlap_thresh;
  at::Tensor mask =
      at::empty({box_num, mask_num}, boxes.options().dtype(at::kShort));
  EXEC_NPU_CMD(aclnnNms3d, boxes, iou_threshold, mask);

  Tensor keep_t = at::zeros({box_num}, mask.options());
  Tensor num_out_t = at::zeros(1, mask.options());
  EXEC_NPU_CMD(aclnnGatherNms3dMask, mask, keep_t, num_out_t);
  num_out.fill_(num_out_t.item().toLong());
  keep.copy_(keep_t);
}

void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep, Tensor &num_out,
                              float nms_overlap_thresh);

REGISTER_NPU_IMPL(iou3d_nms3d_forward_impl, iou3d_nms3d_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
================================================
#include "common_util.h"
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

Tensor nms_npu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
  TORCH_CHECK((boxes.scalar_type() == at::ScalarType::Float),
              "The type of boxes tensor passed in nms_npu should be float");
  int64_t offset_64 = offset;
  at::Tensor iou_threshold_y =
      at::empty({}, boxes.options().dtype(at::kFloat)).fill_(iou_threshold);
  at::Tensor scores_threshold_y =
      at::empty({}, boxes.options().dtype(at::kFloat)).fill_(0);
  at::Tensor max_outputsize_y =
      at::empty({}, boxes.options().dtype(at::kInt)).fill_(boxes.size(0));

  c10::SmallVector<int64_t, 8> outputsize = {boxes.size(0)};
  at::Tensor output =
      at::empty(outputsize, boxes.options().dtype(at::kInt)).fill_(-1);
  OpCommand cmd;
  cmd.Name("NonMaxSuppressionV3")
      .Input(boxes)
      .Input(scores)
      .Input(max_outputsize_y)
      .Input(iou_threshold_y)
      .Input(scores_threshold_y)
      .Attr("offset", offset_64)
      .Output(output)
      .Run();
  auto outputsizeBool = at::gt(output, -1);
  auto outputsizeInt = outputsizeBool.to(at::kInt);
  auto countLen = at::sum(outputsizeInt, at::kInt);
  at::Tensor actual_output = output.slice(0, 0, countLen.item().toLong());
  actual_output = actual_output.to(at::kLong);
  return actual_output;
}

Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);

REGISTER_NPU_IMPL(nms_impl, nms_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
================================================
#include "common_util.h"
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;

Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
                       const Tensor labels, const float iou_threshold) {
  auto originDtype = dets.scalar_type();
  at::Tensor detsCast = dets;
  at::Tensor scoresCast = scores;
  if (originDtype != at::kFloat) {
    detsCast = detsCast.to(at::kFloat);
    scoresCast = scoresCast.to(at::kFloat);
  }
  c10::SmallVector<int64_t, 8> selectedIndexSize = {dets.size(0)};

  at::Tensor selectedBox = at::empty_like(dets);
  at::Tensor selectedIndex =
      at::empty(selectedIndexSize, dets.options().dtype(at::kInt));

  c10::SmallVector<int64_t, N> output_sync_idx = {0, 1};
  OpCommand cmd;
  cmd.Sync(output_sync_idx)
      .Name("RotatedNMS")
      .Input(detsCast)
      .Input(scoresCast)
      .Input(labels)
      .Output(selectedBox)
      .Output(selectedIndex)
      .Attr("iou_threshold", (float)iou_threshold)
      .Attr("is_angle", false)
      .Run();
  selectedIndex = selectedIndex.to(at::kLong);
  return selectedIndex;
}


================================================
FILE: mmcv/ops/csrc/pytorch/npu/points_in_box_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void points_in_boxes_part_forward_impl_npu(int batch_size, int boxes_num,
                                           int pts_num, const Tensor boxes,
                                           const Tensor pts,
                                           Tensor box_idx_of_points) {
  c10::SmallVector<int64_t, 8> output_size = {pts.size(0), pts.size(1)};
  auto boxes_trans = boxes.transpose(1, 2).contiguous();
  EXEC_NPU_CMD(aclnnPointsInBox, boxes_trans, pts, box_idx_of_points);
}
void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points);
REGISTER_NPU_IMPL(points_in_boxes_part_forward_impl,
                  points_in_boxes_part_forward_impl_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/points_in_box_npu_all.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void points_in_boxes_all_forward_impl_npu(int batch_size, int boxes_num,
                                          int pts_num, const Tensor boxes,
                                          const Tensor pts,
                                          Tensor box_idx_of_points) {
  c10::SmallVector<int64_t, 8> output_size = {pts.size(0), pts.size(1),
                                              boxes.size(1)};
  auto boxes_trans = boxes.transpose(1, 2).contiguous();
  EXEC_NPU_CMD(aclnnPointsInBoxAll, boxes_trans, pts, box_idx_of_points);
}
void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points);
REGISTER_NPU_IMPL(points_in_boxes_all_forward_impl,
                  points_in_boxes_all_forward_impl_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

constexpr int32_t MAX_POLYGONS_BATCH = 2800;

void points_in_polygons_npu(const Tensor points, Tensor polygons, Tensor output,
                            const int rows, const int cols) {
  TORCH_CHECK(
      (polygons.sizes()[0] <= MAX_POLYGONS_BATCH),
      "The batch of polygons tensor must be less than MAX_POLYGONS_BATCH");
  at::Tensor trans_polygons = polygons.transpose(0, 1);
  OpCommand cmd;
  at::Tensor new_trans_polygons = trans_polygons.contiguous();
  cmd.Name("PointsInPolygons")
      .Input(points, (string) "points")
      .Input(new_trans_polygons, (string) "polygons")
      .Output(output)
      .Run();
}

void points_in_polygons_forward_impl(const Tensor points, Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols);

REGISTER_NPU_IMPL(points_in_polygons_forward_impl, points_in_polygons_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void psamask_forward_npu(const int psa_type, const Tensor x, Tensor y,
                         const int num, const int h_feature,
                         const int w_feature, const int h_mask,
                         const int w_mask, const int half_h_mask,
                         const int half_w_mask) {
  int64_t psa_type_i64 = psa_type;
  int64_t num_i64 = num;
  int64_t h_feature_i64 = h_feature;
  int64_t w_feature_i64 = w_feature;
  int64_t h_mask_i64 = h_mask;
  int64_t w_mask_i64 = w_mask;
  int64_t half_h_mask_i64 = half_h_mask;
  int64_t half_w_mask_i64 = half_w_mask;
  OpCommand cmd;
  cmd.Name("PSAMask")
      .Input(x)
      .Output(y)
      .Attr("psa_type", psa_type_i64)
      .Attr("num", num_i64)
      .Attr("h_feature", h_feature_i64)
      .Attr("w_feature", w_feature_i64)
      .Attr("h_mask", h_mask_i64)
      .Attr("w_mask", w_mask_i64)
      .Attr("half_h_mask", half_h_mask_i64)
      .Attr("half_w_mask", half_w_mask_i64)
      .Run();
}

void psamask_forward_impl(const int psa_type, const Tensor x, Tensor y,
                          const int num, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask);

void psamask_backward_npu(const int psa_type, const Tensor y_grad,
                          Tensor x_grad, const int num, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask) {
  int64_t psa_type_i64 = psa_type;
  int64_t num_i64 = num;
  int64_t h_feature_i64 = h_feature;
  int64_t w_feature_i64 = w_feature;
  int64_t h_mask_i64 = h_mask;
  int64_t w_mask_i64 = w_mask;
  int64_t half_h_mask_i64 = half_h_mask;
  int64_t half_w_mask_i64 = half_w_mask;
  OpCommand cmd;
  cmd.Name("PSAMaskGrad")
      .Input(y_grad)
      .Output(x_grad)
      .Attr("psa_type", psa_type_i64)
      .Attr("num", num_i64)
      .Attr("h_feature", h_feature_i64)
      .Attr("w_feature", w_feature_i64)
      .Attr("h_mask", h_mask_i64)
      .Attr("w_mask", w_mask_i64)
      .Attr("half_h_mask", half_h_mask_i64)
      .Attr("half_w_mask", half_w_mask_i64)
      .Run();
}

void psamask_backward_impl(const int psa_type, const Tensor y_grad,
                           Tensor x_grad, const int num, const int h_feature,
                           const int w_feature, const int h_mask,
                           const int w_mask, const int half_h_mask,
                           const int half_w_mask);

REGISTER_NPU_IMPL(psamask_forward_impl, psamask_forward_npu);
REGISTER_NPU_IMPL(psamask_backward_impl, psamask_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp
================================================
#include "common_util.h"
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void roi_align_forward_npu(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
                           int aligned_width, float spatial_scale,
                           int sampling_ratio, int pool_mode, bool aligned) {
  TORCH_CHECK(input.scalar_type() == at::kFloat,
              "input should be a float tensor");
  int64_t roi_end_mode = 2;
  if (!aligned) {
    LOG(WARNING) << "The [aligned] attr in roi_align op is false";
    roi_end_mode = 0;
  }
  int64_t aligned_height_64 = aligned_height;
  int64_t aligned_width_64 = aligned_width;
  int64_t sampling_ratio_64 = sampling_ratio;
  OpCommand cmd;
  cmd.Name("ROIAlign")
      .Input(input)
      .Input(rois)
      .Output(output)
      .Attr("spatial_scale", spatial_scale)
      .Attr("pooled_height", aligned_height_64)
      .Attr("pooled_width", aligned_width_64)
      .Attr("sample_num", sampling_ratio_64)
      .Attr("roi_end_mode", roi_end_mode)
      .Run();
}

void roi_align_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax_y,
                            Tensor argmax_x, Tensor grad_input,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  TORCH_CHECK(grad_output.scalar_type() == at::kFloat,
              "input should be a float tensor");
  int64_t aligned_height_64 = aligned_height;
  int64_t aligned_width_64 = aligned_width;
  int64_t sampling_ratio_64 = sampling_ratio;
  int64_t roi_end_mode = 2;
  if (!aligned) {
    LOG(WARNING) << "The [aligned] attr in roi_align_grad op is false";
    roi_end_mode = 0;
  }
  auto shape = grad_input.sizes();
  c10::SmallVector<int64_t, 8> xdiff_shape;
  for (uint64_t i = 0; i < shape.size(); i++) {
    xdiff_shape.emplace_back(shape[i]);
  }
  OpCommand cmd;
  cmd.Name("ROIAlignGrad")
      .Input(grad_output)
      .Input(rois)
      .Output(grad_input)
      .Attr("xdiff_shape", xdiff_shape)
      .Attr("pooled_width", aligned_width_64)
      .Attr("pooled_height", aligned_height_64)
      .Attr("spatial_scale", spatial_scale)
      .Attr("sample_num", sampling_ratio_64)
      .Attr("roi_end_mode", roi_end_mode)
      .Run();
}

void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned);

void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned);

REGISTER_NPU_IMPL(roi_align_forward_impl, roi_align_forward_npu);
REGISTER_NPU_IMPL(roi_align_backward_impl, roi_align_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/roi_align_rotated_npu.cpp
================================================
#include "common_util.h"
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void roi_align_rotated_forward_npu(Tensor input, Tensor rois, Tensor output,
                                   int aligned_height, int aligned_width,
                                   float spatial_scale, int sampling_ratio,
                                   bool aligned, bool clockwise) {
  int64_t aligned_height_64 = aligned_height;
  int64_t aligned_width_64 = aligned_width;
  int64_t sampling_ratio_64 = sampling_ratio;

  at::Tensor input_trans = input.permute({0, 2, 3, 1}).contiguous();
  at::Tensor rois_trans = rois.permute({1, 0}).contiguous();
  at::Tensor output_trans = output.permute({0, 2, 3, 1}).contiguous();

  OpCommand cmd;
  cmd.Name("RoiAlignRotated")
      .Input(input_trans)
      .Input(rois_trans)
      .Output(output_trans)
      .Attr("pooled_h", aligned_height_64)
      .Attr("pooled_w", aligned_width_64)
      .Attr("spatial_scale", spatial_scale)
      .Attr("sampling_ratio", sampling_ratio_64)
      .Attr("aligned", aligned)
      .Attr("clockwise", clockwise)
      .Run();

  output_trans = output_trans.permute({0, 3, 1, 2}).contiguous();
  output.copy_(output_trans);
}

void roi_align_rotated_backward_npu(Tensor top_grad, Tensor rois,
                                    Tensor bottom_grad, int aligned_height,
                                    int aligned_width, float spatial_scale,
                                    int sampling_ratio, bool aligned,
                                    bool clockwise) {
  int64_t aligned_height_64 = aligned_height;
  int64_t aligned_width_64 = aligned_width;
  int64_t sampling_ratio_64 = sampling_ratio;

  at::Tensor top_grad_trans = top_grad.permute({0, 2, 3, 1}).contiguous();
  at::Tensor rois_trans = rois.permute({1, 0}).contiguous();
  at::Tensor bottom_grad_trans = bottom_grad.permute({0, 2, 3, 1}).contiguous();

  c10::SmallVector<int64_t, 8> y_grad_shape;
  auto shape = bottom_grad_trans.sizes();
  for (uint64_t i = 0; i < shape.size(); i++) {
    y_grad_shape.emplace_back(shape[i]);
  }
  OpCommand cmd;
  cmd.Name("RoiAlignRotatedGrad")
      .Input(top_grad_trans)
      .Input(rois_trans)
      .Output(bottom_grad_trans)
      .Attr("y_grad_shape", y_grad_shape)
      .Attr("pooled_h", aligned_width_64)
      .Attr("pooled_w", aligned_height_64)
      .Attr("spatial_scale", spatial_scale)
      .Attr("sampling_ratio", sampling_ratio_64)
      .Attr("aligned", aligned)
      .Attr("clockwise", clockwise)
      .Run();

  bottom_grad_trans = bottom_grad_trans.permute({0, 3, 1, 2}).contiguous();
  bottom_grad.copy_(bottom_grad_trans);
}

void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise);

void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise);

REGISTER_NPU_IMPL(roi_align_rotated_forward_impl,
                  roi_align_rotated_forward_npu);
REGISTER_NPU_IMPL(roi_align_rotated_backward_impl,
                  roi_align_rotated_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
                          Tensor argmax, int pooled_height, int pooled_width,
                          float spatial_scale) {
  int64_t pooled_height_64 = pooled_height;
  int64_t pooled_width_64 = pooled_width;
  int64_t pooled_channel = 1;
  at::Tensor roi_actual_num =
      at::empty_like(rois, rois.options().dtype(at::kInt));
  if (input.sizes()[1] % 16 == 0) {
    OpCommand cmd;
    cmd.Name("RoiPoolingWithArgMax")
        .Input(input)
        .Input(rois)
        .Input(roi_actual_num)
        .Output(output)
        .Output(argmax)
        .Attr("pooled_h", pooled_height_64)
        .Attr("pooled_w", pooled_width_64)
        .Attr("spatial_scale_h", spatial_scale)
        .Attr("spatial_scale_w", spatial_scale)
        .Attr("pool_channel", pooled_channel)
        .Run();

  } else {
    OpCommand cmd;
    cmd.Name("RoiPoolingWithArgMax")
        .Input(input)
        .Input(rois)
        .Input(roi_actual_num)
        .Output(output)
        .Output(argmax)
        .Attr("pooled_h", pooled_height_64)
        .Attr("pooled_w", pooled_width_64)
        .Attr("spatial_scale_h", spatial_scale)
        .Attr("spatial_scale_w", spatial_scale)
        .Attr("pool_channel", pooled_channel)
        .Attr("_exclude_engines", (string) "AiCore")
        .Run();
  }
}

void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
                           Tensor grad_input, int pooled_height,
                           int pooled_width, float spatial_scale) {
  int64_t pooled_height_64 = pooled_height;
  int64_t pooled_width_64 = pooled_width;
  int64_t pooled_channel = 1;
  at::Tensor argmax_trans = argmax.transpose(1, 2).transpose(2, 3);
  at::Tensor grad_output_trans = grad_output.transpose(1, 2).transpose(2, 3);
  at::Tensor roi_actual_num =
      at::empty_like(rois, rois.options().dtype(at::kInt));
  at::Tensor x = at::ones_like(grad_input).transpose(1, 2).transpose(2, 3);
  at::Tensor y = at::zeros_like(x);
  OpCommand cmd;
  cmd.Name("RoiPoolingGradWithArgMax")
      .Input(grad_output_trans)
      .Input(x)
      .Input(rois)
      .Input(roi_actual_num)
      .Input(argmax_trans)
      .Output(y)
      .Attr("pooled_h", pooled_height_64)
      .Attr("pooled_w", pooled_width_64)
      .Attr("spatial_scale_h", spatial_scale)
      .Attr("spatial_scale_w", spatial_scale)
      .Attr("pool_channel", pooled_channel)
      .Run();
  at::Tensor result = y.transpose(2, 3).transpose(1, 2);
  at::Tensor res = result.contiguous();
  grad_input.copy_(res);
}

void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale);

void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale);

REGISTER_NPU_IMPL(roi_pool_forward_impl, roi_pool_forward_npu);
REGISTER_NPU_IMPL(roi_pool_backward_impl, roi_pool_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/roiaware_pool3d_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;

void roiaware_pool3d_forward_npu(int boxes_num, int pts_num, int channels,
                                 int max_pts_each_voxel, int out_x, int out_y,
                                 int out_z, const Tensor rois, const Tensor pts,
                                 const Tensor pts_feature, Tensor argmax,
                                 Tensor pts_idx_of_voxels,
                                 Tensor pooled_features, int pool_method) {
  at::Tensor rois_cast = rois;
  at::Tensor pts_cast = pts;
  at::Tensor pts_feature_cast = pts_feature;
  at::Tensor pooled_features_cast = pooled_features;

  auto dtype = rois.dtype();
  if (dtype == at::kHalf) {
    rois_cast = rois_cast.to(at::kFloat);
    pts_cast = pts_cast.to(at::kFloat);
    pts_feature_cast = pts_feature_cast.to(at::kFloat);
    pooled_features_cast = pooled_features_cast.to(at::kFloat);
  }

  EXEC_NPU_CMD(aclnnRoiawarePool3d, rois_cast, pts_cast, pts_feature_cast,
               pool_method, max_pts_each_voxel, out_x, out_y, out_z, argmax,
               pts_idx_of_voxels, pooled_features_cast);

  if (dtype == at::kHalf) {
    pooled_features_cast = pooled_features_cast.to(at::kHalf);
  }

  pooled_features.copy_(pooled_features_cast);
}

void roiaware_pool3d_backward_npu(int boxes_num, int out_x, int out_y,
                                  int out_z, int channels,
                                  int max_pts_each_voxel,
                                  const Tensor pts_idx_of_voxels,
                                  const Tensor argmax, const Tensor grad_out,
                                  Tensor grad_in, int pool_method) {
  int32_t npoints = grad_in.size(0);

  auto dtype = grad_out.dtype();
  at::Tensor grad_out_cast = grad_out;
  at::Tensor grad_in_cast = grad_in;

  if (dtype == at::kHalf) {
    grad_out_cast = grad_out.to(at::kFloat);
    grad_in_cast = grad_in_cast.to(at::kFloat);
  }

  if (pool_method == 0) {
    // maxpool3d
    EXEC_NPU_CMD(aclnnRoiawareMaxpool3dGrad, argmax, grad_out_cast, boxes_num,
                 out_x, out_y, out_z, channels, npoints, grad_in_cast);
  } else if (pool_method == 1) {
    // avgpool3d
    EXEC_NPU_CMD(aclnnRoiawareAvgpool3dGrad, pts_idx_of_voxels, grad_out_cast,
                 boxes_num, out_x, out_y, out_z, channels, npoints,
                 max_pts_each_voxel, grad_in_cast);
  }

  if (dtype == at::kHalf) {
    grad_in_cast = grad_in_cast.to(at::kHalf);
  }

  grad_in.copy_(grad_in_cast);
}

void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method);

void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method);

REGISTER_NPU_IMPL(roiaware_pool3d_forward_impl, roiaware_pool3d_forward_npu);
REGISTER_NPU_IMPL(roiaware_pool3d_backward_impl, roiaware_pool3d_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/roipoint_pool3d_forward.cpp
================================================
#include "common_util.h"
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void roipoint_pool3d_forward_impl_npu(int batch_size, int pts_num,
                                      int boxes_num, int feature_in_len,
                                      int sampled_pts_num, const Tensor xyz,
                                      const Tensor boxes3d,
                                      const Tensor pts_feature,
                                      Tensor pooled_features,
                                      Tensor pooled_empty_flag) {
  auto points_trans = xyz.transpose(1, 2).contiguous();
  auto point_features_trans = pts_feature.transpose(1, 2).contiguous();
  c10::SmallVector<int64_t, 8> features_trans_size = {
      xyz.size(0), boxes3d.size(1), xyz.size(2) + pts_feature.size(2),
      sampled_pts_num};
  at::Tensor pooled_features_trans =
      at::empty(features_trans_size, xyz.options());
  c10::SmallVector<int64_t, 8> empty_flag_size = {boxes3d.size(0),
                                                  boxes3d.size(1)};
  EXEC_NPU_CMD(aclnnRoipointPool3dForward, points_trans, point_features_trans,
               boxes3d, sampled_pts_num, pooled_features_trans,
               pooled_empty_flag);
  auto pooled_features_cache =
      pooled_features_trans.transpose(2, 3).contiguous();
  pooled_features.copy_(pooled_features_cache);
}

void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag);

REGISTER_NPU_IMPL(roipoint_pool3d_forward_impl,
                  roipoint_pool3d_forward_impl_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/rotated_feature_align_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void rotated_feature_align_forward_impl(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output);

void rotated_feature_align_backward_impl(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad);

void rotated_feature_align_forward_npu(const Tensor features,
                                       const Tensor best_bboxes,
                                       const float spatial_scale,
                                       const int points, Tensor output) {
  int64_t points_ = (int64_t)points;
  at::Tensor best_bboxes_ = best_bboxes.transpose(2, 3).transpose(1, 2);
  OpCommand cmd;
  cmd.Name("RotatedFeatureAlign")
      .Input(features)
      .Input(best_bboxes_)
      .Output(output)
      .Attr("spatial_scale", spatial_scale)
      .Attr("points", points_)
      .Run();
}

void rotated_feature_align_backward_npu(const Tensor top_grad,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor bottom_grad) {
  int64_t points_ = (int64_t)points;
  at::Tensor best_bboxes_ = best_bboxes.transpose(2, 3).transpose(1, 2);
  OpCommand cmd;
  cmd.Name("RotatedFeatureAlignGrad")
      .Input(top_grad)
      .Input(best_bboxes_)
      .Output(bottom_grad)
      .Attr("spatial_scale", spatial_scale)
      .Attr("points", points_)
      .Run();
}

REGISTER_NPU_IMPL(rotated_feature_align_forward_impl,
                  rotated_feature_align_forward_npu);

REGISTER_NPU_IMPL(rotated_feature_align_backward_impl,
                  rotated_feature_align_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void stack_ball_query_forward_npu(float max_radius, int nsample,
                                  const Tensor new_xyz,
                                  const Tensor new_xyz_batch_cnt,
                                  const Tensor xyz, const Tensor xyz_batch_cnt,
                                  Tensor idx) {
  at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous();
  double max_radius_double = double(max_radius);
  EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz, xyz_batch_cnt,
               new_xyz_batch_cnt, max_radius_double, nsample, idx);
}

void stack_ball_query_forward_impl(float max_radius, int nsample,
                                   const Tensor new_xyz,
                                   const Tensor new_xyz_batch_cnt,
                                   const Tensor xyz, const Tensor xyz_batch_cnt,
                                   Tensor idx);

REGISTER_NPU_IMPL(stack_ball_query_forward_impl, stack_ball_query_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/stack_group_points_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

void stack_group_points_forward_npu(int b, int c, int n, int nsample,
                                    const Tensor features_tensor,
                                    const Tensor features_batch_cnt_tensor,
                                    const Tensor idx_tensor,
                                    const Tensor idx_batch_cnt_tensor,
                                    Tensor out_tensor) {
  EXEC_NPU_CMD(aclnnStackGroupPoints, features_tensor,
               features_batch_cnt_tensor, idx_tensor, idx_batch_cnt_tensor,
               out_tensor);
}

void stack_group_points_forward_impl(int b, int c, int n, int nsample,
                                     const Tensor features_tensor,
                                     const Tensor features_batch_cnt_tensor,
                                     const Tensor idx_tensor,
                                     const Tensor idx_batch_cnt_tensor,
                                     Tensor out_tensor);

REGISTER_NPU_IMPL(stack_group_points_forward_impl,
                  stack_group_points_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"
#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
#include "torch_npu/csrc/framework/utils/OpAdapter.h"

using namespace NPU_NAME_SPACE;
using namespace std;

void three_interpolate_forward_npu(int b, int c, int m, int n,
                                   const Tensor points, const Tensor idx,
                                   const Tensor weight, Tensor out) {
  auto originDtype = points.scalar_type();
  TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),
              "three_interpolate_forward ascend only support fp32 and fp16.");

  auto point_c_trans = points.transpose(1, 2);

  OpCommand cmd;
  cmd.Name("ThreeInterpolate")
      .Input(point_c_trans)
      .Input(idx)
      .Input(weight)
      .Output(out)
      .Run();

  auto output = out.view({b, n, c}).transpose(1, 2);
  auto res = output.contiguous();
  out.copy_(res);
}

void three_interpolate_backward_npu(int b, int c, int n, int m,
                                    const Tensor grad_out, const Tensor idx,
                                    const Tensor weight, Tensor grad_points) {
  auto originDtype = grad_out.scalar_type();
  TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),
              "three_interpolate_backward ascend only support fp32 and fp16.");

  auto grad_x = at::unsqueeze(grad_out, 3);
  auto grad_y = at::unsqueeze(grad_points, 3);

  EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight, m, grad_y);

  auto output = at::squeeze(grad_y, 3);
  auto res = output.contiguous();
  grad_points.copy_(res);
}

void three_interpolate_forward_impl(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out);

void three_interpolate_backward_impl(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points);

REGISTER_NPU_IMPL(three_interpolate_forward_impl,
                  three_interpolate_forward_npu);

REGISTER_NPU_IMPL(three_interpolate_backward_impl,
                  three_interpolate_backward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"
#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
#include "torch_npu/csrc/framework/utils/OpAdapter.h"

using namespace NPU_NAME_SPACE;
using namespace std;

void three_nn_forward_npu(int b, int n, int m, const Tensor unknown,
                          const Tensor known, Tensor dist2, Tensor idx) {
  at::Tensor source = known.contiguous();
  at::Tensor target = unknown.contiguous();

  bool is_from_knn = false;
  int nsample = 3;
  EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, nsample, dist2, idx);
}

void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx);

REGISTER_NPU_IMPL(three_nn_forward_impl, three_nn_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
================================================
#include "pytorch_npu_helper.hpp"

using namespace NPU_NAME_SPACE;
using namespace std;

int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim = 3);

void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim = 3);

int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
                              at::Tensor &coors,
                              at::Tensor &num_points_per_voxel,
                              const std::vector<float> voxel_size,
                              const std::vector<float> coors_range,
                              const int max_points, const int max_voxels,
                              const int NDim = 3) {
  at::Tensor voxel_num_tmp = at::empty({1}, points.options());
  at::Tensor voxel_num = voxel_num_tmp.to(at::kInt);

  at::Tensor voxel_size_cpu = at::from_blob(
      const_cast<float *>(voxel_size.data()), {3}, dtype(at::kFloat));
  at::Tensor voxel_size_npu = voxel_size_cpu.to(points.device());

  at::Tensor coors_range_cpu = at::from_blob(
      const_cast<float *>(coors_range.data()), {6}, dtype(at::kFloat));
  at::Tensor coors_range_npu = coors_range_cpu.to(points.device());

  int64_t max_points_ = (int64_t)max_points;
  int64_t max_voxels_ = (int64_t)max_voxels;

  // only support true now
  bool deterministic = true;

  OpCommand cmd;
  cmd.Name("Voxelization")
      .Input(points)
      .Input(voxel_size_npu)
      .Input(coors_range_npu)
      .Output(voxels)
      .Output(coors)
      .Output(num_points_per_voxel)
      .Output(voxel_num)
      .Attr("max_points", max_points_)
      .Attr("max_voxels", max_voxels_)
      .Attr("deterministic", deterministic)
      .Run();
  auto voxel_num_cpu = voxel_num.to(at::kCPU);
  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
  return voxel_num_int;
}

void dynamic_voxelize_forward_npu(const at::Tensor &points, at::Tensor &coors,
                                  const std::vector<float> voxel_size,
                                  const std::vector<float> coors_range,
                                  const int NDim = 3) {
  uint32_t ptsNum = points.size(0);
  uint32_t ptsFeature = points.size(1);
  at::Tensor ptsTrans = at::transpose(points, 0, 1);
  double coors_min_x = coors_range[0];
  double coors_min_y = coors_range[1];
  double coors_min_z = coors_range[2];
  double coors_max_x = coors_range[3];
  double coors_max_y = coors_range[4];
  double coors_max_z = coors_range[5];
  double voxel_x = voxel_size[0];
  double voxel_y = voxel_size[1];
  double voxel_z = voxel_size[2];
  int grid_x = std::round((coors_max_x - coors_min_x) / voxel_x);
  int grid_y = std::round((coors_max_y - coors_min_y) / voxel_y);
  int grid_z = std::round((coors_max_z - coors_min_z) / voxel_z);

  at::Tensor tmp_coors =
      at::zeros({3, ptsNum}, points.options().dtype(at::kInt));
  EXEC_NPU_CMD(aclnnDynamicVoxelization, ptsTrans, coors_min_x, coors_min_y,
               coors_min_z, voxel_x, voxel_y, voxel_z, grid_x, grid_y, grid_z,
               tmp_coors);
  tmp_coors.transpose_(0, 1);
  coors.copy_(tmp_coors);
}

REGISTER_NPU_IMPL(hard_voxelize_forward_impl, hard_voxelize_forward_npu);
REGISTER_NPU_IMPL(dynamic_voxelize_forward_impl, dynamic_voxelize_forward_npu);


================================================
FILE: mmcv/ops/csrc/pytorch/pixel_group.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/WenmuZhou/PAN.pytorch

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

std::vector<std::vector<float>> pixel_group_impl(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
                              kernel_label, kernel_contour, kernel_region_num,
                              dis_threshold);
}

std::vector<std::vector<float>> pixel_group(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
  score = score.contiguous();
  mask = mask.contiguous();
  embedding = embedding.contiguous();
  kernel_label = kernel_label.contiguous();
  kernel_contour = kernel_contour.contiguous();

  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
                          kernel_region_num, distance_threshold);
}


================================================
FILE: mmcv/ops/csrc/pytorch/points_in_boxes.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points) {
  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
                       pts_num, boxes, pts, box_idx_of_points);
}

void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
                                      int pts_num, const Tensor boxes,
                                      const Tensor pts,
                                      Tensor box_idx_of_points) {
  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
                       pts_num, boxes, pts, box_idx_of_points);
}

void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                  Tensor box_idx_of_points_tensor) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
  // default -1
  int batch_size = boxes_tensor.size(0);
  int boxes_num = boxes_tensor.size(1);
  int pts_num = pts_tensor.size(1);
  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
                                    boxes_tensor, pts_tensor,
                                    box_idx_of_points_tensor);
}

void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                 Tensor box_idx_of_points_tensor) {
  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
  int batch_size = boxes_tensor.size(0);
  int boxes_num = boxes_tensor.size(1);
  int pts_num = pts_tensor.size(1);
  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
                                   pts_tensor, box_idx_of_points_tensor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/points_in_polygons.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
                                     Tensor output, const int rows,
                                     const int cols) {
  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
                       output, rows, cols);
}

void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
  int rows = points.size(0);
  int cols = polygons.size(0);
  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
}


================================================
FILE: mmcv/ops/csrc/pytorch/prroi_pool.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                             int pooled_height, int pooled_width,
                             float spatial_scale) {
  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
                       pooled_height, pooled_width, spatial_scale);
}

void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
                              Tensor grad_input, int pooled_height,
                              int pooled_width, float spatial_scale) {
  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
                       pooled_height, pooled_width, spatial_scale);
}

void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
                                   Tensor input, Tensor rois, Tensor grad_rois,
                                   int pooled_height, int pooled_width,
                                   float spatial_scale) {
  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
                       input, rois, grad_rois, pooled_height, pooled_width,
                       spatial_scale);
}

void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
                        int pooled_height, int pooled_width,
                        float spatial_scale) {
  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
                          spatial_scale);
}

void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
                         int pooled_height, int pooled_width,
                         float spatial_scale) {
  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
                           pooled_width, spatial_scale);
}

void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
                              Tensor rois, Tensor grad_rois, int pooled_height,
                              int pooled_width, float spatial_scale) {
  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
                                pooled_height, pooled_width, spatial_scale);
}


================================================
FILE: mmcv/ops/csrc/pytorch/psamask.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                          const int num_, const int h_feature,
                          const int w_feature, const int h_mask,
                          const int w_mask, const int half_h_mask,
                          const int half_w_mask) {
  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
                       half_w_mask);
}

void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                           Tensor grad_input, const int num_,
                           const int h_feature, const int w_feature,
                           const int h_mask, const int w_mask,
                           const int half_h_mask, const int half_w_mask) {
  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
                       half_w_mask);
}

void psamask_forward(const Tensor input, Tensor output, const int psa_type,
                     const int num_, const int h_feature, const int w_feature,
                     const int h_mask, const int w_mask, const int half_h_mask,
                     const int half_w_mask) {
  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
                       h_mask, w_mask, half_h_mask, half_w_mask);
}

void psamask_backward(Tensor grad_output, const Tensor grad_input,
                      const int psa_type, const int num_, const int h_feature,
                      const int w_feature, const int h_mask, const int w_mask,
                      const int half_h_mask, const int half_w_mask) {
  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}


================================================
FILE: mmcv/ops/csrc/pytorch/pybind.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include <torch/extension.h>

#include "pytorch_cpp_helper.hpp"

std::string get_compiler_version();
std::string get_compiling_cuda_version();

void assign_score_withk_forward(const Tensor &points, const Tensor &centers,
                                const Tensor &scores, const Tensor &knn_idx,
                                Tensor &output, int B, int N0, int N1, int M,
                                int K, int O, int aggregate);

void assign_score_withk_backward(const Tensor &grad_out, const Tensor &points,
                                 const Tensor &centers, const Tensor &scores,
                                 const Tensor &knn_idx, Tensor &grad_points,
                                 Tensor &grad_centers, Tensor &grad_scores,
                                 int B, int N0, int N1, int M, int K, int O,
                                 int aggregate);

void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                          int kernel_size, int group_size, int scale_factor);

void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
                           Tensor bottom_grad, Tensor mask_grad,
                           int kernel_size, int group_size, int scale_factor);

void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
                    Tensor routput, Tensor rmasks, Tensor output,
                    int kernel_size, int group_size, int scale_factor);

void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
                     Tensor rtop_grad, Tensor rbottom_grad_hs,
                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
                     Tensor mask_grad, int kernel_size, int group_size,
                     int scale_factor);

void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
                         Tensor output, Tensor columns, Tensor ones, int kW,
                         int kH, int dW, int dH, int padW, int padH,
                         int dilationW, int dilationH, int group,
                         int deformable_group, int im2col_step);

void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
                                Tensor gradInput, Tensor gradOffset,
                                Tensor weight, Tensor columns, int kW, int kH,
                                int dW, int dH, int padW, int padH,
                                int dilationW, int dilationH, int group,
                                int deformable_group, int im2col_step);

void deform_conv_backward_parameters(Tensor input, Tensor offset,
                                     Tensor gradOutput, Tensor gradWeight,
                                     Tensor columns, Tensor ones, int kW,
                                     int kH, int dW, int dH, int padW, int padH,
                                     int dilationW, int dilationH, int group,
                                     int deformable_group, float scale,
                                     int im2col_step);

void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                             Tensor output, int pooled_height, int pooled_width,
                             float spatial_scale, int sampling_ratio,
                             float gamma);

void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                              Tensor offset, Tensor grad_input,
                              Tensor grad_offset, int pooled_height,
                              int pooled_width, float spatial_scale,
                              int sampling_ratio, float gamma);

void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                          Tensor out_tensor, int b, int c, int n, int npoints,
                          int nsample);

void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                           Tensor grad_points_tensor, int b, int c, int n,
                           int npoints, int nsample);

void stack_group_points_forward(Tensor features_tensor,
                                Tensor features_batch_cnt_tensor,
                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,
                                Tensor out_tensor, int b, int c, int m,
                                int nsample);

void stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                                 Tensor idx_batch_cnt_tensor,
                                 Tensor features_batch_cnt_tensor,
                                 Tensor grad_features_tensor, int b, int c,
                                 int m, int n, int nsample);

void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
                             Tensor pooled_features, Tensor pooled_empty_flag);

void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n, int npoints);

void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints);

void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha);

void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor grad_input, float gamma, float alpha);

void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                Tensor output, float gamma, float alpha);

void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                 Tensor buff, Tensor grad_input, float gamma,
                                 float alpha);

void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
                               Tensor weight_tensor, Tensor out_tensor, int b,
                               int c, int m, int n);

void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                                Tensor weight_tensor, Tensor grad_points_tensor,
                                int b, int c, int n, int m);

void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
                      int m);

void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                   const int mode, const bool aligned, const int offset);

void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                 Tensor dist2_tensor, int b, int n, int m, int nsample);

void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap);

void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                         float nms_overlap_thresh);

void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
                                float nms_overlap_thresh);

void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                     Tensor idx_tensor, int b, int n, int m);

void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                               Tensor temp_tensor,
                                               Tensor idx_tensor, int b, int n,
                                               int m);

void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor col,
                           const int kernel_h, const int kernel_w,
                           const int pad_h, const int pad_w);

void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                           const Tensor mask_w_idx, Tensor im, int height,
                           int width, int channels);

void modulated_deform_conv_forward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w, const int group,
    const int deformable_group, const bool with_bias);

void modulated_deform_conv_backward(
    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias);

Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
                              const Tensor &level_start_index,
                              const Tensor &sampling_loc,
                              const Tensor &attn_weight, const int im2col_step);

void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
                             const Tensor &level_start_index,
                             const Tensor &sampling_loc,
                             const Tensor &attn_weight,
                             const Tensor &grad_output, Tensor &grad_value,
                             Tensor &grad_sampling_loc,
                             Tensor &grad_attn_weight, const int im2col_step);

Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);

Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
               float sigma, float min_score, int method, int offset);

std::vector<std::vector<int>> nms_match(Tensor dets, float iou_threshold);

std::vector<std::vector<float>> pixel_group(
    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
    Tensor kernel_contour, int kernel_region_num, float distance_threshold);

std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
                                             Tensor internal_kernel_label,
                                             int min_kernel_area,
                                             int kernel_num);

void roi_align_forward(Tensor input, Tensor rois, Tensor output,
                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
                       int aligned_width, float spatial_scale,
                       int sampling_ratio, int pool_mode, bool aligned);

void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
                        Tensor argmax_x, Tensor grad_input, int aligned_height,
                        int aligned_width, float spatial_scale,
                        int sampling_ratio, int pool_mode, bool aligned);

void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
                      int pooled_height, int pooled_width, float spatial_scale);

void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
                       Tensor grad_input, int pooled_height, int pooled_width,
                       float spatial_scale);

void sync_bn_forward_mean(const Tensor input, Tensor mean);

void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var);

void sync_bn_forward_output(const Tensor input, const Tensor mean,
                            const Tensor var, const Tensor weight,
                            const Tensor bias, Tensor running_mean,
                            Tensor running_var, Tensor norm, Tensor std,
                            Tensor output, float eps, float momentum,
                            int group_size);

void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
                            Tensor grad_weight, Tensor grad_bias);

void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
                           const Tensor grad_weight, const Tensor grad_bias,
                           const Tensor norm, const Tensor std,
                           Tensor grad_input);

void psamask_forward(const Tensor input, Tensor output, const int psa_type,
                     const int num_, const int h_feature, const int w_feature,
                     const int h_mask, const int w_mask, const int half_h_mask,
                     const int half_w_mask);

void psamask_backward(Tensor grad_output, const Tensor grad_input,
                      const int psa_type, const int num_, const int h_feature,
                      const int w_feature, const int h_mask, const int w_mask,
                      const int half_h_mask, const int half_w_mask);

void tin_shift_forward(Tensor input, Tensor shift, Tensor output);

void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input);

void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                        Tensor idx_tensor, int b, int n, int m,
                        float min_radius, float max_radius, int nsample);

void stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,
                              Tensor xyz_tensor, Tensor xyz_batch_cnt,
                              Tensor idx_tensor, float max_radius, int nsample);

void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
                        int pooled_height, int pooled_width,
                        float spatial_scale);

void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
                         int pooled_height, int pooled_width,
                         float spatial_scale);

void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
                              Tensor rois, Tensor grad_rois, int pooled_height,
                              int pooled_width, float spatial_scale);

template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_forward(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template <unsigned NDim>
std::vector<Tensor> get_indice_pairs_backward(
    Tensor indices, Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
                           Tensor indiceNum, int64_t numActOut,
                           int64_t _inverse, int64_t _subM);

std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
                                         Tensor outGrad, Tensor indicePairs,
                                         Tensor indiceNum, int64_t _inverse,
                                         int64_t _subM);

Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
                                           Tensor bias, Tensor indicePairs,
                                           Tensor indiceNum, int64_t numActOut,
                                           int64_t _inverse, int64_t _subM);

Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
                              Tensor indiceNum, int64_t numAct);

Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
                               Tensor outGrad, Tensor indicePairs,
                               Tensor indiceNum);

void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                     const int mode_flag, const bool aligned);

Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
                   const Tensor dets_sorted, const Tensor labels,
                   const float iou_threshold, const int multi_label);

Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx, int upy,
                 int downx, int downy, int padx0, int padx1, int pady0,
                 int pady1, bool flip, float gain);

Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
                            const Tensor &refer, int act, int grad, float alpha,
                            float scale);

void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
                               int pooled_height, int pooled_width,
                               float spatial_scale, int sampling_ratio,
                               bool aligned, bool clockwise);

void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
                                Tensor grad_input, int pooled_height,
                                int pooled_width, float spatial_scale,
                                int sampling_ratio, bool aligned,
                                bool clockwise);

std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
    const torch::Tensor &feats, const torch::Tensor &coors,
    const std::string &reduce_type);

void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
                                     const torch::Tensor &grad_reduced_feats,
                                     const torch::Tensor &feats,
                                     const torch::Tensor &reduced_feats,
                                     const torch::Tensor &coors_idx,
                                     const torch::Tensor &reduce_count,
                                     const std::string &reduce_type);

void hard_voxelize_forward(const at::Tensor &points,
                           const at::Tensor &voxel_size,
                           const at::Tensor &coors_range, at::Tensor &voxels,
                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
                           at::Tensor &voxel_num, const int max_points,
                           const int max_voxels, const int NDim,
                           const bool deterministic);

void dynamic_voxelize_forward(const at::Tensor &points,
                              const at::Tensor &voxel_size,
                              const at::Tensor &coors_range, at::Tensor &coors,
                              const int NDim);

void border_align_forward(const Tensor &input, const Tensor &boxes,
                          Tensor output, Tensor argmax_idx,
                          const int pool_size);

void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
                           const Tensor &argmax_idx, Tensor grad_input,
                           const int pool_size);

void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                 Tensor pts_indices_tensor);

void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                  Tensor box_idx_of_points_tensor);

void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                 Tensor box_idx_of_points_tensor);

void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
                             Tensor argmax, Tensor pts_idx_of_voxels,
                             Tensor pooled_features, int pool_method);

void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
                              Tensor grad_out, Tensor grad_in, int pool_method);

void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
                         int kW, int patchH, int patchW, int padH, int padW,
                         int dilationH, int dilationW, int dilation_patchH,
                         int dilation_patchW, int dH, int dW);

void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                          Tensor grad_input1, Tensor grad_input2, int kH,
                          int kW, int patchH, int patchW, int padH, int padW,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW);

void rotated_feature_align_forward(const Tensor features,
                                   const Tensor best_bboxes, Tensor output,
                                   const float spatial_scale, const int points);

void rotated_feature_align_backward(const Tensor top_grad,
                                    const Tensor best_bboxes,
                                    Tensor bottom_grad,
                                    const float spatial_scale,
                                    const int points);

void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
                                 int pooled_height, int pooled_width,
                                 float spatial_scale, int num_samples,
                                 int num_orientations, bool clockwise);

void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
                                  Tensor bottom_grad, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int num_samples, int num_orientations,
                                  bool clockwise);

void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);

void min_area_polygons(const Tensor pointsets, Tensor polygons);

void active_rotated_filter_forward(const Tensor input, const Tensor indices,
                                   Tensor output);

void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
                                    Tensor grad_in);

void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);

void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);

at::Tensor diff_iou_rotated_sort_vertices_forward(at::Tensor vertices,
                                                  at::Tensor mask,
                                                  at::Tensor num_valid);

void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
                              const Tensor dist1, const Tensor dist2,
                              const Tensor idx1, const Tensor idx);

void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
                               Tensor idx1, Tensor idx2, Tensor graddist1,
                               Tensor graddist2, Tensor gradxyz1,
                               Tensor gradxyz2);

Tensor bias_act(const Tensor &input, const Tensor &bias, const Tensor &xref,
                const Tensor &yref, const Tensor &dy, int grad, int dim,
                int act, float alpha, float gain, float clamp);

std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
    bool writeSigns);

torch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,
                                  int sy, float gain, float slope, float clamp,
                                  bool writeSigns);

void box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                    const int mode_flag, const bool aligned);

Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
                  const Tensor dets_sorted, const float iou_threshold,
                  const int multi_label);

void bezier_align_forward(Tensor input, Tensor rois, Tensor output,
                          int aligned_height, int aligned_width,
                          float spatial_scale, int sampling_ratio,
                          bool aligned);

void bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
                           int aligned_height, int aligned_width,
                           float spatial_scale, int sampling_ratio,
                           bool aligned);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
        py::arg("filter"), py::arg("upx"), py::arg("upy"), py::arg("downx"),
        py::arg("downy"), py::arg("padx0"), py::arg("padx1"), py::arg("pady0"),
        py::arg("pady1"), py::arg("flip"), py::arg("gain"));
  m.def("fused_bias_leakyrelu", &fused_bias_leakyrelu,
        "fused_bias_leakyrelu (CUDA)", py::arg("input"), py::arg("bias"),
        py::arg("empty"), py::arg("act"), py::arg("grad"), py::arg("alpha"),
        py::arg("scale"));
  m.def("gather_points_forward", &gather_points_forward,
        "gather_points_forward", py::arg("points_tensor"),
        py::arg("idx_tensor"), py::arg("out_tensor"), py::arg("b"),
        py::arg("c"), py::arg("n"), py::arg("npoints"));
  m.def("gather_points_backward", &gather_points_backward,
        "gather_points_backward", py::arg("grad_out_tensor"),
        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
        py::arg("c"), py::arg("n"), py::arg("npoints"));
  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
  m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
        "get_compiling_cuda_version");
  m.def("assign_score_withk_forward", &assign_score_withk_forward,
        "assign_score_withk_forward", py::arg("points"), py::arg("centers"),
        py::arg("scores"), py::arg("knn_idx"), py::arg("output"), py::arg("B"),
        py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
        py::arg("aggregate"));
  m.def("assign_score_withk_backward", &assign_score_withk_backward,
        "assign_score_withk_backward", py::arg("grad_out"), py::arg("points"),
        py::arg("centers"), py::arg("scores"), py::arg("knn_idx"),
        py::arg("grad_points"), py::arg("grad_centers"), py::arg("grad_scores"),
        py::arg("B"), py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"),
        py::arg("O"), py::arg("aggregate"));
  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("xyz_tensor"),
        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
        py::arg("dist2_tensor"), py::arg("b"), py::arg("n"), py::arg("m"),
        py::arg("nsample"));
  m.def("carafe_naive_forward", &carafe_naive_forward, "carafe_naive_forward",
        py::arg("features"), py::arg("masks"), py::arg("output"),
        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
  m.def("carafe_naive_backward", &carafe_naive_backward,
        "carafe_naive_backward", py::arg("top_grad"), py::arg("features"),
        py::arg("masks"), py::arg("bottom_grad"), py::arg("mask_grad"),
        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
  m.def("carafe_forward", &carafe_forward, "carafe_forward",
        py::arg("features"), py::arg("masks"), py::arg("rfeatures"),
        py::arg("routput"), py::arg("rmasks"), py::arg("output"),
        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
  m.def("carafe_backward", &carafe_backward, "carafe_backward",
        py::arg("top_grad"), py::arg("rfeatures"), py::arg("masks"),
        py::arg("rtop_grad"), py::arg("rbottom_grad_hs"),
        py::arg("rbottom_grad"), py::arg("rmask_grad"), py::arg("bottom_grad"),
        py::arg("mask_grad"), py::arg("kernel_size"), py::arg("group_size"),
        py::arg("scale_factor"));
  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
        py::arg("input"), py::arg("weight"), py::arg("offset"),
        py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
        py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
        py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
  m.def("deform_conv_backward_input", &deform_conv_backward_input,
        "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
        py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
        py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
        py::arg("dW"), py::arg("dH"), py::arg("padW"), py::arg("padH"),
        py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
        py::arg("deformable_group"), py::arg("im2col_step"));
  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
        "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
        py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
        py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
        py::arg("dH"), py::arg("padW"), py::arg("padH"), py::arg("dilationW"),
        py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
        py::arg("scale"), py::arg("im2col_step"));
  m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
        "deform roi pool forward", py::arg("input"), py::arg("rois"),
        py::arg("offset"), py::arg("output"), py::arg("pooled_height"),
        py::arg("pooled_width"), py::arg("spatial_scale"),
        py::arg("sampling_ratio"), py::arg("gamma"));
  m.def("deform_roi_pool_backward", &deform_roi_pool_backward,
        "deform roi pool backward", py::arg("grad_output"), py::arg("input"),
        py::arg("rois"), py::arg("offset"), py::arg("grad_input"),
        py::arg("grad_offset"), py::arg("pooled_height"),
        py::arg("pooled_width"), py::arg("spatial_scale"),
        py::arg("sampling_ratio"), py::arg("gamma"));
  m.def("roipoint_pool3d_forward", &roipoint_pool3d_forward,
        "roipoint_pool3d_forward", py::arg("xyz"), py::arg("boxes3d"),
        py::arg("pts_feature"), py::arg("pooled_features"),
        py::arg("pooled_empty_flag"));
  m.def("sigmoid_focal_loss_forward", &sigmoid_focal_loss_forward,
        "sigmoid_focal_loss_forward ", py::arg("input"), py::arg("target"),
        py::arg("weight"), py::arg("output"), py::arg("gamma"),
        py::arg("alpha"));
  m.def("sigmoid_focal_loss_backward", &sigmoid_focal_loss_backward,
        "sigmoid_focal_loss_backward", py::arg("input"), py::arg("target"),
        py::arg("weight"), py::arg("grad_input"), py::arg("gamma"),
        py::arg("alpha"));
  m.def("softmax_focal_loss_forward", &softmax_focal_loss_forward,
        "softmax_focal_loss_forward", py::arg("input"), py::arg("target"),
        py::arg("weight"), py::arg("output"), py::arg("gamma"),
        py::arg("alpha"));
  m.def("softmax_focal_loss_backward", &softmax_focal_loss_backward,
        "softmax_focal_loss_backward", py::arg("input"), py::arg("target"),
        py::arg("weight"), py::arg("buff"), py::arg("grad_input"),
        py::arg("gamma"), py::arg("alpha"));
  m.def("three_interpolate_forward", &three_interpolate_forward,
        "three_interpolate_forward", py::arg("points_tensor"),
        py::arg("idx_tensor"), py::arg("weight_tensor"), py::arg("out_tensor"),
        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"));
  m.def("three_interpolate_backward", &three_interpolate_backward,
        "three_interpolate_backward", py::arg("grad_out_tensor"),
        py::arg("idx_tensor"), py::arg("weight_tensor"),
        py::arg("grad_points_tensor"), py::arg("b"), py::arg("c"), py::arg("n"),
        py::arg("m"));
  m.def("three_nn_forward", &three_nn_forward, "three_nn_forward",
        py::arg("unknown_tensor"), py::arg("known_tensor"),
        py::arg("dist2_tensor"), py::arg("idx_tensor"), py::arg("b"),
        py::arg("n"), py::arg("m"));
  m.def("bbox_overlaps", &bbox_overlaps, "bbox_overlaps", py::arg("bboxes1"),
        py::arg("bboxes2"), py::arg("ious"), py::arg("mode"),
        py::arg("aligned"), py::arg("offset"));
  m.def("group_points_forward", &group_points_forward, "group_points_forward",
        py::arg("points_tensor"), py::arg("idx_tensor"), py::arg("out_tensor"),
        py::arg("b"), py::arg("c"), py::arg("n"), py::arg("npoints"),
        py::arg("nsample"));
  m.def("group_points_backward", &group_points_backward,
        "group_points_backward", py::arg("grad_out_tensor"),
        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
        py::arg("c"), py::arg("n"), py::arg("npoints"), py::arg("nsample"));
  m.def("stack_group_points_forward", &stack_group_points_forward,
        "stack_group_points_forward", py::arg("features_tensor"),
        py::arg("features_batch_cnt_tensor"), py::arg("idx_tensor"),
        py::arg("idx_batch_cnt_tensor"), py::arg("out_tensor"), py::arg("b"),
        py::arg("c"), py::arg("m"), py::arg("nsample"));
  m.def("stack_group_points_backward", &stack_group_points_backward,
        "stack_group_points_backward", py::arg("grad_out_tensor"),
        py::arg("idx_tensor"), py::arg("idx_batch_cnt_tensor"),
        py::arg("features_batch_cnt_tensor"), py::arg("grad_features_tensor"),
        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"),
        py::arg("nsample"));
  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("b"), py::arg("n"),
        py::arg("m"), py::arg("nsample"), py::arg("xyz_tensor"),
        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
        py::arg("dist2_tensor"));
  m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
        "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
        py::arg("boxes_b"), py::arg("ans_iou"));
  m.def("iou3d_nms3d_forward", &iou3d_nms3d_forward, "iou3d_nms3d_forward",
        py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
        py::arg("nms_overlap_thresh"));
  m.def("iou3d_nms3d_normal_forward", &iou3d_nms3d_normal_forward,
        "iou3d_nms3d_normal_forward", py::arg("boxes"), py::arg("keep"),
        py::arg("num_out"), py::arg("nms_overlap_thresh"));
  m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
        "furthest_point_sampling_forward", py::arg("points_tensor"),
        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
        py::arg("n"), py::arg("m"));
  m.def("furthest_point_sampling_with_dist_forward",
        &furthest_point_sampling_with_dist_forward,
        "furthest_point_sampling_with_dist_forward", py::arg("points_tensor"),
        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
        py::arg("n"), py::arg("m"));
  m.def("masked_im2col_forward", &masked_im2col_forward,
        "masked_im2col_forward", py::arg("im"), py::arg("mask_h_idx"),
        py::arg("mask_w_idx"), py::arg("col"), py::arg("kernel_h"),
        py::arg("kernel_w"), py::arg("pad_h"), py::arg("pad_w"));
  m.def("masked_col2im_forward", &masked_col2im_forward,
        "masked_col2im_forward", py::arg("col"), py::arg("mask_h_idx"),
        py::arg("mask_w_idx"), py::arg("im"), py::arg("height"),
        py::arg("width"), py::arg("channels"));
  m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward,
        "modulated deform conv forward", py::arg("input"), py::arg("weight"),
        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
        py::arg("output"), py::arg("columns"), py::arg("kernel_h"),
        py::arg("kernel_w"), py::arg("stride_h"), py::arg("stride_w"),
        py::arg("pad_h"), py::arg("pad_w"), py::arg("dilation_h"),
        py::arg("dilation_w"), py::arg("group"), py::arg("deformable_group"),
        py::arg("with_bias"));
  m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward,
        "modulated deform conv backward", py::arg("input"), py::arg("weight"),
        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
        py::arg("columns"), py::arg("grad_input"), py::arg("grad_weight"),
        py::arg("grad_bias"), py::arg("grad_offset"), py::arg("grad_mask"),
        py::arg("grad_output"), py::arg("kernel_h"), py::arg("kernel_w"),
        py::arg("stride_h"), py::arg("stride_w"), py::arg("pad_h"),
        py::arg("pad_w"), py::arg("dilation_h"), py::arg("dilation_w"),
        py::arg("group"), py::arg("deformable_group"), py::arg("with_bias"));
  m.def("nms", &nms, "nms (CPU/CUDA) ", py::arg("boxes"), py::arg("scores"),
        py::arg("iou_threshold"), py::arg("offset"));
  m.def("softnms", &softnms, "softnms (CPU) ", py::arg("boxes"),
        py::arg("scores"), py::arg("dets"), py::arg("iou_threshold"),
        py::arg("sigma"), py::arg("min_score"), py::arg("method"),
        py::arg("offset"));
  m.def("nms_match", &nms_match, "nms_match (CPU) ", py::arg("dets"),
        py::arg("iou_threshold"));
  m.def("pixel_group", &pixel_group, "pixel group (CPU) ", py::arg("score"),
        py::arg("mask"), py::arg("embedding"), py::arg("kernel_label"),
        py::arg("kernel_contour"), py::arg("kernel_region_label"),
        py::arg("distance_threshold"));
  m.def("contour_expand", &contour_expand, "contour exapnd (CPU) ",
        py::arg("kernel_mask"), py::arg("internal_kernel_label"),
        py::arg("min_kernel_area"), py::arg("kernel_num"));
  m.def("roi_align_forward", &roi_align_forward, "roi_align forward",
        py::arg("input"), py::arg("rois"), py::arg("output"),
        py::arg("argmax_y"), py::arg("argmax_x"), py::arg("aligned_height"),
        py::arg("aligned_width"), py::arg("spatial_scale"),
        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
  m.def("roi_align_backward", &roi_align_backward, "roi_align backward",
        py::arg("grad_output"), py::arg("rois"), py::arg("argmax_y"),
        py::arg("argmax_x"), py::arg("grad_input"), py::arg("aligned_height"),
        py::arg("aligned_width"), py::arg("spatial_scale"),
        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
  m.def("roi_pool_forward", &roi_pool_forward, "roi_pool forward",
        py::arg("input"), py::arg("rois"), py::arg("output"), py::arg("argmax"),
        py::arg("pooled_height"), py::arg("pooled_width"),
        py::arg("spatial_scale"));
  m.def("roi_pool_backward", &roi_pool_backward, "roi_pool backward",
        py::arg("grad_output"), py::arg("rois"), py::arg("argmax"),
        py::arg("grad_input"), py::arg("pooled_height"),
        py::arg("pooled_width"), py::arg("spatial_scale"));
  m.def("sync_bn_forward_mean", &sync_bn_forward_mean, "sync_bn forward_mean",
        py::arg("input"), py::arg("mean"));
  m.def("sync_bn_forward_var", &sync_bn_forward_var, "sync_bn forward_var",
        py::arg("input"), py::arg("mean"), py::arg("var"));
  m.def("sync_bn_forward_output", &sync_bn_forward_output,
        "sync_bn forward_output", py::arg("input"), py::arg("mean"),
        py::arg("var"), py::arg("weight"), py::arg("bias"),
        py::arg("running_mean"), py::arg("running_var"), py::arg("norm"),
        py::arg("std"), py::arg("output"), py::arg("eps"), py::arg("momentum"),
        py::arg("group_size"));
  m.def("sync_bn_backward_param", &sync_bn_backward_param,
        "sync_bn backward_param", py::arg("grad_output"), py::arg("norm"),
        py::arg("grad_weight"), py::arg("grad_bias"));
  m.def("sync_bn_backward_data", &sync_bn_backward_data,
        "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
        py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
        py::arg("std"), py::arg("grad_input"));
  m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
        "get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
        py::arg("outSpatialShape"), py::arg("spatialShape"),
        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
        py::arg("_transpose"));
  m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
        "get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
        py::arg("outSpatialShape"), py::arg("spatialShape"),
        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
        py::arg("_transpose"));
  m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
        "get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
        py::arg("outSpatialShape"), py::arg("spatialShape"),
        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
        py::arg("_transpose"));
  m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
        "get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
        py::arg("batchSize"), py::arg("outSpatialShape"),
        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
        py::arg("_subM"), py::arg("_transpose"));
  m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
        "get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
        py::arg("batchSize"), py::arg("outSpatialShape"),
        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
        py::arg("_subM"), py::arg("_transpose"));
  m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
        py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
        py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
        py::arg("_subM"));
  m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
        py::arg("features"), py::arg("filters"), py::arg("outGrad"),
        py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
        py::arg("_subM"));
  m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
        "fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
        py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
        py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
  m.def("indice_maxpool_forward", &indice_maxpool_forward,
        "indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
        py::arg("indiceNum"), py::arg("numAct"));
  m.def("indice_maxpool_backward", &indice_maxpool_backward,
        "indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
        py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
  m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
        py::arg("input"), py::arg("output"), py::arg("psa_type"),
        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
        py::arg("half_w_mask"));
  m.def("psamask_backward", &psamask_backward, "PSAMASK backward (CPU/CUDA)",
        py::arg("grad_output"), py::arg("grad_input"), py::arg("psa_type"),
        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
        py::arg("half_w_mask"));
  m.def("tin_shift_forward", &tin_shift_forward, "tin_shift forward",
        py::arg("input"), py::arg("shift"), py::arg("output"));
  m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
        py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
        py::arg("mode_flag"), py::arg("aligned"));
  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
        py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
        py::arg("labels"), py::arg("iou_threshold"), py::arg("multi_label"));
  m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
        py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
        py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
        py::arg("max_radius"), py::arg("nsample"));
  m.def("stack_ball_query_forward", &stack_ball_query_forward,
        "stack_ball_query_forward", py::arg("new_xyz_tensor"),
        py::arg("new_xyz_batch_cnt"), py::arg("xyz_tensor"),
        py::arg("xyz_batch_cnt"), py::arg("idx_tensor"), py::arg("max_radius"),
        py::arg("nsample"));
  m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
        "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
        py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"),
        py::arg("clockwise"));
  m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
        "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
        py::arg("grad_output"), py::arg("pooled_height"),
        py::arg("pooled_width"), py::arg("spatial_scale"),
        py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise"));
  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
        "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
        py::arg("reduce_type"));
  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward,
        "dynamic_point_to_voxel_backward", py::arg("grad_feats"),
        py::arg("grad_reduced_feats"), py::arg("feats"),
        py::arg("reduced_feats"), py::arg("coors_idx"), py::arg("reduce_count"),
        py::arg("reduce_type"));
  m.def("hard_voxelize_forward", &hard_voxelize_forward,
        "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
        py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
        py::arg("num_points_per_voxel"), py::arg("voxel_num"),
        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
        py::arg("deterministic"));
  m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
        "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
        py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
  m.def("ms_deform_attn_forward", &ms_deform_attn_forward,
        "forward function of multi-scale deformable attention",
        py::arg("value"), py::arg("value_spatial_shapes"),
        py::arg("value_level_start_index"), py::arg("sampling_locations"),
        py::arg("attention_weights"), py::arg("im2col_step"));
  m.def("ms_deform_attn_backward", &ms_deform_attn_backward,
        "backward function of multi-scale deformable attention",
        py::arg("value"), py::arg("value_spatial_shapes"),
        py::arg("value_level_start_index"), py::arg("sampling_locations"),
        py::arg("attention_weights"), py::arg("grad_output"),
        py::arg("grad_value"), py::arg("grad_sampling_loc"),
        py::arg("grad_attn_weight"), py::arg("im2col_step"));
  m.def("border_align_forward", &border_align_forward,
        "forward function of border_align", py::arg("input"), py::arg("boxes"),
        py::arg("output"), py::arg("argmax_idx"), py::arg("pool_size"));
  m.def("border_align_backward", &border_align_backward,
        "backward function of border_align", py::arg("grad_output"),
        py::arg("boxes"), py::arg("argmax_idx"), py::arg("grad_input"),
        py::arg("pool_size"));
  m.def("correlation_forward", &correlation_forward, "Correlation forward",
        py::arg("input1"), py::arg("input2"), py::arg("output"), py::arg("kH"),
        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
        py::arg("dW"));
  m.def("correlation_backward", &correlation_backward, "Correlation backward",
        py::arg("grad_output"), py::arg("input1"), py::arg("input2"),
        py::arg("grad_input1"), py::arg("grad_input2"), py::arg("kH"),
        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
        py::arg("dW"));
  m.def("points_in_boxes_cpu_forward", &points_in_boxes_cpu_forward,
        "points_in_boxes_cpu_forward", py::arg("boxes_tensor"),
        py::arg("pts_tensor"), py::arg("pts_indices_tensor"));
  m.def("points_in_boxes_part_forward", &points_in_boxes_part_forward,
        "points_in_boxes_part_forward", py::arg("boxes_tensor"),
        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
  m.def("points_in_boxes_all_forward", &points_in_boxes_all_forward,
        "points_in_boxes_all_forward", py::arg("boxes_tensor"),
        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
  m.def("roiaware_pool3d_forward", &roiaware_pool3d_forward,
        "roiaware_pool3d_forward", py::arg("rois"), py::arg("pts"),
        py::arg("pts_feature"), py::arg("argmax"), py::arg("pts_idx_of_voxels"),
        py::arg("pooled_features"), py::arg("pool_method"));
  m.def("roiaware_pool3d_backward", &roiaware_pool3d_backward,
        "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
        py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
        py::arg("pool_method"));
  m.def("rotated_feature_align_forward", &rotated_feature_align_forward,
        "Feature Refine forward (CUDA)", py::arg("features"),
        py::arg("best_bboxes"), py::arg("output"), py::arg("spatial_scale"),
        py::arg("points"));
  m.def("rotated_feature_align_backward", &rotated_feature_align_backward,
        "Feature Refine backward (CUDA)", py::arg("top_grad"),
        py::arg("best_bboxes"), py::arg("bottom_grad"),
        py::arg("spatial_scale"), py::arg("points"));
  m.def("riroi_align_rotated_forward", &riroi_align_rotated_forward,
        "riroi_align_rotated forward", py::arg("features"), py::arg("rois"),
        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
        py::arg("spatial_scale"), py::arg("num_samples"),
        py::arg("num_orientations"), py::arg("clockwise"));
  m.def("riroi_align_rotated_backward", &riroi_align_rotated_backward,
        "riroi_align_rotated backward", py::arg("top_grad"), py::arg("rois"),
        py::arg("bottom_grad"), py::arg("pooled_height"),
        py::arg("pooled_width"), py::arg("spatial_scale"),
        py::arg("num_samples"), py::arg("num_orientations"),
        py::arg("clockwise"));
  m.def("points_in_polygons_forward", &points_in_polygons_forward,
        "points_in_polygons_forward", py::arg("points"), py::arg("polygons"),
        py::arg("output"));
  m.def("min_area_polygons", &min_area_polygons, "min_area_polygons",
        py::arg("pointsets"), py::arg("polygons"));
  m.def("active_rotated_filter_forward", &active_rotated_filter_forward,
        "active_rotated_filter_forward", py::arg("input"), py::arg("indices"),
        py::arg("output"));
  m.def("active_rotated_filter_backward", &active_rotated_filter_backward,
        "active_rotated_filter_backward", py::arg("grad_out"),
        py::arg("indices"), py::arg("grad_in"));
  m.def("convex_iou", &convex_iou, "convex_iou", py::arg("pointsets"),
        py::arg("polygons"), py::arg("ious"));
  m.def("convex_giou", &convex_giou, "convex_giou", py::arg("pointsets"),
        py::arg("polygons"), py::arg("output"));
  m.def("diff_iou_rotated_sort_vertices_forward",
        &diff_iou_rotated_sort_vertices_forward,
        "diff_iou_rotated_sort_vertices_forward", py::arg("vertices"),
        py::arg("mask"), py::arg("num_valid"));
  m.def("chamfer_distance_forward", &chamfer_distance_forward,
        "chamfer_distance_forward", py::arg("xyz1"), py::arg("xyz2"),
        py::arg("dist1"), py::arg("dist2"), py::arg("idx1"), py::arg("idx2"));
  m.def("chamfer_distance_backward", &chamfer_distance_backward,
        "chamfer_distance_backward", py::arg("xyz1"), py::arg("xyz2"),
        py::arg("idx1"), py::arg("idx2"), py::arg("graddist1"),
        py::arg("graddist2"), py::arg("gradxyz1"), py::arg("gradxyz2"));
  m.def("prroi_pool_forward", &prroi_pool_forward, "prroi_pool forward",
        py::arg("input"), py::arg("rois"), py::arg("output"),
        py::arg("pooled_height"), py::arg("pooled_width"),
        py::arg("spatial_scale"));
  m.def("prroi_pool_backward", &prroi_pool_backward, "prroi_pool_backward",
        py::arg("grad_output"), py::arg("rois"), py::arg("grad_input"),
        py::arg("pooled_height"), py::arg("pooled_width"),
        py::arg("spatial_scale"));
  m.def("prroi_pool_coor_backward", &prroi_pool_coor_backward,
        "prroi_pool_coor_backward", py::arg("output"), py::arg("grad_output"),
        py::arg("input"), py::arg("rois"), py::arg("grad_rois"),
        py::arg("pooled_height"), py::arg("pooled_width"),
        py::arg("spatial_scale"));
  m.def("bias_act", &bias_act, "bias_act (CUDA)", py::arg("input"),
        py::arg("bias"), py::arg("xref"), py::arg("yref"), py::arg("dy"),
        py::arg("grad"), py::arg("dim"), py::arg("act"), py::arg("alpha"),
        py::arg("gain"), py::arg("clamp"));
  m.def("filtered_lrelu", &filtered_lrelu, "filtered_lrelu (CUDA)",
        py::arg("x"), py::arg("fu"), py::arg("fd"), py::arg("b"), py::arg("si"),
        py::arg("up"), py::arg("down"), py::arg("px0"), py::arg("px1"),
        py::arg("py0"), py::arg("py1"), py::arg("sx"), py::arg("sy"),
        py::arg("gain"), py::arg("slope"), py::arg("clamp"),
        py::arg("flip_filters"), py::arg("writeSigns"));
  m.def("filtered_lrelu_act_", &filtered_lrelu_act_,
        "filtered_lrelu_act_ (CUDA)", py::arg("x"), py::arg("si"),
        py::arg("sx"), py::arg("sy"), py::arg("gain"), py::arg("slope"),
        py::arg("clamp"), py::arg("writeSigns"));
  m.def("box_iou_quadri", &box_iou_quadri, "IoU for quadrilateral boxes",
        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
        py::arg("mode_flag"), py::arg("aligned"));
  m.def("nms_quadri", &nms_quadri, "NMS for quadrilateral boxes",
        py::arg("dets"), py::arg("scores"), py::arg("order"),
        py::arg("dets_sorted"), py::arg("iou_threshold"),
        py::arg("multi_label"));
  m.def("bezier_align_forward", &bezier_align_forward, "bezier_align forward",
        py::arg("input"), py::arg("rois"), py::arg("output"),
        py::arg("aligned_height"), py::arg("aligned_width"),
        py::arg("spatial_scale"), py::arg("sampling_ratio"),
        py::arg("aligned"));
  m.def("bezier_align_backward", &bezier_align_backward,
        "bezier_align backward", py::arg("grad_output"), py::arg("rois"),
        py::arg("grad_input"), py::arg("aligned_height"),
        py::arg("aligned_width"), py::arg("spatial_scale"),
        py::arg("sampling_ratio"), py::arg("aligned"));
}


================================================
FILE: mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
                                      Tensor output, int pooled_height,
                                      int pooled_width, float spatial_scale,
                                      int num_samples, int num_orientations,
                                      bool clockwise) {
  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
                       pooled_height, pooled_width, spatial_scale, num_samples,
                       num_orientations, clockwise);
}

void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                       Tensor bottom_grad, int pooled_height,
                                       int pooled_width, float spatial_scale,
                                       int num_samples, int num_orientations,
                                       bool clockwise) {
  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
                       bottom_grad, pooled_height, pooled_width, spatial_scale,
                       num_samples, num_orientations, clockwise);
}

void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
                                 int pooled_height, int pooled_width,
                                 float spatial_scale, int num_samples,
                                 int num_orientations, bool clockwise) {
  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
                                   pooled_width, spatial_scale, num_samples,
                                   num_orientations, clockwise);
}

void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
                                  Tensor bottom_grad, int pooled_height,
                                  int pooled_width, float spatial_scale,
                                  int num_samples, int num_orientations,
                                  bool clockwise) {
  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
                                    pooled_width, spatial_scale, num_samples,
                                    num_orientations, clockwise);
}


================================================
FILE: mmcv/ops/csrc/pytorch/roi_align.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>

#include "csrc_dipu/base/basedef.h"
#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"

using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif

void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax_y, Tensor argmax_x,
                            int aligned_height, int aligned_width,
                            float spatial_scale, int sampling_ratio,
                            int pool_mode, bool aligned) {
  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
                       argmax_x, aligned_height, aligned_width, spatial_scale,
                       sampling_ratio, pool_mode, aligned);
}

void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                             Tensor argmax_x, Tensor grad_input,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
                       argmax_x, grad_input, aligned_height, aligned_width,
                       spatial_scale, sampling_ratio, pool_mode, aligned);
}

#ifdef MMCV_WITH_DIOPI
void roi_align_forward_diopi(Tensor input, Tensor rois, Tensor output,
                             Tensor argmax_y, Tensor argmax_x,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
  auto input_p = toDiopiTensorHandle(input);
  diopiDevice_t device;
  diopiGetTensorDevice(input_p, &device);
  if (device == diopi_host) {
    roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
                           aligned_height, aligned_width, spatial_scale,
                           sampling_ratio, pool_mode, aligned);
    return;
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  auto rois_p = toDiopiTensorHandle(rois);
  auto out_p = toDiopiTensorHandle(output);
  auto argmax_y_p = toDiopiTensorHandle(argmax_y);
  auto argmax_x_p = toDiopiTensorHandle(argmax_x);
  bool is_mock_cuda = input.device().type() == dipu::DIPU_DEVICE_TYPE;
  if (is_mock_cuda && reinterpret_cast<void *>(diopiRoiAlignMmcv) != nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret = diopiRoiAlignMmcv(
          ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,
          aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);
      if (ret == diopiSuccess) return;
    } else {
      auto ret = diopiRoiAlignMmcv(
          ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,
          aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);
      if (ret == diopiSuccess) return;
    }
  }
  LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_forward";
  auto input_cpu = input.cpu();
  auto rois_cpu = rois.cpu();
  auto out_cpu = output.cpu();
  auto argmax_y_cpu = argmax_y.cpu();
  auto argmax_x_cpu = argmax_x.cpu();
  roi_align_forward_impl(input_cpu, rois_cpu, out_cpu, argmax_y_cpu,
                         argmax_x_cpu, aligned_height, aligned_width,
                         spatial_scale, sampling_ratio, pool_mode, aligned);
  output.copy_(out_cpu);
}

void roi_align_backward_diopi(Tensor grad_output, Tensor rois, Tensor argmax_y,
                              Tensor argmax_x, Tensor grad_input,
                              int aligned_height, int aligned_width,
                              float spatial_scale, int sampling_ratio,
                              int pool_mode, bool aligned) {
  auto grad_output_ = toDiopiTensorHandle(grad_output);
  diopiDevice_t device;
  diopiGetTensorDevice(grad_output_, &device);
  if (device == diopi_host) {
    roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
                            aligned_height, aligned_width, spatial_scale,
                            sampling_ratio, pool_mode, aligned);
    return;
  }
  auto rois_ = toDiopiTensorHandle(rois);
  auto argmax_y_ = toDiopiTensorHandle(argmax_y);
  auto argmax_x_ = toDiopiTensorHandle(argmax_x);
  auto grad_input_ = toDiopiTensorHandle(grad_input);
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  bool is_mock_cuda = grad_output.device().type() == dipu::DIPU_DEVICE_TYPE;
  if (is_mock_cuda &&
      reinterpret_cast<void *>(diopiRoiAlignBackwardMmcv) != nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,
                                           argmax_y_, argmax_x_, aligned_height,
                                           aligned_width, sampling_ratio,
                                           pool_mode, spatial_scale, aligned);
      if (ret == diopiSuccess) return;
    } else {
      auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,
                                           argmax_y_, argmax_x_, aligned_height,
                                           aligned_width, sampling_ratio,
                                           pool_mode, spatial_scale, aligned);
      if (ret == diopiSuccess) return;
    }
  }
  LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_backward";
  auto grad_output_cpu = grad_output.cpu();
  auto rois_cpu = rois.cpu();
  auto argmax_y_cpu = argmax_y.cpu();
  auto argmax_x_cpu = argmax_x.cpu();
  auto grad_input_cpu = grad_input.cpu();
  roi_align_backward_impl(grad_output_cpu, rois_cpu, argmax_y_cpu, argmax_x_cpu,
                          grad_input_cpu, aligned_height, aligned_width,
                          spatial_scale, sampling_ratio, pool_mode, aligned);
  grad_input.copy_(grad_input_cpu);
}
#endif

void roi_align_forward(Tensor input, Tensor rois, Tensor output,
                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
                       int aligned_width, float spatial_scale,
                       int sampling_ratio, int pool_mode, bool aligned) {
#ifdef MMCV_WITH_DIOPI
  roi_align_forward_diopi(input, rois, output, argmax_y, argmax_x,
                          aligned_height, aligned_width, spatial_scale,
                          sampling_ratio, pool_mode, aligned);
#else
  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
                         aligned_height, aligned_width, spatial_scale,
                         sampling_ratio, pool_mode, aligned);
#endif
}

void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
                        Tensor argmax_x, Tensor grad_input, int aligned_height,
                        int aligned_width, float spatial_scale,
                        int sampling_ratio, int pool_mode, bool aligned) {
#ifdef MMCV_WITH_DIOPI
  roi_align_backward_diopi(grad_output, rois, argmax_y, argmax_x, grad_input,
                           aligned_height, aligned_width, spatial_scale,
                           sampling_ratio, pool_mode, aligned);
#else
  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
                          aligned_height, aligned_width, spatial_scale,
                          sampling_ratio, pool_mode, aligned);
#endif
}


================================================
FILE: mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                    int aligned_height, int aligned_width,
                                    float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise) {
  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, input, rois, output,
                       aligned_height, aligned_width, spatial_scale,
                       sampling_ratio, aligned, clockwise);
}

void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                     Tensor bottom_grad, int aligned_height,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise) {
  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
                       bottom_grad, aligned_height, aligned_width,
                       spatial_scale, sampling_ratio, aligned, clockwise);
}

void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
                               int aligned_height, int aligned_width,
                               float spatial_scale, int sampling_ratio,
                               bool aligned, bool clockwise) {
  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
                                 aligned_width, spatial_scale, sampling_ratio,
                                 aligned, clockwise);
}

void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
                                Tensor bottom_grad, int aligned_height,
                                int aligned_width, float spatial_scale,
                                int sampling_ratio, bool aligned,
                                bool clockwise) {
  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
                                  aligned_width, spatial_scale, sampling_ratio,
                                  aligned, clockwise);
}


================================================
FILE: mmcv/ops/csrc/pytorch/roi_pool.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale) {
  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
                       pooled_height, pooled_width, spatial_scale);
}

void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                            Tensor grad_input, int pooled_height,
                            int pooled_width, float spatial_scale) {
  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
                       grad_input, pooled_height, pooled_width, spatial_scale);
}

void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
                      int pooled_height, int pooled_width,
                      float spatial_scale) {
  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
                        pooled_width, spatial_scale);
}

void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
                       Tensor grad_input, int pooled_height, int pooled_width,
                       float spatial_scale) {
  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
                         pooled_width, spatial_scale);
}


================================================
FILE: mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
================================================
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
                                  int max_pts_each_voxel, int out_x, int out_y,
                                  int out_z, const Tensor rois,
                                  const Tensor pts, const Tensor pts_feature,
                                  Tensor argmax, Tensor pts_idx_of_voxels,
                                  Tensor pooled_features, int pool_method) {
  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
                       pts, pts_feature, argmax, pts_idx_of_voxels,
                       pooled_features, pool_method);
}

void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
                                   int out_z, int channels,
                                   int max_pts_each_voxel,
                                   const Tensor pts_idx_of_voxels,
                                   const Tensor argmax, const Tensor grad_out,
                                   Tensor grad_in, int pool_method) {
  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
                       argmax, grad_out, grad_in, pool_method);
}

void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
                             Tensor argmax, Tensor pts_idx_of_voxels,
                             Tensor pooled_features, int pool_method) {
  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
  // coordinate
  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
  // params pts_feature: (npoints, C)
  // params argmax: (N, out_x, out_y, out_z, C)
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params pooled_features: (N, out_x, out_y, out_z, C)
  // params pool_method: 0: max_pool 1: avg_pool
  int boxes_num = rois.size(0);
  int pts_num = pts.size(0);
  int channels = pts_feature.size(1);
  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
  int out_x = pts_idx_of_voxels.size(1);
  int out_y = pts_idx_of_voxels.size(2);
  int out_z = pts_idx_of_voxels.size(3);
  assert((out_x < 256) && (out_y < 256) &&
         (out_z < 256));  // we encode index with 8bit

  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
                               out_x, out_y, out_z, rois, pts, pts_feature,
                               argmax, pts_idx_of_voxels, pooled_features,
                               pool_method);
}

void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
                              Tensor grad_out, Tensor grad_in,
                              int pool_method) {
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params argmax: (N, out_x, out_y, out_z, C)
  // params grad_out: (N, out_x, out_y, out_z, C)
  // params grad_in: (npoints, C), return value
  // params pool_method: 0: max_pool 1: avg_pool
  int boxes_num = pts_idx_of_voxels.size(0);
  int out_x = pts_idx_of_voxels.size(1);
  int out_y = pts_idx_of_voxels.size(2);
  int out_z = pts_idx_of_voxels.size(3);
  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
  int channels = grad_out.size(4);

  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
                                grad_out, grad_in, pool_method);
}


================================================
FILE: mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
================================================
/*
Modified from
https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
                                  int feature_in_len, int sampled_pts_num,
                                  const Tensor xyz, const Tensor boxes3d,
                                  const Tensor pts_feature,
                                  Tensor pooled_features,
                                  Tensor pooled_empty_flag) {
  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
                       pts_feature, pooled_features, pooled_empty_flag);
}

void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
                             Tensor pooled_features, Tensor pooled_empty_flag) {
  // params xyz: (B, N, 3)
  // params boxes3d: (B, M, 7)
  // params pts_feature: (B, N, C)
  // params pooled_features: (B, M, 512, 3+C)
  // params pooled_empty_flag: (B, M)
  int batch_size = xyz.size(0);
  int pts_num = xyz.size(1);
  int boxes_num = boxes3d.size(1);
  int feature_in_len = pts_feature.size(2);
  int sampled_pts_num = pooled_features.size(2);

  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
                               sampled_pts_num, xyz, boxes3d, pts_feature,
                               pooled_features, pooled_empty_flag);
}


================================================
FILE: mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void rotated_feature_align_forward_impl(const Tensor features,
                                        const Tensor best_bboxes,
                                        const float spatial_scale,
                                        const int points, Tensor output) {
  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
                       best_bboxes, spatial_scale, points, output);
}

void rotated_feature_align_backward_impl(const Tensor top_grad,
                                         const Tensor best_bboxes,
                                         const float spatial_scale,
                                         const int points, Tensor bottom_grad) {
  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
                       best_bboxes, spatial_scale, points, bottom_grad);
}

void rotated_feature_align_forward(const Tensor features,
                                   const Tensor best_bboxes, Tensor output,
                                   const float spatial_scale,
                                   const int points) {
  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
                                     points, output);
}

void rotated_feature_align_backward(const Tensor top_grad,
                                    const Tensor best_bboxes,
                                    Tensor bottom_grad,
                                    const float spatial_scale,
                                    const int points) {
  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
                                      points, bottom_grad);
}


================================================
FILE: mmcv/ops/csrc/pytorch/scatter_points.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;

std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
    const torch::Tensor &feats, const torch::Tensor &coors,
    const reduce_t reduce_type) {
  return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
                              reduce_type);
}

void dynamic_point_to_voxel_backward_impl(
    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
    const reduce_t reduce_type) {
  DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
                       grad_reduced_feats, feats, reduced_feats, coors_idx,
                       reduce_count, reduce_type);
}

inline reduce_t convert_reduce_type(const std::string &reduce_type) {
  if (reduce_type == "max")
    return reduce_t::MAX;
  else if (reduce_type == "sum")
    return reduce_t::SUM;
  else if (reduce_type == "mean")
    return reduce_t::MEAN;
  else
    TORCH_CHECK(false, "do not support reduce type " + reduce_type)
  return reduce_t::SUM;
}

std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
    const torch::Tensor &feats, const torch::Tensor &coors,
    const std::string &reduce_type) {
  return dynamic_point_to_voxel_forward_impl(feats, coors,
                                             convert_reduce_type(reduce_type));
}

void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
                                     const torch::Tensor &grad_reduced_feats,
                                     const torch::Tensor &feats,
                                     const torch::Tensor &reduced_feats,
                                     const torch::Tensor &coors_idx,
                                     const torch::Tensor &reduce_count,
                                     const std::string &reduce_type) {
  dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
                                       reduced_feats, coors_idx, reduce_count,
                                       convert_reduce_type(reduce_type));
}


================================================
FILE: mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
                                          torch::Tensor indicePairs,
                                          torch::Tensor indiceNum,
                                          int64_t numAct) {
  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
                              indicePairs, indiceNum, numAct);
}

torch::Tensor indice_maxpool_forward(torch::Tensor features,
                                     torch::Tensor indicePairs,
                                     torch::Tensor indiceNum, int64_t numAct) {
  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
}

torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
                                           torch::Tensor outFeatures,
                                           torch::Tensor outGrad,
                                           torch::Tensor indicePairs,
                                           torch::Tensor indiceNum) {
  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
                              outFeatures, outGrad, indicePairs, indiceNum);
}

torch::Tensor indice_maxpool_backward(torch::Tensor features,
                                      torch::Tensor outFeatures,
                                      torch::Tensor outGrad,
                                      torch::Tensor indicePairs,
                                      torch::Tensor indiceNum) {
  return indice_maxpool_backward_impl(features, outFeatures, outGrad,
                                      indicePairs, indiceNum);
}


================================================
FILE: mmcv/ops/csrc/pytorch/spconv_ops.cpp
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
      padding, dilation, outPadding, _subM, _transpose);
};

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_forward_mlu(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  return GetIndicePairsForwardMLUKernelLauncher<NDim>(
      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
      padding, dilation, outPadding, _subM, _transpose);
}

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_forward_musa(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  return GetIndicePairsForwardMUSAKernelLauncher<NDim>(
      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
      padding, dilation, outPadding, _subM, _transpose);
};

template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
      stride, padding, dilation, outPadding, _subM, _transpose);
};

#ifdef MMCV_WITH_MUSA
template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsBackwardMUSAKernelLauncher(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_backward_musa(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  return GetIndicePairsBackwardMUSAKernelLauncher<NDim>(
      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
      stride, padding, dilation, outPadding, _subM, _transpose);
};
#endif

template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_forward(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  if (indices.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(indices);

    return get_indice_pairs_forward_cuda<NDim>(
        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
        padding, dilation, outPadding, _subM, _transpose);
#else
    AT_ERROR("get_indice_pairs is not compiled with GPU support");
#endif
#ifdef MMCV_WITH_MLU
  } else if (indices.device().type() == at::kMLU) {
    return get_indice_pairs_forward_mlu<NDim>(
        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
        padding, dilation, outPadding, _subM, _transpose);
#endif
#ifdef MMCV_WITH_MUSA
  } else if (indices.device().type() == at::kMUSA) {
    return get_indice_pairs_forward_musa<NDim>(
        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
        padding, dilation, outPadding, _subM, _transpose);
#endif
  } else {
    AT_ERROR("get_indice_pairs is not implemented on CPU");
  }
}

template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_backward(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
  if (indices.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
    CHECK_CUDA_INPUT(indices);
    CHECK_CUDA_INPUT(gridOut);

    return get_indice_pairs_backward_cuda<NDim>(
        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
        stride, padding, dilation, outPadding, _subM, _transpose);
#else
    AT_ERROR("get_indice_pairs is not compiled with GPU support");
#endif
  } else {
#ifdef MMCV_WITH_MUSA
    if (indices.device().type() == at::kMUSA) {
      CHECK_MUSA_INPUT(indices);
      CHECK_MUSA_INPUT(gridOut);
      return get_indice_pairs_backward_musa<NDim>(
          indices, gridOut, batchSize, outSpatialShape, spatialShape,
          kernelSize, stride, padding, dilation, outPadding, _subM, _transpose);
    }
#endif
    AT_ERROR("get_indice_pairs is not implemented on CPU");
  }
}

torch::Tensor indice_conv_forward_impl(torch::Tensor features,
                                       torch::Tensor filters,
                                       torch::Tensor indicePairs,
                                       torch::Tensor indiceNum,
                                       int64_t numActOut, int64_t _inverse,
                                       int64_t _subM) {
  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
                              indicePairs, indiceNum, numActOut, _inverse,
                              _subM);
}

torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
                                  torch::Tensor indicePairs,
                                  torch::Tensor indiceNum, int64_t numActOut,
                                  int64_t _inverse, int64_t _subM) {
  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
                                  numActOut, _inverse, _subM);
}

std::vector<torch::Tensor> indice_conv_backward_impl(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM) {
  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
                              outGrad, indicePairs, indiceNum, _inverse, _subM);
}

std::vector<torch::Tensor> indice_conv_backward(
    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
    int64_t _subM) {
  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
                                   indiceNum, _inverse, _subM);
}

template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);

template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);


================================================
FILE: mmcv/ops/csrc/pytorch/spconv_utils.h
================================================
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include <ATen/ATen.h>
#ifdef MMCV_WITH_MUSA
#include "pytorch_musa_helper.hpp"
#include "torch_musa/csrc/aten/musa/MUSAContext.h"
#else
#include <ATen/cuda/CUDAContext.h>

#include "pytorch_cuda_helper.hpp"
#endif
#include <torch/script.h>
#include <utils/spconv/tensorview/tensorview.h>

namespace tv {
#ifdef MMCV_WITH_MUSA
struct GPU {
  GPU(musaStream_t s = 0) : mStream(s) {}
  virtual musaStream_t getStream() const { return mStream; }
  musaStream_t mStream = 0;
};

struct TorchGPU : public tv::GPU {
  virtual musaStream_t getStream() const override {
    return at::musa::getCurrentMUSAStream();
  }
};

#else
struct GPU {
  GPU(cudaStream_t s = 0) : mStream(s) {}
  virtual cudaStream_t getStream() const { return mStream; }
  cudaStream_t mStream = 0;
};

struct TorchGPU : public tv::GPU {
  virtual cudaStream_t getStream() const override {
    return at::cuda::getCurrentCUDAStream();
  }
};
#endif

template <typename scalar_t>
void check_torch_dtype(const torch::Tensor &tensor) {
  switch (tensor.scalar_type()) {
    case at::ScalarType::Double: {
      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
      TV_ASSERT_RT_ERR(val, "error");
      break;
    }
    case at::ScalarType::Float: {
      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
      TV_ASSERT_RT_ERR(val, "error");
      break;
    }
    case at::ScalarType::Int: {
      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
      TV_ASSERT_RT_ERR(val, "error");
      break;
    }
    case at::ScalarType::Half: {
      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
      TV_ASSERT_RT_ERR(val, "error");
      break;
    }
    case at::ScalarType::Long: {
      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
      TV_ASSERT_RT_ERR(val, "error");
      break;
    }
    default:
      TV_ASSERT_RT_ERR(false, "error");
  }
}

template <typename scalar_t>
tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
  check_torch_dtype<scalar_t>(tensor);
  tv::Shape shape;
  for (auto i : tensor.sizes()) {
    shape.push_back(i);
  }
  return tv::TensorView<scalar_t>(
      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
}
}  // namespace tv


================================================
FILE: mmcv/ops/csrc/pytorch/sync_bn.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
}

void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
                              Tensor var) {
  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
}

void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
                                 const Tensor var, Tensor running_mean,
                                 Tensor running_var, const Tensor weight,
                                 const Tensor bias, Tensor norm, Tensor std,
                                 Tensor output, float eps, float momentum,
                                 int group_size) {
  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
                       running_mean, running_var, weight, bias, norm, std,
                       output, eps, momentum, group_size);
}

void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
                                 Tensor grad_weight, Tensor grad_bias) {
  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
                       grad_weight, grad_bias);
}

void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
                                const Tensor grad_weight,
                                const Tensor grad_bias, const Tensor norm,
                                const Tensor std, Tensor grad_input) {
  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
                       grad_weight, grad_bias, norm, std, grad_input);
}

void sync_bn_forward_mean(const Tensor input, Tensor mean) {
  sync_bn_forward_mean_impl(input, mean);
}

void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
  sync_bn_forward_var_impl(input, mean, var);
}

void sync_bn_forward_output(const Tensor input, const Tensor mean,
                            const Tensor var, const Tensor weight,
                            const Tensor bias, Tensor running_mean,
                            Tensor running_var, Tensor norm, Tensor std,
                            Tensor output, float eps, float momentum,
                            int group_size) {
  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
                              weight, bias, norm, std, output, eps, momentum,
                              group_size);
}

void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
                            Tensor grad_weight, Tensor grad_bias) {
  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
}

void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
                           const Tensor grad_weight, const Tensor grad_bias,
                           const Tensor norm, const Tensor std,
                           Tensor grad_input) {
  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
                             std, grad_input);
}


================================================
FILE: mmcv/ops/csrc/pytorch/three_interpolate.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void three_interpolate_forward_impl(int b, int c, int m, int n,
                                    const Tensor points, const Tensor idx,
                                    const Tensor weight, Tensor out) {
  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
                       weight, out);
}

void three_interpolate_backward_impl(int b, int c, int n, int m,
                                     const Tensor grad_out, const Tensor idx,
                                     const Tensor weight, Tensor grad_points) {
  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
                       idx, weight, grad_points);
}

void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
                               Tensor weight_tensor, Tensor out_tensor, int b,
                               int c, int m, int n) {
  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
                                 weight_tensor, out_tensor);
}

void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                                Tensor weight_tensor, Tensor grad_points_tensor,
                                int b, int c, int n, int m) {
  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
                                  weight_tensor, grad_points_tensor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/three_nn.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx) {
  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
                       idx);
}

void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
                      int m) {
  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
                        idx_tensor);
}


================================================
FILE: mmcv/ops/csrc/pytorch/tin_shift.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
}

void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
                             Tensor grad_input) {
  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
}

void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
  tin_shift_forward_impl(input, shift, output);
}

void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
  tin_shift_backward_impl(grad_output, shift, grad_input);
}


================================================
FILE: mmcv/ops/csrc/pytorch/upfirdn2d.cpp
================================================
// Modified from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp

/*
Copyright (c) 2021, NVIDIA Corporation. All rights reserved.

NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
Augmentation (ADA)
=======================================================================

1. Definitions

"Licensor" means any person or entity that distributes its Work.

"Software" means the original work of authorship made available under
this License.

"Work" means the Software and any additions to or derivative works of
the Software that are made available under this License.

The terms "reproduce," "reproduction," "derivative works," and
"distribution" have the meaning as provided under U.S. copyright law;
provided, however, that for the purposes of this License, derivative
works shall not include works that remain separable from, or merely
link (or bind by name) to the interfaces of, the Work.

Works, including the Software, are "made available" under this License
by including in or with the Work either (a) a copyright notice
referencing the applicability of this License to the Work, or (b) a
copy of this License.

2. License Grants

    2.1 Copyright Grant. Subject to the terms and conditions of this
    License, each Licensor grants to you a perpetual, worldwide,
    non-exclusive, royalty-free, copyright license to reproduce,
    prepare derivative works of, publicly display, publicly perform,
    sublicense and distribute its Work and any resulting derivative
    works in any form.

3. Limitations

    3.1 Redistribution. You may reproduce or distribute the Work only
    if (a) you do so under this License, (b) you include a complete
    copy of this License with your distribution, and (c) you retain
    without modification any copyright, patent, trademark, or
    attribution notices that are present in the Work.

    3.2 Derivative Works. You may specify that additional or different
    terms apply to the use, reproduction, and distribution of your
    derivative works of the Work ("Your Terms") only if (a) Your Terms
    provide that the use limitation in Section 3.3 applies to your
    derivative works, and (b) you identify the specific derivative
    works that are subject to Your Terms. Notwithstanding Your Terms,
    this License (including the redistribution requirements in Section
    3.1) will continue to apply to the Work itself.

    3.3 Use Limitation. The Work and any derivative works thereof only
    may be used or intended for use non-commercially. Notwithstanding
    the foregoing, NVIDIA and its affiliates may use the Work and any
    derivative works commercially. As used herein, "non-commercially"
    means for research or evaluation purposes only.

    3.4 Patent Claims. If you bring or threaten to bring a patent claim
    against any Licensor (including any claim, cross-claim or
    counterclaim in a lawsuit) to enforce any patents that you allege
    are infringed by any Work, then your rights under this License from
    such Licensor (including the grant in Section 2.1) will terminate
    immediately.

    3.5 Trademarks. This License does not grant any rights to use any
    Licensor’s or its affiliates’ names, logos, or trademarks, except
    as necessary to reproduce the notices described in this License.

    3.6 Termination. If you violate any term of this License, then your
    rights under this License (including the grant in Section 2.1) will
    terminate immediately.

4. Disclaimer of Warranty.

THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
THIS LICENSE.

5. Limitation of Liability.

EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
THE POSSIBILITY OF SUCH DAMAGES.

=======================================================================
*/

#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"

torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
                                int upx, int upy, int downx, int downy,
                                int padx0, int padx1, int pady0, int pady1,
                                bool flip, float gain) {
  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, filter, upx, upy, downx,
                              downy, padx0, padx1, pady0, pady1, flip, gain);
}

torch::Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx,
                        int upy, int downx, int downy, int padx0, int padx1,
                        int pady0, int pady1, bool flip, float gain) {
  return upfirdn2d_op_impl(input, filter, upx, upy, downx, downy, padx0, padx1,
                           pady0, pady1, flip, gain);
}


================================================
FILE: mmcv/ops/csrc/pytorch/voxelization.cpp
================================================
// Copyright (c) OpenMMLab. All rights reserved.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>

#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"

using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif

int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim = 3) {
  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
                              num_points_per_voxel, voxel_size, coors_range,
                              max_points, max_voxels, NDim);
}

int nondeterministic_hard_voxelize_forward_impl(
    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
    const std::vector<float> coors_range, const int max_points,
    const int max_voxels, const int NDim = 3) {
  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
                              points, voxels, coors, num_points_per_voxel,
                              voxel_size, coors_range, max_points, max_voxels,
                              NDim);
}

void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim = 3) {
  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
                       coors_range, NDim);
}

#ifdef MMCV_WITH_DIOPI
void hard_voxelize_forward_diopi(const at::Tensor &points,
                                 const at::Tensor &voxel_size,
                                 const at::Tensor &coors_range,
                                 at::Tensor &voxels, at::Tensor &coors,
                                 at::Tensor &num_points_per_voxel,
                                 at::Tensor &voxel_num, const int max_points,
                                 const int max_voxels, const int NDim = 3,
                                 const bool deterministic = true) {
  auto points_p = toDiopiTensorHandle(points);
  diopiDevice_t device;
  diopiGetTensorDevice(points_p, &device);
  if (device == diopi_host) {
    int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
    std::vector<float> voxel_size_v(
        voxel_size.data_ptr<float>(),
        voxel_size.data_ptr<float>() + voxel_size.numel());
    std::vector<float> coors_range_v(
        coors_range.data_ptr<float>(),
        coors_range.data_ptr<float>() + coors_range.numel());

    if (deterministic) {
      *voxel_num_data = hard_voxelize_forward_impl(
          points, voxels, coors, num_points_per_voxel, voxel_size_v,
          coors_range_v, max_points, max_voxels, NDim);
    } else {
      TORCH_CHECK(
          deterministic,
          "nondeterministic hard_voxelize_forward is not supported on host!");
    }
    return;
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  auto voxel_size_p = toDiopiTensorHandle(voxel_size);
  auto coors_range_p = toDiopiTensorHandle(coors_range);
  auto voxels_p = toDiopiTensorHandle(voxels);
  auto coors_p = toDiopiTensorHandle(coors);
  auto num_points_per_voxel_p = toDiopiTensorHandle(num_points_per_voxel);
  auto voxel_num_p = toDiopiTensorHandle(voxel_num);
  if (reinterpret_cast<void *>(diopiHardVoxelizeMmcv) != nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret = diopiHardVoxelizeMmcv(
          ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,
          voxel_size_p, coors_range_p, max_points, max_voxels, NDim,
          deterministic);
      if (ret == diopiSuccess) return;
    } else {
      auto ret = diopiHardVoxelizeMmcv(
          ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,
          voxel_size_p, coors_range_p, max_points, max_voxels, NDim,
          deterministic);
      if (ret == diopiSuccess) return;
    }
  }
  LOG(WARNING) << "Fallback to cpu: mmcv ext op hard_voxelize_forward";
  auto points_cpu = points.cpu();
  auto voxel_size_cpu = voxel_size.cpu();
  auto coors_range_cpu = coors_range.cpu();
  auto voxels_cpu = voxels.cpu();
  auto coors_cpu = coors.cpu();
  auto num_points_per_voxel_cpu = num_points_per_voxel.cpu();
  auto voxel_num_cpu = voxel_num.cpu();

  int64_t *voxel_num_data_cpu = voxel_num_cpu.data_ptr<int64_t>();
  std::vector<float> voxel_size_v_cpu(
      voxel_size_cpu.data_ptr<float>(),
      voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());
  std::vector<float> coors_range_v_cpu(
      coors_range_cpu.data_ptr<float>(),
      coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());

  if (deterministic) {
    *voxel_num_data_cpu = hard_voxelize_forward_impl(
        points_cpu, voxels_cpu, coors_cpu, num_points_per_voxel_cpu,
        voxel_size_v_cpu, coors_range_v_cpu, max_points, max_voxels, NDim);
  } else {
    puts("nondeterministic hard_voxelize_forward is not supported on host!");
    abort();
  }
  voxels.copy_(voxels_cpu);
  coors.copy_(coors_cpu);
  num_points_per_voxel.copy_(num_points_per_voxel_cpu);
  voxel_num.copy_(voxel_num_cpu);
  return;
}

void dynamic_voxelize_forward_diopi(const at::Tensor &points,
                                    const at::Tensor &voxel_size,
                                    const at::Tensor &coors_range,
                                    at::Tensor &coors, const int NDim = 3) {
  auto points_p = toDiopiTensorHandle(points);
  diopiDevice_t device;
  diopiGetTensorDevice(points_p, &device);
  if (device == diopi_host) {
    std::vector<float> voxel_size_v(
        voxel_size.data_ptr<float>(),
        voxel_size.data_ptr<float>() + voxel_size.numel());
    std::vector<float> coors_range_v(
        coors_range.data_ptr<float>(),
        coors_range.data_ptr<float>() + coors_range.numel());
    dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
                                  NDim);
    return;
  }
  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
  diopiContextHandle_t ch = &ctx;
  auto voxel_size_p = toDiopiTensorHandle(voxel_size);
  auto coors_range_p = toDiopiTensorHandle(coors_range);
  auto coors_p = toDiopiTensorHandle(coors);
  if (reinterpret_cast<void *>(diopiDynamicVoxelizeMmcv) != nullptr) {
    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
      pybind11::gil_scoped_release no_gil;
      auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,
                                          coors_range_p, NDim);
      if (ret == diopiSuccess) return;
    } else {
      auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,
                                          coors_range_p, NDim);
      if (ret == diopiSuccess) return;
    }
  }
  LOG(WARNING) << "Fallback to cpu: mmcv ext op dynamic_voxelize_forward";
  auto points_cpu = points.cpu();
  auto voxel_size_cpu = voxel_size.cpu();
  auto coors_range_cpu = coors_range.cpu();
  auto coors_cpu = coors.cpu();

  std::vector<float> voxel_size_v_cpu(
      voxel_size_cpu.data_ptr<float>(),
      voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());
  std::vector<float> coors_range_v_cpu(
      coors_range_cpu.data_ptr<float>(),
      coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());
  dynamic_voxelize_forward_impl(points_cpu, coors_cpu, voxel_size_v_cpu,
                                coors_range_v_cpu, NDim);
  coors.copy_(coors_cpu);
  return;
}
#endif

void hard_voxelize_forward(const at::Tensor &points,
                           const at::Tensor &voxel_size,
                           const at::Tensor &coors_range, at::Tensor &voxels,
                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
                           at::Tensor &voxel_num, const int max_points,
                           const int max_voxels, const int NDim = 3,
                           const bool deterministic = true) {
#ifdef MMCV_WITH_DIOPI
  hard_voxelize_forward_diopi(points, voxel_size, coors_range, voxels, coors,
                              num_points_per_voxel, voxel_num, max_points,
                              max_voxels, NDim, deterministic);
#else
  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
  std::vector<float> voxel_size_v(
      voxel_size.data_ptr<float>(),
      voxel_size.data_ptr<float>() + voxel_size.numel());
  std::vector<float> coors_range_v(
      coors_range.data_ptr<float>(),
      coors_range.data_ptr<float>() + coors_range.numel());

  if (deterministic) {
    *voxel_num_data = hard_voxelize_forward_impl(
        points, voxels, coors, num_points_per_voxel, voxel_size_v,
        coors_range_v, max_points, max_voxels, NDim);
  } else {
    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
        points, voxels, coors, num_points_per_voxel, voxel_size_v,
        coors_range_v, max_points, max_voxels, NDim);
  }
#endif
}

void dynamic_voxelize_forward(const at::Tensor &points,
                              const at::Tensor &voxel_size,
                              const at::Tensor &coors_range, at::Tensor &coors,
                              const int NDim = 3) {
#ifdef MMCV_WITH_DIOPI
  dynamic_voxelize_forward_diopi(points, voxel_size, coors_range, coors, NDim);
#else
  std::vector<float> voxel_size_v(
      voxel_size.data_ptr<float>(),
      voxel_size.data_ptr<float>() + voxel_size.numel());
  std::vector<float> coors_range_v(
      coors_range.data_ptr<float>(),
      coors_range.data_ptr<float>() + coors_range.numel());
  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
                                NDim);
#endif
}


================================================
FILE: mmcv/ops/deform_conv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.logging import print_log
from mmengine.registry import MODELS
from mmengine.utils import deprecated_api_warning
from torch import Tensor
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair, _single

from mmcv.utils import IS_MLU_AVAILABLE
from ..utils import ext_loader
from .modulated_deform_conv import ModulatedDeformConv2dFunction

ext_module = ext_loader.load_ext('_ext', [
    'deform_conv_forward', 'deform_conv_backward_input',
    'deform_conv_backward_parameters'
])


class DeformConv2dFunction(Function):

    @staticmethod
    def symbolic(g,
                 input,
                 offset,
                 weight,
                 stride,
                 padding,
                 dilation,
                 groups,
                 deform_groups,
                 bias=False,
                 im2col_step=32):
        return g.op(
            'mmcv::MMCVDeformConv2d',
            input,
            offset,
            weight,
            stride_i=stride,
            padding_i=padding,
            dilation_i=dilation,
            groups_i=groups,
            deform_groups_i=deform_groups,
            bias_i=bias,
            im2col_step_i=im2col_step)

    @staticmethod
    def _npu_backward(ctx, grad_output):
        import torch_npu
        input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \
            ctx.saved_tensors
        grad_input, grad_weight, grad_offset_all, grad_bias = \
            torch_npu.npu_deformable_conv2dbk(
                input_tensor, grad_output, offset_out, weight, offset_all,
                kernel_size=[weight.shape[2], weight.shape[3]],
                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],
                         ctx.padding[1]],
                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
                groups=ctx.groups, deformable_groups=ctx.deform_groups,
                modulated=True)
        grad_offset = grad_offset_all.index_select(1, sort_index_for_npu_bp)
        return grad_input, grad_offset, grad_weight, \
            None, None, None, None, None, None, None

    @staticmethod
    def forward(ctx,
                input: Tensor,
                offset: Tensor,
                weight: Tensor,
                stride: Union[int, Tuple[int, ...]] = 1,
                padding: Union[int, Tuple[int, ...]] = 0,
                dilation: Union[int, Tuple[int, ...]] = 1,
                groups: int = 1,
                deform_groups: int = 1,
                bias: bool = False,
                im2col_step: int = 32) -> Tensor:
        if input is not None and input.dim() != 4:
            raise ValueError(
                f'Expected 4D tensor as input, got {input.dim()}D tensor \
                  instead.')
        assert bias is False, 'Only support bias is False.'
        ctx.stride = _pair(stride)
        ctx.padding = _pair(padding)
        ctx.dilation = _pair(dilation)
        ctx.groups = groups
        ctx.deform_groups = deform_groups
        ctx.im2col_step = im2col_step
        ctx.device = input.device.type

        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
        # amp won't cast the type of model (float32), but "offset" is cast
        # to float16 by nn.Conv2d automatically, leading to the type
        # mismatch with input (when it is float32) or weight.
        # The flag for whether to use fp16 or amp is the type of "offset",
        # we cast weight and input to temporarily support fp16 and amp
        # whatever the pytorch version is.
        input = input.type_as(offset)
        weight = weight.type_as(input)
        if ctx.device == 'npu':
            mask_shape, _ = torch.chunk(offset, 2, dim=1)
            mask = torch.ones_like(mask_shape).to(input.device)
            bias = input.new_empty(0)
            output = ModulatedDeformConv2dFunction._npu_forward(
                ctx, input, offset, mask, weight, bias)
            return output
        ctx.save_for_backward(input, offset, weight)

        output = input.new_empty([
            int(i)
            for i in DeformConv2dFunction._output_size(ctx, input, weight)
        ])

        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones

        cur_im2col_step = min(ctx.im2col_step, input.size(0))
        assert (input.size(0) % cur_im2col_step
                ) == 0, 'batch size must be divisible by im2col_step'
        ext_module.deform_conv_forward(
            input,
            weight,
            offset,
            output,
            ctx.bufs_[0],
            ctx.bufs_[1],
            kW=weight.size(3),
            kH=weight.size(2),
            dW=ctx.stride[1],
            dH=ctx.stride[0],
            padW=ctx.padding[1],
            padH=ctx.padding[0],
            dilationW=ctx.dilation[1],
            dilationH=ctx.dilation[0],
            group=ctx.groups,
            deformable_group=ctx.deform_groups,
            im2col_step=cur_im2col_step)
        return output

    @staticmethod
    @once_differentiable
    def backward(
        ctx, grad_output: Tensor
    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
               None, None, None, None, None, None]:
        if ctx.device == 'npu':
            return DeformConv2dFunction._npu_backward(ctx, grad_output)
        input, offset, weight = ctx.saved_tensors

        grad_input = grad_offset = grad_weight = None

        cur_im2col_step = min(ctx.im2col_step, input.size(0))
        assert (input.size(0) % cur_im2col_step
                ) == 0, 'batch size must be divisible by im2col_step'

        grad_output = grad_output.contiguous()
        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
            grad_input = torch.zeros_like(input)
            grad_offset = torch.zeros_like(offset)
            ext_module.deform_conv_backward_input(
                input,
                offset,
                grad_output,
                grad_input,
                grad_offset,
                weight,
                ctx.bufs_[0],
                kW=weight.size(3),
                kH=weight.size(2),
                dW=ctx.stride[1],
                dH=ctx.stride[0],
                padW=ctx.padding[1],
                padH=ctx.padding[0],
                dilationW=ctx.dilation[1],
                dilationH=ctx.dilation[0],
                group=ctx.groups,
                deformable_group=ctx.deform_groups,
                im2col_step=cur_im2col_step)

        if ctx.needs_input_grad[2]:
            grad_weight = torch.zeros_like(weight)
            ext_module.deform_conv_backward_parameters(
                input,
                offset,
                grad_output,
                grad_weight,
                ctx.bufs_[0],
                ctx.bufs_[1],
                kW=weight.size(3),
                kH=weight.size(2),
                dW=ctx.stride[1],
                dH=ctx.stride[0],
                padW=ctx.padding[1],
                padH=ctx.padding[0],
                dilationW=ctx.dilation[1],
                dilationH=ctx.dilation[0],
                group=ctx.groups,
                deformable_group=ctx.deform_groups,
                scale=1,
                im2col_step=cur_im2col_step)

        return grad_input, grad_offset, grad_weight, \
            None, None, None, None, None, None, None

    @staticmethod
    def _output_size(ctx, input, weight):
        channels = weight.size(0)
        output_size = (input.size(0), channels)
        for d in range(input.dim() - 2):
            in_size = input.size(d + 2)
            pad = ctx.padding[d]
            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
            stride_ = ctx.stride[d]
            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
        if not all(map(lambda s: s > 0, output_size)):
            raise ValueError(
                'convolution input is too small (output would be ' +
                'x'.join(map(str, output_size)) + ')')
        return output_size


deform_conv2d = DeformConv2dFunction.apply


class DeformConv2d(nn.Module):
    r"""Deformable 2D convolution.

    Applies a deformable 2D convolution over an input signal composed of
    several input planes. DeformConv2d was described in the paper
    `Deformable Convolutional Networks
    <https://arxiv.org/pdf/1703.06211.pdf>`_

    Note:
        The argument ``im2col_step`` was added in version 1.3.17, which means
        number of samples processed by the ``im2col_cuda_kernel`` per call.
        It enables users to define ``batch_size`` and ``im2col_step`` more
        flexibly and solved `issue mmcv#1440
        <https://github.com/open-mmlab/mmcv/issues/1440>`_.

    Args:
        in_channels (int): Number of channels in the input image.
        out_channels (int): Number of channels produced by the convolution.
        kernel_size(int, tuple): Size of the convolving kernel.
        stride(int, tuple): Stride of the convolution. Default: 1.
        padding (int or tuple): Zero-padding added to both sides of the input.
            Default: 0.
        dilation (int or tuple): Spacing between kernel elements. Default: 1.
        groups (int): Number of blocked connections from input.
            channels to output channels. Default: 1.
        deform_groups (int): Number of deformable group partitions.
        bias (bool): If True, adds a learnable bias to the output.
            Default: False.
        im2col_step (int): Number of samples processed by im2col_cuda_kernel
            per call. It will work when ``batch_size`` > ``im2col_step``, but
            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.
            `New in version 1.3.17.`
    """

    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
                            cls_name='DeformConv2d')
    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: Union[int, Tuple[int, ...]],
                 stride: Union[int, Tuple[int, ...]] = 1,
                 padding: Union[int, Tuple[int, ...]] = 0,
                 dilation: Union[int, Tuple[int, ...]] = 1,
                 groups: int = 1,
                 deform_groups: int = 1,
                 bias: bool = False,
                 im2col_step: int = 32) -> None:
        super().__init__()

        assert not bias, \
            f'bias={bias} is not supported in DeformConv2d.'
        assert in_channels % groups == 0, \
            f'in_channels {in_channels} cannot be divisible by groups {groups}'
        assert out_channels % groups == 0, \
            f'out_channels {out_channels} cannot be divisible by groups \
              {groups}'

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = _pair(kernel_size)
        self.stride = _pair(stride)
        self.padding = _pair(padding)
        self.dilation = _pair(dilation)
        self.groups = groups
        self.deform_groups = deform_groups
        self.im2col_step = im2col_step
        # enable compatibility with nn.Conv2d
        self.transposed = False
        self.output_padding = _single(0)

        # only weight, no bias
        self.weight = nn.Parameter(
            torch.Tensor(out_channels, in_channels // self.groups,
                         *self.kernel_size))

        self.reset_parameters()

    def reset_parameters(self):
        # switch the initialization of `self.weight` to the standard kaiming
        # method described in `Delving deep into rectifiers: Surpassing
        # human-level performance on ImageNet classification` - He, K. et al.
        # (2015), using a uniform distribution
        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')

    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
        """Deformable Convolutional forward function.

        Args:
            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
            offset (Tensor): Offset for deformable convolution, shape
                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,
                H_out, W_out), H_out, W_out are equal to the output's.

                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
                The spatial arrangement is like:

                .. code:: text

                    (x0, y0) (x1, y1) (x2, y2)
                    (x3, y3) (x4, y4) (x5, y5)
                    (x6, y6) (x7, y7) (x8, y8)

        Returns:
            Tensor: Output of the layer.
        """
        # To fix an assert error in deform_conv_cuda.cpp:128
        # input image is smaller than kernel
        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
                                                          self.kernel_size[1])
        if input_pad:
            pad_h = max(self.kernel_size[0] - x.size(2), 0)
            pad_w = max(self.kernel_size[1] - x.size(3), 0)
            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
            offset = offset.contiguous()
        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
                            self.dilation, self.groups, self.deform_groups,
                            False, self.im2col_step)
        if input_pad:
            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
                      pad_w].contiguous()
        return out

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(in_channels={self.in_channels},\n'
        s += f'out_channels={self.out_channels},\n'
        s += f'kernel_size={self.kernel_size},\n'
        s += f'stride={self.stride},\n'
        s += f'padding={self.padding},\n'
        s += f'dilation={self.dilation},\n'
        s += f'groups={self.groups},\n'
        s += f'deform_groups={self.deform_groups},\n'
        # bias is not supported in DeformConv2d.
        s += 'bias=False)'
        return s


@MODELS.register_module('DCN')
class DeformConv2dPack(DeformConv2d):
    """A Deformable Conv Encapsulation that acts as normal Conv layers.

    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
    The spatial arrangement is like:

    .. code:: text

        (x0, y0) (x1, y1) (x2, y2)
        (x3, y3) (x4, y4) (x5, y5)
        (x6, y6) (x7, y7) (x8, y8)

    Args:
        in_channels (int): Same as nn.Conv2d.
        out_channels (int): Same as nn.Conv2d.
        kernel_size (int or tuple[int]): Same as nn.Conv2d.
        stride (int or tuple[int]): Same as nn.Conv2d.
        padding (int or tuple[int]): Same as nn.Conv2d.
        dilation (int or tuple[int]): Same as nn.Conv2d.
        groups (int): Same as nn.Conv2d.
        bias (bool or str): If specified as `auto`, it will be decided by the
            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
            False.
    """

    _version = 2

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.conv_offset = nn.Conv2d(
            self.in_channels,
            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
            kernel_size=self.kernel_size,
            stride=_pair(self.stride),
            padding=_pair(self.padding),
            dilation=_pair(self.dilation),
            bias=True)
        self.init_offset()

    def init_offset(self):
        self.conv_offset.weight.data.zero_()
        self.conv_offset.bias.data.zero_()

    def forward(self, x: Tensor) -> Tensor:  # type: ignore
        offset = self.conv_offset(x)
        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
                             self.dilation, self.groups, self.deform_groups,
                             False, self.im2col_step)

    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                              missing_keys, unexpected_keys, error_msgs):
        version = local_metadata.get('version', None)

        if version is None or version < 2:
            # the key is different in early versions
            # In version < 2, DeformConvPack loads previous benchmark models.
            if (prefix + 'conv_offset.weight' not in state_dict
                    and prefix[:-1] + '_offset.weight' in state_dict):
                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
                    prefix[:-1] + '_offset.weight')
            if (prefix + 'conv_offset.bias' not in state_dict
                    and prefix[:-1] + '_offset.bias' in state_dict):
                state_dict[prefix +
                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
                                                                '_offset.bias')

        if version is not None and version > 1:
            print_log(
                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
                'version 2.',
                logger='current')

        super()._load_from_state_dict(state_dict, prefix, local_metadata,
                                      strict, missing_keys, unexpected_keys,
                                      error_msgs)


if IS_MLU_AVAILABLE:
    import torchvision
    from mmengine.utils import digit_version
    from torchvision.ops import deform_conv2d as tv_deform_conv2d

    @MODELS.register_module('DCN', force=True)
    class DeformConv2dPack_MLU(DeformConv2d):
        """This class is the DCN implementation of the MLU device.

        The MLU
        backend support of the operator has been implemented in torchvision.
        The mmcv registration mechanism is used for multiplexing here. The
        torchvision implementation of DCN is called.
        Args:
            in_channels (int): Same as nn.Conv2d.
            out_channels (int): Same as nn.Conv2d.
            kernel_size (int or tuple[int]): Same as nn.Conv2d.
            stride (int): Same as nn.Conv2d, while tuple is not supported.
            padding (int): Same as nn.Conv2d, while tuple is not supported.
            dilation (int): Same as nn.Conv2d, while tuple is not supported.
            groups (int): Same as nn.Conv2d.
            bias (bool or str): If specified as `auto`, it will be decided by
                the norm_cfg. Bias will be set as True if norm_cfg is None,
                otherwise False.
            im2col_step (int): Number of samples processed by
                im2col_cuda_kernel per call. It will work when ``batch_size``
                > ``im2col_step``, but ``batch_size`` must be divisible by
                ``im2col_step``. Default: 32. `New in version 1.7.2.
                Currently not supported on MLU devices.`
        """

        def __init__(self, *args, **kwargs):
            assert digit_version(torchvision.__version__) >= digit_version(
                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'
            super().__init__(*args, **kwargs)

            self.conv_offset = nn.Conv2d(
                self.in_channels,
                self.deform_groups * 2 * self.kernel_size[0] *
                self.kernel_size[1],
                kernel_size=self.kernel_size,
                stride=_pair(self.stride),
                padding=_pair(self.padding),
                dilation=_pair(self.dilation),
                bias=True)
            self.init_offset()

        def init_offset(self):
            self.conv_offset.weight.data.zero_()
            self.conv_offset.bias.data.zero_()

        def forward(self, x: Tensor) -> Tensor:  # type: ignore
            cur_im2col_step = min(self.im2col_step, x.size(0))
            assert (x.size(0) % cur_im2col_step
                    ) == 0, 'batch size must be divisible by im2col_step'
            offset = self.conv_offset(x)
            x = x.type_as(offset)
            weight = self.weight.type_as(x)
            return tv_deform_conv2d(x, offset, weight, None, self.stride,
                                    self.padding, self.dilation)


================================================
FILE: mmcv/ops/deform_roi_pool.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Tuple

from torch import Tensor, nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])


class DeformRoIPoolFunction(Function):

    @staticmethod
    def symbolic(g, input, rois, offset, output_size, spatial_scale,
                 sampling_ratio, gamma):
        inputs = [input, rois]
        if offset is not None:
            inputs = [input, rois, offset]
        return g.op(
            'mmcv::MMCVDeformRoIPool',
            *inputs,
            pooled_height_i=output_size[0],
            pooled_width_i=output_size[1],
            spatial_scale_f=spatial_scale,
            sampling_ratio_f=sampling_ratio,
            gamma_f=gamma,
        )

    @staticmethod
    def forward(ctx,
                input: Tensor,
                rois: Tensor,
                offset: Optional[Tensor],
                output_size: Tuple[int, ...],
                spatial_scale: float = 1.0,
                sampling_ratio: int = 0,
                gamma: float = 0.1) -> Tensor:
        if offset is None:
            offset = input.new_zeros(0)
        ctx.output_size = _pair(output_size)
        ctx.spatial_scale = float(spatial_scale)
        ctx.sampling_ratio = int(sampling_ratio)
        ctx.gamma = float(gamma)

        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'

        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
                        ctx.output_size[1])
        output = input.new_zeros(output_shape)

        ext_module.deform_roi_pool_forward(
            input,
            rois,
            offset,
            output,
            pooled_height=ctx.output_size[0],
            pooled_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale,
            sampling_ratio=ctx.sampling_ratio,
            gamma=ctx.gamma)

        ctx.save_for_backward(input, rois, offset)
        return output

    @staticmethod
    @once_differentiable
    def backward(
        ctx, grad_output: Tensor
    ) -> Tuple[Tensor, None, Tensor, None, None, None, None]:
        input, rois, offset = ctx.saved_tensors
        grad_input = grad_output.new_zeros(input.shape)
        grad_offset = grad_output.new_zeros(offset.shape)

        ext_module.deform_roi_pool_backward(
            grad_output,
            input,
            rois,
            offset,
            grad_input,
            grad_offset,
            pooled_height=ctx.output_size[0],
            pooled_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale,
            sampling_ratio=ctx.sampling_ratio,
            gamma=ctx.gamma)
        if grad_offset.numel() == 0:
            grad_offset = None
        return grad_input, None, grad_offset, None, None, None, None


deform_roi_pool = DeformRoIPoolFunction.apply


class DeformRoIPool(nn.Module):

    def __init__(self,
                 output_size: Tuple[int, ...],
                 spatial_scale: float = 1.0,
                 sampling_ratio: int = 0,
                 gamma: float = 0.1):
        super().__init__()
        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)
        self.sampling_ratio = int(sampling_ratio)
        self.gamma = float(gamma)

    def forward(self,
                input: Tensor,
                rois: Tensor,
                offset: Optional[Tensor] = None) -> Tensor:
        return deform_roi_pool(input, rois, offset, self.output_size,
                               self.spatial_scale, self.sampling_ratio,
                               self.gamma)


class DeformRoIPoolPack(DeformRoIPool):

    def __init__(self,
                 output_size: Tuple[int, ...],
                 output_channels: int,
                 deform_fc_channels: int = 1024,
                 spatial_scale: float = 1.0,
                 sampling_ratio: int = 0,
                 gamma: float = 0.1):
        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)

        self.output_channels = output_channels
        self.deform_fc_channels = deform_fc_channels

        self.offset_fc = nn.Sequential(
            nn.Linear(
                self.output_size[0] * self.output_size[1] *
                self.output_channels, self.deform_fc_channels),
            nn.ReLU(inplace=True),
            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
            nn.ReLU(inplace=True),
            nn.Linear(self.deform_fc_channels,
                      self.output_size[0] * self.output_size[1] * 2))
        self.offset_fc[-1].weight.data.zero_()
        self.offset_fc[-1].bias.data.zero_()

    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
        assert input.size(1) == self.output_channels
        x = deform_roi_pool(input, rois, None, self.output_size,
                            self.spatial_scale, self.sampling_ratio,
                            self.gamma)
        rois_num = rois.size(0)
        offset = self.offset_fc(x.view(rois_num, -1))
        offset = offset.view(rois_num, 2, self.output_size[0],
                             self.output_size[1])
        return deform_roi_pool(input, rois, offset, self.output_size,
                               self.spatial_scale, self.sampling_ratio,
                               self.gamma)


class ModulatedDeformRoIPoolPack(DeformRoIPool):

    def __init__(self,
                 output_size: Tuple[int, ...],
                 output_channels: int,
                 deform_fc_channels: int = 1024,
                 spatial_scale: float = 1.0,
                 sampling_ratio: int = 0,
                 gamma: float = 0.1):
        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)

        self.output_channels = output_channels
        self.deform_fc_channels = deform_fc_channels

        self.offset_fc = nn.Sequential(
            nn.Linear(
                self.output_size[0] * self.output_size[1] *
                self.output_channels, self.deform_fc_channels),
            nn.ReLU(inplace=True),
            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
            nn.ReLU(inplace=True),
            nn.Linear(self.deform_fc_channels,
                      self.output_size[0] * self.output_size[1] * 2))
        self.offset_fc[-1].weight.data.zero_()
        self.offset_fc[-1].bias.data.zero_()

        self.mask_fc = nn.Sequential(
            nn.Linear(
                self.output_size[0] * self.output_size[1] *
                self.output_channels, self.deform_fc_channels),
            nn.ReLU(inplace=True),
            nn.Linear(self.deform_fc_channels,
                      self.output_size[0] * self.output_size[1] * 1),
            nn.Sigmoid())
        self.mask_fc[2].weight.data.zero_()
        self.mask_fc[2].bias.data.zero_()

    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
        assert input.size(1) == self.output_channels
        x = deform_roi_pool(input, rois, None, self.output_size,
                            self.spatial_scale, self.sampling_ratio,
                            self.gamma)
        rois_num = rois.size(0)
        offset = self.offset_fc(x.view(rois_num, -1))
        offset = offset.view(rois_num, 2, self.output_size[0],
                             self.output_size[1])
        mask = self.mask_fc(x.view(rois_num, -1))
        mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])
        d = deform_roi_pool(input, rois, offset, self.output_size,
                            self.spatial_scale, self.sampling_ratio,
                            self.gamma)
        return d * mask


================================================
FILE: mmcv/ops/deprecated_wrappers.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# This file is for backward compatibility.
# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks.
import warnings

from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d


class Conv2d_deprecated(Conv2d):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        warnings.warn(
            'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
            ' the future. Please import them from "mmcv.cnn" instead',
            DeprecationWarning)


class ConvTranspose2d_deprecated(ConvTranspose2d):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        warnings.warn(
            'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
            'deprecated in the future. Please import them from "mmcv.cnn" '
            'instead', DeprecationWarning)


class MaxPool2d_deprecated(MaxPool2d):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        warnings.warn(
            'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
            ' the future. Please import them from "mmcv.cnn" instead',
            DeprecationWarning)


class Linear_deprecated(Linear):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        warnings.warn(
            'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
            ' the future. Please import them from "mmcv.cnn" instead',
            DeprecationWarning)


================================================
FILE: mmcv/ops/diff_iou_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/box_intersection_2d.py  # noqa
# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/oriented_iou_loss.py  # noqa
from typing import Tuple

import torch
from torch import Tensor
from torch.autograd import Function

from ..utils import ext_loader

EPSILON = 1e-8
ext_module = ext_loader.load_ext('_ext',
                                 ['diff_iou_rotated_sort_vertices_forward'])


class SortVertices(Function):

    @staticmethod
    def forward(ctx, vertices, mask, num_valid):
        idx = ext_module.diff_iou_rotated_sort_vertices_forward(
            vertices, mask, num_valid)
        if torch.__version__ != 'parrots':
            ctx.mark_non_differentiable(idx)
        return idx

    @staticmethod
    def backward(ctx, gradout):
        return ()


def box_intersection(corners1: Tensor,
                     corners2: Tensor) -> Tuple[Tensor, Tensor]:
    """Find intersection points of rectangles.
    Convention: if two edges are collinear, there is no intersection point.

    Args:
        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.

    Returns:
        Tuple:
         - Tensor: (B, N, 4, 4, 2) Intersections.
         - Tensor: (B, N, 4, 4) Valid intersections mask.
    """
    # build edges from corners
    # B, N, 4, 4: Batch, Box, edge, point
    line1 = torch.cat([corners1, corners1[:, :, [1, 2, 3, 0], :]], dim=3)
    line2 = torch.cat([corners2, corners2[:, :, [1, 2, 3, 0], :]], dim=3)
    # duplicate data to pair each edges from the boxes
    # (B, N, 4, 4) -> (B, N, 4, 4, 4) : Batch, Box, edge1, edge2, point
    line1_ext = line1.unsqueeze(3)
    line2_ext = line2.unsqueeze(2)
    x1, y1, x2, y2 = line1_ext.split([1, 1, 1, 1], dim=-1)
    x3, y3, x4, y4 = line2_ext.split([1, 1, 1, 1], dim=-1)
    # math: https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection
    numerator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
    denumerator_t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)
    t = denumerator_t / numerator
    t[numerator == .0] = -1.
    mask_t = (t > 0) & (t < 1)  # intersection on line segment 1
    denumerator_u = (x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)
    u = -denumerator_u / numerator
    u[numerator == .0] = -1.
    mask_u = (u > 0) & (u < 1)  # intersection on line segment 2
    mask = mask_t * mask_u
    # overwrite with EPSILON. otherwise numerically unstable
    t = denumerator_t / (numerator + EPSILON)
    intersections = torch.stack([x1 + t * (x2 - x1), y1 + t * (y2 - y1)],
                                dim=-1)
    intersections = intersections * mask.float().unsqueeze(-1)
    return intersections, mask


def box1_in_box2(corners1: Tensor, corners2: Tensor) -> Tensor:
    """Check if corners of box1 lie in box2.
    Convention: if a corner is exactly on the edge of the other box,
    it's also a valid point.

    Args:
        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.

    Returns:
        Tensor: (B, N, 4) Intersection.
    """
    # a, b, c, d - 4 vertices of box2
    a = corners2[:, :, 0:1, :]  # (B, N, 1, 2)
    b = corners2[:, :, 1:2, :]  # (B, N, 1, 2)
    d = corners2[:, :, 3:4, :]  # (B, N, 1, 2)
    # ab, am, ad - vectors between corresponding vertices
    ab = b - a  # (B, N, 1, 2)
    am = corners1 - a  # (B, N, 4, 2)
    ad = d - a  # (B, N, 1, 2)
    prod_ab = torch.sum(ab * am, dim=-1)  # (B, N, 4)
    norm_ab = torch.sum(ab * ab, dim=-1)  # (B, N, 1)
    prod_ad = torch.sum(ad * am, dim=-1)  # (B, N, 4)
    norm_ad = torch.sum(ad * ad, dim=-1)  # (B, N, 1)
    # NOTE: the expression looks ugly but is stable if the two boxes
    # are exactly the same also stable with different scale of bboxes
    cond1 = (prod_ab / norm_ab > -1e-6) * (prod_ab / norm_ab < 1 + 1e-6
                                           )  # (B, N, 4)
    cond2 = (prod_ad / norm_ad > -1e-6) * (prod_ad / norm_ad < 1 + 1e-6
                                           )  # (B, N, 4)
    return cond1 * cond2


def box_in_box(corners1: Tensor, corners2: Tensor) -> Tuple[Tensor, Tensor]:
    """Check if corners of two boxes lie in each other.

    Args:
        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.

    Returns:
        Tuple:
         - Tensor: (B, N, 4) True if i-th corner of box1 is in box2.
         - Tensor: (B, N, 4) True if i-th corner of box2 is in box1.
    """
    c1_in_2 = box1_in_box2(corners1, corners2)
    c2_in_1 = box1_in_box2(corners2, corners1)
    return c1_in_2, c2_in_1


def build_vertices(corners1: Tensor, corners2: Tensor, c1_in_2: Tensor,
                   c2_in_1: Tensor, intersections: Tensor,
                   valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
    """Find vertices of intersection area.

    Args:
        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
        c1_in_2 (Tensor): (B, N, 4) True if i-th corner of box1 is in box2.
        c2_in_1 (Tensor): (B, N, 4) True if i-th corner of box2 is in box1.
        intersections (Tensor): (B, N, 4, 4, 2) Intersections.
        valid_mask (Tensor): (B, N, 4, 4) Valid intersections mask.

    Returns:
        Tuple:
         - Tensor: (B, N, 24, 2) Vertices of intersection area;
               only some elements are valid.
         - Tensor: (B, N, 24) Mask of valid elements in vertices.
    """
    # NOTE: inter has elements equals zero and has zeros gradient
    # (masked by multiplying with 0); can be used as trick
    B = corners1.size()[0]
    N = corners1.size()[1]
    # (B, N, 4 + 4 + 16, 2)
    vertices = torch.cat(
        [corners1, corners2,
         intersections.view([B, N, -1, 2])], dim=2)
    # Bool (B, N, 4 + 4 + 16)
    mask = torch.cat([c1_in_2, c2_in_1, valid_mask.view([B, N, -1])], dim=2)
    return vertices, mask


def sort_indices(vertices: Tensor, mask: Tensor) -> Tensor:
    """Sort indices.
    Note:
        why 9? the polygon has maximal 8 vertices.
        +1 to duplicate the first element.
        the index should have following structure:
            (A, B, C, ... , A, X, X, X)
        and X indicates the index of arbitrary elements in the last
        16 (intersections not corners) with value 0 and mask False.
        (cause they have zero value and zero gradient)

    Args:
        vertices (Tensor): (B, N, 24, 2) Box vertices.
        mask (Tensor): (B, N, 24) Mask.

    Returns:
        Tensor: (B, N, 9) Sorted indices.

    """
    num_valid = torch.sum(mask.int(), dim=2).int()  # (B, N)
    mean = torch.sum(
        vertices * mask.float().unsqueeze(-1), dim=2,
        keepdim=True) / num_valid.unsqueeze(-1).unsqueeze(-1)
    vertices_normalized = vertices - mean  # normalization makes sorting easier
    return SortVertices.apply(vertices_normalized, mask, num_valid).long()


def calculate_area(idx_sorted: Tensor,
                   vertices: Tensor) -> Tuple[Tensor, Tensor]:
    """Calculate area of intersection.

    Args:
        idx_sorted (Tensor): (B, N, 9) Sorted vertex ids.
        vertices (Tensor): (B, N, 24, 2) Vertices.

    Returns:
        Tuple:
         - Tensor (B, N): Area of intersection.
         - Tensor: (B, N, 9, 2) Vertices of polygon with zero padding.
    """
    idx_ext = idx_sorted.unsqueeze(-1).repeat([1, 1, 1, 2])
    selected = torch.gather(vertices, 2, idx_ext)
    total = selected[:, :, 0:-1, 0] * selected[:, :, 1:, 1] \
        - selected[:, :, 0:-1, 1] * selected[:, :, 1:, 0]
    total = torch.sum(total, dim=2)
    area = torch.abs(total) / 2
    return area, selected


def oriented_box_intersection_2d(corners1: Tensor,
                                 corners2: Tensor) -> Tuple[Tensor, Tensor]:
    """Calculate intersection area of 2d rotated boxes.

    Args:
        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.

    Returns:
        Tuple:
         - Tensor (B, N): Area of intersection.
         - Tensor (B, N, 9, 2): Vertices of polygon with zero padding.
    """
    intersections, valid_mask = box_intersection(corners1, corners2)
    c12, c21 = box_in_box(corners1, corners2)
    vertices, mask = build_vertices(corners1, corners2, c12, c21,
                                    intersections, valid_mask)
    sorted_indices = sort_indices(vertices, mask)
    return calculate_area(sorted_indices, vertices)


def box2corners(box: Tensor) -> Tensor:
    """Convert rotated 2d box coordinate to corners.

    Args:
        box (Tensor): (B, N, 5) with x, y, w, h, alpha.

    Returns:
        Tensor: (B, N, 4, 2) Corners.
    """
    B = box.size()[0]
    x, y, w, h, alpha = box.split([1, 1, 1, 1, 1], dim=-1)
    x4 = box.new_tensor([0.5, -0.5, -0.5, 0.5]).to(box.device)
    x4 = x4 * w  # (B, N, 4)
    y4 = box.new_tensor([0.5, 0.5, -0.5, -0.5]).to(box.device)
    y4 = y4 * h  # (B, N, 4)
    corners = torch.stack([x4, y4], dim=-1)  # (B, N, 4, 2)
    sin = torch.sin(alpha)
    cos = torch.cos(alpha)
    row1 = torch.cat([cos, sin], dim=-1)
    row2 = torch.cat([-sin, cos], dim=-1)  # (B, N, 2)
    rot_T = torch.stack([row1, row2], dim=-2)  # (B, N, 2, 2)
    rotated = torch.bmm(corners.view([-1, 4, 2]), rot_T.view([-1, 2, 2]))
    rotated = rotated.view([B, -1, 4, 2])  # (B * N, 4, 2) -> (B, N, 4, 2)
    rotated[..., 0] += x
    rotated[..., 1] += y
    return rotated


def diff_iou_rotated_2d(box1: Tensor, box2: Tensor) -> Tensor:
    """Calculate differentiable iou of rotated 2d boxes.

    Args:
        box1 (Tensor): (B, N, 5) First box.
        box2 (Tensor): (B, N, 5) Second box.

    Returns:
        Tensor: (B, N) IoU.
    """
    corners1 = box2corners(box1)
    corners2 = box2corners(box2)
    intersection, _ = oriented_box_intersection_2d(corners1,
                                                   corners2)  # (B, N)
    area1 = box1[:, :, 2] * box1[:, :, 3]
    area2 = box2[:, :, 2] * box2[:, :, 3]
    union = area1 + area2 - intersection
    iou = intersection / union
    return iou


def diff_iou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:
    """Calculate differentiable iou of rotated 3d boxes.

    Args:
        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).
        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).

    Returns:
        Tensor: (B, N) IoU.
    """
    box1 = box3d1[..., [0, 1, 3, 4, 6]]  # 2d box
    box2 = box3d2[..., [0, 1, 3, 4, 6]]
    corners1 = box2corners(box1)
    corners2 = box2corners(box2)
    intersection, _ = oriented_box_intersection_2d(corners1, corners2)
    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5
    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5
    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5
    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5
    z_overlap = (torch.min(zmax1, zmax2) -
                 torch.max(zmin1, zmin2)).clamp_(min=0.)
    intersection_3d = intersection * z_overlap
    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]
    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]
    union_3d = volume1 + volume2 - intersection_3d
    return intersection_3d / union_3d


================================================
FILE: mmcv/ops/filtered_lrelu.py
================================================
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/filtered_lrelu.py # noqa
import warnings
from typing import Dict, Optional, Union

import numpy as np
import torch

from ..utils import IS_MUSA_AVAILABLE, ext_loader
from .bias_act import bias_act
from .upfirdn2d import _get_filter_size, _parse_padding, upfirdn2d

ext_module = ext_loader.load_ext('_ext',
                                 ['filtered_lrelu', 'filtered_lrelu_act_'])

_plugin = None


def filtered_lrelu(input: torch.Tensor,
                   filter_up: Optional[torch.Tensor] = None,
                   filter_down: Optional[torch.Tensor] = None,
                   bias: Optional[torch.Tensor] = None,
                   up: int = 1,
                   down: int = 1,
                   padding: int = 0,
                   gain: float = np.sqrt(2),
                   slope: float = 0.2,
                   clamp: Optional[Union[float, int]] = None,
                   flip_filter: bool = False,
                   use_custom_op: bool = True):
    """Filtered leaky ReLU for a batch of 2D images.

    Performs the following sequence of operations for each channel:

    1. Add channel-specific bias if `bias` is provided.

    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).

    3. Pad the image with the specified number of zeros on each side
      (`padding`). Negative padding corresponds to cropping the image.

    4. Convolve the image with the specified upsampling FIR filter
        (`filter_up`), shrinking it so that the footprint of all output pixels
        lies within the input image.

    5. Multiply each value by the provided gain factor (`gain`).

    6. Apply leaky ReLU activation function to each value.

    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is
       provided.

    8. Convolve the image with the specified downsampling FIR filter
        (`filter_down`), shrinking it so that the footprint of all output
        pixels lies within the input image.

    9. Downsample the image by keeping every Nth pixel (`down`).

    The fused op is considerably more efficient than performing the same
    calculation using standard PyTorch ops. It supports gradients of arbitrary
    order.

    Args:
        input (torch.Tensor): Float32/float16/float64 input tensor of the shape
            `[batch_size, num_channels, in_height, in_width]`.
        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape
            `[filter_height, filter_width]` (non-separable), `[filter_taps]`
            (separable), or `None` (identity). Defaults to None.
        filter_down (torch.Tensor): Float32 downsampling FIR filter of the
            shape `[filter_height, filter_width]` (non-separable),
            `[filter_taps]` (separable), or `None` (identity).
            Defaults to None.
        bias (torch.Tensor): Bias vector, or `None` to disable. Must be
            a 1D tensor of the same type as `input`. The length of vector must
            match the channel dimension of `input`. Defaults to None.
        up (int): Integer upsampling factor. Defaults to 1.
        down (int): Integer downsampling factor. Defaults to 1.
        padding (int): Padding with respect to the upsampled image. Can be a
            single number or a list/tuple `[x, y]` or `[x_before, x_after,
            y_before, y_after]`. Defaults to 0.
        gain (float): Overall scaling factor for signal magnitude.
            Defaults to np.sqrt(2).
        slope (float): Slope on the negative side of leaky ReLU.
            Defaults to 0.2.
        clamp (Optional[Union[float, int]]): Maximum magnitude for leaky ReLU
            output. Defaults to None.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.
        use_custom_op (bool): Whether to use customized op.
            Defaults to True.

    Returns:
        Tensor of the shape `[batch_size, num_channels, out_height,
        out_width]`.
    """
    assert isinstance(input, torch.Tensor)
    if use_custom_op and input.is_cuda:
        return _filtered_lrelu_cuda(
            up=up,
            down=down,
            padding=padding,
            gain=gain,
            slope=slope,
            clamp=clamp,
            flip_filter=flip_filter).apply(input, filter_up, filter_down, bias,
                                           None, 0, 0)
    if use_custom_op and IS_MUSA_AVAILABLE and input.is_musa:
        # @MTAI there have some bugs
        input = input.cpu()
        if bias is not None:
            bias = bias.cpu()
        if filter_up is not None:
            filter_up = filter_up.cpu()
        if filter_down is not None:
            filter_down = filter_down.cpu()
        return _filtered_lrelu_ref(
            input,
            filter_up=filter_up,
            filter_down=filter_down,
            bias=bias,
            up=up,
            down=down,
            padding=padding,
            gain=gain,
            slope=slope,
            clamp=clamp,
            flip_filter=flip_filter)
    return _filtered_lrelu_ref(
        input,
        filter_up=filter_up,
        filter_down=filter_down,
        bias=bias,
        up=up,
        down=down,
        padding=padding,
        gain=gain,
        slope=slope,
        clamp=clamp,
        flip_filter=flip_filter)


def _filtered_lrelu_ref(input: torch.Tensor,
                        filter_up: Optional[torch.Tensor] = None,
                        filter_down: Optional[torch.Tensor] = None,
                        bias: Optional[torch.Tensor] = None,
                        up: int = 1,
                        down: int = 1,
                        padding: int = 0,
                        gain: float = np.sqrt(2),
                        slope: float = 0.2,
                        clamp: Optional[Union[float, int]] = None,
                        flip_filter: bool = False):
    """Slow and memory-inefficient reference implementation of
    `filtered_lrelu()` using existing `upfirdn2n()` and `bias_act()` ops.

    Args:
        input (torch.Tensor): Float32/float16/float64 input tensor of the shape
            `[batch_size, num_channels, in_height, in_width]`.
        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape
            `[filter_height, filter_width]` (non-separable), `[filter_taps]`
            (separable), or `None` (identity). Defaults to None.
        filter_down (torch.Tensor): Float32 downsampling FIR filter of the
            shape `[filter_height, filter_width]` (non-separable),
            `[filter_taps]` (separable), or `None` (identity).
            Defaults to None.
        bias (torch.Tensor): Bias vector, or `None` to disable. Must be
            a 1D tensor of the same type as `input`. The length of vector must
            match the channel dimension of `input`. Defaults to None.
        up (int): Integer upsampling factor. Defaults to 1.
        down (int): Integer downsampling factor. Defaults to 1.
        padding (int): Padding with respect to the upsampled image. Can be a
            single number or a list/tuple `[x, y]` or `[x_before, x_after,
            y_before, y_after]`. Defaults to 0.
        gain (float): Overall scaling factor for signal magnitude.
            Defaults to np.sqrt(2).
        slope (float): Slope on the negative side of leaky ReLU.
            Defaults to 0.2.
        clamp (float or int): Maximum magnitude for leaky ReLU
            output. Defaults to None.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.

    Returns:
        Tensor of the shape `[batch_size, num_channels, out_height,
        out_width]`.
    """
    assert isinstance(input, torch.Tensor) and input.ndim == 4
    filter_up_w, filter_up_h = _get_filter_size(filter_up)
    filter_down_w, filter_down_h = _get_filter_size(filter_down)
    if bias is not None:
        assert isinstance(bias, torch.Tensor) and bias.dtype == input.dtype
    assert isinstance(up, int) and up >= 1
    assert isinstance(down, int) and down >= 1
    px0, px1, py0, py1 = _parse_padding(padding)
    assert gain == float(gain) and gain > 0
    assert slope == float(slope) and slope >= 0
    assert clamp is None or (clamp == float(clamp) and clamp >= 0)

    # Calculate output size.
    batch_size, channels, in_h, in_w = input.shape
    in_dtype = input.dtype
    out_w = (in_w * up + (px0 + px1) - (filter_up_w - 1) -
             (filter_down_w - 1) + (down - 1)) // down
    out_h = (in_h * up + (py0 + py1) - (filter_up_h - 1) -
             (filter_down_h - 1) + (down - 1)) // down

    # Compute using existing ops.
    output = bias_act(input=input, bias=bias)  # Apply bias.
    output = upfirdn2d(
        input=output,
        filter=filter_up,
        up=up,
        padding=[px0, px1, py0, py1],
        gain=up**2,
        flip_filter=flip_filter)  # Upsample.
    output = bias_act(
        input=output, act='lrelu', alpha=slope, gain=gain,
        clamp=clamp)  # Bias, leaky ReLU, clamp.
    output = upfirdn2d(
        input=output, filter=filter_down, down=down,
        flip_filter=flip_filter)  # Downsample.

    assert output.shape == (batch_size, channels, out_h, out_w)
    assert output.dtype == in_dtype
    return output


_filtered_lrelu_cuda_cache: Dict = dict()


def _filtered_lrelu_cuda(up: int = 1,
                         down: int = 1,
                         padding: int = 0,
                         gain: float = np.sqrt(2),
                         slope: float = 0.2,
                         clamp: Optional[Union[float, int]] = None,
                         flip_filter: bool = False):
    """Fast CUDA implementation of `filtered_lrelu()` using custom ops.

    Args:
        up (int): Integer upsampling factor. Defaults to 1.
        down (int): Integer downsampling factor. Defaults to 1.
        padding (int): Padding with respect to the upsampled image. Can be a
            single number or a list/tuple `[x, y]` or `[x_before, x_after,
            y_before, y_after]`. Defaults to 0.
        gain (float): Overall scaling factor for signal magnitude.
            Defaults to np.sqrt(2).
        slope (float): Slope on the negative side of leaky ReLU.
            Defaults to 0.2.
        clamp (float or int): Maximum magnitude for leaky ReLU
            output. Defaults to None.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.

    Returns:
        Tensor of the shape `[batch_size, num_channels, out_height,
        out_width]`.
    """
    assert isinstance(up, int) and up >= 1
    assert isinstance(down, int) and down >= 1
    px0, px1, py0, py1 = _parse_padding(padding)
    assert gain == float(gain) and gain > 0
    gain = float(gain)
    assert slope == float(slope) and slope >= 0
    slope = float(slope)
    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
    clamp = float(clamp if clamp is not None else 'inf')

    # Lookup from cache.
    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
    if key in _filtered_lrelu_cuda_cache:
        return _filtered_lrelu_cuda_cache[key]

    # Forward op.
    class FilteredLReluCuda(torch.autograd.Function):

        @staticmethod
        def forward(ctx, input, filter_up, filter_down, bias, si, sx, sy):
            # pylint: disable=arguments-differ
            assert isinstance(input, torch.Tensor) and input.ndim == 4

            # Replace empty up/downsample kernels with full 1x1 kernels
            # (faster than separable).
            if filter_up is None:
                filter_up = torch.ones([1, 1],
                                       dtype=torch.float32,
                                       device=input.device)
            if filter_down is None:
                filter_down = torch.ones([1, 1],
                                         dtype=torch.float32,
                                         device=input.device)
            assert 1 <= filter_up.ndim <= 2
            assert 1 <= filter_down.ndim <= 2

            # Replace separable 1x1 kernels with full 1x1 kernels when scale
            # factor is 1.
            if up == 1 and filter_up.ndim == 1 and filter_up.shape[0] == 1:
                filter_up = filter_up.square()[None]
            if down == 1 and filter_down.ndim == 1 and filter_down.shape[
                    0] == 1:
                filter_down = filter_down.square()[None]

            # Missing sign input tensor.
            if si is None:
                si = torch.empty([0])

            # Missing bias tensor.
            if bias is None:
                bias = torch.zeros([input.shape[1]],
                                   dtype=input.dtype,
                                   device=input.device)

            # Construct internal sign tensor only if gradients are needed.
            write_signs = (si.numel() == 0) and (input.requires_grad
                                                 or bias.requires_grad)

            # Warn if input storage strides are not in decreasing order due to
            # e.g. channels-last layout.
            strides = [
                input.stride(i) for i in range(input.ndim) if input.size(i) > 1
            ]
            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
                warnings.warn(
                    'low-performance memory layout detected in filtered_lrelu '
                    'input', RuntimeWarning)

            # Call C++/Cuda plugin if datatype is supported.
            if input.dtype in [torch.float16, torch.float32]:
                if torch.cuda.current_stream(
                        input.device) != torch.cuda.default_stream(
                            input.device):
                    warnings.warn(
                        'filtered_lrelu called with non-default cuda stream '
                        'but concurrent execution is not supported',
                        RuntimeWarning)
                y, so, return_code = ext_module.filtered_lrelu(
                    input, filter_up, filter_down, bias, si.to(input.device),
                    up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp,
                    flip_filter, write_signs)
            else:
                return_code = -1

            # No Cuda kernel found? Fall back to generic implementation.
            # Still more memory efficient than the reference implementation
            # because only the bit-packed sign tensor is retained for gradient
            # computation.
            if return_code < 0:
                warnings.warn(
                    'filtered_lrelu called with parameters that have no '
                    'optimized CUDA kernel, using generic fallback',
                    RuntimeWarning)

                y = input.add(bias.unsqueeze(-1).unsqueeze(-1))  # Add bias.
                y = upfirdn2d(
                    input=y,
                    filter=filter_up,
                    up=up,
                    padding=[px0, px1, py0, py1],
                    gain=float(up**2),
                    flip_filter=flip_filter)  # Upsample.
                # Activation function and sign handling. Modifies y in-place.
                so = ext_module.filtered_lrelu_act_(y, si.to(y.device), sx, sy,
                                                    gain, slope, clamp,
                                                    write_signs)
                y = upfirdn2d(
                    input=y,
                    filter=filter_down,
                    down=down,
                    flip_filter=flip_filter)  # Downsample.

            # Prepare for gradient computation.
            ctx.save_for_backward(filter_up, filter_down,
                                  (si if si.numel() else so))
            ctx.x_shape = input.shape
            ctx.y_shape = y.shape
            ctx.s_ofs = sx, sy
            return y

        @staticmethod
        def backward(ctx, dy):  # pylint: disable=arguments-differ
            filter_up, filter_down, si = ctx.saved_tensors
            _, _, xh, xw = ctx.x_shape
            _, _, yh, yw = ctx.y_shape
            sx, sy = ctx.s_ofs
            dx = None  # 0
            dfu = None
            assert not ctx.needs_input_grad[1]
            dfd = None
            assert not ctx.needs_input_grad[2]
            db = None  # 3
            dsi = None
            assert not ctx.needs_input_grad[4]
            dsx = None
            assert not ctx.needs_input_grad[5]
            dsy = None
            assert not ctx.needs_input_grad[6]

            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
                pp = [
                    (filter_up.shape[-1] - 1) + (filter_down.shape[-1] - 1) -
                    px0,
                    xw * up - yw * down + px0 - (up - 1),
                    (filter_up.shape[0] - 1) + (filter_down.shape[0] - 1) -
                    py0,
                    xh * up - yh * down + py0 - (up - 1),
                ]
                gg = gain * (up**2) / (down**2)
                ff = (not flip_filter)
                sx = sx - (filter_up.shape[-1] - 1) + px0
                sy = sy - (filter_up.shape[0] - 1) + py0
                dx = _filtered_lrelu_cuda(
                    up=down,
                    down=up,
                    padding=pp,
                    gain=gg,
                    slope=slope,
                    clamp=None,
                    flip_filter=ff).apply(dy, filter_down, filter_up, None, si,
                                          sx, sy)

            if ctx.needs_input_grad[3]:
                db = dx.sum([0, 2, 3])

            return dx, dfu, dfd, db, dsi, dsx, dsy

    # Add to cache.
    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda
    return FilteredLReluCuda


_filtered_lrelu_musa_cache: Dict = dict()


def _filtered_lrelu_musa(up: int = 1,
                         down: int = 1,
                         padding: int = 0,
                         gain: float = np.sqrt(2),
                         slope: float = 0.2,
                         clamp: Optional[Union[float, int]] = None,
                         flip_filter: bool = False):
    """Fast MUSA implementation of `filtered_lrelu()` using custom ops.

    Args:
        up (int): Integer upsampling factor. Defaults to 1.
        down (int): Integer downsampling factor. Defaults to 1.
        padding (int): Padding with respect to the upsampled image. Can be a
            single number or a list/tuple `[x, y]` or `[x_before, x_after,
            y_before, y_after]`. Defaults to 0.
        gain (float): Overall scaling factor for signal magnitude.
            Defaults to np.sqrt(2).
        slope (float): Slope on the negative side of leaky ReLU.
            Defaults to 0.2.
        clamp (float or int): Maximum magnitude for leaky ReLU
            output. Defaults to None.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.

    Returns:
        Tensor of the shape `[batch_size, num_channels, out_height,
        out_width]`.
    """
    assert isinstance(up, int) and up >= 1
    assert isinstance(down, int) and down >= 1
    px0, px1, py0, py1 = _parse_padding(padding)
    assert gain == float(gain) and gain > 0
    gain = float(gain)
    assert slope == float(slope) and slope >= 0
    slope = float(slope)
    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
    clamp = float(clamp if clamp is not None else 'inf')

    # Lookup from cache.
    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
    if key in _filtered_lrelu_musa_cache:
        return _filtered_lrelu_musa_cache[key]

    # Forward op.
    class FilteredLReluMusa(torch.autograd.Function):

        @staticmethod
        def forward(ctx, input, filter_up, filter_down, bias, si, sx, sy):
            # pylint: disable=arguments-differ
            assert isinstance(input, torch.Tensor) and input.ndim == 4

            # Replace empty up/downsample kernels with full 1x1 kernels
            # (faster than separable).
            if filter_up is None:
                filter_up = torch.ones([1, 1],
                                       dtype=torch.float32,
                                       device=input.device)
            if filter_down is None:
                filter_down = torch.ones([1, 1],
                                         dtype=torch.float32,
                                         device=input.device)
            assert 1 <= filter_up.ndim <= 2
            assert 1 <= filter_down.ndim <= 2

            # Replace separable 1x1 kernels with full 1x1 kernels when scale
            # factor is 1.
            if up == 1 and filter_up.ndim == 1 and filter_up.shape[0] == 1:
                filter_up = filter_up.square()[None]
            if down == 1 and filter_down.ndim == 1 and filter_down.shape[
                    0] == 1:
                filter_down = filter_down.square()[None]

            # Missing sign input tensor.
            if si is None:
                si = torch.empty([0])

            # Missing bias tensor.
            if bias is None:
                bias = torch.zeros([input.shape[1]],
                                   dtype=input.dtype,
                                   device=input.device)

            # Construct internal sign tensor only if gradients are needed.
            write_signs = (si.numel() == 0) and (input.requires_grad
                                                 or bias.requires_grad)

            # Warn if input storage strides are not in decreasing order due to
            # e.g. channels-last layout.
            strides = [
                input.stride(i) for i in range(input.ndim) if input.size(i) > 1
            ]
            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
                warnings.warn(
                    'low-performance memory layout detected in filtered_lrelu '
                    'input', RuntimeWarning)

            # Call C++/MUSA plugin if datatype is supported.
            if input.dtype in [torch.float16, torch.float32]:
                if torch.musa.current_stream(
                        input.device) != torch.musa.default_stream(
                            input.device):
                    warnings.warn(
                        'filtered_lrelu called with non-default musa stream '
                        'but concurrent execution is not supported',
                        RuntimeWarning)
                y, so, return_code = ext_module.filtered_lrelu(
                    input, filter_up, filter_down, bias, si.to(input.device),
                    up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp,
                    flip_filter, write_signs)
            else:
                return_code = -1

            # No Musa kernel found? Fall back to generic implementation.
            # Still more memory efficient than the reference implementation
            # because only the bit-packed sign tensor is retained for gradient
            # computation.
            if return_code < 0:
                warnings.warn(
                    'filtered_lrelu called with parameters that have no '
                    'optimized MUSA kernel, using generic fallback',
                    RuntimeWarning)

                y = input.add(bias.unsqueeze(-1).unsqueeze(-1))  # Add bias.
                y = upfirdn2d(
                    input=y,
                    filter=filter_up,
                    up=up,
                    padding=[px0, px1, py0, py1],
                    gain=float(up**2),
                    flip_filter=flip_filter)  # Upsample.
                # Activation function and sign handling. Modifies y in-place.
                so = ext_module.filtered_lrelu_act_(y, si.to(y.device), sx, sy,
                                                    gain, slope, clamp,
                                                    write_signs)
                y = upfirdn2d(
                    input=y,
                    filter=filter_down,
                    down=down,
                    flip_filter=flip_filter)  # Downsample.

            # Prepare for gradient computation.
            ctx.save_for_backward(filter_up, filter_down,
                                  (si if si.numel() else so))
            ctx.x_shape = input.shape
            ctx.y_shape = y.shape
            ctx.s_ofs = sx, sy
            return y

        @staticmethod
        def backward(ctx, dy):  # pylint: disable=arguments-differ
            filter_up, filter_down, si = ctx.saved_tensors
            _, _, xh, xw = ctx.x_shape
            _, _, yh, yw = ctx.y_shape
            sx, sy = ctx.s_ofs
            dx = None  # 0
            dfu = None
            assert not ctx.needs_input_grad[1]
            dfd = None
            assert not ctx.needs_input_grad[2]
            db = None  # 3
            dsi = None
            assert not ctx.needs_input_grad[4]
            dsx = None
            assert not ctx.needs_input_grad[5]
            dsy = None
            assert not ctx.needs_input_grad[6]

            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
                pp = [
                    (filter_up.shape[-1] - 1) + (filter_down.shape[-1] - 1) -
                    px0,
                    xw * up - yw * down + px0 - (up - 1),
                    (filter_up.shape[0] - 1) + (filter_down.shape[0] - 1) -
                    py0,
                    xh * up - yh * down + py0 - (up - 1),
                ]
                gg = gain * (up**2) / (down**2)
                ff = (not flip_filter)
                sx = sx - (filter_up.shape[-1] - 1) + px0
                sy = sy - (filter_up.shape[0] - 1) + py0
                dx = _filtered_lrelu_musa(
                    up=down,
                    down=up,
                    padding=pp,
                    gain=gg,
                    slope=slope,
                    clamp=None,
                    flip_filter=ff).apply(dy, filter_down, filter_up, None, si,
                                          sx, sy)

            if ctx.needs_input_grad[3]:
                db = dx.sum([0, 2, 3])

            return dx, dfu, dfd, db, dsi, dsx, dsy

    # Add to cache.
    _filtered_lrelu_musa_cache[key] = FilteredLReluMusa
    return FilteredLReluMusa


================================================
FILE: mmcv/ops/focal_loss.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Union

import torch
import torch.nn as nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', [
    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',
    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'
])


class SigmoidFocalLossFunction(Function):

    @staticmethod
    def forward(ctx,
                input: torch.Tensor,
                target: Union[torch.LongTensor, torch.cuda.LongTensor],
                gamma: float = 2.0,
                alpha: float = 0.25,
                weight: Optional[torch.Tensor] = None,
                reduction: str = 'mean') -> torch.Tensor:

        assert target.dtype == torch.long
        assert input.dim() == 2
        assert target.dim() == 1
        assert input.size(0) == target.size(0)
        if weight is None:
            weight = input.new_empty(0)
        else:
            assert weight.dim() == 1
            assert input.size(1) == weight.size(0)
        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
        assert reduction in ctx.reduction_dict.keys()

        ctx.gamma = float(gamma)
        ctx.alpha = float(alpha)
        ctx.reduction = ctx.reduction_dict[reduction]

        output = input.new_zeros(input.size())

        ext_module.sigmoid_focal_loss_forward(
            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)
        if ctx.reduction == ctx.reduction_dict['mean']:
            output = output.sum() / input.size(0)
        elif ctx.reduction == ctx.reduction_dict['sum']:
            output = output.sum()
        ctx.save_for_backward(input, target, weight)
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_output: torch.Tensor) -> tuple:
        input, target, weight = ctx.saved_tensors

        grad_input = input.new_zeros(input.size())

        ext_module.sigmoid_focal_loss_backward(
            input,
            target,
            weight,
            grad_input,
            gamma=ctx.gamma,
            alpha=ctx.alpha)

        grad_input *= grad_output
        if ctx.reduction == ctx.reduction_dict['mean']:
            grad_input /= input.size(0)
        return grad_input, None, None, None, None, None


sigmoid_focal_loss = SigmoidFocalLossFunction.apply


class SigmoidFocalLoss(nn.Module):

    def __init__(self,
                 gamma: float,
                 alpha: float,
                 weight: Optional[torch.Tensor] = None,
                 reduction: str = 'mean'):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.register_buffer('weight', weight)
        self.reduction = reduction

    def forward(
        self,
        input: torch.Tensor,
        target: Union[torch.LongTensor, torch.cuda.LongTensor],
    ) -> torch.Tensor:
        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
                                  self.weight, self.reduction)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(gamma={self.gamma}, '
        s += f'alpha={self.alpha}, '
        s += f'reduction={self.reduction})'
        return s


class SoftmaxFocalLossFunction(Function):

    @staticmethod
    def forward(ctx,
                input: torch.Tensor,
                target: Union[torch.LongTensor, torch.cuda.LongTensor],
                gamma: float = 2.0,
                alpha: float = 0.25,
                weight: Optional[torch.Tensor] = None,
                reduction='mean') -> torch.Tensor:

        assert target.dtype == torch.long
        assert input.dim() == 2
        assert target.dim() == 1
        assert input.size(0) == target.size(0)
        if weight is None:
            weight = input.new_empty(0)
        else:
            assert weight.dim() == 1
            assert input.size(1) == weight.size(0)
        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
        assert reduction in ctx.reduction_dict.keys()

        ctx.gamma = float(gamma)
        ctx.alpha = float(alpha)
        ctx.reduction = ctx.reduction_dict[reduction]

        channel_stats, _ = torch.max(input, dim=1)
        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)
        input_softmax.exp_()

        channel_stats = input_softmax.sum(dim=1)
        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)

        output = input.new_zeros(input.size(0))
        ext_module.softmax_focal_loss_forward(
            input_softmax,
            target,
            weight,
            output,
            gamma=ctx.gamma,
            alpha=ctx.alpha)

        if ctx.reduction == ctx.reduction_dict['mean']:
            output = output.sum() / input.size(0)
        elif ctx.reduction == ctx.reduction_dict['sum']:
            output = output.sum()
        ctx.save_for_backward(input_softmax, target, weight)
        return output

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor) -> tuple:
        input_softmax, target, weight = ctx.saved_tensors
        buff = input_softmax.new_zeros(input_softmax.size(0))
        grad_input = input_softmax.new_zeros(input_softmax.size())

        ext_module.softmax_focal_loss_backward(
            input_softmax,
            target,
            weight,
            buff,
            grad_input,
            gamma=ctx.gamma,
            alpha=ctx.alpha)

        grad_input *= grad_output
        if ctx.reduction == ctx.reduction_dict['mean']:
            grad_input /= input_softmax.size(0)
        return grad_input, None, None, None, None, None


softmax_focal_loss = SoftmaxFocalLossFunction.apply


class SoftmaxFocalLoss(nn.Module):

    def __init__(self,
                 gamma: float,
                 alpha: float,
                 weight: Optional[torch.Tensor] = None,
                 reduction: str = 'mean'):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.register_buffer('weight', weight)
        self.reduction = reduction

    def forward(
        self,
        input: torch.Tensor,
        target: Union[torch.LongTensor, torch.cuda.LongTensor],
    ) -> torch.Tensor:
        return softmax_focal_loss(input, target, self.gamma, self.alpha,
                                  self.weight, self.reduction)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(gamma={self.gamma}, '
        s += f'alpha={self.alpha}, '
        s += f'reduction={self.reduction})'
        return s


================================================
FILE: mmcv/ops/furthest_point_sample.py
================================================
import torch
from mmengine.device import is_cuda_available, is_musa_available
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', [
    'furthest_point_sampling_forward',
    'furthest_point_sampling_with_dist_forward'
])


class FurthestPointSampling(Function):
    """Uses iterative furthest point sampling to select a set of features whose
    corresponding points have the furthest distance."""

    @staticmethod
    def forward(ctx, points_xyz: torch.Tensor,
                num_points: int) -> torch.Tensor:
        """
        Args:
            points_xyz (torch.Tensor): (B, N, 3) where N > num_points.
            num_points (int): Number of points in the sampled set.

        Returns:
            torch.Tensor: (B, num_points) indices of the sampled points.
        """
        assert points_xyz.is_contiguous()

        B, N = points_xyz.size()[:2]
        if points_xyz.device.type == 'npu':
            output = torch.IntTensor(B, num_points).npu()
            temp = torch.FloatTensor(B, N).fill_(1e10).npu()
        elif is_cuda_available():
            output = torch.cuda.IntTensor(B, num_points)
            temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
        elif is_musa_available():
            output = torch.musa.IntTensor(B, num_points)
            temp = torch.musa.FloatTensor(B, N).fill_(1e10)

        ext_module.furthest_point_sampling_forward(
            points_xyz,
            temp,
            output,
            b=B,
            n=N,
            m=num_points,
        )
        if torch.__version__ != 'parrots':
            ctx.mark_non_differentiable(output)
        return output

    @staticmethod
    def backward(xyz, a=None):
        return None, None


class FurthestPointSamplingWithDist(Function):
    """Uses iterative furthest point sampling to select a set of features whose
    corresponding points have the furthest distance."""

    @staticmethod
    def forward(ctx, points_dist: torch.Tensor,
                num_points: int) -> torch.Tensor:
        """
        Args:
            points_dist (torch.Tensor): (B, N, N) Distance between each point
                pair.
            num_points (int): Number of points in the sampled set.

        Returns:
            torch.Tensor: (B, num_points) indices of the sampled points.
        """
        assert points_dist.is_contiguous()

        B, N, _ = points_dist.size()
        output = points_dist.new_zeros([B, num_points], dtype=torch.int32)
        temp = points_dist.new_zeros([B, N]).fill_(1e10)

        ext_module.furthest_point_sampling_with_dist_forward(
            points_dist, temp, output, b=B, n=N, m=num_points)
        if torch.__version__ != 'parrots':
            ctx.mark_non_differentiable(output)
        return output

    @staticmethod
    def backward(xyz, a=None):
        return None, None


furthest_point_sample = FurthestPointSampling.apply
furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply


================================================
FILE: mmcv/ops/fused_bias_leakyrelu.py
================================================
# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501

# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
# Augmentation (ADA)
# =======================================================================

# 1. Definitions

# "Licensor" means any person or entity that distributes its Work.

# "Software" means the original work of authorship made available under
# this License.

# "Work" means the Software and any additions to or derivative works of
# the Software that are made available under this License.

# The terms "reproduce," "reproduction," "derivative works," and
# "distribution" have the meaning as provided under U.S. copyright law;
# provided, however, that for the purposes of this License, derivative
# works shall not include works that remain separable from, or merely
# link (or bind by name) to the interfaces of, the Work.

# Works, including the Software, are "made available" under this License
# by including in or with the Work either (a) a copyright notice
# referencing the applicability of this License to the Work, or (b) a
# copy of this License.

# 2. License Grants

#     2.1 Copyright Grant. Subject to the terms and conditions of this
#     License, each Licensor grants to you a perpetual, worldwide,
#     non-exclusive, royalty-free, copyright license to reproduce,
#     prepare derivative works of, publicly display, publicly perform,
#     sublicense and distribute its Work and any resulting derivative
#     works in any form.

# 3. Limitations

#     3.1 Redistribution. You may reproduce or distribute the Work only
#     if (a) you do so under this License, (b) you include a complete
#     copy of this License with your distribution, and (c) you retain
#     without modification any copyright, patent, trademark, or
#     attribution notices that are present in the Work.

#     3.2 Derivative Works. You may specify that additional or different
#     terms apply to the use, reproduction, and distribution of your
#     derivative works of the Work ("Your Terms") only if (a) Your Terms
#     provide that the use limitation in Section 3.3 applies to your
#     derivative works, and (b) you identify the specific derivative
#     works that are subject to Your Terms. Notwithstanding Your Terms,
#     this License (including the redistribution requirements in Section
#     3.1) will continue to apply to the Work itself.

#     3.3 Use Limitation. The Work and any derivative works thereof only
#     may be used or intended for use non-commercially. Notwithstanding
#     the foregoing, NVIDIA and its affiliates may use the Work and any
#     derivative works commercially. As used herein, "non-commercially"
#     means for research or evaluation purposes only.

#     3.4 Patent Claims. If you bring or threaten to bring a patent claim
#     against any Licensor (including any claim, cross-claim or
#     counterclaim in a lawsuit) to enforce any patents that you allege
#     are infringed by any Work, then your rights under this License from
#     such Licensor (including the grant in Section 2.1) will terminate
#     immediately.

#     3.5 Trademarks. This License does not grant any rights to use any
#     Licensor’s or its affiliates’ names, logos, or trademarks, except
#     as necessary to reproduce the notices described in this License.

#     3.6 Termination. If you violate any term of this License, then your
#     rights under this License (including the grant in Section 2.1) will
#     terminate immediately.

# 4. Disclaimer of Warranty.

# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
# THIS LICENSE.

# 5. Limitation of Liability.

# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGES.

# =======================================================================

import torch
import torch.nn.functional as F
from torch import nn
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu'])


class FusedBiasLeakyReLUFunctionBackward(Function):
    """Calculate second order deviation.

    This function is to compute the second order deviation for the fused leaky
    relu operation.
    """

    @staticmethod
    def forward(ctx, grad_output: torch.Tensor, out: torch.Tensor,
                negative_slope: float, scale: float) -> tuple:
        ctx.save_for_backward(out)
        ctx.negative_slope = negative_slope
        ctx.scale = scale

        empty = grad_output.new_empty(0)

        grad_input = ext_module.fused_bias_leakyrelu(
            grad_output,
            empty,
            out,
            act=3,
            grad=1,
            alpha=negative_slope,
            scale=scale)

        dim = [0]

        if grad_input.ndim > 2:
            dim += list(range(2, grad_input.ndim))

        grad_bias = grad_input.sum(dim).detach()

        return grad_input, grad_bias

    @staticmethod
    def backward(ctx, gradgrad_input: torch.Tensor,
                 gradgrad_bias: nn.Parameter) -> tuple:
        out, = ctx.saved_tensors

        # The second order deviation, in fact, contains two parts, while the
        # the first part is zero. Thus, we direct consider the second part
        # which is similar with the first order deviation in implementation.
        gradgrad_out = ext_module.fused_bias_leakyrelu(
            gradgrad_input,
            gradgrad_bias.to(out.dtype),
            out,
            act=3,
            grad=1,
            alpha=ctx.negative_slope,
            scale=ctx.scale)

        return gradgrad_out, None, None, None


class FusedBiasLeakyReLUFunction(Function):

    @staticmethod
    def forward(ctx, input: torch.Tensor, bias: nn.Parameter,
                negative_slope: float, scale: float) -> torch.Tensor:
        empty = input.new_empty(0)

        out = ext_module.fused_bias_leakyrelu(
            input,
            bias,
            empty,
            act=3,
            grad=0,
            alpha=negative_slope,
            scale=scale)
        ctx.save_for_backward(out)
        ctx.negative_slope = negative_slope
        ctx.scale = scale

        return out

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor) -> tuple:
        out, = ctx.saved_tensors

        grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(
            grad_output, out, ctx.negative_slope, ctx.scale)

        return grad_input, grad_bias, None, None


class FusedBiasLeakyReLU(nn.Module):
    r"""Fused bias leaky ReLU.

    This function is introduced in the StyleGAN2:
    `Analyzing and Improving the Image Quality of StyleGAN
    <http://arxiv.org/abs/1912.04958>`_

    The bias term comes from the convolution operation. In addition, to keep
    the variance of the feature map or gradients unchanged, they also adopt a
    scale similarly with Kaiming initialization. However, since the
    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
    your own scale.

    TODO: Implement the CPU version.

    Args:
        num_channels (int): The channel number of the feature map.
        negative_slope (float, optional): Same as nn.LeakyRelu.
            Defaults to 0.2.
        scale (float, optional): A scalar to adjust the variance of the feature
            map. Defaults to 2**0.5.
    """

    def __init__(self,
                 num_channels: int,
                 negative_slope: float = 0.2,
                 scale: float = 2**0.5):
        super().__init__()

        self.bias = nn.Parameter(torch.zeros(num_channels))
        self.negative_slope = negative_slope
        self.scale = scale

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return fused_bias_leakyrelu(input, self.bias, self.negative_slope,
                                    self.scale)


def fused_bias_leakyrelu(input: torch.Tensor,
                         bias: nn.Parameter,
                         negative_slope: float = 0.2,
                         scale: float = 2**0.5) -> torch.Tensor:
    r"""Fused bias leaky ReLU function.

    This function is introduced in the StyleGAN2:
    `Analyzing and Improving the Image Quality of StyleGAN
    <http://arxiv.org/abs/1912.04958>`_

    The bias term comes from the convolution operation. In addition, to keep
    the variance of the feature map or gradients unchanged, they also adopt a
    scale similarly with Kaiming initialization. However, since the
    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
    your own scale.

    Args:
        input (torch.Tensor): Input feature map.
        bias (nn.Parameter): The bias from convolution operation.
        negative_slope (float, optional): Same as nn.LeakyRelu.
            Defaults to 0.2.
        scale (float, optional): A scalar to adjust the variance of the feature
            map. Defaults to 2**0.5.

    Returns:
        torch.Tensor: Feature map after non-linear activation.
    """

    if (not input.is_cuda) and (not input.is_musa) and \
            input.device.type != 'npu':
        return bias_leakyrelu_ref(input, bias, negative_slope, scale)

    return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype),
                                            negative_slope, scale)


def bias_leakyrelu_ref(x: torch.Tensor,
                       bias: nn.Parameter,
                       negative_slope: float = 0.2,
                       scale: float = 2**0.5) -> torch.Tensor:

    if bias is not None:
        assert bias.ndim == 1
        assert bias.shape[0] == x.shape[1]
        x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)])

    x = F.leaky_relu(x, negative_slope)
    if scale != 1:
        x = x * scale

    return x


================================================
FILE: mmcv/ops/gather_points.py
================================================
from typing import Tuple

import torch
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['gather_points_forward', 'gather_points_backward'])


class GatherPoints(Function):
    """Gather points with given index."""

    @staticmethod
    def forward(ctx, features: torch.Tensor,
                indices: torch.Tensor) -> torch.Tensor:
        """
        Args:
            features (torch.Tensor): (B, C, N) features to gather.
            indices (torch.Tensor): (B, M) where M is the number of points.

        Returns:
            torch.Tensor: (B, C, M) where M is the number of points.
        """
        assert features.is_contiguous()
        assert indices.is_contiguous()

        B, npoint = indices.size()
        _, C, N = features.size()
        output = features.new_zeros((B, C, npoint))

        ext_module.gather_points_forward(
            features, indices, output, b=B, c=C, n=N, npoints=npoint)

        ctx.for_backwards = (indices, C, N)
        if torch.__version__ != 'parrots':
            ctx.mark_non_differentiable(indices)
        return output

    @staticmethod
    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
        idx, C, N = ctx.for_backwards
        B, npoint = idx.size()

        grad_features = grad_out.new_zeros((B, C, N))
        grad_out_data = grad_out.data.contiguous()
        ext_module.gather_points_backward(
            grad_out_data,
            idx,
            grad_features.data,
            b=B,
            c=C,
            n=N,
            npoints=npoint)
        return grad_features, None


gather_points = GatherPoints.apply


================================================
FILE: mmcv/ops/group_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Tuple, Union

import torch
from torch import nn as nn
from torch.autograd import Function

from ..utils import ext_loader
from .ball_query import ball_query
from .knn import knn

ext_module = ext_loader.load_ext('_ext', [
    'group_points_forward', 'group_points_backward',
    'stack_group_points_forward', 'stack_group_points_backward'
])


class QueryAndGroup(nn.Module):
    """Groups points with a ball query of radius.

    Args:
        max_radius (float): The maximum radius of the balls.
            If None is given, we will use kNN sampling instead of ball query.
        sample_num (int): Maximum number of features to gather in the ball.
        min_radius (float, optional): The minimum radius of the balls.
            Default: 0.
        use_xyz (bool, optional): Whether to use xyz.
            Default: True.
        return_grouped_xyz (bool, optional): Whether to return grouped xyz.
            Default: False.
        normalize_xyz (bool, optional): Whether to normalize xyz.
            Default: False.
        uniform_sample (bool, optional): Whether to sample uniformly.
            Default: False
        return_unique_cnt (bool, optional): Whether to return the count of
            unique samples. Default: False.
        return_grouped_idx (bool, optional): Whether to return grouped idx.
            Default: False.
    """

    def __init__(self,
                 max_radius: float,
                 sample_num: int,
                 min_radius: float = 0.,
                 use_xyz: bool = True,
                 return_grouped_xyz: bool = False,
                 normalize_xyz: bool = False,
                 uniform_sample: bool = False,
                 return_unique_cnt: bool = False,
                 return_grouped_idx: bool = False):
        super().__init__()
        self.max_radius = max_radius
        self.min_radius = min_radius
        self.sample_num = sample_num
        self.use_xyz = use_xyz
        self.return_grouped_xyz = return_grouped_xyz
        self.normalize_xyz = normalize_xyz
        self.uniform_sample = uniform_sample
        self.return_unique_cnt = return_unique_cnt
        self.return_grouped_idx = return_grouped_idx
        if self.return_unique_cnt:
            assert self.uniform_sample, \
                'uniform_sample should be True when ' \
                'returning the count of unique samples'
        if self.max_radius is None:
            assert not self.normalize_xyz, \
                'can not normalize grouped xyz when max_radius is None'

    def forward(
        self,
        points_xyz: torch.Tensor,
        center_xyz: torch.Tensor,
        features: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple]:
        """
        Args:
            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of the
                points.
            center_xyz (torch.Tensor): (B, npoint, 3) coordinates of the
                centriods.
            features (torch.Tensor): (B, C, N) The features of grouped
                points.

        Returns:
            Tuple | torch.Tensor: (B, 3 + C, npoint, sample_num) Grouped
            concatenated coordinates and features of points.
        """
        # if self.max_radius is None, we will perform kNN instead of ball query
        # idx is of shape [B, npoint, sample_num]
        if self.max_radius is None:
            idx = knn(self.sample_num, points_xyz, center_xyz, False)
            idx = idx.transpose(1, 2).contiguous()
        else:
            idx = ball_query(self.min_radius, self.max_radius, self.sample_num,
                             points_xyz, center_xyz)

        if self.uniform_sample:
            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
            for i_batch in range(idx.shape[0]):
                for i_region in range(idx.shape[1]):
                    unique_ind = torch.unique(idx[i_batch, i_region, :])
                    num_unique = unique_ind.shape[0]
                    unique_cnt[i_batch, i_region] = num_unique
                    sample_ind = torch.randint(
                        0,
                        num_unique, (self.sample_num - num_unique, ),
                        dtype=torch.long)
                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
                    idx[i_batch, i_region, :] = all_ind

        xyz_trans = points_xyz.transpose(1, 2).contiguous()
        # (B, 3, npoint, sample_num)
        grouped_xyz = grouping_operation(xyz_trans, idx)
        grouped_xyz_diff = grouped_xyz - \
            center_xyz.transpose(1, 2).unsqueeze(-1)  # relative offsets
        if self.normalize_xyz:
            grouped_xyz_diff /= self.max_radius

        if features is not None:
            grouped_features = grouping_operation(features, idx)
            if self.use_xyz:
                # (B, C + 3, npoint, sample_num)
                new_features = torch.cat([grouped_xyz_diff, grouped_features],
                                         dim=1)
            else:
                new_features = grouped_features
        else:
            assert (self.use_xyz
                    ), 'Cannot have not features and not use xyz as a feature!'
            new_features = grouped_xyz_diff

        ret = [new_features]
        if self.return_grouped_xyz:
            ret.append(grouped_xyz)
        if self.return_unique_cnt:
            ret.append(unique_cnt)
        if self.return_grouped_idx:
            ret.append(idx)
        if len(ret) == 1:
            return ret[0]
        else:
            return tuple(ret)


class GroupAll(nn.Module):
    """Group xyz with feature.

    Args:
        use_xyz (bool): Whether to use xyz.
    """

    def __init__(self, use_xyz: bool = True):
        super().__init__()
        self.use_xyz = use_xyz

    def forward(self,
                xyz: torch.Tensor,
                new_xyz: torch.Tensor,
                features: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Args:
            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
            new_xyz (Tensor): new xyz coordinates of the features.
            features (Tensor): (B, C, N) features to group.

        Returns:
            Tensor: (B, C + 3, 1, N) Grouped feature.
        """
        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
        if features is not None:
            grouped_features = features.unsqueeze(2)
            if self.use_xyz:
                # (B, 3 + C, 1, N)
                new_features = torch.cat([grouped_xyz, grouped_features],
                                         dim=1)
            else:
                new_features = grouped_features
        else:
            new_features = grouped_xyz

        return new_features


class GroupingOperation(Function):
    """Group feature with given index."""

    @staticmethod
    def forward(
            ctx,
            features: torch.Tensor,
            indices: torch.Tensor,
            features_batch_cnt: Optional[torch.Tensor] = None,
            indices_batch_cnt: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Args:
            features (Tensor): Tensor of features to group, input shape is
                (B, C, N) or stacked inputs (N1 + N2 ..., C).
            indices (Tensor):  The indices of features to group with, input
                shape is (B, npoint, nsample) or stacked inputs
                (M1 + M2 ..., nsample).
            features_batch_cnt (Tensor, optional): Input features nums in
                each batch, just like (N1, N2, ...). Defaults to None.
                New in version 1.7.0.
            indices_batch_cnt (Tensor, optional): Input indices nums in
                each batch, just like (M1, M2, ...). Defaults to None.
                New in version 1.7.0.

        Returns:
            Tensor: Grouped features, the shape is (B, C, npoint, nsample)
            or (M1 + M2 ..., C, nsample).
        """
        features = features.contiguous()
        indices = indices.contiguous()
        if features_batch_cnt is not None and indices_batch_cnt is not None:
            assert features_batch_cnt.dtype == torch.int
            assert indices_batch_cnt.dtype == torch.int
            M, nsample = indices.size()
            N, C = features.size()
            B = indices_batch_cnt.shape[0]
            output = features.new_zeros((M, C, nsample))
            ext_module.stack_group_points_forward(
                features,
                features_batch_cnt,
                indices,
                indices_batch_cnt,
                output,
                b=B,
                m=M,
                c=C,
                nsample=nsample)
            ctx.for_backwards = (B, N, indices, features_batch_cnt,
                                 indices_batch_cnt)
        else:
            B, nfeatures, nsample = indices.size()
            _, C, N = features.size()
            output = features.new_zeros(B, C, nfeatures, nsample)

            ext_module.group_points_forward(
                features,
                indices,
                output,
                b=B,
                c=C,
                n=N,
                npoints=nfeatures,
                nsample=nsample)

            ctx.for_backwards = (indices, N)
        return output

    @staticmethod
    def backward(ctx, grad_out: torch.Tensor) -> Tuple:
        """
        Args:
            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
                of the output from forward.

        Returns:
            Tensor: (B, C, N) gradient of the features.
        """
        if len(ctx.for_backwards) != 5:
            idx, N = ctx.for_backwards

            B, C, npoint, nsample = grad_out.size()
            grad_features = grad_out.new_zeros(B, C, N)

            grad_out_data = grad_out.data.contiguous()
            ext_module.group_points_backward(
                grad_out_data,
                idx,
                grad_features.data,
                b=B,
                c=C,
                n=N,
                npoints=npoint,
                nsample=nsample)
            return grad_features, None
        else:
            B, N, idx, features_batch_cnt, idx_batch_cnt = ctx.for_backwards

            M, C, nsample = grad_out.size()
            grad_features = grad_out.new_zeros(N, C)

            grad_out_data = grad_out.data.contiguous()
            ext_module.stack_group_points_backward(
                grad_out_data,
                idx,
                idx_batch_cnt,
                features_batch_cnt,
                grad_features.data,
                b=B,
                c=C,
                m=M,
                n=N,
                nsample=nsample)
            return grad_features, None, None, None


grouping_operation = GroupingOperation.apply


================================================
FILE: mmcv/ops/info.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch

if torch.__version__ == 'parrots':
    import parrots

    def get_compiler_version():
        return 'GCC ' + parrots.version.compiler

    def get_compiling_cuda_version():
        return parrots.version.cuda
else:
    from ..utils import ext_loader
    ext_module = ext_loader.load_ext(
        '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])

    def get_compiler_version():
        return ext_module.get_compiler_version()

    def get_compiling_cuda_version():
        return ext_module.get_compiling_cuda_version()


================================================
FILE: mmcv/ops/iou3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from typing import Optional

import torch
from torch import Tensor

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', [
    'iou3d_boxes_overlap_bev_forward', 'iou3d_nms3d_forward',
    'iou3d_nms3d_normal_forward'
])


def boxes_overlap_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
    """Calculate boxes BEV overlap.

    Args:
        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).

    Returns:
        torch.Tensor: BEV overlap result with shape (M, N).
    """
    ans_overlap = boxes_a.new_zeros(
        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
                                               boxes_b.contiguous(),
                                               ans_overlap)

    return ans_overlap


def boxes_iou3d(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
    """Calculate boxes 3D IoU.

    Args:
        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).

    Returns:
        torch.Tensor: 3D IoU result with shape (M, N).
    """
    assert boxes_a.shape[1] == boxes_b.shape[1] == 7, \
        'Input boxes shape should be (N, 7)'

    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1)
    boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1)
    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1)
    boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1)

    overlaps_bev = boxes_a.new_zeros(
        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
                                               boxes_b.contiguous(),
                                               overlaps_bev)

    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
    overlaps_3d = overlaps_bev * overlaps_h
    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
    iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6)
    return iou3d


def nms3d(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
    """3D NMS function GPU implementation (for BEV boxes).

    Args:
        boxes (torch.Tensor): Input boxes with the shape of (N, 7)
            ([x, y, z, dx, dy, dz, heading]).
        scores (torch.Tensor): Scores of boxes with the shape of (N).
        iou_threshold (float): Overlap threshold of NMS.

    Returns:
        torch.Tensor: Indexes after NMS.
    """
    assert boxes.size(1) == 7, 'Input boxes shape should be (N, 7)'
    order = scores.sort(0, descending=True)[1]
    boxes = boxes[order].contiguous()

    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
    num_out = boxes.new_zeros(size=(), dtype=torch.long)
    ext_module.iou3d_nms3d_forward(
        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
    keep = order[keep[:num_out].to(boxes.device)].contiguous()
    return keep


def nms3d_normal(boxes: Tensor, scores: Tensor,
                 iou_threshold: float) -> Tensor:
    """Normal 3D NMS function GPU implementation. The overlap of two boxes for
    IoU calculation is defined as the exact overlapping area of the two boxes
    WITH their yaw angle set to 0.

    Args:
        boxes (torch.Tensor): Input boxes with shape (N, 7).
            ([x, y, z, dx, dy, dz, heading]).
        scores (torch.Tensor): Scores of predicted boxes with shape (N).
        iou_threshold (float): Overlap threshold of NMS.

    Returns:
        torch.Tensor: Remaining indices with scores in descending order.
    """
    assert boxes.shape[1] == 7, 'Input boxes shape should be (N, 7)'
    order = scores.sort(0, descending=True)[1]
    boxes = boxes[order].contiguous()

    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
    num_out = boxes.new_zeros(size=(), dtype=torch.long)
    ext_module.iou3d_nms3d_normal_forward(
        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
    return order[keep[:num_out].to(boxes.device)].contiguous()


def _xyxyr2xywhr(boxes: Tensor) -> Tensor:
    """Convert [x1, y1, x2, y2, heading] box to [x, y, dx, dy, heading] box.

    Args:
        box (torch.Tensor): Input boxes with shape (N, 5).

    Returns:
        torch.Tensor: Converted boxes with shape (N, 7).
    """
    warnings.warn(
        'This function is deprecated and will be removed in the future.',
        DeprecationWarning)
    return torch.stack(
        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
        dim=-1)


def boxes_iou_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
    """Calculate boxes IoU in the Bird's Eye View.

    Args:
        boxes_a (torch.Tensor): Input boxes a with shape (M, 5)
            ([x1, y1, x2, y2, ry]).
        boxes_b (torch.Tensor): Input boxes b with shape (N, 5)
            ([x1, y1, x2, y2, ry]).

    Returns:
        torch.Tensor: IoU result with shape (M, N).
    """
    from .box_iou_rotated import box_iou_rotated

    warnings.warn(
        '`iou3d.boxes_iou_bev` is deprecated and will be removed in'
        ' the future. Please, use `box_iou_rotated.box_iou_rotated`.',
        DeprecationWarning)

    return box_iou_rotated(_xyxyr2xywhr(boxes_a), _xyxyr2xywhr(boxes_b))


def nms_bev(boxes: Tensor,
            scores: Tensor,
            thresh: float,
            pre_max_size: Optional[int] = None,
            post_max_size: Optional[int] = None) -> Tensor:
    """NMS function GPU implementation (for BEV boxes).

    The overlap of two boxes for IoU calculation is defined as the exact
    overlapping area of the two boxes. In this function, one can also
    set ``pre_max_size`` and ``post_max_size``.

    Args:
        boxes (torch.Tensor): Input boxes with the shape of (N, 5)
            ([x1, y1, x2, y2, ry]).
        scores (torch.Tensor): Scores of boxes with the shape of (N,).
        thresh (float): Overlap threshold of NMS.
        pre_max_size (int, optional): Max size of boxes before NMS.
            Default: None.
        post_max_size (int, optional): Max size of boxes after NMS.
            Default: None.

    Returns:
        torch.Tensor: Indexes after NMS.
    """
    from .nms import nms_rotated

    warnings.warn(
        '`iou3d.nms_bev` is deprecated and will be removed in'
        ' the future. Please, use `nms.nms_rotated`.', DeprecationWarning)
    assert boxes.size(1) == 5, 'Input boxes shape should be (N, 5)'
    order = scores.sort(0, descending=True)[1]

    if pre_max_size is not None:
        order = order[:pre_max_size]
    boxes = _xyxyr2xywhr(boxes)[order]
    scores = scores[order]

    keep = nms_rotated(boxes, scores, thresh)[1]
    keep = order[keep]

    if post_max_size is not None:
        keep = keep[:post_max_size]
    return keep


def nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:
    """Normal NMS function GPU implementation (for BEV boxes).

    The overlap of two boxes for IoU calculation is defined as the exact
    overlapping area of the two boxes WITH their yaw angle set to 0.

    Args:
        boxes (torch.Tensor): Input boxes with shape (N, 5)
            ([x1, y1, x2, y2, ry]).
        scores (torch.Tensor): Scores of predicted boxes with shape (N,).
        thresh (float): Overlap threshold of NMS.

    Returns:
        torch.Tensor: Remaining indices with scores in descending order.
    """
    from .nms import nms

    warnings.warn(
        '`iou3d.nms_normal_bev` is deprecated and will be removed in'
        ' the future. Please, use `nms.nms`.', DeprecationWarning)
    assert boxes.shape[1] == 5, 'Input boxes shape should be (N, 5)'

    return nms(boxes[:, :-1], scores, thresh)[1]


================================================
FILE: mmcv/ops/knn.py
================================================
from typing import Optional

import torch
from mmengine.device import is_cuda_available, is_musa_available
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['knn_forward'])


class KNN(Function):
    r"""KNN (CUDA/MUSA) based on heap data structure.

    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
    scene_seg/lib/pointops/src/knnquery_heap>`_.

    Find k-nearest points.
    """

    @staticmethod
    def forward(ctx,
                k: int,
                xyz: torch.Tensor,
                center_xyz: Optional[torch.Tensor] = None,
                transposed: bool = False) -> torch.Tensor:
        """
        Args:
            k (int): number of nearest neighbors.
            xyz (torch.Tensor): (B, N, 3) if transposed == False, else
                (B, 3, N). xyz coordinates of the features.
            center_xyz (torch.Tensor, optional): (B, npoint, 3) if transposed
                is False, else (B, 3, npoint). centers of the knn query.
                Default: None.
            transposed (bool, optional): whether the input tensors are
                transposed. Should not explicitly use this keyword when
                calling knn (=KNN.apply), just add the fourth param.
                Default: False.

        Returns:
            torch.Tensor: (B, k, npoint) tensor with the indices of the
            features that form k-nearest neighbours.
        """
        assert (k > 0) & (k < 100), 'k should be in range(0, 100)'

        if center_xyz is None:
            center_xyz = xyz

        if transposed:
            xyz = xyz.transpose(2, 1).contiguous()
            center_xyz = center_xyz.transpose(2, 1).contiguous()

        assert xyz.is_contiguous()  # [B, N, 3]
        assert center_xyz.is_contiguous()  # [B, npoint, 3]

        center_xyz_device = center_xyz.get_device()
        assert center_xyz_device == xyz.get_device(), \
            'center_xyz and xyz should be put on the same device'
        if xyz.device.type != 'npu' and is_cuda_available():
            if torch.cuda.current_device() != center_xyz_device:
                torch.cuda.set_device(center_xyz_device)
        if xyz.device.type != 'npu' and is_musa_available():
            if torch.musa.current_device() != center_xyz_device:
                torch.musa.set_device(center_xyz_device)

        B, npoint, _ = center_xyz.shape
        N = xyz.shape[1]

        if xyz.device.type == 'npu':
            dist2 = center_xyz.new_zeros((B, npoint, k)).float()
            idx = center_xyz.new_zeros((B, npoint, k)).int()
            ext_module.knn_forward(
                xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
            zeros_idx = torch.zeros(
                xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu()
            idx.where(dist2 >= 1e10, zeros_idx)
            idx = idx.transpose(2, 1).contiguous()  # [B, k, npoint]
            return idx.int()

        idx = center_xyz.new_zeros((B, npoint, k)).int()
        dist2 = center_xyz.new_zeros((B, npoint, k)).float()

        ext_module.knn_forward(
            xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
        # idx shape to [B, k, npoint]
        idx = idx.transpose(2, 1).contiguous()
        if torch.__version__ != 'parrots':
            ctx.mark_non_differentiable(idx)
        return idx

    @staticmethod
    def backward(ctx, a=None):
        return None, None, None


knn = KNN.apply


================================================
FILE: mmcv/ops/masked_conv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
from typing import Optional, Tuple, Union

import torch
import torch.nn as nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])


class MaskedConv2dFunction(Function):

    @staticmethod
    def symbolic(g, features, mask, weight, bias, padding, stride=1):
        return g.op(
            'mmcv::MMCVMaskedConv2d',
            features,
            mask,
            weight,
            bias,
            padding_i=padding,
            stride_i=stride)

    @staticmethod
    def forward(ctx,
                features: torch.Tensor,
                mask: torch.Tensor,
                weight: torch.nn.Parameter,
                bias: torch.nn.Parameter,
                padding: int = 0,
                stride: int = 1) -> torch.Tensor:
        assert mask.dim() == 3 and mask.size(0) == 1
        assert features.dim() == 4 and features.size(0) == 1
        assert features.size()[2:] == mask.size()[1:]
        pad_h, pad_w = _pair(padding)
        stride_h, stride_w = _pair(stride)
        if stride_h != 1 or stride_w != 1:
            raise ValueError(
                'Stride could not only be 1 in masked_conv2d currently.')
        out_channel, in_channel, kernel_h, kernel_w = weight.size()

        if features.device.type == 'npu':
            import torch_npu
            output = torch_npu.npu_conv2d(
                features,
                weight,
                bias,
                stride=(stride_h, stride_w),
                padding=(pad_h, pad_w),
                dilation=(1, 1),
                groups=1)
            if mask.size()[1:] != output.size()[2:]:
                raise ValueError(
                    'The mask is inconsistent with the shape of output_conv.')
            mask = mask > 0
            mask = mask.type(output.dtype)
            output = output * mask
            return output

        batch_size = features.size(0)
        out_h = int(
            math.floor(
                torch.true_divide((features.size(2) + 2 * pad_h -
                                   (kernel_h - 1) - 1), stride_h) + 1))
        out_w = int(
            math.floor(
                torch.true_divide((features.size(3) + 2 * pad_w -
                                   (kernel_w - 1) - 1), stride_w) + 1))
        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
        if mask_inds.numel() > 0:
            mask_h_idx = mask_inds[:, 0].contiguous()
            mask_w_idx = mask_inds[:, 1].contiguous()
            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
                                          mask_inds.size(0))
            ext_module.masked_im2col_forward(
                features,
                mask_h_idx,
                mask_w_idx,
                data_col,
                kernel_h=kernel_h,
                kernel_w=kernel_w,
                pad_h=pad_h,
                pad_w=pad_w)
            masked_output = torch.addmm(1, bias[:, None], 1,
                                        weight.view(out_channel, -1), data_col)
            ext_module.masked_col2im_forward(
                masked_output,
                mask_h_idx,
                mask_w_idx,
                output,
                height=out_h,
                width=out_w,
                channels=out_channel)
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_output: torch.Tensor) -> tuple:
        return (None, ) * 5


masked_conv2d = MaskedConv2dFunction.apply


class MaskedConv2d(nn.Conv2d):
    """A MaskedConv2d which inherits the official Conv2d.

    The masked forward doesn't implement the backward function and only
    supports the stride parameter to be 1 currently.
    """

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: Union[int, Tuple[int, ...]],
                 stride: int = 1,
                 padding: int = 0,
                 dilation: int = 1,
                 groups: int = 1,
                 bias: bool = True):
        super().__init__(in_channels, out_channels, kernel_size, stride,
                         padding, dilation, groups, bias)

    def forward(self,
                input: torch.Tensor,
                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        if mask is None:  # fallback to the normal Conv2d
            return super().forward(input)
        else:
            return masked_conv2d(input, mask, self.weight, self.bias,
                                 self.padding)


================================================
FILE: mmcv/ops/merge_cells.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
from abc import abstractmethod
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F

from ..cnn import ConvModule


class BaseMergeCell(nn.Module):
    """The basic class for cells used in NAS-FPN and NAS-FCOS.

    BaseMergeCell takes 2 inputs. After applying convolution
    on them, they are resized to the target size. Then,
    they go through binary_op, which depends on the type of cell.
    If with_out_conv is True, the result of output will go through
    another convolution layer.

    Args:
        fused_channels (int): number of input channels in out_conv layer.
        out_channels (int): number of output channels in out_conv layer.
        with_out_conv (bool): Whether to use out_conv layer
        out_conv_cfg (dict): Config dict for convolution layer, which should
            contain "groups", "kernel_size", "padding", "bias" to build
            out_conv layer.
        out_norm_cfg (dict): Config dict for normalization layer in out_conv.
        out_conv_order (tuple): The order of conv/norm/activation layers in
            out_conv.
        with_input1_conv (bool): Whether to use convolution on input1.
        with_input2_conv (bool): Whether to use convolution on input2.
        input_conv_cfg (dict): Config dict for building input1_conv layer and
            input2_conv layer, which is expected to contain the type of
            convolution.
            Default: None, which means using conv2d.
        input_norm_cfg (dict): Config dict for normalization layer in
            input1_conv and input2_conv layer. Default: None.
        upsample_mode (str): Interpolation method used to resize the output
            of input1_conv and input2_conv to target size. Currently, we
            support ['nearest', 'bilinear']. Default: 'nearest'.
    """

    def __init__(self,
                 fused_channels: Optional[int] = 256,
                 out_channels: Optional[int] = 256,
                 with_out_conv: bool = True,
                 out_conv_cfg: dict = dict(
                     groups=1, kernel_size=3, padding=1, bias=True),
                 out_norm_cfg: Optional[dict] = None,
                 out_conv_order: tuple = ('act', 'conv', 'norm'),
                 with_input1_conv: bool = False,
                 with_input2_conv: bool = False,
                 input_conv_cfg: Optional[dict] = None,
                 input_norm_cfg: Optional[dict] = None,
                 upsample_mode: str = 'nearest'):
        super().__init__()
        assert upsample_mode in ['nearest', 'bilinear']
        self.with_out_conv = with_out_conv
        self.with_input1_conv = with_input1_conv
        self.with_input2_conv = with_input2_conv
        self.upsample_mode = upsample_mode

        if self.with_out_conv:
            self.out_conv = ConvModule(
                fused_channels,  # type: ignore
                out_channels,  # type: ignore
                **out_conv_cfg,
                norm_cfg=out_norm_cfg,
                order=out_conv_order)

        self.input1_conv = self._build_input_conv(
            out_channels, input_conv_cfg,
            input_norm_cfg) if with_input1_conv else nn.Sequential()
        self.input2_conv = self._build_input_conv(
            out_channels, input_conv_cfg,
            input_norm_cfg) if with_input2_conv else nn.Sequential()

    def _build_input_conv(self, channel, conv_cfg, norm_cfg):
        return ConvModule(
            channel,
            channel,
            3,
            padding=1,
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg,
            bias=True)

    @abstractmethod
    def _binary_op(self, x1, x2):
        pass

    def _resize(self, x, size):
        if x.shape[-2:] == size:
            return x
        elif x.shape[-2:] < size:
            return F.interpolate(x, size=size, mode=self.upsample_mode)
        else:
            if x.shape[-2] % size[-2] != 0 or x.shape[-1] % size[-1] != 0:
                h, w = x.shape[-2:]
                target_h, target_w = size
                pad_h = math.ceil(h / target_h) * target_h - h
                pad_w = math.ceil(w / target_w) * target_w - w
                pad_l = pad_w // 2
                pad_r = pad_w - pad_l
                pad_t = pad_h // 2
                pad_b = pad_h - pad_t
                pad = (pad_l, pad_r, pad_t, pad_b)
                x = F.pad(x, pad, mode='constant', value=0.0)
            kernel_size = (x.shape[-2] // size[-2], x.shape[-1] // size[-1])
            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
            return x

    def forward(self,
                x1: torch.Tensor,
                x2: torch.Tensor,
                out_size: Optional[tuple] = None) -> torch.Tensor:
        assert x1.shape[:2] == x2.shape[:2]
        assert out_size is None or len(out_size) == 2
        if out_size is None:  # resize to larger one
            out_size = max(x1.size()[2:], x2.size()[2:])

        x1 = self.input1_conv(x1)
        x2 = self.input2_conv(x2)

        x1 = self._resize(x1, out_size)
        x2 = self._resize(x2, out_size)

        x = self._binary_op(x1, x2)
        if self.with_out_conv:
            x = self.out_conv(x)
        return x


class SumCell(BaseMergeCell):

    def __init__(self, in_channels: int, out_channels: int, **kwargs):
        super().__init__(in_channels, out_channels, **kwargs)

    def _binary_op(self, x1, x2):
        return x1 + x2


class ConcatCell(BaseMergeCell):

    def __init__(self, in_channels: int, out_channels: int, **kwargs):
        super().__init__(in_channels * 2, out_channels, **kwargs)

    def _binary_op(self, x1, x2):
        ret = torch.cat([x1, x2], dim=1)
        return ret


class GlobalPoolingCell(BaseMergeCell):

    def __init__(self,
                 in_channels: Optional[int] = None,
                 out_channels: Optional[int] = None,
                 **kwargs):
        super().__init__(in_channels, out_channels, **kwargs)
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))

    def _binary_op(self, x1, x2):
        x2_att = self.global_pool(x2).sigmoid()
        return x2 + x2_att * x1


================================================
FILE: mmcv/ops/min_area_polygons.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['min_area_polygons'])


def min_area_polygons(pointsets: torch.Tensor) -> torch.Tensor:
    """Find the smallest polygons that surrounds all points in the point sets.

    Args:
        pointsets (Tensor): point sets with shape  (N, 18).

    Returns:
        torch.Tensor: Return the smallest polygons with shape (N, 8).
    """
    polygons = pointsets.new_zeros((pointsets.size(0), 8))
    ext_module.min_area_polygons(pointsets, polygons)
    return polygons


================================================
FILE: mmcv/ops/modulated_deform_conv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
from typing import Optional, Tuple, Union

import torch
import torch.nn as nn
from mmengine.logging import print_log
from mmengine.registry import MODELS
from mmengine.utils import deprecated_api_warning
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair, _single

from mmcv.utils import IS_MLU_AVAILABLE
from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext',
    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])


class ModulatedDeformConv2dFunction(Function):

    @staticmethod
    def symbolic(g, input, offset, mask, weight, bias, stride, padding,
                 dilation, groups, deform_groups):
        input_tensors = [input, offset, mask, weight]
        if bias is not None:
            input_tensors.append(bias)
        return g.op(
            'mmcv::MMCVModulatedDeformConv2d',
            *input_tensors,
            stride_i=stride,
            padding_i=padding,
            dilation_i=dilation,
            groups_i=groups,
            deform_groups_i=deform_groups)

    @staticmethod
    def _calculate_sort_index(kernel_h, kernel_w, deformable_group):
        split_num = deformable_group * 2 * kernel_h * kernel_w
        sort_index = list(range(split_num))
        sort_index_fp = (sort_index[1::2] + sort_index[::2])
        sort_index_bp_dict = {i: idx for idx, i in enumerate(sort_index_fp)}
        sort_index_bp = [sort_index_bp_dict[i] for i in sort_index]
        sort_index_fp = torch.IntTensor(sort_index_fp)
        sort_index_bp = torch.IntTensor(sort_index_bp)
        sort_index_fp = sort_index_fp.npu()
        sort_index_bp = sort_index_bp.npu()
        return sort_index_fp, sort_index_bp

    @staticmethod
    def _npu_forward(ctx, input_tensor, offset, mask, weight, bias):
        _, _, kernel_h, kernel_w = weight.shape
        conv2d_bias = bias if len(bias) > 0 else None
        sort_index_fp, sort_index_bp = \
            ModulatedDeformConv2dFunction._calculate_sort_index(
                kernel_h, kernel_w, ctx.deform_groups)
        select_offset = offset.index_select(1, sort_index_fp)
        offset_all = torch.cat([select_offset, mask], dim=1)
        import torch_npu
        output, offset_out = torch_npu.npu_deformable_conv2d(
            input_tensor,
            weight,
            offset_all,
            conv2d_bias,
            kernel_size=[kernel_h, kernel_w],
            stride=[1, 1, ctx.stride[0], ctx.stride[1]],
            padding=[
                ctx.padding[0], ctx.padding[0], ctx.padding[1], ctx.padding[1]
            ],
            dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
            groups=ctx.groups,
            deformable_groups=ctx.deform_groups,
            modulated=True)
        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
                or input_tensor.requires_grad:
            ctx.save_for_backward(input_tensor, weight, offset_out, offset_all,
                                  sort_index_bp)
        return output

    @staticmethod
    def _npu_backward(ctx, grad_output):
        input_tensor, weight, offset_out, offset_all, sort_index_bp = \
            ctx.saved_tensors
        import torch_npu
        grad_input, grad_weight, grad_offset_all, grad_bias = \
            torch_npu.npu_deformable_conv2dbk(
                input_tensor, grad_output, offset_out, weight, offset_all,
                kernel_size=[weight.shape[2], weight.shape[3]],
                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],
                         ctx.padding[1]],
                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
                groups=ctx.groups, deformable_groups=ctx.deform_groups,
                modulated=True)
        grad_offset = grad_offset_all.index_select(1, sort_index_bp)
        grad_mask = grad_offset_all[:, grad_offset.shape[1]:, :, :]
        if not ctx.with_bias:
            grad_bias = None
        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
                None, None, None, None, None, None, None, None)

    @staticmethod
    def forward(ctx,
                input: torch.Tensor,
                offset: torch.Tensor,
                mask: torch.Tensor,
                weight: nn.Parameter,
                bias: Optional[nn.Parameter] = None,
                stride: int = 1,
                padding: int = 0,
                dilation: int = 1,
                groups: int = 1,
                deform_groups: int = 1) -> torch.Tensor:
        if input is not None and input.dim() != 4:
            raise ValueError(
                f'Expected 4D tensor as input, got {input.dim()}D tensor \
                  instead.')
        ctx.stride = _pair(stride)
        ctx.padding = _pair(padding)
        ctx.dilation = _pair(dilation)
        ctx.groups = groups
        ctx.deform_groups = deform_groups
        ctx.with_bias = bias is not None
        ctx.device = input.device.type
        if not ctx.with_bias:
            bias = input.new_empty(0)  # fake tensor
        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
        # amp won't cast the type of model (float32), but "offset" is cast
        # to float16 by nn.Conv2d automatically, leading to the type
        # mismatch with input (when it is float32) or weight.
        # The flag for whether to use fp16 or amp is the type of "offset",
        # we cast weight and input to temporarily support fp16 and amp
        # whatever the pytorch version is.
        input = input.type_as(offset)
        weight = weight.type_as(input)
        bias = bias.type_as(input)  # type: ignore
        mask = mask.type_as(input)
        if ctx.device == 'npu':
            output = ModulatedDeformConv2dFunction._npu_forward(
                ctx, input, offset, mask, weight, bias)
            return output
        ctx.save_for_backward(input, offset, mask, weight, bias)
        output = input.new_empty([
            int(i) for i in ModulatedDeformConv2dFunction._output_size(
                ctx, input, weight)
        ])
        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
        ext_module.modulated_deform_conv_forward(
            input,
            weight,
            bias,
            ctx._bufs[0],
            offset,
            mask,
            output,
            ctx._bufs[1],
            kernel_h=weight.size(2),
            kernel_w=weight.size(3),
            stride_h=ctx.stride[0],
            stride_w=ctx.stride[1],
            pad_h=ctx.padding[0],
            pad_w=ctx.padding[1],
            dilation_h=ctx.dilation[0],
            dilation_w=ctx.dilation[1],
            group=ctx.groups,
            deformable_group=ctx.deform_groups,
            with_bias=ctx.with_bias)
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_output: torch.Tensor) -> tuple:
        if ctx.device == 'npu':
            return ModulatedDeformConv2dFunction._npu_backward(
                ctx, grad_output)
        input, offset, mask, weight, bias = ctx.saved_tensors
        grad_input = torch.zeros_like(input)
        grad_offset = torch.zeros_like(offset)
        grad_mask = torch.zeros_like(mask)
        grad_weight = torch.zeros_like(weight)
        grad_bias = torch.zeros_like(bias)
        grad_output = grad_output.contiguous()
        ext_module.modulated_deform_conv_backward(
            input,
            weight,
            bias,
            ctx._bufs[0],
            offset,
            mask,
            ctx._bufs[1],
            grad_input,
            grad_weight,
            grad_bias,
            grad_offset,
            grad_mask,
            grad_output,
            kernel_h=weight.size(2),
            kernel_w=weight.size(3),
            stride_h=ctx.stride[0],
            stride_w=ctx.stride[1],
            pad_h=ctx.padding[0],
            pad_w=ctx.padding[1],
            dilation_h=ctx.dilation[0],
            dilation_w=ctx.dilation[1],
            group=ctx.groups,
            deformable_group=ctx.deform_groups,
            with_bias=ctx.with_bias)
        if not ctx.with_bias:
            grad_bias = None

        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
                None, None, None, None, None)

    @staticmethod
    def _output_size(ctx, input, weight):
        channels = weight.size(0)
        output_size = (input.size(0), channels)
        for d in range(input.dim() - 2):
            in_size = input.size(d + 2)
            pad = ctx.padding[d]
            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
            stride_ = ctx.stride[d]
            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
        if not all(map(lambda s: s > 0, output_size)):
            raise ValueError(
                'convolution input is too small (output would be ' +
                'x'.join(map(str, output_size)) + ')')
        return output_size


modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply


class ModulatedDeformConv2d(nn.Module):

    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
                            cls_name='ModulatedDeformConv2d')
    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: Union[int, Tuple[int]],
                 stride: int = 1,
                 padding: int = 0,
                 dilation: int = 1,
                 groups: int = 1,
                 deform_groups: int = 1,
                 bias: Union[bool, str] = True):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = _pair(kernel_size)
        self.stride = _pair(stride)
        self.padding = _pair(padding)
        self.dilation = _pair(dilation)
        self.groups = groups
        self.deform_groups = deform_groups
        # enable compatibility with nn.Conv2d
        self.transposed = False
        self.output_padding = _single(0)

        self.weight = nn.Parameter(
            torch.Tensor(out_channels, in_channels // groups,
                         *self.kernel_size))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)
        self.init_weights()

    def init_weights(self):
        n = self.in_channels
        for k in self.kernel_size:
            n *= k
        stdv = 1. / math.sqrt(n)
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.zero_()

    def forward(self, x: torch.Tensor, offset: torch.Tensor,
                mask: torch.Tensor) -> torch.Tensor:
        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
                                       self.stride, self.padding,
                                       self.dilation, self.groups,
                                       self.deform_groups)


@MODELS.register_module('DCNv2')
class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
    layers.

    Args:
        in_channels (int): Same as nn.Conv2d.
        out_channels (int): Same as nn.Conv2d.
        kernel_size (int or tuple[int]): Same as nn.Conv2d.
        stride (int): Same as nn.Conv2d, while tuple is not supported.
        padding (int): Same as nn.Conv2d, while tuple is not supported.
        dilation (int): Same as nn.Conv2d, while tuple is not supported.
        groups (int): Same as nn.Conv2d.
        bias (bool or str): If specified as `auto`, it will be decided by the
            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
            False.
    """

    _version = 2

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.conv_offset = nn.Conv2d(
            self.in_channels,
            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
            kernel_size=self.kernel_size,
            stride=self.stride,
            padding=self.padding,
            dilation=self.dilation,
            bias=True)
        self.init_weights()

    def init_weights(self) -> None:
        super().init_weights()
        if hasattr(self, 'conv_offset'):
            self.conv_offset.weight.data.zero_()
            self.conv_offset.bias.data.zero_()

    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore
        out = self.conv_offset(x)
        o1, o2, mask = torch.chunk(out, 3, dim=1)
        offset = torch.cat((o1, o2), dim=1)
        mask = torch.sigmoid(mask)
        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
                                       self.stride, self.padding,
                                       self.dilation, self.groups,
                                       self.deform_groups)

    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                              missing_keys, unexpected_keys, error_msgs):
        version = local_metadata.get('version', None)

        if version is None or version < 2:
            # the key is different in early versions
            # In version < 2, ModulatedDeformConvPack
            # loads previous benchmark models.
            if (prefix + 'conv_offset.weight' not in state_dict
                    and prefix[:-1] + '_offset.weight' in state_dict):
                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
                    prefix[:-1] + '_offset.weight')
            if (prefix + 'conv_offset.bias' not in state_dict
                    and prefix[:-1] + '_offset.bias' in state_dict):
                state_dict[prefix +
                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
                                                                '_offset.bias')

        if version is not None and version > 1:
            print_log(
                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
                'version 2.',
                logger='current')

        super()._load_from_state_dict(state_dict, prefix, local_metadata,
                                      strict, missing_keys, unexpected_keys,
                                      error_msgs)


if IS_MLU_AVAILABLE:
    import torchvision
    from mmengine.utils import digit_version
    from torchvision.ops import deform_conv2d as tv_deform_conv2d

    @MODELS.register_module('DCNv2', force=True)
    class ModulatedDeformConv2dPack_MLU(ModulatedDeformConv2d):
        """This class is the DCNv2 implementation of the MLU device.

        The MLU backend support of the operator has been implemented
        in torchvision. The mmcv registration mechanism is used for
        multiplexing here. The torchvision implementation of DCNv2 is called.
        Args:
            in_channels (int): Same as nn.Conv2d.
            out_channels (int): Same as nn.Conv2d.
            kernel_size (int or tuple[int]): Same as nn.Conv2d.
            stride (int): Same as nn.Conv2d, while tuple is not supported.
            padding (int): Same as nn.Conv2d, while tuple is not supported.
            dilation (int): Same as nn.Conv2d, while tuple is not supported.
            groups (int): Same as nn.Conv2d.
            bias (bool or str): If specified as `auto`, it will be decided by
                the norm_cfg. Bias will be set as True if norm_cfg is None,
                otherwise False.
        """

        def __init__(self, *args, **kwargs):
            assert digit_version(torchvision.__version__) >= digit_version(
                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'
            super().__init__(*args, **kwargs)
            self.conv_offset = nn.Conv2d(
                self.in_channels,
                self.deform_groups * 3 * self.kernel_size[0] *
                self.kernel_size[1],
                kernel_size=self.kernel_size,
                stride=self.stride,
                padding=self.padding,
                dilation=self.dilation,
                bias=True)
            self.init_weights()

        def init_weights(self):
            super().init_weights()
            if hasattr(self, 'conv_offset'):
                self.conv_offset.weight.data.zero_()
                self.conv_offset.bias.data.zero_()

        def forward(self, x):
            out = self.conv_offset(x)
            o1, o2, mask = torch.chunk(out, 3, dim=1)
            offset = torch.cat((o1, o2), dim=1)
            mask = torch.sigmoid(mask)
            x = x.type_as(offset)
            weight = self.weight.type_as(x)
            mask = mask.type_as(x)
            return tv_deform_conv2d(
                x,
                offset,
                weight,
                bias=self.bias,
                stride=self.stride,
                padding=self.padding,
                dilation=self.dilation,
                mask=mask)


================================================
FILE: mmcv/ops/multi_scale_deform_attn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
import warnings
from typing import Optional, no_type_check

import mmengine
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import BaseModule, constant_init, xavier_init
from mmengine.registry import MODELS
from mmengine.utils import deprecated_api_warning
from torch.autograd.function import Function, once_differentiable

from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)
from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])


class MultiScaleDeformableAttnFunction(Function):

    @staticmethod
    def forward(ctx, value: torch.Tensor, value_spatial_shapes: torch.Tensor,
                value_level_start_index: torch.Tensor,
                sampling_locations: torch.Tensor,
                attention_weights: torch.Tensor,
                im2col_step: torch.Tensor) -> torch.Tensor:
        """GPU/MLU version of multi-scale deformable attention.

        Args:
            value (torch.Tensor): The value has shape
                (bs, num_keys, mum_heads, embed_dims//num_heads)
            value_spatial_shapes (torch.Tensor): Spatial shape of
                each feature map, has shape (num_levels, 2),
                last dimension 2 represent (h, w)
            sampling_locations (torch.Tensor): The location of sampling points,
                has shape
                (bs ,num_queries, num_heads, num_levels, num_points, 2),
                the last dimension 2 represent (x, y).
            attention_weights (torch.Tensor): The weight of sampling points
                used when calculate the attention, has shape
                (bs ,num_queries, num_heads, num_levels, num_points),
            im2col_step (torch.Tensor): The step used in image to column.

        Returns:
            torch.Tensor: has shape (bs, num_queries, embed_dims)
        """

        ctx.im2col_step = im2col_step

        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
        # amp won't cast the type of sampling_locations, attention_weights
        # (float32), but "value" is cast to float16, leading to the type
        # mismatch with input (when it is float32) or weight.
        # The flag for whether to use fp16 or amp is the type of "value",
        # we cast sampling_locations and attention_weights to
        # temporarily support fp16 and amp whatever the
        # pytorch version is.
        sampling_locations = sampling_locations.type_as(value)
        attention_weights = attention_weights.type_as(value)

        output = ext_module.ms_deform_attn_forward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            im2col_step=ctx.im2col_step)
        ctx.save_for_backward(value, value_spatial_shapes,
                              value_level_start_index, sampling_locations,
                              attention_weights)
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_output: torch.Tensor) -> tuple:
        """GPU/MLU version of backward function.

        Args:
            grad_output (torch.Tensor): Gradient of output tensor of forward.

        Returns:
            tuple[Tensor]: Gradient of input tensors in forward.
        """
        value, value_spatial_shapes, value_level_start_index, \
            sampling_locations, attention_weights = ctx.saved_tensors
        grad_value = torch.zeros_like(value)
        grad_sampling_loc = torch.zeros_like(sampling_locations)
        grad_attn_weight = torch.zeros_like(attention_weights)

        ext_module.ms_deform_attn_backward(
            value,
            value_spatial_shapes,
            value_level_start_index,
            sampling_locations,
            attention_weights,
            grad_output.contiguous(),
            grad_value,
            grad_sampling_loc,
            grad_attn_weight,
            im2col_step=ctx.im2col_step)

        return grad_value, None, None, \
            grad_sampling_loc, grad_attn_weight, None


def multi_scale_deformable_attn_pytorch(
        value: torch.Tensor, value_spatial_shapes: torch.Tensor,
        sampling_locations: torch.Tensor,
        attention_weights: torch.Tensor) -> torch.Tensor:
    """CPU version of multi-scale deformable attention.

    Args:
        value (torch.Tensor): The value has shape
            (bs, num_keys, num_heads, embed_dims//num_heads)
        value_spatial_shapes (torch.Tensor): Spatial shape of
            each feature map, has shape (num_levels, 2),
            last dimension 2 represent (h, w)
        sampling_locations (torch.Tensor): The location of sampling points,
            has shape
            (bs ,num_queries, num_heads, num_levels, num_points, 2),
            the last dimension 2 represent (x, y).
        attention_weights (torch.Tensor): The weight of sampling points used
            when calculate the attention, has shape
            (bs ,num_queries, num_heads, num_levels, num_points),

    Returns:
        torch.Tensor: has shape (bs, num_queries, embed_dims)
    """

    bs, _, num_heads, embed_dims = value.shape
    _, num_queries, num_heads, num_levels, num_points, _ =\
        sampling_locations.shape
    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
                             dim=1)
    sampling_grids = 2 * sampling_locations - 1
    sampling_value_list = []
    for level, (H_, W_) in enumerate(value_spatial_shapes):
        # bs, H_*W_, num_heads, embed_dims ->
        # bs, H_*W_, num_heads*embed_dims ->
        # bs, num_heads*embed_dims, H_*W_ ->
        # bs*num_heads, embed_dims, H_, W_
        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
            bs * num_heads, embed_dims, H_, W_)
        # bs, num_queries, num_heads, num_points, 2 ->
        # bs, num_heads, num_queries, num_points, 2 ->
        # bs*num_heads, num_queries, num_points, 2
        sampling_grid_l_ = sampling_grids[:, :, :,
                                          level].transpose(1, 2).flatten(0, 1)
        # bs*num_heads, embed_dims, num_queries, num_points
        sampling_value_l_ = F.grid_sample(
            value_l_,
            sampling_grid_l_,
            mode='bilinear',
            padding_mode='zeros',
            align_corners=False)
        sampling_value_list.append(sampling_value_l_)
    # (bs, num_queries, num_heads, num_levels, num_points) ->
    # (bs, num_heads, num_queries, num_levels, num_points) ->
    # (bs, num_heads, 1, num_queries, num_levels*num_points)
    attention_weights = attention_weights.transpose(1, 2).reshape(
        bs * num_heads, 1, num_queries, num_levels * num_points)
    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
                                              num_queries)
    return output.transpose(1, 2).contiguous()


@MODELS.register_module()
class MultiScaleDeformableAttention(BaseModule):
    """An attention module used in Deformable-Detr.

    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
    <https://arxiv.org/pdf/2010.04159.pdf>`_.

    Args:
        embed_dims (int): The embedding dimension of Attention.
            Default: 256.
        num_heads (int): Parallel attention heads. Default: 8.
        num_levels (int): The number of feature map used in
            Attention. Default: 4.
        num_points (int): The number of sampling points for
            each query in each head. Default: 4.
        im2col_step (int): The step used in image_to_column.
            Default: 64.
        dropout (float): A Dropout layer on `inp_identity`.
            Default: 0.1.
        batch_first (bool): Key, Query and Value are shape of
            (batch, n, embed_dim)
            or (n, batch, embed_dim). Default to False.
        norm_cfg (dict): Config dict for normalization layer.
            Default: None.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
        value_proj_ratio (float): The expansion ratio of value_proj.
            Default: 1.0.
    """

    def __init__(self,
                 embed_dims: int = 256,
                 num_heads: int = 8,
                 num_levels: int = 4,
                 num_points: int = 4,
                 im2col_step: int = 64,
                 dropout: float = 0.1,
                 batch_first: bool = False,
                 norm_cfg: Optional[dict] = None,
                 init_cfg: Optional[mmengine.ConfigDict] = None,
                 value_proj_ratio: float = 1.0):
        super().__init__(init_cfg)
        if embed_dims % num_heads != 0:
            raise ValueError(f'embed_dims must be divisible by num_heads, '
                             f'but got {embed_dims} and {num_heads}')
        dim_per_head = embed_dims // num_heads
        self.norm_cfg = norm_cfg
        self.dropout = nn.Dropout(dropout)
        self.batch_first = batch_first

        # you'd better set dim_per_head to a power of 2
        # which is more efficient in the CUDA/MUSA implementation
        def _is_power_of_2(n):
            if (not isinstance(n, int)) or (n < 0):
                raise ValueError(
                    'invalid input for _is_power_of_2: {} (type: {})'.format(
                        n, type(n)))
            return (n & (n - 1) == 0) and n != 0

        if not _is_power_of_2(dim_per_head):
            warnings.warn(
                "You'd better set embed_dims in "
                'MultiScaleDeformAttention to make '
                'the dimension of each attention head a power of 2 '
                'which is more efficient in our CUDA/MUSA implementation.')

        self.im2col_step = im2col_step
        self.embed_dims = embed_dims
        self.num_levels = num_levels
        self.num_heads = num_heads
        self.num_points = num_points
        self.sampling_offsets = nn.Linear(
            embed_dims, num_heads * num_levels * num_points * 2)
        self.attention_weights = nn.Linear(embed_dims,
                                           num_heads * num_levels * num_points)
        value_proj_size = int(embed_dims * value_proj_ratio)
        self.value_proj = nn.Linear(embed_dims, value_proj_size)
        self.output_proj = nn.Linear(value_proj_size, embed_dims)
        self.init_weights()

    def init_weights(self) -> None:
        """Default initialization for Parameters of Module."""
        constant_init(self.sampling_offsets, 0.)
        device = next(self.parameters()).device
        thetas = torch.arange(
            self.num_heads, dtype=torch.float32,
            device=device) * (2.0 * math.pi / self.num_heads)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = (grid_init /
                     grid_init.abs().max(-1, keepdim=True)[0]).view(
                         self.num_heads, 1, 1,
                         2).repeat(1, self.num_levels, self.num_points, 1)
        for i in range(self.num_points):
            grid_init[:, :, i, :] *= i + 1

        self.sampling_offsets.bias.data = grid_init.view(-1)
        constant_init(self.attention_weights, val=0., bias=0.)
        xavier_init(self.value_proj, distribution='uniform', bias=0.)
        xavier_init(self.output_proj, distribution='uniform', bias=0.)
        self._is_init = True

    @no_type_check
    @deprecated_api_warning({'residual': 'identity'},
                            cls_name='MultiScaleDeformableAttention')
    def forward(self,
                query: torch.Tensor,
                key: Optional[torch.Tensor] = None,
                value: Optional[torch.Tensor] = None,
                identity: Optional[torch.Tensor] = None,
                query_pos: Optional[torch.Tensor] = None,
                key_padding_mask: Optional[torch.Tensor] = None,
                reference_points: Optional[torch.Tensor] = None,
                spatial_shapes: Optional[torch.Tensor] = None,
                level_start_index: Optional[torch.Tensor] = None,
                **kwargs) -> torch.Tensor:
        """Forward Function of MultiScaleDeformAttention.

        Args:
            query (torch.Tensor): Query of Transformer with shape
                (num_query, bs, embed_dims).
            key (torch.Tensor): The key tensor with shape
                `(num_key, bs, embed_dims)`.
            value (torch.Tensor): The value tensor with shape
                `(num_key, bs, embed_dims)`.
            identity (torch.Tensor): The tensor used for addition, with the
                same shape as `query`. Default None. If None,
                `query` will be used.
            query_pos (torch.Tensor): The positional encoding for `query`.
                Default: None.
            key_padding_mask (torch.Tensor): ByteTensor for `query`, with
                shape [bs, num_key].
            reference_points (torch.Tensor):  The normalized reference
                points with shape (bs, num_query, num_levels, 2),
                all elements is range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area.
                or (N, Length_{query}, num_levels, 4), add
                additional two dimensions is (w, h) to
                form reference boxes.
            spatial_shapes (torch.Tensor): Spatial shape of features in
                different levels. With shape (num_levels, 2),
                last dimension represents (h, w).
            level_start_index (torch.Tensor): The start index of each level.
                A tensor has shape ``(num_levels, )`` and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].

        Returns:
            torch.Tensor: forwarded results with shape
            [num_query, bs, embed_dims].
        """

        if value is None:
            value = query

        if identity is None:
            identity = query
        if query_pos is not None:
            query = query + query_pos
        if not self.batch_first:
            # change to (bs, num_query ,embed_dims)
            query = query.permute(1, 0, 2)
            value = value.permute(1, 0, 2)

        bs, num_query, _ = query.shape
        bs, num_value, _ = value.shape
        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value

        value = self.value_proj(value)
        if key_padding_mask is not None:
            value = value.masked_fill(key_padding_mask[..., None], 0.0)
        value = value.view(bs, num_value, self.num_heads, -1)
        sampling_offsets = self.sampling_offsets(query).view(
            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
        attention_weights = self.attention_weights(query).view(
            bs, num_query, self.num_heads, self.num_levels * self.num_points)
        attention_weights = attention_weights.softmax(-1)

        attention_weights = attention_weights.view(bs, num_query,
                                                   self.num_heads,
                                                   self.num_levels,
                                                   self.num_points)
        if reference_points.shape[-1] == 2:
            offset_normalizer = torch.stack(
                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
            sampling_locations = reference_points[:, :, None, :, None, :] \
                + sampling_offsets \
                / offset_normalizer[None, None, None, :, None, :]
        elif reference_points.shape[-1] == 4:
            sampling_locations = reference_points[:, :, None, :, None, :2] \
                + sampling_offsets / self.num_points \
                * reference_points[:, :, None, :, None, 2:] \
                * 0.5
        else:
            raise ValueError(
                f'Last dim of reference_points must be'
                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
        if ((IS_CUDA_AVAILABLE and value.is_cuda)
                or (IS_MLU_AVAILABLE and value.is_mlu)
                or (IS_MUSA_AVAILABLE and value.is_musa)
                or (IS_NPU_AVAILABLE and value.device.type == 'npu')):
            output = MultiScaleDeformableAttnFunction.apply(
                value, spatial_shapes, level_start_index, sampling_locations,
                attention_weights, self.im2col_step)
        else:
            output = multi_scale_deformable_attn_pytorch(
                value, spatial_shapes, sampling_locations, attention_weights)

        output = self.output_proj(output)

        if not self.batch_first:
            # (num_query, bs ,embed_dims)
            output = output.permute(1, 0, 2)

        return self.dropout(output) + identity


================================================
FILE: mmcv/ops/nms.py
================================================
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import torch
from mmengine.utils import deprecated_api_warning
from torch import Tensor

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated', 'nms_quadri'])


# This function is modified from: https://github.com/pytorch/vision/
class NMSop(torch.autograd.Function):

    @staticmethod
    def forward(ctx: Any, bboxes: Tensor, scores: Tensor, iou_threshold: float,
                offset: int, score_threshold: float, max_num: int) -> Tensor:
        is_filtering_by_score = score_threshold > 0
        if is_filtering_by_score:
            valid_mask = scores > score_threshold
            bboxes, scores = bboxes[valid_mask], scores[valid_mask]
            valid_inds = torch.nonzero(
                valid_mask, as_tuple=False).squeeze(dim=1)

        inds = ext_module.nms(
            bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)

        if max_num > 0:
            inds = inds[:max_num]
        if is_filtering_by_score:
            inds = valid_inds[inds]
        return inds


class SoftNMSop(torch.autograd.Function):

    @staticmethod
    def forward(ctx: Any, boxes: Tensor, scores: Tensor, iou_threshold: float,
                sigma: float, min_score: float, method: int,
                offset: int) -> Tuple[Tensor, Tensor]:
        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
        inds = ext_module.softnms(
            boxes.cpu(),
            scores.cpu(),
            dets.cpu(),
            iou_threshold=float(iou_threshold),
            sigma=float(sigma),
            min_score=float(min_score),
            method=int(method),
            offset=int(offset))
        return dets, inds

    @staticmethod
    def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method,
                 offset):
        from packaging import version
        assert version.parse(torch.__version__) >= version.parse('1.7.0')
        nms_out = g.op(
            'mmcv::SoftNonMaxSuppression',
            boxes,
            scores,
            iou_threshold_f=float(iou_threshold),
            sigma_f=float(sigma),
            min_score_f=float(min_score),
            method_i=int(method),
            offset_i=int(offset),
            outputs=2)
        return nms_out


array_like_type = Union[Tensor, np.ndarray]


@deprecated_api_warning({'iou_thr': 'iou_threshold'})
def nms(boxes: array_like_type,
        scores: array_like_type,
        iou_threshold: float,
        offset: int = 0,
        score_threshold: float = 0,
        max_num: int = -1) -> Tuple[array_like_type, array_like_type]:
    """Dispatch to either CPU or GPU NMS implementations.

    The input can be either torch tensor or numpy array. GPU NMS will be used
    if the input is gpu tensor, otherwise CPU NMS
    will be used. The returned type will always be the same as inputs.

    Arguments:
        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
        iou_threshold (float): IoU threshold for NMS.
        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
        score_threshold (float): score threshold for NMS.
        max_num (int): maximum number of boxes after NMS.

    Returns:
        tuple: kept dets (boxes and scores) and indice, which always have
        the same data type as the input.

    Example:
        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
        >>>                   [49.3, 32.9, 51.0, 35.3],
        >>>                   [49.2, 31.8, 51.0, 35.4],
        >>>                   [35.1, 11.5, 39.1, 15.7],
        >>>                   [35.6, 11.8, 39.3, 14.2],
        >>>                   [35.3, 11.5, 39.9, 14.5],
        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\
               dtype=np.float32)
        >>> iou_threshold = 0.6
        >>> dets, inds = nms(boxes, scores, iou_threshold)
        >>> assert len(inds) == len(dets) == 3
    """
    assert isinstance(boxes, (Tensor, np.ndarray))
    assert isinstance(scores, (Tensor, np.ndarray))
    is_numpy = False
    if isinstance(boxes, np.ndarray):
        is_numpy = True
        boxes = torch.from_numpy(boxes)
    if isinstance(scores, np.ndarray):
        scores = torch.from_numpy(scores)
    assert boxes.size(1) == 4
    assert boxes.size(0) == scores.size(0)
    assert offset in (0, 1)

    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,
                       max_num)
    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
    if is_numpy:
        dets = dets.cpu().numpy()
        inds = inds.cpu().numpy()
    return dets, inds


@deprecated_api_warning({'iou_thr': 'iou_threshold'})
def soft_nms(boxes: array_like_type,
             scores: array_like_type,
             iou_threshold: float = 0.3,
             sigma: float = 0.5,
             min_score: float = 1e-3,
             method: str = 'linear',
             offset: int = 0) -> Tuple[array_like_type, array_like_type]:
    """Dispatch to only CPU Soft NMS implementations.

    The input can be either a torch tensor or numpy array.
    The returned type will always be the same as inputs.

    Args:
        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
        iou_threshold (float): IoU threshold for NMS.
        sigma (float): hyperparameter for gaussian method
        min_score (float): score filter threshold
        method (str): either 'linear' or 'gaussian'
        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).

    Returns:
        tuple: kept dets (boxes and scores) and indice, which always have
        the same data type as the input.

    Example:
        >>> boxes = np.array([[4., 3., 5., 3.],
        >>>                   [4., 3., 5., 4.],
        >>>                   [3., 1., 3., 1.],
        >>>                   [3., 1., 3., 1.],
        >>>                   [3., 1., 3., 1.],
        >>>                   [3., 1., 3., 1.]], dtype=np.float32)
        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
        >>> iou_threshold = 0.6
        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
        >>> assert len(inds) == len(dets) == 5
    """

    assert isinstance(boxes, (Tensor, np.ndarray))
    assert isinstance(scores, (Tensor, np.ndarray))
    is_numpy = False
    if isinstance(boxes, np.ndarray):
        is_numpy = True
        boxes = torch.from_numpy(boxes)
    if isinstance(scores, np.ndarray):
        scores = torch.from_numpy(scores)
    assert boxes.size(1) == 4
    assert boxes.size(0) == scores.size(0)
    assert offset in (0, 1)
    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}
    assert method in method_dict.keys()

    if torch.__version__ == 'parrots':
        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
        indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]
        indata_dict = {
            'iou_threshold': float(iou_threshold),
            'sigma': float(sigma),
            'min_score': min_score,
            'method': method_dict[method],
            'offset': int(offset)
        }
        inds = ext_module.softnms(*indata_list, **indata_dict)
    else:
        dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
                                     float(iou_threshold), float(sigma),
                                     float(min_score), method_dict[method],
                                     int(offset))

    dets = dets[:inds.size(0)]

    if is_numpy:
        dets = dets.cpu().numpy()
        inds = inds.cpu().numpy()
        return dets, inds
    else:
        return dets.to(device=boxes.device), inds.to(device=boxes.device)


def batched_nms(boxes: Tensor,
                scores: Tensor,
                idxs: Tensor,
                nms_cfg: Optional[Dict],
                class_agnostic: bool = False) -> Tuple[Tensor, Tensor]:
    r"""Performs non-maximum suppression in a batched fashion.

    Modified from `torchvision/ops/boxes.py#L39
    <https://github.com/pytorch/vision/blob/
    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.
    In order to perform NMS independently per class, we add an offset to all
    the boxes. The offset is dependent only on the class idx, and is large
    enough so that boxes from different classes do not overlap.

    Note:
        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and
        returns sorted raw results when `nms_cfg` is None.

    Args:
        boxes (torch.Tensor): boxes in shape (N, 4) or (N, 5).
        scores (torch.Tensor): scores in shape (N, ).
        idxs (torch.Tensor): each index value correspond to a bbox cluster,
            and NMS will not be applied between elements of different idxs,
            shape (N, ).
        nms_cfg (dict | optional): Supports skipping the nms when `nms_cfg`
            is None, otherwise it should specify nms type and other
            parameters like `iou_thr`. Possible keys includes the following.

            - iou_threshold (float): IoU threshold used for NMS.
            - split_thr (float): threshold number of boxes. In some cases the
              number of boxes is large (e.g., 200k). To avoid OOM during
              training, the users could set `split_thr` to a small value.
              If the number of boxes is greater than the threshold, it will
              perform NMS on each group of boxes separately and sequentially.
              Defaults to 10000.
        class_agnostic (bool): if true, nms is class agnostic,
            i.e. IoU thresholding happens over all boxes,
            regardless of the predicted class. Defaults to False.

    Returns:
        tuple: kept dets and indice.

        - boxes (Tensor): Bboxes with score after nms, has shape
          (num_bboxes, 5). last dimension 5 arrange as
          (x1, y1, x2, y2, score)
        - keep (Tensor): The indices of remaining boxes in input
          boxes.
    """
    # skip nms when nms_cfg is None
    if nms_cfg is None:
        scores, inds = scores.sort(descending=True)
        boxes = boxes[inds]
        return torch.cat([boxes, scores[:, None]], -1), inds

    nms_cfg_ = nms_cfg.copy()
    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
    if class_agnostic:
        boxes_for_nms = boxes
    else:
        # When using rotated boxes, only apply offsets on center.
        if boxes.size(-1) == 5:
            # Strictly, the maximum coordinates of the rotating box
            # (x,y,w,h,a) should be calculated by polygon coordinates.
            # But the conversion from rotated box to polygon will
            # slow down the speed.
            # So we use max(x,y) + max(w,h) as max coordinate
            # which is larger than polygon max coordinate
            # max(x1, y1, x2, y2,x3, y3, x4, y4)
            max_coordinate = boxes[..., :2].max() + boxes[..., 2:4].max()
            offsets = idxs.to(boxes) * (
                max_coordinate + torch.tensor(1).to(boxes))
            boxes_ctr_for_nms = boxes[..., :2] + offsets[:, None]
            boxes_for_nms = torch.cat([boxes_ctr_for_nms, boxes[..., 2:5]],
                                      dim=-1)
        else:
            max_coordinate = boxes.max()
            offsets = idxs.to(boxes) * (
                max_coordinate + torch.tensor(1).to(boxes))
            boxes_for_nms = boxes + offsets[:, None]

    nms_op = nms_cfg_.pop('type', 'nms')
    if isinstance(nms_op, str):
        nms_op = eval(nms_op)

    split_thr = nms_cfg_.pop('split_thr', 10000)
    # Won't split to multiple nms nodes when exporting to onnx
    if boxes_for_nms.shape[0] < split_thr:
        dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
        boxes = boxes[keep]

        # This assumes `dets` has arbitrary dimensions where
        # the last dimension is score.
        # Currently it supports bounding boxes [x1, y1, x2, y2, score] or
        # rotated boxes [cx, cy, w, h, angle_radian, score].

        scores = dets[:, -1]
    else:
        max_num = nms_cfg_.pop('max_num', -1)
        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
        # Some type of nms would reweight the score, such as SoftNMS
        scores_after_nms = scores.new_zeros(scores.size())
        for id in torch.unique(idxs):
            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
            dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_)
            total_mask[mask[keep]] = True
            scores_after_nms[mask[keep]] = dets[:, -1]
        keep = total_mask.nonzero(as_tuple=False).view(-1)

        scores, inds = scores_after_nms[keep].sort(descending=True)
        keep = keep[inds]
        boxes = boxes[keep]

        if max_num > 0:
            keep = keep[:max_num]
            boxes = boxes[:max_num]
            scores = scores[:max_num]

    boxes = torch.cat([boxes, scores[:, None]], -1)
    return boxes, keep


def nms_match(dets: array_like_type,
              iou_threshold: float) -> List[array_like_type]:
    """Matched dets into different groups by NMS.

    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
    record the indice of suppressed bbox and form a group with the indice of
    kept bbox. In each group, indice is sorted as score order.

    Args:
        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
        iou_threshold (float): IoU thresh for NMS.

    Returns:
        list[torch.Tensor | np.ndarray]: The outer list corresponds different
        matched group, the inner Tensor corresponds the indices for a group
        in score order.
    """
    if dets.shape[0] == 0:
        matched = []
    else:
        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
                                    f'but get {dets.shape}'
        if isinstance(dets, Tensor):
            dets_t = dets.detach().cpu()
        else:
            dets_t = torch.from_numpy(dets)
        indata_list = [dets_t]
        indata_dict = {'iou_threshold': float(iou_threshold)}
        matched = ext_module.nms_match(*indata_list, **indata_dict)
        if torch.__version__ == 'parrots':
            matched = matched.tolist()  # type: ignore

    if isinstance(dets, Tensor):
        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
    else:
        return [np.array(m, dtype=int) for m in matched]


def nms_rotated(dets: Tensor,
                scores: Tensor,
                iou_threshold: float,
                labels: Optional[Tensor] = None,
                clockwise: bool = True) -> Tuple[Tensor, Tensor]:
    """Performs non-maximum suppression (NMS) on the rotated boxes according to
    their intersection-over-union (IoU).

    Rotated NMS iteratively removes lower scoring rotated boxes which have an
    IoU greater than iou_threshold with another (higher scoring) rotated box.

    Args:
        dets (torch.Tensor):  Rotated boxes in shape (N, 5).
            They are expected to be in
            (x_ctr, y_ctr, width, height, angle_radian) format.
        scores (torch.Tensor): scores in shape (N, ).
        iou_threshold (float): IoU thresh for NMS.
        labels (torch.Tensor, optional): boxes' label in shape (N,).
        clockwise (bool): flag indicating whether the positive angular
            orientation is clockwise. default True.
            `New in version 1.4.3.`

    Returns:
        tuple: kept dets(boxes and scores) and indice, which is always the
        same data type as the input.
    """
    if dets.shape[0] == 0:
        return dets, None
    if not clockwise:
        flip_mat = dets.new_ones(dets.shape[-1])
        flip_mat[-1] = -1
        dets_cw = dets * flip_mat
    else:
        dets_cw = dets
    multi_label = labels is not None
    if labels is None:
        input_labels = scores.new_empty(0, dtype=torch.int)
    else:
        input_labels = labels
    if dets.device.type in ('npu', 'mlu'):
        order = scores.new_empty(0, dtype=torch.long)
        keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw,
                                           input_labels, iou_threshold,
                                           multi_label)
        dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
                         dim=1)
        return dets, keep_inds

    if multi_label:
        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)  # type: ignore
    else:
        dets_wl = dets_cw
    _, order = scores.sort(0, descending=True)
    dets_sorted = dets_wl.index_select(0, order)

    if torch.__version__ == 'parrots':
        keep_inds = ext_module.nms_rotated(
            dets_wl,
            scores,
            order,
            dets_sorted,
            input_labels,
            iou_threshold=iou_threshold,
            multi_label=multi_label)
    else:
        keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
                                           input_labels, iou_threshold,
                                           multi_label)
    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
                     dim=1)
    return dets, keep_inds


def nms_quadri(dets: Tensor,
               scores: Tensor,
               iou_threshold: float,
               labels: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
    """Performs non-maximum suppression (NMS) on the quadrilateral boxes
    according to their intersection-over-union (IoU).

    Quadri NMS iteratively removes lower scoring quadrilateral boxes
    which have an IoU greater than iou_threshold with another (higher
    scoring) quadrilateral box.

    Args:
        dets (torch.Tensor):  Quadri boxes in shape (N, 8).
            They are expected to be in
            (x1, y1, ..., x4, y4) format.
        scores (torch.Tensor): scores in shape (N, ).
        iou_threshold (float): IoU thresh for NMS.
        labels (torch.Tensor, optional): boxes' label in shape (N,).

    Returns:
        tuple: kept dets(boxes and scores) and indice, which is always the
        same data type as the input.
    """
    if dets.shape[0] == 0:
        return dets, None

    multi_label = labels is not None
    if multi_label:
        dets_with_lables = \
            torch.cat((dets, labels.unsqueeze(1)), 1)  # type: ignore
    else:
        dets_with_lables = dets
    _, order = scores.sort(0, descending=True)
    dets_sorted = dets_with_lables.index_select(0, order)

    keep_inds = ext_module.nms_quadri(dets_with_lables, scores, order,
                                      dets_sorted, iou_threshold, multi_label)
    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
                     dim=1)
    return dets, keep_inds


================================================
FILE: mmcv/ops/pixel_group.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union

import numpy as np
import torch
from torch import Tensor

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['pixel_group'])


def pixel_group(
    score: Union[np.ndarray, Tensor],
    mask: Union[np.ndarray, Tensor],
    embedding: Union[np.ndarray, Tensor],
    kernel_label: Union[np.ndarray, Tensor],
    kernel_contour: Union[np.ndarray, Tensor],
    kernel_region_num: int,
    distance_threshold: float,
) -> List[List[float]]:
    """Group pixels into text instances, which is widely used text detection
    methods.

    Arguments:
        score (np.array or torch.Tensor): The foreground score with size hxw.
        mask (np.array or Tensor): The foreground mask with size hxw.
        embedding (np.array or torch.Tensor): The embedding with size hxwxc to
            distinguish instances.
        kernel_label (np.array or torch.Tensor): The instance kernel index with
            size hxw.
        kernel_contour (np.array or torch.Tensor): The kernel contour with
            size hxw.
        kernel_region_num (int): The instance kernel region number.
        distance_threshold (float): The embedding distance threshold between
            kernel and pixel in one instance.

    Returns:
        list[list[float]]: The instance coordinates and attributes list. Each
        element consists of averaged confidence, pixel number, and coordinates
        (x_i, y_i for all pixels) in order.
    """
    assert isinstance(score, (torch.Tensor, np.ndarray))
    assert isinstance(mask, (torch.Tensor, np.ndarray))
    assert isinstance(embedding, (torch.Tensor, np.ndarray))
    assert isinstance(kernel_label, (torch.Tensor, np.ndarray))
    assert isinstance(kernel_contour, (torch.Tensor, np.ndarray))
    assert isinstance(kernel_region_num, int)
    assert isinstance(distance_threshold, float)

    if isinstance(score, np.ndarray):
        score = torch.from_numpy(score)
    if isinstance(mask, np.ndarray):
        mask = torch.from_numpy(mask)
    if isinstance(embedding, np.ndarray):
        embedding = torch.from_numpy(embedding)
    if isinstance(kernel_label, np.ndarray):
        kernel_label = torch.from_numpy(kernel_label)
    if isinstance(kernel_contour, np.ndarray):
        kernel_contour = torch.from_numpy(kernel_contour)

    if torch.__version__ == 'parrots':
        label = ext_module.pixel_group(
            score,
            mask,
            embedding,
            kernel_label,
            kernel_contour,
            kernel_region_num=kernel_region_num,
            distance_threshold=distance_threshold)
        label = label.tolist()
        label = label[0]
        list_index = kernel_region_num
        pixel_assignment = []
        for x in range(kernel_region_num):
            pixel_assignment.append(
                np.array(
                    label[list_index:list_index + int(label[x])],
                    dtype=np.float))
            list_index = list_index + int(label[x])
    else:
        pixel_assignment = ext_module.pixel_group(score, mask, embedding,
                                                  kernel_label, kernel_contour,
                                                  kernel_region_num,
                                                  distance_threshold)
    return pixel_assignment


================================================
FILE: mmcv/ops/point_sample.py
================================================
# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa

from typing import Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.nn.modules.utils import _pair


def bilinear_grid_sample(im: Tensor,
                         grid: Tensor,
                         align_corners: bool = False) -> Tensor:
    """Given an input and a flow-field grid, computes the output using input
    values and pixel locations from grid. Supported only bilinear interpolation
    method to sample the input pixels.

    Args:
        im (torch.Tensor): Input feature map, shape (N, C, H, W)
        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
        align_corners (bool): If set to True, the extrema (-1 and 1) are
            considered as referring to the center points of the input’s
            corner pixels. If set to False, they are instead considered as
            referring to the corner points of the input’s corner pixels,
            making the sampling more resolution agnostic.

    Returns:
        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
    """
    n, c, h, w = im.shape
    gn, gh, gw, _ = grid.shape
    assert n == gn

    x = grid[:, :, :, 0]
    y = grid[:, :, :, 1]

    if align_corners:
        x = ((x + 1) / 2) * (w - 1)
        y = ((y + 1) / 2) * (h - 1)
    else:
        x = ((x + 1) * w - 1) / 2
        y = ((y + 1) * h - 1) / 2

    x = x.view(n, -1)
    y = y.view(n, -1)

    x0 = torch.floor(x).long()
    y0 = torch.floor(y).long()
    x1 = x0 + 1
    y1 = y0 + 1

    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
    wb = ((x1 - x) * (y - y0)).unsqueeze(1)
    wc = ((x - x0) * (y1 - y)).unsqueeze(1)
    wd = ((x - x0) * (y - y0)).unsqueeze(1)

    # Apply default for grid_sample function zero padding
    im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
    padded_h = h + 2
    padded_w = w + 2
    # save points positions after padding
    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1

    # Clip coordinates to padded image size
    x0 = torch.where(x0 < 0, torch.tensor(0), x0)
    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
    x1 = torch.where(x1 < 0, torch.tensor(0), x1)
    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
    y0 = torch.where(y0 < 0, torch.tensor(0), y0)
    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
    y1 = torch.where(y1 < 0, torch.tensor(0), y1)
    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)

    im_padded = im_padded.view(n, c, -1)

    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)

    Ia = torch.gather(im_padded, 2, x0_y0)
    Ib = torch.gather(im_padded, 2, x0_y1)
    Ic = torch.gather(im_padded, 2, x1_y0)
    Id = torch.gather(im_padded, 2, x1_y1)

    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)


def normalize(grid: Tensor) -> Tensor:
    """Normalize input grid from [-1, 1] to [0, 1]

    Args:
        grid (torch.Tensor): The grid to be normalize, range [-1, 1].

    Returns:
        torch.Tensor: Normalized grid, range [0, 1].
    """

    return (grid + 1.0) / 2.0


def denormalize(grid: Tensor) -> Tensor:
    """Denormalize input grid from range [0, 1] to [-1, 1]

    Args:
        grid (torch.Tensor): The grid to be denormalize, range [0, 1].

    Returns:
        torch.Tensor: Denormalized grid, range [-1, 1].
    """

    return grid * 2.0 - 1.0


def generate_grid(num_grid: int, size: Tuple[int, int],
                  device: torch.device) -> Tensor:
    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
    space.

    Args:
        num_grid (int): The number of grids to sample, one for each region.
        size (tuple[int, int]): The side size of the regular grid.
        device (torch.device): Desired device of returned tensor.

    Returns:
        torch.Tensor: A tensor of shape (num_grid, size[0]*size[1], 2) that
        contains coordinates for the regular grids.
    """

    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
    grid = F.affine_grid(
        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
    grid = normalize(grid)
    return grid.view(1, -1, 2).expand(num_grid, -1, -1)


def rel_roi_point_to_abs_img_point(rois: Tensor,
                                   rel_roi_points: Tensor) -> Tensor:
    """Convert roi based relative point coordinates to image based absolute
    point coordinates.

    Args:
        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
            to RoI, location, range (0, 1), shape (N, P, 2)
    Returns:
        torch.Tensor: Image based absolute point coordinates, shape (N, P, 2)
    """

    with torch.no_grad():
        assert rel_roi_points.size(0) == rois.size(0)
        assert rois.dim() == 2
        assert rel_roi_points.dim() == 3
        assert rel_roi_points.size(2) == 2
        # remove batch idx
        if rois.size(1) == 5:
            rois = rois[:, 1:]
        abs_img_points = rel_roi_points.clone()
        # To avoid an error during exporting to onnx use independent
        # variables instead inplace computation
        xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0])
        ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1])
        xs += rois[:, None, 0]
        ys += rois[:, None, 1]
        abs_img_points = torch.stack([xs, ys], dim=2)
    return abs_img_points


def get_shape_from_feature_map(x: Tensor) -> Tensor:
    """Get spatial resolution of input feature map considering exporting to
    onnx mode.

    Args:
        x (torch.Tensor): Input tensor, shape (N, C, H, W)

    Returns:
        torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
    """
    img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1,
                                                       2).to(x.device).float()
    return img_shape


def abs_img_point_to_rel_img_point(abs_img_points: Tensor,
                                   img: Union[tuple, Tensor],
                                   spatial_scale: float = 1.) -> Tensor:
    """Convert image based absolute point coordinates to image based relative
    coordinates for sampling.

    Args:
        abs_img_points (torch.Tensor): Image based absolute point coordinates,
            shape (N, P, 2)
        img (tuple or torch.Tensor): (height, width) of image or feature map.
        spatial_scale (float, optional): Scale points by this factor.
            Default: 1.

    Returns:
        Tensor: Image based relative point coordinates for sampling, shape
        (N, P, 2).
    """

    assert (isinstance(img, tuple) and len(img) == 2) or \
           (isinstance(img, torch.Tensor) and len(img.shape) == 4)

    if isinstance(img, tuple):
        h, w = img
        scale = torch.tensor([w, h],
                             dtype=torch.float,
                             device=abs_img_points.device)
        scale = scale.view(1, 1, 2)
    else:
        scale = get_shape_from_feature_map(img)

    return abs_img_points / scale * spatial_scale


def rel_roi_point_to_rel_img_point(rois: Tensor,
                                   rel_roi_points: Tensor,
                                   img: Union[tuple, Tensor],
                                   spatial_scale: float = 1.) -> Tensor:
    """Convert roi based relative point coordinates to image based absolute
    point coordinates.

    Args:
        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
            to RoI, location, range (0, 1), shape (N, P, 2)
        img (tuple or torch.Tensor): (height, width) of image or feature map.
        spatial_scale (float, optional): Scale points by this factor.
            Default: 1.

    Returns:
        torch.Tensor: Image based relative point coordinates for sampling,
        shape (N, P, 2).
    """

    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img,
                                                   spatial_scale)

    return rel_img_point


def point_sample(input: Tensor,
                 points: Tensor,
                 align_corners: bool = False,
                 **kwargs) -> Tensor:
    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
    lie inside ``[0, 1] x [0, 1]`` square.

    Args:
        input (torch.Tensor): Feature map, shape (N, C, H, W).
        points (torch.Tensor): Image based absolute point coordinates
            (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or
            (N, Hgrid, Wgrid, 2).
        align_corners (bool, optional): Whether align_corners.
            Default: False

    Returns:
        torch.Tensor: Features of `point` on `input`, shape (N, C, P) or
        (N, C, Hgrid, Wgrid).
    """

    add_dim = False
    if points.dim() == 3:
        add_dim = True
        points = points.unsqueeze(2)
    output = F.grid_sample(
        input, denormalize(points), align_corners=align_corners, **kwargs)
    if add_dim:
        output = output.squeeze(3)
    return output


class SimpleRoIAlign(nn.Module):

    def __init__(self,
                 output_size: Tuple[int],
                 spatial_scale: float,
                 aligned: bool = True) -> None:
        """Simple RoI align in PointRend, faster than standard RoIAlign.

        Args:
            output_size (tuple[int]): h, w
            spatial_scale (float): scale the input boxes by this number
            aligned (bool): if False, use the legacy implementation in
                MMDetection, align_corners=True will be used in F.grid_sample.
                If True, align the results more perfectly.
        """

        super().__init__()
        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)
        # to be consistent with other RoI ops
        self.use_torchvision = False
        self.aligned = aligned

    def forward(self, features: Tensor, rois: Tensor) -> Tensor:
        num_imgs = features.size(0)
        num_rois = rois.size(0)
        rel_roi_points = generate_grid(
            num_rois, self.output_size, device=rois.device)

        point_feats = []
        for batch_ind in range(num_imgs):
            # unravel batch dim
            feat = features[batch_ind].unsqueeze(0)
            inds = (rois[:, 0].long() == batch_ind)
            if inds.any():
                rel_img_points = rel_roi_point_to_rel_img_point(
                    rois[inds], rel_roi_points[inds], feat,
                    self.spatial_scale).unsqueeze(0)
                point_feat = point_sample(
                    feat, rel_img_points, align_corners=not self.aligned)
                point_feat = point_feat.squeeze(0).transpose(0, 1)
                point_feats.append(point_feat)

        point_feats_t = torch.cat(point_feats, dim=0)

        channels = features.size(1)
        roi_feats = point_feats_t.reshape(num_rois, channels,
                                          *self.output_size)

        return roi_feats

    def __repr__(self) -> str:
        format_str = self.__class__.__name__
        format_str += '(output_size={}, spatial_scale={}'.format(
            self.output_size, self.spatial_scale)
        return format_str


================================================
FILE: mmcv/ops/points_in_boxes.py
================================================
import torch
from mmengine.device import is_cuda_available, is_musa_available
from torch import Tensor

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', [
    'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward',
    'points_in_boxes_all_forward'
])


def points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor:
    """Find the box in which each point is (CUDA/MUSA).

    Args:
        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate.
        boxes (torch.Tensor): [B, T, 7],
            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center.

    Returns:
        torch.Tensor: Return the box indices of points with the shape of
        (B, M). Default background = -1.
    """
    assert points.shape[0] == boxes.shape[0], \
        'Points and boxes should have the same batch size, ' \
        f'but got {points.shape[0]} and {boxes.shape[0]}'
    assert boxes.shape[2] == 7, \
        'boxes dimension should be 7, ' \
        f'but got unexpected shape {boxes.shape[2]}'
    assert points.shape[2] == 3, \
        'points dimension should be 3, ' \
        f'but got unexpected shape {points.shape[2]}'
    batch_size, num_points, _ = points.shape

    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
                                       dtype=torch.int).fill_(-1)

    # If manually put the tensor 'points' or 'boxes' on a device
    # which is not the current device, some temporary variables
    # will be created on the current device in the cuda/musa op,
    # and the output will be incorrect.
    # Therefore, we force the current device to be the same
    # as the device of the tensors if it was not.
    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
    # for the incorrect output before the fix.
    points_device = points.get_device()
    assert points_device == boxes.get_device(), \
        'Points and boxes should be put on the same device'
    if points.device.type != 'npu':
        if is_cuda_available():
            if torch.cuda.current_device() != points_device:
                torch.cuda.set_device(points_device)
        elif is_musa_available():
            if torch.musa.current_device() != points_device:
                torch.musa.set_device(points_device)
    else:
        boxes[:, :, 2] += boxes[:, :, 5] / 2.0

    ext_module.points_in_boxes_part_forward(boxes.contiguous(),
                                            points.contiguous(),
                                            box_idxs_of_pts)

    return box_idxs_of_pts


def points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:
    """Find all boxes in which each point is (CPU). The CPU version of
    :meth:`points_in_boxes_all`.

    Args:
        points (torch.Tensor): [B, M, 3], [x, y, z] in
            LiDAR/DEPTH coordinate
        boxes (torch.Tensor): [B, T, 7],
            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
            (x, y, z) is the bottom center.

    Returns:
        torch.Tensor: Return the box indices of points with the shape of
        (B, M, T). Default background = 0.
    """
    assert points.shape[0] == boxes.shape[0], \
        'Points and boxes should have the same batch size, ' \
        f'but got {points.shape[0]} and {boxes.shape[0]}'
    assert boxes.shape[2] == 7, \
        'boxes dimension should be 7, ' \
        f'but got unexpected shape {boxes.shape[2]}'
    assert points.shape[2] == 3, \
        'points dimension should be 3, ' \
        f'but got unexpected shape {points.shape[2]}'
    batch_size, num_points, _ = points.shape
    num_boxes = boxes.shape[1]

    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
                                     dtype=torch.int)
    for b in range(batch_size):
        ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(),
                                               points[b].float().contiguous(),
                                               point_indices[b])
    point_indices = point_indices.transpose(1, 2)

    return point_indices


def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor:
    """Find all boxes in which each point is (CUDA/MUSA).

    Args:
        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
        boxes (torch.Tensor): [B, T, 7],
            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
            (x, y, z) is the bottom center.

    Returns:
        torch.Tensor: Return the box indices of points with the shape of
        (B, M, T). Default background = 0.
    """
    assert boxes.shape[0] == points.shape[0], \
        'Points and boxes should have the same batch size, ' \
        f'but got {boxes.shape[0]} and {boxes.shape[0]}'
    assert boxes.shape[2] == 7, \
        'boxes dimension should be 7, ' \
        f'but got unexpected shape {boxes.shape[2]}'
    assert points.shape[2] == 3, \
        'points dimension should be 3, ' \
        f'but got unexpected shape {points.shape[2]}'
    batch_size, num_points, _ = points.shape
    num_boxes = boxes.shape[1]

    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
                                       dtype=torch.int).fill_(0)

    # Same reason as line 25-32
    points_device = points.get_device()
    assert points_device == boxes.get_device(), \
        'Points and boxes should be put on the same device'
    if points.device.type != 'npu':
        if is_cuda_available():
            if torch.cuda.current_device() != points_device:
                torch.cuda.set_device(points_device)
        elif is_musa_available():
            if torch.musa.current_device() != points_device:
                torch.musa.set_device(points_device)

    ext_module.points_in_boxes_all_forward(boxes.contiguous(),
                                           points.contiguous(),
                                           box_idxs_of_pts)

    return box_idxs_of_pts


================================================
FILE: mmcv/ops/points_in_polygons.py
================================================
import torch
from torch import Tensor

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['points_in_polygons_forward'])


def points_in_polygons(points: Tensor, polygons: Tensor) -> Tensor:
    """Judging whether points are inside polygons, which is used in the ATSS
    assignment for the rotated boxes.

    It should be noted that when the point is just at the polygon boundary, the
    judgment will be inaccurate, but the effect on assignment is limited.

    Args:
        points (torch.Tensor): It has shape (B, 2), indicating (x, y).
            M means the number of predicted points.
        polygons (torch.Tensor): It has shape (M, 8), indicating
            (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of
            ground truth polygons.

    Returns:
        torch.Tensor: Return the result with the shape of (B, M),
        1 indicates that the point is inside the polygon,
        0 indicates that the point is outside the polygon.
    """
    assert points.shape[1] == 2, \
        'points dimension should be 2, ' \
        f'but got unexpected shape {points.shape[1]}'
    assert polygons.shape[1] == 8, \
        'polygons dimension should be 8, ' \
        f'but got unexpected shape {polygons.shape[1]}'
    output = torch.zeros(
        points.shape[0],
        polygons.shape[0],
        dtype=torch.float32,
        device=points.device)
    ext_module.points_in_polygons_forward(points.contiguous(),
                                          polygons.contiguous(), output)
    return output


================================================
FILE: mmcv/ops/points_sampler.py
================================================
from typing import List

import torch
from torch import Tensor
from torch import nn as nn

from .furthest_point_sample import (furthest_point_sample,
                                    furthest_point_sample_with_dist)


def calc_square_dist(point_feat_a: Tensor,
                     point_feat_b: Tensor,
                     norm: bool = True) -> Tensor:
    """Calculating square distance between a and b.

    Args:
        point_feat_a (torch.Tensor): (B, N, C) Feature vector of each point.
        point_feat_b (torch.Tensor): (B, M, C) Feature vector of each point.
        norm (bool, optional): Whether to normalize the distance.
            Default: True.

    Returns:
        torch.Tensor: (B, N, M) Square distance between each point pair.
    """
    num_channel = point_feat_a.shape[-1]
    dist = torch.cdist(point_feat_a, point_feat_b)
    if norm:
        dist = dist / num_channel
    else:
        dist = torch.square(dist)
    return dist


def get_sampler_cls(sampler_type: str) -> nn.Module:
    """Get the type and mode of points sampler.

    Args:
        sampler_type (str): The type of points sampler.
            The valid value are "D-FPS", "F-FPS", or "FS".

    Returns:
        class: Points sampler type.
    """
    sampler_mappings = {
        'D-FPS': DFPSSampler,
        'F-FPS': FFPSSampler,
        'FS': FSSampler,
    }
    try:
        return sampler_mappings[sampler_type]
    except KeyError:
        raise KeyError(
            f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \
                {sampler_type}')


class PointsSampler(nn.Module):
    """Points sampling.

    Args:
        num_point (list[int]): Number of sample points.
        fps_mod_list (list[str], optional): Type of FPS method, valid mod
            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
            F-FPS: using feature distances for FPS.
            D-FPS: using Euclidean distances of points for FPS.
            FS: using F-FPS and D-FPS simultaneously.
        fps_sample_range_list (list[int], optional):
            Range of points to apply FPS. Default: [-1].
    """

    def __init__(self,
                 num_point: List[int],
                 fps_mod_list: List[str] = ['D-FPS'],
                 fps_sample_range_list: List[int] = [-1]) -> None:
        super().__init__()
        # FPS would be applied to different fps_mod in the list,
        # so the length of the num_point should be equal to
        # fps_mod_list and fps_sample_range_list.
        assert len(num_point) == len(fps_mod_list) == len(
            fps_sample_range_list)
        self.num_point = num_point
        self.fps_sample_range_list = fps_sample_range_list
        self.samplers = nn.ModuleList()
        for fps_mod in fps_mod_list:
            self.samplers.append(get_sampler_cls(fps_mod)())
        self.fp16_enabled = False

    def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:
        """
        Args:
            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of
                the points.
            features (torch.Tensor): (B, C, N) features of the points.

        Returns:
            torch.Tensor: (B, npoint, sample_num) Indices of sampled points.
        """
        if points_xyz.dtype == torch.half:
            points_xyz = points_xyz.to(torch.float32)
        if features is not None and features.dtype == torch.half:
            features = features.to(torch.float32)

        indices = []
        last_fps_end_index = 0
        for fps_sample_range, sampler, npoint in zip(
                self.fps_sample_range_list, self.samplers, self.num_point):
            assert fps_sample_range < points_xyz.shape[1]

            if fps_sample_range == -1:
                sample_points_xyz = points_xyz[:, last_fps_end_index:]
                if features is not None:
                    sample_features = features[:, :, last_fps_end_index:]
                else:
                    sample_features = None
            else:
                sample_points_xyz = points_xyz[:, last_fps_end_index:
                                               fps_sample_range]
                if features is not None:
                    sample_features = features[:, :, last_fps_end_index:
                                               fps_sample_range]
                else:
                    sample_features = None

            fps_idx = sampler(sample_points_xyz.contiguous(), sample_features,
                              npoint)

            indices.append(fps_idx + last_fps_end_index)
            last_fps_end_index = fps_sample_range
        indices = torch.cat(indices, dim=1)

        return indices


class DFPSSampler(nn.Module):
    """Using Euclidean distances of points for FPS."""

    def __init__(self) -> None:
        super().__init__()

    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
        """Sampling points with D-FPS."""
        fps_idx = furthest_point_sample(points.contiguous(), npoint)
        return fps_idx


class FFPSSampler(nn.Module):
    """Using feature distances for FPS."""

    def __init__(self) -> None:
        super().__init__()

    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
        """Sampling points with F-FPS."""
        assert features is not None, \
            'feature input to FFPS_Sampler should not be None'
        features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)
        features_dist = calc_square_dist(
            features_for_fps, features_for_fps, norm=False)
        fps_idx = furthest_point_sample_with_dist(features_dist, npoint)
        return fps_idx


class FSSampler(nn.Module):
    """Using F-FPS and D-FPS simultaneously."""

    def __init__(self) -> None:
        super().__init__()

    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
        """Sampling points with FS_Sampling."""
        assert features is not None, \
            'feature input to FS_Sampler should not be None'
        ffps_sampler = FFPSSampler()
        dfps_sampler = DFPSSampler()
        fps_idx_ffps = ffps_sampler(points, features, npoint)
        fps_idx_dfps = dfps_sampler(points, features, npoint)
        fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1)
        return fps_idx


================================================
FILE: mmcv/ops/prroi_pool.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple, Union

import torch
import torch.nn as nn
from mmengine.utils.dl_utils import TORCH_VERSION
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext',
    ['prroi_pool_forward', 'prroi_pool_backward', 'prroi_pool_coor_backward'])


class PrRoIPoolFunction(Function):

    @staticmethod
    def symbolic(g, features, rois, output_size, spatial_scale):
        return g.op(
            'mmcv::PrRoIPool',
            features,
            rois,
            pooled_height_i=int(output_size[0]),
            pooled_width_i=int(output_size[1]),
            spatial_scale_f=float(spatial_scale))

    @staticmethod
    def forward(ctx,
                features: torch.Tensor,
                rois: torch.Tensor,
                output_size: Tuple,
                spatial_scale: float = 1.0) -> torch.Tensor:
        if features.dtype != torch.float32 or rois.dtype != torch.float32:
            raise ValueError('Precise RoI Pooling only takes float input, got '
                             f'{features.dtype()} for features and'
                             f'{rois.dtype()} for rois.')

        pooled_height = int(output_size[0])
        pooled_width = int(output_size[1])
        spatial_scale = float(spatial_scale)

        features = features.contiguous()
        rois = rois.contiguous()
        output_shape = (rois.size(0), features.size(1), pooled_height,
                        pooled_width)
        output = features.new_zeros(output_shape)
        params = (pooled_height, pooled_width, spatial_scale)

        ext_module.prroi_pool_forward(
            features,
            rois,
            output,
            pooled_height=params[0],
            pooled_width=params[1],
            spatial_scale=params[2])
        ctx.params = params
        # everything here is contiguous.
        ctx.save_for_backward(features, rois, output)

        return output

    @staticmethod
    @once_differentiable
    def backward(
        ctx, grad_output: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:
        features, rois, output = ctx.saved_tensors
        grad_input = grad_output.new_zeros(*features.shape)
        grad_coor = grad_output.new_zeros(*rois.shape)

        if features.requires_grad or TORCH_VERSION == 'parrots':
            grad_output = grad_output.contiguous()
            ext_module.prroi_pool_backward(
                grad_output,
                rois,
                grad_input,
                pooled_height=ctx.params[0],
                pooled_width=ctx.params[1],
                spatial_scale=ctx.params[2])
        if rois.requires_grad or TORCH_VERSION == 'parrots':
            grad_output = grad_output.contiguous()
            ext_module.prroi_pool_coor_backward(
                output,
                grad_output,
                features,
                rois,
                grad_coor,
                pooled_height=ctx.params[0],
                pooled_width=ctx.params[1],
                spatial_scale=ctx.params[2])

        return grad_input, grad_coor, None, None, None


prroi_pool = PrRoIPoolFunction.apply


class PrRoIPool(nn.Module):
    """The operation of precision RoI pooling. The implementation of PrRoIPool
    is modified from https://github.com/vacancy/PreciseRoIPooling/

    Precise RoI Pooling (PrRoIPool) is an integration-based (bilinear
    interpolation) average pooling method for RoI Pooling. It avoids any
    quantization and has a continuous gradient on bounding box coordinates.
    It is:

    1. different from the original RoI Pooling proposed in Fast R-CNN. PrRoI
    Pooling uses average pooling instead of max pooling for each bin and has a
    continuous gradient on bounding box coordinates. That is, one can take the
    derivatives of some loss function w.r.t the coordinates of each RoI and
    optimize the RoI coordinates.
    2. different from the RoI Align proposed in Mask R-CNN. PrRoI Pooling uses
    a full integration-based average pooling instead of sampling a constant
    number of points. This makes the gradient w.r.t. the coordinates
    continuous.

    Args:
        output_size (Union[int, tuple]): h, w.
        spatial_scale (float, optional): scale the input boxes by this number.
            Defaults to 1.0.
    """

    def __init__(self,
                 output_size: Union[int, tuple],
                 spatial_scale: float = 1.0):
        super().__init__()

        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)

    def forward(self, features: torch.Tensor,
                rois: torch.Tensor) -> torch.Tensor:
        """Forward function.

        Args:
            features (torch.Tensor): The feature map.
            rois (torch.Tensor): The RoI bboxes in [tl_x, tl_y, br_x, br_y]
                format.

        Returns:
            torch.Tensor: The pooled results.
        """
        return prroi_pool(features, rois, self.output_size, self.spatial_scale)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(output_size={self.output_size}, '
        s += f'spatial_scale={self.spatial_scale})'
        return s


================================================
FILE: mmcv/ops/psa_mask.py
================================================
# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
from typing import Optional, Tuple

import torch
from torch import nn
from torch.autograd import Function
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext',
                                 ['psamask_forward', 'psamask_backward'])


class PSAMaskFunction(Function):

    @staticmethod
    def symbolic(g, input, psa_type, mask_size):
        return g.op(
            'mmcv::MMCVPSAMask',
            input,
            psa_type_i=psa_type,
            mask_size_i=mask_size)

    @staticmethod
    def forward(ctx, input: torch.Tensor, psa_type: str,
                mask_size: int) -> torch.Tensor:
        ctx.psa_type = psa_type
        ctx.mask_size = _pair(mask_size)
        ctx.save_for_backward(input)

        h_mask, w_mask = ctx.mask_size
        batch_size, channels, h_feature, w_feature = input.size()
        assert channels == h_mask * w_mask
        output = input.new_zeros(
            (batch_size, h_feature * w_feature, h_feature, w_feature))

        ext_module.psamask_forward(
            input,
            output,
            psa_type=psa_type,
            num_=batch_size,
            h_feature=h_feature,
            w_feature=w_feature,
            h_mask=h_mask,
            w_mask=w_mask,
            half_h_mask=(h_mask - 1) // 2,
            half_w_mask=(w_mask - 1) // 2)
        return output

    @staticmethod
    def backward(
            ctx, grad_output: torch.Tensor
    ) -> Tuple[torch.Tensor, None, None, None]:
        input = ctx.saved_tensors[0]
        psa_type = ctx.psa_type
        h_mask, w_mask = ctx.mask_size
        batch_size, channels, h_feature, w_feature = input.size()
        grad_input = grad_output.new_zeros(
            (batch_size, channels, h_feature, w_feature))
        ext_module.psamask_backward(
            grad_output,
            grad_input,
            psa_type=psa_type,
            num_=batch_size,
            h_feature=h_feature,
            w_feature=w_feature,
            h_mask=h_mask,
            w_mask=w_mask,
            half_h_mask=(h_mask - 1) // 2,
            half_w_mask=(w_mask - 1) // 2)
        return grad_input, None, None, None


psa_mask = PSAMaskFunction.apply


class PSAMask(nn.Module):

    def __init__(self, psa_type: str, mask_size: Optional[tuple] = None):
        super().__init__()
        assert psa_type in ['collect', 'distribute']
        if psa_type == 'collect':
            psa_type_enum = 0
        else:
            psa_type_enum = 1
        self.psa_type_enum = psa_type_enum
        self.mask_size = mask_size
        self.psa_type = psa_type

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return psa_mask(input, self.psa_type_enum, self.mask_size)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(psa_type={self.psa_type}, '
        s += f'mask_size={self.mask_size})'
        return s


================================================
FILE: mmcv/ops/riroi_align_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any, Optional, Tuple, Union

import torch
import torch.nn as nn
from mmengine.utils import is_tuple_of
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['riroi_align_rotated_forward', 'riroi_align_rotated_backward'])


class RiRoIAlignRotatedFunction(Function):

    @staticmethod
    def forward(ctx: Any,
                features: torch.Tensor,
                rois: torch.Tensor,
                out_size: Union[int, tuple],
                spatial_scale: float,
                num_samples: int = 0,
                num_orientations: int = 8,
                clockwise: bool = False) -> torch.Tensor:
        if isinstance(out_size, int):
            out_h = out_size
            out_w = out_size
        elif is_tuple_of(out_size, int):
            assert len(out_size) == 2
            out_h, out_w = out_size
        else:
            raise TypeError(
                f'"out_size" should be an integer or tuple of integers,'
                f' but got {out_size}')
        ctx.spatial_scale = spatial_scale
        ctx.num_samples = num_samples
        ctx.num_orientations = num_orientations
        ctx.clockwise = clockwise
        ctx.save_for_backward(rois)
        ctx.feature_size = features.size()

        batch_size, num_channels, _, _ = features.size()
        num_rois = rois.size(0)

        output = features.new_zeros(num_rois, num_channels, out_h, out_w)

        ext_module.riroi_align_rotated_forward(
            features,
            rois,
            output,
            pooled_height=out_h,
            pooled_width=out_w,
            spatial_scale=spatial_scale,
            num_samples=num_samples,
            num_orientations=num_orientations,
            clockwise=clockwise)
        return output

    @staticmethod
    def backward(
        ctx: Any, grad_output: torch.Tensor
    ) -> Optional[Tuple[torch.Tensor, None, None, None, None, None, None]]:
        feature_size = ctx.feature_size
        spatial_scale = ctx.spatial_scale
        num_orientations = ctx.num_orientations
        clockwise = ctx.clockwise
        num_samples = ctx.num_samples
        rois = ctx.saved_tensors[0]
        assert feature_size is not None
        batch_size, num_channels, feature_h, feature_w = feature_size

        out_w = grad_output.size(3)
        out_h = grad_output.size(2)

        grad_input = None

        if ctx.needs_input_grad[0]:
            grad_input = rois.new_zeros(batch_size, num_channels, feature_h,
                                        feature_w)
            ext_module.riroi_align_rotated_backward(
                grad_output.contiguous(),
                rois,
                grad_input,
                pooled_height=out_h,
                pooled_width=out_w,
                spatial_scale=spatial_scale,
                num_samples=num_samples,
                num_orientations=num_orientations,
                clockwise=clockwise)

            return grad_input, None, None, None, None, None, None
        return None


riroi_align_rotated = RiRoIAlignRotatedFunction.apply


class RiRoIAlignRotated(nn.Module):
    """Rotation-invariant RoI align pooling layer for rotated proposals.

    It accepts a feature map of shape (N, C, H, W) and rois with shape
    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
    w, h, angle). The angle is in radian.

    The details are described in the paper `ReDet: A Rotation-equivariant
    Detector for Aerial Object Detection  <https://arxiv.org/abs/2103.07733>`_.

    Args:
        out_size (tuple): fixed dimensional RoI output with shape (h, w).
        spatial_scale (float): scale the input boxes by this number
        num_samples (int): number of inputs samples to take for each
            output sample. 0 to take samples densely for current models.
        num_orientations (int): number of oriented channels.
        clockwise (bool): If True, the angle in each proposal follows a
            clockwise fashion in image space, otherwise, the angle is
            counterclockwise. Default: False.
    """

    def __init__(self,
                 out_size: tuple,
                 spatial_scale: float,
                 num_samples: int = 0,
                 num_orientations: int = 8,
                 clockwise: bool = False):
        super().__init__()

        self.out_size = out_size
        self.spatial_scale = float(spatial_scale)
        self.num_samples = int(num_samples)
        self.num_orientations = int(num_orientations)
        self.clockwise = clockwise

    def forward(self, features: torch.Tensor,
                rois: torch.Tensor) -> torch.Tensor:
        return RiRoIAlignRotatedFunction.apply(features, rois, self.out_size,
                                               self.spatial_scale,
                                               self.num_samples,
                                               self.num_orientations,
                                               self.clockwise)


================================================
FILE: mmcv/ops/roi_align.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any

import torch
import torch.nn as nn
from mmengine.utils import deprecated_api_warning
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext',
                                 ['roi_align_forward', 'roi_align_backward'])


class RoIAlignFunction(Function):

    @staticmethod
    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
                 pool_mode, aligned):
        from torch.onnx import TensorProtoDataType
        from torch.onnx.symbolic_opset9 import sub

        def _select(g, self, dim, index):
            return g.op('Gather', self, index, axis_i=dim)

        # batch_indices = rois[:, 0].long()
        batch_indices = _select(
            g, rois, 1,
            g.op('Constant', value_t=torch.tensor([0], dtype=torch.long)))
        batch_indices = g.op('Squeeze', batch_indices, axes_i=[1])
        batch_indices = g.op(
            'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
        # rois = rois[:, 1:]
        rois = _select(
            g, rois, 1,
            g.op(
                'Constant',
                value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))

        if aligned:
            # rois -= 0.5/spatial_scale
            aligned_offset = g.op(
                'Constant',
                value_t=torch.tensor([0.5 / spatial_scale],
                                     dtype=torch.float32))
            rois = sub(g, rois, aligned_offset)
        # roi align
        return g.op(
            'RoiAlign',
            input,
            rois,
            batch_indices,
            output_height_i=output_size[0],
            output_width_i=output_size[1],
            spatial_scale_f=spatial_scale,
            sampling_ratio_i=max(0, sampling_ratio),
            mode_s=pool_mode)

    @staticmethod
    def forward(ctx: Any,
                input: torch.Tensor,
                rois: torch.Tensor,
                output_size: int,
                spatial_scale: float = 1.0,
                sampling_ratio: int = 0,
                pool_mode: str = 'avg',
                aligned: bool = True) -> torch.Tensor:
        ctx.output_size = _pair(output_size)
        ctx.spatial_scale = spatial_scale
        ctx.sampling_ratio = sampling_ratio
        assert pool_mode in ('max', 'avg')
        ctx.pool_mode = 0 if pool_mode == 'max' else 1
        ctx.aligned = aligned
        ctx.input_shape = input.size()

        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'

        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
                        ctx.output_size[1])
        output = input.new_zeros(output_shape)
        if ctx.pool_mode == 0:
            argmax_y = input.new_zeros(output_shape)
            argmax_x = input.new_zeros(output_shape)
        else:
            argmax_y = input.new_zeros(0)
            argmax_x = input.new_zeros(0)

        ext_module.roi_align_forward(
            input,
            rois,
            output,
            argmax_y,
            argmax_x,
            aligned_height=ctx.output_size[0],
            aligned_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale,
            sampling_ratio=ctx.sampling_ratio,
            pool_mode=ctx.pool_mode,
            aligned=ctx.aligned)

        ctx.save_for_backward(rois, argmax_y, argmax_x)
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
        rois, argmax_y, argmax_x = ctx.saved_tensors
        grad_input = grad_output.new_zeros(ctx.input_shape)
        # complex head architecture may cause grad_output uncontiguous.
        grad_output = grad_output.contiguous()
        ext_module.roi_align_backward(
            grad_output,
            rois,
            argmax_y,
            argmax_x,
            grad_input,
            aligned_height=ctx.output_size[0],
            aligned_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale,
            sampling_ratio=ctx.sampling_ratio,
            pool_mode=ctx.pool_mode,
            aligned=ctx.aligned)
        return grad_input, None, None, None, None, None, None


roi_align = RoIAlignFunction.apply


class RoIAlign(nn.Module):
    """RoI align pooling layer.

    Args:
        output_size (tuple): h, w
        spatial_scale (float): scale the input boxes by this number
        sampling_ratio (int): number of inputs samples to take for each
            output sample. 0 to take samples densely for current models.
        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
        aligned (bool): if False, use the legacy implementation in
            MMDetection. If True, align the results more perfectly.
        use_torchvision (bool): whether to use roi_align from torchvision.

    Note:
        The implementation of RoIAlign when aligned=True is modified from
        https://github.com/facebookresearch/detectron2/

        The meaning of aligned=True:

        Given a continuous coordinate c, its two neighboring pixel
        indices (in our pixel model) are computed by floor(c - 0.5) and
        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
        indices [0] and [1] (which are sampled from the underlying signal
        at continuous coordinates 0.5 and 1.5). But the original roi_align
        (aligned=False) does not subtract the 0.5 when computing
        neighboring pixel indices and therefore it uses pixels with a
        slightly incorrect alignment (relative to our pixel model) when
        performing bilinear interpolation.

        With `aligned=True`,
        we first appropriately scale the ROI and then shift it by -0.5
        prior to calling roi_align. This produces the correct neighbors;

        The difference does not make a difference to the model's
        performance if ROIAlign is used together with conv layers.
    """

    @deprecated_api_warning(
        {
            'out_size': 'output_size',
            'sample_num': 'sampling_ratio'
        },
        cls_name='RoIAlign')
    def __init__(self,
                 output_size: tuple,
                 spatial_scale: float = 1.0,
                 sampling_ratio: int = 0,
                 pool_mode: str = 'avg',
                 aligned: bool = True,
                 use_torchvision: bool = False):
        super().__init__()

        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)
        self.sampling_ratio = int(sampling_ratio)
        self.pool_mode = pool_mode
        self.aligned = aligned
        self.use_torchvision = use_torchvision

    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
        """
        Args:
            input: NCHW images
            rois: Bx5 boxes. First column is the index into N.\
                The other 4 columns are xyxy.
        """
        if self.use_torchvision:
            from torchvision.ops import roi_align as tv_roi_align
            if 'aligned' in tv_roi_align.__code__.co_varnames:
                return tv_roi_align(input, rois, self.output_size,
                                    self.spatial_scale, self.sampling_ratio,
                                    self.aligned)
            else:
                if self.aligned:
                    rois -= rois.new_tensor([0.] +
                                            [0.5 / self.spatial_scale] * 4)
                return tv_roi_align(input, rois, self.output_size,
                                    self.spatial_scale, self.sampling_ratio)
        else:
            return roi_align(input, rois, self.output_size, self.spatial_scale,
                             self.sampling_ratio, self.pool_mode, self.aligned)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(output_size={self.output_size}, '
        s += f'spatial_scale={self.spatial_scale}, '
        s += f'sampling_ratio={self.sampling_ratio}, '
        s += f'pool_mode={self.pool_mode}, '
        s += f'aligned={self.aligned}, '
        s += f'use_torchvision={self.use_torchvision})'
        return s


================================================
FILE: mmcv/ops/roi_align_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any, Optional, Tuple, Union

import torch
import torch.nn as nn
from mmengine.utils import deprecated_api_warning
from torch.autograd import Function
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])


class RoIAlignRotatedFunction(Function):

    @staticmethod
    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
                 aligned, clockwise):
        if isinstance(output_size, int):
            out_h = output_size
            out_w = output_size
        elif isinstance(output_size, tuple):
            assert len(output_size) == 2
            assert isinstance(output_size[0], int)
            assert isinstance(output_size[1], int)
            out_h, out_w = output_size
        else:
            raise TypeError(
                '"output_size" must be an integer or tuple of integers')
        return g.op(
            'mmcv::MMCVRoIAlignRotated',
            input,
            rois,
            output_height_i=out_h,
            output_width_i=out_h,
            spatial_scale_f=spatial_scale,
            sampling_ratio_i=sampling_ratio,
            aligned_i=aligned,
            clockwise_i=clockwise)

    @staticmethod
    def forward(ctx: Any,
                input: torch.Tensor,
                rois: torch.Tensor,
                output_size: Union[int, tuple],
                spatial_scale: float,
                sampling_ratio: int = 0,
                aligned: bool = True,
                clockwise: bool = False) -> torch.Tensor:
        ctx.output_size = _pair(output_size)
        ctx.spatial_scale = spatial_scale
        ctx.sampling_ratio = sampling_ratio
        ctx.aligned = aligned
        ctx.clockwise = clockwise
        ctx.save_for_backward(rois)
        ctx.feature_size = input.size()

        batch_size, num_channels, data_height, data_width = input.size()
        num_rois = rois.size(0)

        output = input.new_zeros(num_rois, num_channels, ctx.output_size[0],
                                 ctx.output_size[1])
        ext_module.roi_align_rotated_forward(
            input,
            rois,
            output,
            pooled_height=ctx.output_size[0],
            pooled_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale,
            sampling_ratio=ctx.sampling_ratio,
            aligned=ctx.aligned,
            clockwise=ctx.clockwise)
        return output

    @staticmethod
    def backward(
        ctx: Any, grad_output: torch.Tensor
    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], None, None,
               None, None, None]:
        feature_size = ctx.feature_size
        rois = ctx.saved_tensors[0]
        assert feature_size is not None
        batch_size, num_channels, data_height, data_width = feature_size

        out_w = grad_output.size(3)
        out_h = grad_output.size(2)

        grad_input = grad_rois = None

        if ctx.needs_input_grad[0]:
            grad_input = rois.new_zeros(batch_size, num_channels, data_height,
                                        data_width)
            ext_module.roi_align_rotated_backward(
                grad_output.contiguous(),
                rois,
                grad_input,
                pooled_height=out_h,
                pooled_width=out_w,
                spatial_scale=ctx.spatial_scale,
                sampling_ratio=ctx.sampling_ratio,
                aligned=ctx.aligned,
                clockwise=ctx.clockwise)
        return grad_input, grad_rois, None, None, None, None, None


roi_align_rotated = RoIAlignRotatedFunction.apply


class RoIAlignRotated(nn.Module):
    """RoI align pooling layer for rotated proposals.

    It accepts a feature map of shape (N, C, H, W) and rois with shape
    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
    w, h, angle). The angle is in radian.

    Args:
        output_size (tuple): h, w
        spatial_scale (float): scale the input boxes by this number
        sampling_ratio(int): number of inputs samples to take for each
            output sample. 0 to take samples densely for current models.
        aligned (bool): if False, use the legacy implementation in
            MMDetection. If True, align the results more perfectly.
            Default: True.
        clockwise (bool): If True, the angle in each proposal follows a
            clockwise fashion in image space, otherwise, the angle is
            counterclockwise. Default: False.

    Note:
        The implementation of RoIAlign when aligned=True is modified from
        https://github.com/facebookresearch/detectron2/

        The meaning of aligned=True:

        Given a continuous coordinate c, its two neighboring pixel
        indices (in our pixel model) are computed by floor(c - 0.5) and
        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
        indices [0] and [1] (which are sampled from the underlying signal
        at continuous coordinates 0.5 and 1.5). But the original roi_align
        (aligned=False) does not subtract the 0.5 when computing
        neighboring pixel indices and therefore it uses pixels with a
        slightly incorrect alignment (relative to our pixel model) when
        performing bilinear interpolation.

        With `aligned=True`,
        we first appropriately scale the ROI and then shift it by -0.5
        prior to calling roi_align. This produces the correct neighbors;

        The difference does not make a difference to the model's
        performance if ROIAlign is used together with conv layers.
    """

    @deprecated_api_warning(
        {
            'out_size': 'output_size',
            'sample_num': 'sampling_ratio'
        },
        cls_name='RoIAlignRotated')
    def __init__(self,
                 output_size: Union[int, tuple],
                 spatial_scale: float,
                 sampling_ratio: int = 0,
                 aligned: bool = True,
                 clockwise: bool = False):
        super().__init__()

        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)
        self.sampling_ratio = int(sampling_ratio)
        self.aligned = aligned
        self.clockwise = clockwise

    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
        return RoIAlignRotatedFunction.apply(input, rois, self.output_size,
                                             self.spatial_scale,
                                             self.sampling_ratio, self.aligned,
                                             self.clockwise)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(output_size={self.output_size}, '
        s += f'spatial_scale={self.spatial_scale}, '
        s += f'sampling_ratio={self.sampling_ratio}, '
        s += f'aligned={self.aligned}, '
        s += f'clockwise={self.clockwise})'
        return s


================================================
FILE: mmcv/ops/roi_pool.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any, Tuple, Union

import torch
import torch.nn as nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext',
                                 ['roi_pool_forward', 'roi_pool_backward'])


class RoIPoolFunction(Function):

    @staticmethod
    def symbolic(g, input, rois, output_size, spatial_scale):
        return g.op(
            'MaxRoiPool',
            input,
            rois,
            pooled_shape_i=output_size,
            spatial_scale_f=spatial_scale)

    @staticmethod
    def forward(ctx: Any,
                input: torch.Tensor,
                rois: torch.Tensor,
                output_size: Union[int, tuple],
                spatial_scale: float = 1.0) -> torch.Tensor:
        ctx.output_size = _pair(output_size)
        ctx.spatial_scale = spatial_scale
        ctx.input_shape = input.size()

        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'

        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
                        ctx.output_size[1])
        output = input.new_zeros(output_shape)
        argmax = input.new_zeros(output_shape, dtype=torch.int)

        ext_module.roi_pool_forward(
            input,
            rois,
            output,
            argmax,
            pooled_height=ctx.output_size[0],
            pooled_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale)

        ctx.save_for_backward(rois, argmax)
        return output

    @staticmethod
    @once_differentiable
    def backward(
            ctx: Any, grad_output: torch.Tensor
    ) -> Tuple[torch.Tensor, None, None, None]:
        rois, argmax = ctx.saved_tensors
        grad_input = grad_output.new_zeros(ctx.input_shape)

        ext_module.roi_pool_backward(
            grad_output,
            rois,
            argmax,
            grad_input,
            pooled_height=ctx.output_size[0],
            pooled_width=ctx.output_size[1],
            spatial_scale=ctx.spatial_scale)

        return grad_input, None, None, None


roi_pool = RoIPoolFunction.apply


class RoIPool(nn.Module):

    def __init__(self,
                 output_size: Union[int, tuple],
                 spatial_scale: float = 1.0):
        super().__init__()

        self.output_size = _pair(output_size)
        self.spatial_scale = float(spatial_scale)

    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
        return roi_pool(input, rois, self.output_size, self.spatial_scale)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'(output_size={self.output_size}, '
        s += f'spatial_scale={self.spatial_scale})'
        return s


================================================
FILE: mmcv/ops/roiaware_pool3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any, Tuple, Union

import mmengine
import torch
from torch import nn as nn
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward'])


class RoIAwarePool3d(nn.Module):
    """Encode the geometry-specific features of each 3D proposal.

    Please refer to `PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_ for more
    details.

    Args:
        out_size (int or tuple): The size of output features. n or
            [n1, n2, n3].
        max_pts_per_voxel (int, optional): The maximum number of points per
            voxel. Default: 128.
        mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'.
            Default: 'max'.
    """

    def __init__(self,
                 out_size: Union[int, tuple],
                 max_pts_per_voxel: int = 128,
                 mode: str = 'max'):
        super().__init__()

        self.out_size = out_size
        self.max_pts_per_voxel = max_pts_per_voxel
        assert mode in ['max', 'avg']
        pool_mapping = {'max': 0, 'avg': 1}
        self.mode = pool_mapping[mode]

    def forward(self, rois: torch.Tensor, pts: torch.Tensor,
                pts_feature: torch.Tensor) -> torch.Tensor:
        """
        Args:
            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
                (x, y, z) is the bottom center of rois.
            pts (torch.Tensor): [npoints, 3], coordinates of input points.
            pts_feature (torch.Tensor): [npoints, C], features of input points.

        Returns:
            torch.Tensor: Pooled features whose shape is
            [N, out_x, out_y, out_z, C].
        """

        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
                                            self.out_size,
                                            self.max_pts_per_voxel, self.mode)


class RoIAwarePool3dFunction(Function):

    @staticmethod
    def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor,
                pts_feature: torch.Tensor, out_size: Union[int, tuple],
                max_pts_per_voxel: int, mode: int) -> torch.Tensor:
        """
        Args:
            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
                (x, y, z) is the bottom center of rois.
            pts (torch.Tensor): [npoints, 3], coordinates of input points.
            pts_feature (torch.Tensor): [npoints, C], features of input points.
            out_size (int or tuple): The size of output features. n or
                [n1, n2, n3].
            max_pts_per_voxel (int): The maximum number of points per voxel.
                Default: 128.
            mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average
                pool).

        Returns:
            torch.Tensor: Pooled features whose shape is
            [N, out_x, out_y, out_z, C].
        """

        if isinstance(out_size, int):
            out_x = out_y = out_z = out_size
        else:
            assert len(out_size) == 3
            assert mmengine.is_tuple_of(out_size, int)
            out_x, out_y, out_z = out_size

        num_rois = rois.shape[0]
        num_channels = pts_feature.shape[-1]
        num_pts = pts.shape[0]

        pooled_features = pts_feature.new_zeros(
            (num_rois, out_x, out_y, out_z, num_channels))
        argmax = pts_feature.new_zeros(
            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
        pts_idx_of_voxels = pts_feature.new_zeros(
            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
            dtype=torch.int)

        ext_module.roiaware_pool3d_forward(
            rois,
            pts,
            pts_feature,
            argmax,
            pts_idx_of_voxels,
            pooled_features,
            pool_method=mode)

        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
                                            num_pts, num_channels)
        return pooled_features

    @staticmethod
    def backward(
        ctx: Any, grad_out: torch.Tensor
    ) -> Tuple[None, None, torch.Tensor, None, None, None]:
        ret = ctx.roiaware_pool3d_for_backward
        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret

        grad_in = grad_out.new_zeros((num_pts, num_channels))
        ext_module.roiaware_pool3d_backward(
            pts_idx_of_voxels,
            argmax,
            grad_out.contiguous(),
            grad_in,
            pool_method=mode)

        return None, None, grad_in, None, None, None


================================================
FILE: mmcv/ops/roipoint_pool3d.py
================================================
from typing import Any, Tuple

import torch
from torch import nn as nn
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward'])


class RoIPointPool3d(nn.Module):
    """Encode the geometry-specific features of each 3D proposal.

    Please refer to `Paper of PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_
    for more details.

    Args:
        num_sampled_points (int, optional): Number of samples in each roi.
            Default: 512.
    """

    def __init__(self, num_sampled_points: int = 512):
        super().__init__()
        self.num_sampled_points = num_sampled_points

    def forward(self, points: torch.Tensor, point_features: torch.Tensor,
                boxes3d: torch.Tensor) -> Tuple[torch.Tensor]:
        """
        Args:
            points (torch.Tensor): Input points whose shape is (B, N, C).
            point_features (torch.Tensor): Features of input points whose shape
                is (B, N, C).
            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).

        Returns:
            tuple[torch.Tensor]: A tuple contains two elements. The first one
            is the pooled features whose shape is (B, M, 512, 3 + C). The
            second is an empty flag whose shape is (B, M).
        """
        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
                                            self.num_sampled_points)


class RoIPointPool3dFunction(Function):

    @staticmethod
    def forward(
            ctx: Any,
            points: torch.Tensor,
            point_features: torch.Tensor,
            boxes3d: torch.Tensor,
            num_sampled_points: int = 512
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            points (torch.Tensor): Input points whose shape is (B, N, C).
            point_features (torch.Tensor): Features of input points whose shape
                is (B, N, C).
            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
            num_sampled_points (int, optional): The num of sampled points.
                Default: 512.

        Returns:
            tuple[torch.Tensor]: A tuple contains two elements. The first one
            is the pooled features whose shape is (B, M, 512, 3 + C). The
            second is an empty flag whose shape is (B, M).
        """
        assert len(points.shape) == 3 and points.shape[2] == 3
        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
            1], point_features.shape[2]
        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
        pooled_features = point_features.new_zeros(
            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
        pooled_empty_flag = point_features.new_zeros(
            (batch_size, boxes_num)).int()

        ext_module.roipoint_pool3d_forward(points.contiguous(),
                                           pooled_boxes3d.contiguous(),
                                           point_features.contiguous(),
                                           pooled_features, pooled_empty_flag)

        return pooled_features, pooled_empty_flag

    @staticmethod
    def backward(ctx: Any, grad_out: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError


================================================
FILE: mmcv/ops/rotated_feature_align.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any

import torch
from torch.autograd import Function
from torch.autograd.function import once_differentiable

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext',
    ['rotated_feature_align_forward', 'rotated_feature_align_backward'])


class RotatedFeatureAlignFunction(Function):
    """Using the feature interpolation to obtain the position information
    correspond to the refined rotate anchors and reconstruct the feature maps
    in pixel-wise manner to achieve feature alignment.

    The details are described in the paper `R3Det: Refined Single-Stage
    Detector with Feature Refinement for Rotating Object
    <https://arxiv.org/abs/1908.05612>`_.
    """

    @staticmethod
    def symbolic(g, features, best_rbboxes, spatial_scale, points):
        assert points in [1, 5]
        return g.op(
            'mmcv::MMCVRotatedFeatureAlign',
            features,
            best_rbboxes,
            spatial_scale_f=spatial_scale,
            points_i=points)

    @staticmethod
    def forward(ctx: Any, features: torch.Tensor, best_rbboxes: torch.Tensor,
                spatial_scale: float, points: int) -> torch.Tensor:
        """
        Args:
            features (torch.Tensor): Input features with shape [N,C,H,W].
            best_rbboxes (torch.Tensor): Refined rotate anchors with
                shape [N,H,W,5]. Coordinate format (cx,cx,h,w,a).
            spatial_scale (float): The scale of feature map size and
                input image size.
            points (int, optional): The number of sample points.
                Only 1 and 5 are supported. Defaults to 1.

        Returns:
            torch.Tensor: Refined features with shape [N,C,H,W].
        """
        ctx.spatial_scale = spatial_scale
        ctx.points = points
        ctx.save_for_backward(best_rbboxes)
        assert points in [1, 5]
        output = torch.zeros_like(features)
        ext_module.rotated_feature_align_forward(
            features,
            best_rbboxes,
            output,
            spatial_scale=spatial_scale,
            points=points)
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
        """
        Args:
            grad_output (torch.Tensor): The gradient of output features
                with shape [N,C,H,W].

        Returns:
            torch.Tensor: The gradient of input features with shape [N,C,H,W].
        """
        best_rbboxes = ctx.saved_tensors[0]
        points = ctx.points
        spatial_scale = ctx.spatial_scale
        grad_input = None
        if ctx.needs_input_grad[0]:
            grad_input = torch.zeros_like(grad_output)
            ext_module.rotated_feature_align_backward(
                grad_output.contiguous(),
                best_rbboxes,
                grad_input,
                spatial_scale=spatial_scale,
                points=points)
        return grad_input, None, None, None


def rotated_feature_align(features: torch.Tensor,
                          best_rbboxes: torch.Tensor,
                          spatial_scale: float = 1 / 8,
                          points: int = 1) -> torch.Tensor:
    return RotatedFeatureAlignFunction.apply(features, best_rbboxes,
                                             spatial_scale, points)


================================================
FILE: mmcv/ops/saconv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import constant_init
from mmengine.registry import MODELS
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION

from mmcv.cnn import ConvAWS2d
from mmcv.ops.deform_conv import deform_conv2d


@MODELS.register_module(name='SAC')
class SAConv2d(ConvAWS2d):
    """SAC (Switchable Atrous Convolution)

    This is an implementation of `DetectoRS: Detecting Objects with Recursive
    Feature Pyramid and Switchable Atrous Convolution
    <https://arxiv.org/abs/2006.02334>`_.

    Args:
        in_channels (int): Number of channels in the input image
        out_channels (int): Number of channels produced by the convolution
        kernel_size (int or tuple): Size of the convolving kernel
        stride (int or tuple, optional): Stride of the convolution. Default: 1
        padding (int or tuple, optional): Zero-padding added to both sides of
            the input. Default: 0
        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
        dilation (int or tuple, optional): Spacing between kernel elements.
            Default: 1
        groups (int, optional): Number of blocked connections from input
            channels to output channels. Default: 1
        bias (bool, optional): If ``True``, adds a learnable bias to the
            output. Default: ``True``
        use_deform: If ``True``, replace convolution with deformable
            convolution. Default: ``False``.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 use_deform=False):
        super().__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias)
        self.use_deform = use_deform
        self.switch = nn.Conv2d(
            self.in_channels, 1, kernel_size=1, stride=stride, bias=True)
        self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size()))
        self.pre_context = nn.Conv2d(
            self.in_channels, self.in_channels, kernel_size=1, bias=True)
        self.post_context = nn.Conv2d(
            self.out_channels, self.out_channels, kernel_size=1, bias=True)
        if self.use_deform:
            self.offset_s = nn.Conv2d(
                self.in_channels,
                18,
                kernel_size=3,
                padding=1,
                stride=stride,
                bias=True)
            self.offset_l = nn.Conv2d(
                self.in_channels,
                18,
                kernel_size=3,
                padding=1,
                stride=stride,
                bias=True)
        self.init_weights()

    def init_weights(self):
        constant_init(self.switch, 0, bias=1)
        self.weight_diff.data.zero_()
        constant_init(self.pre_context, 0)
        constant_init(self.post_context, 0)
        if self.use_deform:
            constant_init(self.offset_s, 0)
            constant_init(self.offset_l, 0)

    def forward(self, x):
        # pre-context
        avg_x = F.adaptive_avg_pool2d(x, output_size=1)
        avg_x = self.pre_context(avg_x)
        avg_x = avg_x.expand_as(x)
        x = x + avg_x
        # switch
        avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect')
        avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0)
        switch = self.switch(avg_x)
        # sac
        weight = self._get_weight(self.weight)
        zero_bias = torch.zeros(
            self.out_channels, device=weight.device, dtype=weight.dtype)

        if self.use_deform:
            offset = self.offset_s(avg_x)
            out_s = deform_conv2d(x, offset, weight, self.stride, self.padding,
                                  self.dilation, self.groups, 1)
        else:
            if (TORCH_VERSION == 'parrots'
                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
                out_s = super().conv2d_forward(x, weight)
            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
                # bias is a required argument of _conv_forward in torch 1.8.0
                out_s = super()._conv_forward(x, weight, zero_bias)
            else:
                out_s = super()._conv_forward(x, weight)
        ori_p = self.padding
        ori_d = self.dilation
        self.padding = tuple(3 * p for p in self.padding)
        self.dilation = tuple(3 * d for d in self.dilation)
        weight = weight + self.weight_diff
        if self.use_deform:
            offset = self.offset_l(avg_x)
            out_l = deform_conv2d(x, offset, weight, self.stride, self.padding,
                                  self.dilation, self.groups, 1)
        else:
            if (TORCH_VERSION == 'parrots'
                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
                out_l = super().conv2d_forward(x, weight)
            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
                # bias is a required argument of _conv_forward in torch 1.8.0
                out_l = super()._conv_forward(x, weight, zero_bias)
            else:
                out_l = super()._conv_forward(x, weight)

        out = switch * out_s + (1 - switch) * out_l
        self.padding = ori_p
        self.dilation = ori_d
        # post-context
        avg_x = F.adaptive_avg_pool2d(out, output_size=1)
        avg_x = self.post_context(avg_x)
        avg_x = avg_x.expand_as(out)
        out = out + avg_x
        return out


================================================
FILE: mmcv/ops/scatter_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any, List, Optional, Tuple

import torch
import torch.nn.functional as F
from torch import nn
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext',
    ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'])


class _DynamicScatter(Function):

    @staticmethod
    def forward(ctx: Any,
                feats: torch.Tensor,
                coors: torch.Tensor,
                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:
        """Convert kitti points(N, >=3) to voxels.

        Args:
            feats (torch.Tensor): [N, C]. Points features to be reduced
                into voxels.
            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates
                (specifically multi-dim voxel index) of each points.
            reduce_type (str, optional): Reduce op. support 'max', 'sum' and
                'mean'. Default: 'max'.

        Returns:
            tuple[torch.Tensor]: A tuple contains two elements. The first one
            is the voxel features with shape [M, C] which are respectively
            reduced from input features that share the same voxel coordinates.
            The second is voxel coordinates with shape [M, ndim].
        """
        results = ext_module.dynamic_point_to_voxel_forward(
            feats, coors, reduce_type)
        (voxel_feats, voxel_coors, point2voxel_map,
         voxel_points_count) = results
        ctx.reduce_type = reduce_type
        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
                              voxel_points_count)
        ctx.mark_non_differentiable(voxel_coors)
        return voxel_feats, voxel_coors

    @staticmethod
    def backward(ctx: Any,
                 grad_voxel_feats: torch.Tensor,
                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
        (feats, voxel_feats, point2voxel_map,
         voxel_points_count) = ctx.saved_tensors
        grad_feats = torch.zeros_like(feats)
        # TODO: whether to use index put or use cuda_backward
        # To use index put, need point to voxel index
        ext_module.dynamic_point_to_voxel_backward(
            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,
            point2voxel_map, voxel_points_count, ctx.reduce_type)
        return grad_feats, None, None


dynamic_scatter = _DynamicScatter.apply


class DynamicScatter(nn.Module):
    """Scatters points into voxels, used in the voxel encoder with dynamic
    voxelization.

    Note:
        The CPU and GPU implementation get the same output, but have numerical
        difference after summation and division (e.g., 5e-7).

    Args:
        voxel_size (list): list [x, y, z] size of three dimension.
        point_cloud_range (list): The coordinate range of points, [x_min,
            y_min, z_min, x_max, y_max, z_max].
        average_points (bool): whether to use avg pooling to scatter points
            into voxel.
    """

    def __init__(self, voxel_size: List, point_cloud_range: List,
                 average_points: bool):
        super().__init__()

        self.voxel_size = voxel_size
        self.point_cloud_range = point_cloud_range
        self.average_points = average_points

    def forward_single(
            self, points: torch.Tensor,
            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """Scatters points into voxels.

        Args:
            points (torch.Tensor): Points to be reduced into voxels.
            coors (torch.Tensor): Corresponding voxel coordinates (specifically
                multi-dim voxel index) of each points.

        Returns:
            tuple[torch.Tensor]: A tuple contains two elements. The first one
            is the voxel features with shape [M, C] which are respectively
            reduced from input features that share the same voxel coordinates.
            The second is voxel coordinates with shape [M, ndim].
        """
        reduce = 'mean' if self.average_points else 'max'
        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)

    def forward(self, points: torch.Tensor,
                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """Scatters points/features into voxels.

        Args:
            points (torch.Tensor): Points to be reduced into voxels.
            coors (torch.Tensor): Corresponding voxel coordinates (specifically
                multi-dim voxel index) of each points.

        Returns:
            tuple[torch.Tensor]: A tuple contains two elements. The first one
            is the voxel features with shape [M, C] which are respectively
            reduced from input features that share the same voxel coordinates.
            The second is voxel coordinates with shape [M, ndim].
        """
        if coors.size(-1) == 3:
            return self.forward_single(points, coors)
        else:
            batch_size = coors[-1, 0] + 1
            voxels, voxel_coors = [], []
            for i in range(batch_size):
                inds = torch.where(coors[:, 0] == i)
                voxel, voxel_coor = self.forward_single(
                    points[inds], coors[inds][:, 1:])
                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
                voxel_coors.append(coor_pad)
                voxels.append(voxel)
            features = torch.cat(voxels, dim=0)
            feature_coors = torch.cat(voxel_coors, dim=0)

            return features, feature_coors

    def __repr__(self):
        s = self.__class__.__name__ + '('
        s += 'voxel_size=' + str(self.voxel_size)
        s += ', point_cloud_range=' + str(self.point_cloud_range)
        s += ', average_points=' + str(self.average_points)
        s += ')'
        return s


================================================
FILE: mmcv/ops/sparse_conv.py
================================================
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math

import numpy as np
import torch
from mmengine.registry import MODELS
from torch.nn import init
from torch.nn.parameter import Parameter

from . import sparse_functional as Fsp
from . import sparse_ops as ops
from .sparse_modules import SparseModule
from .sparse_structure import SparseConvTensor


def _calculate_fan_in_and_fan_out_hwio(tensor):
    dimensions = tensor.ndimension()
    if dimensions < 2:
        raise ValueError('fan in and fan out can not be computed for tensor'
                         'with fewer than 2 dimensions')

    if dimensions == 2:  # Linear
        fan_in = tensor.size(-2)
        fan_out = tensor.size(-1)
    else:
        num_input_fmaps = tensor.size(-2)
        num_output_fmaps = tensor.size(-1)
        receptive_field_size = 1
        if tensor.dim() > 2:
            receptive_field_size = tensor[..., 0, 0].numel()
        fan_in = num_input_fmaps * receptive_field_size
        fan_out = num_output_fmaps * receptive_field_size

    return fan_in, fan_out


class SparseConvolution(SparseModule):

    def __init__(self,
                 ndim,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 subm=False,
                 output_padding=0,
                 transposed=False,
                 inverse=False,
                 indice_key=None,
                 fused_bn=False):
        super().__init__()
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
            kernel_size = [kernel_size] * ndim
        if not isinstance(stride, (list, tuple)):
            stride = [stride] * ndim
        if not isinstance(padding, (list, tuple)):
            padding = [padding] * ndim
        if not isinstance(dilation, (list, tuple)):
            dilation = [dilation] * ndim
        if not isinstance(output_padding, (list, tuple)):
            output_padding = [output_padding] * ndim

        for d, s in zip(dilation, stride):
            assert any([s == 1, d == 1]), "don't support this."

        self.ndim = ndim
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.conv1x1 = np.prod(kernel_size) == 1
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.transposed = transposed
        self.inverse = inverse
        self.output_padding = output_padding
        self.groups = groups
        self.subm = subm
        self.indice_key = indice_key
        self.fused_bn = fused_bn

        self.weight = Parameter(
            torch.Tensor(*kernel_size, in_channels, out_channels))
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        assert isinstance(input, SparseConvTensor)
        features = input.features
        device = features.device
        indices = input.indices
        spatial_shape = input.spatial_shape
        batch_size = input.batch_size
        if not self.subm:
            if self.transposed:
                out_spatial_shape = ops.get_deconv_output_size(
                    spatial_shape, self.kernel_size, self.stride, self.padding,
                    self.dilation, self.output_padding)
            else:
                out_spatial_shape = ops.get_conv_output_size(
                    spatial_shape, self.kernel_size, self.stride, self.padding,
                    self.dilation)

        else:
            out_spatial_shape = spatial_shape

        if self.conv1x1:
            features = torch.mm(
                input.features,
                self.weight.view(self.in_channels, self.out_channels))
            if self.bias is not None:
                features += self.bias
            out_tensor = SparseConvTensor(features, input.indices,
                                          input.spatial_shape,
                                          input.batch_size)
            out_tensor.indice_dict = input.indice_dict
            out_tensor.grid = input.grid
            return out_tensor
        data = input.find_indice_pair(self.indice_key)
        if self.inverse:
            assert data is not None and self.indice_key is not None
            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
            assert indice_pairs.shape[0] == np.prod(
                self.kernel_size
            ), 'inverse conv must have same kernel size as its couple conv'
        else:
            if self.indice_key is not None and data is not None:
                outids, _, indice_pairs, indice_pair_num, _ = data
            else:
                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
                    indices,
                    batch_size,
                    spatial_shape,
                    self.kernel_size,
                    self.stride,
                    self.padding,
                    self.dilation,
                    self.output_padding,
                    self.subm,
                    self.transposed,
                    grid=input.grid)
                input.indice_dict[self.indice_key] = (outids, indices,
                                                      indice_pairs,
                                                      indice_pair_num,
                                                      spatial_shape)
        if self.fused_bn:
            assert self.bias is not None
            out_features = ops.fused_indice_conv(features, self.weight,
                                                 self.bias,
                                                 indice_pairs.to(device),
                                                 indice_pair_num,
                                                 outids.shape[0], self.inverse,
                                                 self.subm)
        else:
            if self.subm:
                out_features = Fsp.indice_subm_conv(features, self.weight,
                                                    indice_pairs.to(device),
                                                    indice_pair_num,
                                                    outids.shape[0])
            else:
                if self.inverse:
                    out_features = Fsp.indice_inverse_conv(
                        features, self.weight, indice_pairs.to(device),
                        indice_pair_num, outids.shape[0])
                else:
                    out_features = Fsp.indice_conv(features, self.weight,
                                                   indice_pairs.to(device),
                                                   indice_pair_num,
                                                   outids.shape[0])

            if self.bias is not None:
                out_features += self.bias
        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
                                      batch_size)
        out_tensor.indice_dict = input.indice_dict
        out_tensor.grid = input.grid
        return out_tensor


@MODELS.register_module()
class SparseConv2d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super().__init__(
            2,
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            indice_key=indice_key)


@MODELS.register_module()
class SparseConv3d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super().__init__(
            3,
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            indice_key=indice_key)


@MODELS.register_module()
class SparseConv4d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super().__init__(
            4,
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            indice_key=indice_key)


@MODELS.register_module()
class SparseConvTranspose2d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super().__init__(
            2,
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            transposed=True,
            indice_key=indice_key)


@MODELS.register_module()
class SparseConvTranspose3d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super().__init__(
            3,
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            transposed=True,
            indice_key=indice_key)


@MODELS.register_module()
class SparseInverseConv2d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 indice_key=None,
                 bias=True):
        super().__init__(
            2,
            in_channels,
            out_channels,
            kernel_size,
            bias=bias,
            inverse=True,
            indice_key=indice_key)


@MODELS.register_module()
class SparseInverseConv3d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 indice_key=None,
                 bias=True):
        super().__init__(
            3,
            in_channels,
            out_channels,
            kernel_size,
            bias=bias,
            inverse=True,
            indice_key=indice_key)


@MODELS.register_module()
class SubMConv2d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super().__init__(
            2,
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            True,
            indice_key=indice_key)


@MODELS.register_module()
class SubMConv3d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super().__init__(
            3,
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            True,
            indice_key=indice_key)


@MODELS.register_module()
class SubMConv4d(SparseConvolution):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super().__init__(
            4,
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            True,
            indice_key=indice_key)


================================================
FILE: mmcv/ops/sparse_functional.py
================================================
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any

import torch
from torch.autograd import Function

from . import sparse_ops as ops


class SparseConvFunction(Function):
    """Sparse Convolution.

    Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for
    more details.
    """

    @staticmethod
    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
                num_activate_out: torch.Tensor) -> torch.Tensor:
        """
        Args:
            features (torch.Tensor): Features that needs to convolute.
            filters (torch.nn.parameter.Parameter): Convolution filters.
            indice_pairs (torch.Tensor): Indice pairs between inputs locations
                and outputs locations.
            indice_pair_num (torch.Tensor): Indice pairs num.
            num_activate_out (torch.Tensor): Output channels num.

        Returns:
            torch.Tensor: Output features from gather-gemm-scatter.
        """
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        return ops.indice_conv(features, filters, indice_pairs,
                               indice_pair_num, num_activate_out, False)

    @staticmethod
    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
        input_bp, filters_bp = ops.indice_conv_backward(
            features, filters, grad_output, indice_pairs, indice_pair_num,
            False)

        return input_bp, filters_bp, None, None, None


class SparseInverseConvFunction(Function):

    @staticmethod
    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
                num_activate_out: torch.Tensor) -> torch.Tensor:
        """
        Args:
            features (torch.Tensor): Features that needs to convolute.
            filters (torch.nn.parameter.Parameter): Convolution filters.
            indice_pairs (torch.Tensor): Indice pairs between inputs locations
                and outputs locations.
            indice_pair_num (torch.Tensor): Indice pairs num.
            num_activate_out (torch.Tensor): Output channels num.

        Returns:
            torch.Tensor: Output features from gather-gemm-scatter.
        """
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        return ops.indice_conv(features, filters, indice_pairs,
                               indice_pair_num, num_activate_out, True, False)

    @staticmethod
    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
        input_bp, filters_bp = ops.indice_conv_backward(
            features, filters, grad_output, indice_pairs, indice_pair_num,
            True, False)

        return input_bp, filters_bp, None, None, None


class SubMConvFunction(Function):

    @staticmethod
    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
                num_activate_out: torch.Tensor) -> torch.Tensor:
        """
        Args:
            features (torch.Tensor): Features that needs to convolute.
            filters (torch.nn.parameter.Parameter): Convolution filters.
            indice_pairs (torch.Tensor): Indice pairs between inputs locations
                and outputs locations.
            indice_pair_num (torch.Tensor): Indice pairs num.
            num_activate_out (torch.Tensor): Output channels num.

        Returns:
            torch.Tensor: Output features from gather-gemm-scatter.
        """
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        return ops.indice_conv(features, filters, indice_pairs,
                               indice_pair_num, num_activate_out, False, True)

    @staticmethod
    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
        input_bp, filters_bp = ops.indice_conv_backward(
            features, filters, grad_output, indice_pairs, indice_pair_num,
            False, True)

        return input_bp, filters_bp, None, None, None


class SparseMaxPoolFunction(Function):

    @staticmethod
    def forward(ctx, features: torch.Tensor, indice_pairs: torch.Tensor,
                indice_pair_num: torch.Tensor,
                num_activate_out: torch.Tensor) -> torch.Tensor:
        """
        Args:
            features (torch.Tensor): Features that needs to convolute.
            indice_pairs (torch.Tensor): Indice pairs between inputs locations
                and outputs locations.
            indice_pair_num (torch.Tensor): Indice pairs num.
            num_activate_out (torch.Tensor): Output channels num.

        Returns:
            torch.Tensor: Output features from sparse maxpooling.
        """
        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
                                 num_activate_out)
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
        return out

    @staticmethod
    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
                                               indice_pairs, indice_pair_num)
        return input_bp, None, None, None


indice_conv = SparseConvFunction.apply
indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply


================================================
FILE: mmcv/ops/sparse_modules.py
================================================
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from collections import OrderedDict
from typing import Any, List, Optional, Union

import torch
from torch import nn

from .sparse_structure import SparseConvTensor


def is_spconv_module(module: nn.Module) -> bool:
    spconv_modules = (SparseModule, )
    return isinstance(module, spconv_modules)


def is_sparse_conv(module: nn.Module) -> bool:
    from .sparse_conv import SparseConvolution
    return isinstance(module, SparseConvolution)


def _mean_update(vals: Union[int, List], m_vals: Union[int, List],
                 t: float) -> List:
    outputs = []
    if not isinstance(vals, list):
        vals = [vals]
    if not isinstance(m_vals, list):
        m_vals = [m_vals]
    for val, m_val in zip(vals, m_vals):
        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
        outputs.append(output)
    if len(outputs) == 1:
        outputs = outputs[0]
    return outputs


class SparseModule(nn.Module):
    """Place holder, All module subclass from this will take sptensor in
    SparseSequential."""
    pass


class SparseSequential(SparseModule):
    r"""A sequential container. Modules will be added to it in the order they
    are passed in the constructor. Alternatively, an ordered dict of modules
    can also be passed in.

    To make it easier to understand, given is a small example::

    Example:
        >>> # using Sequential:
        >>> from mmcv.ops import SparseSequential
        >>> model = SparseSequential(
                    SparseConv2d(1,20,5),
                    nn.ReLU(),
                    SparseConv2d(20,64,5),
                    nn.ReLU()
                    )

        >>> # using Sequential with OrderedDict
        >>> model = SparseSequential(OrderedDict([
                      ('conv1', SparseConv2d(1,20,5)),
                      ('relu1', nn.ReLU()),
                      ('conv2', SparseConv2d(20,64,5)),
                      ('relu2', nn.ReLU())
                    ]))

        >>> # using Sequential with kwargs(python 3.6+)
        >>> model = SparseSequential(
                      conv1=SparseConv2d(1,20,5),
                      relu1=nn.ReLU(),
                      conv2=SparseConv2d(20,64,5),
                      relu2=nn.ReLU()
                    )
    """

    def __init__(self, *args, **kwargs):
        super().__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
        for name, module in kwargs.items():
            if sys.version_info < (3, 6):
                raise ValueError('kwargs only supported in py36+')
            if name in self._modules:
                raise ValueError('name exists.')
            self.add_module(name, module)
        self._sparity_dict = {}

    def __getitem__(self, idx: int) -> torch.Tensor:
        if not (-len(self) <= idx < len(self)):
            raise IndexError(f'index {idx} is out of range')
        if idx < 0:
            idx += len(self)
        it = iter(self._modules.values())
        for i in range(idx):
            next(it)
        return next(it)

    def __len__(self):
        return len(self._modules)

    @property
    def sparity_dict(self):
        return self._sparity_dict

    def add(self, module: Any, name: Optional[str] = None) -> None:
        if name is None:
            name = str(len(self._modules))
            if name in self._modules:
                raise KeyError('name exists')
        self.add_module(name, module)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        for k, module in self._modules.items():
            if is_spconv_module(module):
                assert isinstance(input, SparseConvTensor)
                self._sparity_dict[k] = input.sparity
                input = module(input)
            else:
                if isinstance(input, SparseConvTensor):
                    if input.indices.shape[0] != 0:
                        input.features = module(input.features)
                else:
                    input = module(input)
        return input

    def fused(self):
        from .sparse_conv import SparseConvolution
        mods = [v for k, v in self._modules.items()]
        fused_mods = []
        idx = 0
        while idx < len(mods):
            if is_sparse_conv(mods[idx]):
                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
                                                      nn.BatchNorm1d):
                    new_module = SparseConvolution(
                        ndim=mods[idx].ndim,
                        in_channels=mods[idx].in_channels,
                        out_channels=mods[idx].out_channels,
                        kernel_size=mods[idx].kernel_size,
                        stride=mods[idx].stride,
                        padding=mods[idx].padding,
                        dilation=mods[idx].dilation,
                        groups=mods[idx].groups,
                        bias=True,
                        subm=mods[idx].subm,
                        output_padding=mods[idx].output_padding,
                        transposed=mods[idx].transposed,
                        inverse=mods[idx].inverse,
                        indice_key=mods[idx].indice_key,
                        fused_bn=True,
                    )
                    new_module.load_state_dict(mods[idx].state_dict(), False)
                    new_module.to(mods[idx].weight.device)
                    conv = new_module
                    bn = mods[idx + 1]
                    conv.bias.data.zero_()
                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
                        torch.sqrt(bn.running_var) + bn.eps)
                    conv.bias.data[:] = (
                        conv.bias.data - bn.running_mean) * bn.weight.data / (
                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
                    fused_mods.append(conv)
                    idx += 2
                else:
                    fused_mods.append(mods[idx])
                    idx += 1
            else:
                fused_mods.append(mods[idx])
                idx += 1
        return SparseSequential(*fused_mods)


class ToDense(SparseModule):
    """Convert SparseConvTensor to NCHW dense tensor."""

    def forward(self, x: SparseConvTensor):
        return x.dense()


class RemoveGrid(SparseModule):
    """Remove pre-allocated grid buffer."""

    def forward(self, x: SparseConvTensor):
        x.grid = None
        return x


================================================
FILE: mmcv/ops/sparse_ops.py
================================================
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', [
    'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',
    'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',
    'get_indice_pairs_3d_backward', 'indice_conv_forward',
    'indice_conv_backward', 'fused_indice_conv_forward',
    'indice_maxpool_forward', 'indice_maxpool_backward'
])


def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
    ndim = len(input_size)
    output_size = []
    for i in range(ndim):
        size = (input_size[i] + 2 * padding[i] - dilation[i] *
                (kernel_size[i] - 1) - 1) // stride[i] + 1
        if kernel_size[i] == -1:
            output_size.append(1)
        else:
            output_size.append(size)
    return output_size


def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
                           output_padding):
    ndim = len(input_size)
    output_size = []
    for i in range(ndim):
        if kernel_size[i] == -1:
            raise ValueError("deconv don't support kernel_size < 0")
        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
            i] + output_padding[i]
        output_size.append(size)
    return output_size


def get_indice_pairs(indices,
                     batch_size,
                     spatial_shape,
                     ksize=3,
                     stride=1,
                     padding=0,
                     dilation=1,
                     out_padding=0,
                     subm=False,
                     transpose=False,
                     grid=None):
    ndim = indices.shape[1] - 1
    if not isinstance(ksize, (list, tuple)):
        ksize = [ksize] * ndim
    if not isinstance(stride, (list, tuple)):
        stride = [stride] * ndim
    if not isinstance(padding, (list, tuple)):
        padding = [padding] * ndim
    if not isinstance(dilation, (list, tuple)):
        dilation = [dilation] * ndim
    if not isinstance(out_padding, (list, tuple)):
        out_padding = [out_padding] * ndim

    for d, s in zip(dilation, stride):
        assert any([s == 1, d == 1]), "don't support this."

    if not subm:
        if transpose:
            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
                                               padding, dilation, out_padding)
        else:
            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
                                             padding, dilation)

    else:
        out_shape = spatial_shape
    if grid is None:
        if ndim == 2:
            get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward
        elif ndim == 3:
            get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward
        elif ndim == 4:
            get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward
        else:
            raise NotImplementedError
        return get_indice_pairs_func(indices, batch_size, out_shape,
                                     spatial_shape, ksize, stride, padding,
                                     dilation, out_padding, int(subm),
                                     int(transpose))
    else:
        if ndim == 2:
            get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward
        elif ndim == 3:
            get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward
        else:
            raise NotImplementedError
        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
                                     spatial_shape, ksize, stride, padding,
                                     dilation, out_padding, int(subm),
                                     int(transpose))


def indice_conv(features,
                filters,
                indice_pairs,
                indice_pair_num,
                num_activate_out,
                inverse=False,
                subm=False):
    if filters.dtype == torch.float32 or filters.dtype == torch.half:
        return ext_module.indice_conv_forward(features, filters, indice_pairs,
                                              indice_pair_num,
                                              num_activate_out, int(inverse),
                                              int(subm))
    else:
        raise NotImplementedError


def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
                      num_activate_out, inverse, subm):
    if features.dtype == torch.half or filters.dtypes == torch.float32:
        func = ext_module.fused_indice_conv_forward
    else:
        raise NotImplementedError

    return func(features, filters, bias, indice_pairs, indice_pair_num,
                num_activate_out, int(inverse), int(subm))


def indice_conv_backward(features,
                         filters,
                         out_bp,
                         indice_pairs,
                         indice_pair_num,
                         inverse=False,
                         subm=False):
    if filters.dtype == torch.float32 or filters.dtype == torch.half:
        return ext_module.indice_conv_backward(features, filters, out_bp,
                                               indice_pairs, indice_pair_num,
                                               int(inverse), int(subm))
    else:
        raise NotImplementedError


def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
    if features.dtype == torch.float32 or features.dtype == torch.half:
        return ext_module.indice_maxpool_forward(features, indice_pairs,
                                                 indice_pair_num,
                                                 num_activate_out)
    else:
        raise NotImplementedError


def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
                            indice_pair_num):
    if features.dtype == torch.float32 or features.dtype == torch.half:
        return ext_module.indice_maxpool_backward(features, out_features,
                                                  out_bp, indice_pairs,
                                                  indice_pair_num)
    else:
        raise NotImplementedError


================================================
FILE: mmcv/ops/sparse_pool.py
================================================
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# import sparse_functional as Fsp
# import sparse_ops as ops
from .sparse_functional import indice_maxpool
from .sparse_modules import SparseModule
from .sparse_ops import get_conv_output_size, get_indice_pairs
from .sparse_structure import SparseConvTensor


class SparseMaxPool(SparseModule):

    def __init__(self,
                 ndim,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 subm=False):
        super().__init__()
        if not isinstance(kernel_size, (list, tuple)):
            kernel_size = [kernel_size] * ndim
        if not isinstance(stride, (list, tuple)):
            stride = [stride] * ndim
        if not isinstance(padding, (list, tuple)):
            padding = [padding] * ndim
        if not isinstance(dilation, (list, tuple)):
            dilation = [dilation] * ndim

        self.ndim = ndim
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.subm = subm
        self.dilation = dilation

    def forward(self, input):
        assert isinstance(input, SparseConvTensor)
        features = input.features
        device = features.device
        indices = input.indices
        spatial_shape = input.spatial_shape
        batch_size = input.batch_size
        if not self.subm:
            out_spatial_shape = get_conv_output_size(spatial_shape,
                                                     self.kernel_size,
                                                     self.stride, self.padding,
                                                     self.dilation)
        else:
            out_spatial_shape = spatial_shape
        outids, indice_pairs, indice_pairs_num = get_indice_pairs(
            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
            self.padding, self.dilation, 0, self.subm)

        out_features = indice_maxpool(features, indice_pairs.to(device),
                                      indice_pairs_num.to(device),
                                      outids.shape[0])
        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
                                      batch_size)
        out_tensor.indice_dict = input.indice_dict
        out_tensor.grid = input.grid
        return out_tensor


class SparseMaxPool2d(SparseMaxPool):

    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
        super().__init__(2, kernel_size, stride, padding, dilation)


class SparseMaxPool3d(SparseMaxPool):

    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
        super().__init__(3, kernel_size, stride, padding, dilation)


================================================
FILE: mmcv/ops/sparse_structure.py
================================================
from typing import List, Optional, Tuple, Union

import numpy as np
import torch


def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
               shape: torch.Tensor) -> torch.Tensor:
    """Pytorch edition of tensorflow scatter_nd.

    this function don't contain except handle code. so use this carefully when
    indice repeats, don't support repeat add which is supported in tensorflow.
    """
    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
    ndim = indices.shape[-1]
    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
    flatted_indices = indices.view(-1, ndim)
    slices = [flatted_indices[:, i] for i in range(ndim)]
    slices += [Ellipsis]
    ret[slices] = updates.view(*output_shape)
    return ret


class SparseConvTensor:

    def __init__(self,
                 features: torch.Tensor,
                 indices: torch.Tensor,
                 spatial_shape: Union[List, Tuple],
                 batch_size: int,
                 grid: Optional[torch.Tensor] = None):
        self.features = features
        self.indices = indices
        if self.indices.dtype != torch.int32:
            self.indices.int()
        self.spatial_shape = spatial_shape
        self.batch_size = batch_size
        self.indice_dict: dict = {}
        self.grid = grid

    @property
    def spatial_size(self):
        return np.prod(self.spatial_shape)

    def find_indice_pair(self, key):
        if key is None:
            return None
        if key in self.indice_dict:
            return self.indice_dict[key]
        return None

    def dense(self, channels_first: bool = True) -> torch.Tensor:
        output_shape = [self.batch_size] + list(
            self.spatial_shape) + [self.features.shape[1]]
        res = scatter_nd(self.indices.long(), self.features, output_shape)
        if not channels_first:
            return res
        ndim = len(self.spatial_shape)
        trans_params = list(range(0, ndim + 1))
        trans_params.insert(1, ndim + 1)
        return res.permute(*trans_params).contiguous()

    @property
    def sparity(self):
        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
                self.batch_size)


================================================
FILE: mmcv/ops/sync_bn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional

import torch
import torch.distributed as dist
import torch.nn.functional as F
from mmengine.device import is_cuda_available, is_musa_available
from mmengine.registry import MODELS
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.module import Module
from torch.nn.parameter import Parameter

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', [
    'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output',
    'sync_bn_backward_param', 'sync_bn_backward_data'
])


class SyncBatchNormFunction(Function):

    @staticmethod
    def symbolic(g, input, running_mean, running_var, weight, bias, momentum,
                 eps, group, group_size, stats_mode):
        return g.op(
            'mmcv::MMCVSyncBatchNorm',
            input,
            running_mean,
            running_var,
            weight,
            bias,
            momentum_f=momentum,
            eps_f=eps,
            group_i=group,
            group_size_i=group_size,
            stats_mode=stats_mode)

    @staticmethod
    def forward(self, input: torch.Tensor, running_mean: torch.Tensor,
                running_var: torch.Tensor, weight: torch.Tensor,
                bias: torch.Tensor, momentum: float, eps: float, group: int,
                group_size: int, stats_mode: str) -> torch.Tensor:
        self.momentum = momentum
        self.eps = eps
        self.group = group
        self.group_size = group_size
        self.stats_mode = stats_mode

        if is_cuda_available():
            assert isinstance(
                    input, (torch.HalfTensor, torch.FloatTensor,
                            torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \
                f'only support Half or Float Tensor, but {input.type()}'
        elif is_musa_available():
            assert isinstance(
                    input, (torch.HalfTensor, torch.FloatTensor,
                            torch.musa.HalfTensor, torch.musa.FloatTensor)), \
                f'only support Half or Float Tensor, but {input.type()}'
        else:
            assert isinstance(
                    input, (torch.HalfTensor, torch.FloatTensor)), \
                f'only support Half or Float Tensor, but {input.type()}'
        output = torch.zeros_like(input)
        input3d = input.flatten(start_dim=2)
        output3d = output.view_as(input3d)
        num_channels = input3d.size(1)

        # ensure mean/var/norm/std are initialized as zeros
        # ``torch.empty()`` does not guarantee that
        mean = torch.zeros(
            num_channels, dtype=torch.float, device=input3d.device)
        var = torch.zeros(
            num_channels, dtype=torch.float, device=input3d.device)
        norm = torch.zeros_like(
            input3d, dtype=torch.float, device=input3d.device)
        std = torch.zeros(
            num_channels, dtype=torch.float, device=input3d.device)

        batch_size = input3d.size(0)
        if batch_size > 0:
            ext_module.sync_bn_forward_mean(input3d, mean)
            batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype)
        else:
            # skip updating mean and leave it as zeros when the input is empty
            batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype)

        # synchronize mean and the batch flag
        vec = torch.cat([mean, batch_flag])
        if self.stats_mode == 'N':
            vec *= batch_size
        if self.group_size > 1:
            dist.all_reduce(vec, group=self.group)
        total_batch = vec[-1].detach()
        mean = vec[:num_channels]

        if self.stats_mode == 'default':
            mean = mean / self.group_size
        elif self.stats_mode == 'N':
            mean = mean / total_batch.clamp(min=1)
        else:
            raise NotImplementedError

        # leave var as zeros when the input is empty
        if batch_size > 0:
            ext_module.sync_bn_forward_var(input3d, mean, var)

        if self.stats_mode == 'N':
            var *= batch_size
        if self.group_size > 1:
            dist.all_reduce(var, group=self.group)

        if self.stats_mode == 'default':
            var /= self.group_size
        elif self.stats_mode == 'N':
            var /= total_batch.clamp(min=1)
        else:
            raise NotImplementedError

        # if the total batch size over all the ranks is zero,
        # we should not update the statistics in the current batch
        update_flag = total_batch.clamp(max=1)
        momentum = update_flag * self.momentum
        ext_module.sync_bn_forward_output(
            input3d,
            mean,
            var,
            weight,
            bias,
            running_mean,
            running_var,
            norm,
            std,
            output3d,
            eps=self.eps,
            momentum=momentum,
            group_size=self.group_size)
        self.save_for_backward(norm, std, weight)
        return output

    @staticmethod
    @once_differentiable
    def backward(self, grad_output: torch.Tensor) -> tuple:
        norm, std, weight = self.saved_tensors
        grad_weight = torch.zeros_like(weight)
        grad_bias = torch.zeros_like(weight)
        grad_input = torch.zeros_like(grad_output)
        grad_output3d = grad_output.flatten(start_dim=2)
        grad_input3d = grad_input.view_as(grad_output3d)

        batch_size = grad_input3d.size(0)
        if batch_size > 0:
            ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
                                              grad_bias)

        # all reduce
        if self.group_size > 1:
            dist.all_reduce(grad_weight, group=self.group)
            dist.all_reduce(grad_bias, group=self.group)
            grad_weight /= self.group_size
            grad_bias /= self.group_size

        if batch_size > 0:
            ext_module.sync_bn_backward_data(grad_output3d, weight,
                                             grad_weight, grad_bias, norm, std,
                                             grad_input3d)

        return grad_input, None, None, grad_weight, grad_bias, \
            None, None, None, None, None


@MODELS.register_module(name='MMSyncBN')
class SyncBatchNorm(Module):
    """Synchronized Batch Normalization.

    Args:
        num_features (int): number of features/chennels in input tensor
        eps (float, optional): a value added to the denominator for numerical
            stability. Defaults to 1e-5.
        momentum (float, optional): the value used for the running_mean and
            running_var computation. Defaults to 0.1.
        affine (bool, optional): whether to use learnable affine parameters.
            Defaults to True.
        track_running_stats (bool, optional): whether to track the running
            mean and variance during training. When set to False, this
            module does not track such statistics, and initializes statistics
            buffers ``running_mean`` and ``running_var`` as ``None``. When
            these buffers are ``None``, this module always uses batch
            statistics in both training and eval modes. Defaults to True.
        group (int, optional): synchronization of stats happen within
            each process group individually. By default it is synchronization
            across the whole world. Defaults to None.
        stats_mode (str, optional): The statistical mode. Available options
            includes ``'default'`` and ``'N'``. Defaults to 'default'.
            When ``stats_mode=='default'``, it computes the overall statistics
            using those from each worker with equal weight, i.e., the
            statistics are synchronized and simply divied by ``group``. This
            mode will produce inaccurate statistics when empty tensors occur.
            When ``stats_mode=='N'``, it compute the overall statistics using
            the total number of batches in each worker ignoring the number of
            group, i.e., the statistics are synchronized and then divied by
            the total batch ``N``. This mode is beneficial when empty tensors
            occur during training, as it average the total mean by the real
            number of batch.
    """

    def __init__(self,
                 num_features: int,
                 eps: float = 1e-5,
                 momentum: float = 0.1,
                 affine: bool = True,
                 track_running_stats: bool = True,
                 group: Optional[int] = None,
                 stats_mode: str = 'default'):
        super().__init__()
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        self.affine = affine
        self.track_running_stats = track_running_stats
        group = dist.group.WORLD if group is None else group
        self.group = group
        self.group_size = dist.get_world_size(group)
        assert stats_mode in ['default', 'N'], \
            f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"'
        self.stats_mode = stats_mode
        if self.affine:
            self.weight = Parameter(torch.Tensor(num_features))
            self.bias = Parameter(torch.Tensor(num_features))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)
        if self.track_running_stats:
            self.register_buffer('running_mean', torch.zeros(num_features))
            self.register_buffer('running_var', torch.ones(num_features))
            self.register_buffer('num_batches_tracked',
                                 torch.tensor(0, dtype=torch.long))
        else:
            self.register_buffer('running_mean', None)
            self.register_buffer('running_var', None)
            self.register_buffer('num_batches_tracked', None)
        self.reset_parameters()

    def reset_running_stats(self):
        if self.track_running_stats:
            self.running_mean.zero_()
            self.running_var.fill_(1)
            self.num_batches_tracked.zero_()

    def reset_parameters(self):
        self.reset_running_stats()
        if self.affine:
            self.weight.data.uniform_()  # pytorch use ones_()
            self.bias.data.zero_()

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if input.dim() < 2:
            raise ValueError(
                f'expected at least 2D input, got {input.dim()}D input')
        if self.momentum is None:
            exponential_average_factor = 0.0
        else:
            exponential_average_factor = self.momentum

        if self.training and self.track_running_stats:
            if self.num_batches_tracked is not None:
                self.num_batches_tracked += 1
                if self.momentum is None:  # use cumulative moving average
                    exponential_average_factor = 1.0 / float(
                        self.num_batches_tracked)
                else:  # use exponential moving average
                    exponential_average_factor = self.momentum

        if self.training or not self.track_running_stats:
            return SyncBatchNormFunction.apply(
                input, self.running_mean, self.running_var, self.weight,
                self.bias, exponential_average_factor, self.eps, self.group,
                self.group_size, self.stats_mode)
        else:
            return F.batch_norm(input, self.running_mean, self.running_var,
                                self.weight, self.bias, False,
                                exponential_average_factor, self.eps)

    def __repr__(self):
        s = self.__class__.__name__
        s += f'({self.num_features}, '
        s += f'eps={self.eps}, '
        s += f'momentum={self.momentum}, '
        s += f'affine={self.affine}, '
        s += f'track_running_stats={self.track_running_stats}, '
        s += f'group_size={self.group_size},'
        s += f'stats_mode={self.stats_mode})'
        return s


================================================
FILE: mmcv/ops/three_interpolate.py
================================================
from typing import Any, Tuple

import torch
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['three_interpolate_forward', 'three_interpolate_backward'])


class ThreeInterpolate(Function):
    """Performs weighted linear interpolation on 3 features.

    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
    for more details.
    """

    @staticmethod
    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor,
                weight: torch.Tensor) -> torch.Tensor:
        """
        Args:
            features (torch.Tensor): (B, C, M) Features descriptors to be
                interpolated.
            indices (torch.Tensor): (B, n, 3) indices of three nearest
                neighbor features for the target features.
            weight (torch.Tensor): (B, n, 3) weights of three nearest
                neighbor features for the target features.

        Returns:
            torch.Tensor: (B, C, N) tensor of the interpolated features
        """
        assert features.is_contiguous()
        assert indices.is_contiguous()
        assert weight.is_contiguous()

        B, c, m = features.size()
        n = indices.size(1)
        ctx.three_interpolate_for_backward = (indices, weight, m)
        output = features.new_empty(B, c, n)

        ext_module.three_interpolate_forward(
            features, indices, weight, output, b=B, c=c, m=m, n=n)
        return output

    @staticmethod
    def backward(
        ctx, grad_out: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Args:
            grad_out (torch.Tensor): (B, C, N) tensor with gradients of outputs

        Returns:
            torch.Tensor: (B, C, M) tensor with gradients of features
        """
        idx, weight, m = ctx.three_interpolate_for_backward
        B, c, n = grad_out.size()

        grad_features = grad_out.new_zeros(B, c, m)
        grad_out_data = grad_out.data.contiguous()

        ext_module.three_interpolate_backward(
            grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m)
        return grad_features, None, None


three_interpolate = ThreeInterpolate.apply


================================================
FILE: mmcv/ops/three_nn.py
================================================
from typing import Any, Tuple

import torch
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext', ['three_nn_forward'])


class ThreeNN(Function):
    """Find the top-3 nearest neighbors of the target set from the source set.

    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
    for more details.
    """

    @staticmethod
    def forward(ctx: Any, target: torch.Tensor,
                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            target (torch.Tensor): shape (B, N, 3), points set that needs to
                find the nearest neighbors.
            source (torch.Tensor): shape (B, M, 3), points set that is used
                to find the nearest neighbors of points in target set.

        Returns:
            torch.Tensor: shape (B, N, 3), L2 distance of each point in target
            set to their corresponding top three nearest neighbors.
        """
        target = target.contiguous()
        source = source.contiguous()

        B, N, _ = target.size()
        m = source.size(1)
        if source.device.type == 'npu':
            # strict to fp32
            source = source.transpose(2, 1).contiguous()
            dtype_ = source.dtype
            if dtype_ == torch.float16:
                target = target.float()
                source = source.float()
            dist2 = target.new_empty(B, N, 3)
            idx = target.new_empty(B, N, 3, dtype=torch.int32)
            ext_module.three_nn_forward(
                target, source, dist2, idx, b=B, n=N, m=m)
            dist2 = torch.sqrt(dist2)
            if dtype_ == torch.float16:
                dist2 = dist2.half()
            return dist2, idx.int()
        dist2 = target.new_empty(B, N, 3)
        idx = target.new_empty(B, N, 3, dtype=torch.int32)

        ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)
        if torch.__version__ != 'parrots':
            ctx.mark_non_differentiable(idx)

        return torch.sqrt(dist2), idx

    @staticmethod
    def backward(ctx, a=None, b=None):
        return None, None


three_nn = ThreeNN.apply


================================================
FILE: mmcv/ops/tin_shift.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# Code reference from "Temporal Interlacing Network"
# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py
# Hao Shao, Shengju Qian, Yu Liu
# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk

import torch
import torch.nn as nn
from torch.autograd import Function

from ..utils import ext_loader

ext_module = ext_loader.load_ext('_ext',
                                 ['tin_shift_forward', 'tin_shift_backward'])


class TINShiftFunction(Function):

    @staticmethod
    def forward(ctx, input, shift):
        if input.size(0) != shift.size(0):
            raise ValueError(
                'The first dim (batch) of `input` and `shift` should be '
                f'same, but got {input.size(0)} and {shift.size(0)}.')
        C = input.size(2)
        num_segments = shift.size(1)
        if C // num_segments <= 0 or C % num_segments != 0:
            raise ValueError('C should be a multiple of num_segments, '
                             f'but got C={C} and num_segments={num_segments}.')

        ctx.save_for_backward(shift)

        out = torch.zeros_like(input)
        ext_module.tin_shift_forward(input, shift, out)

        return out

    @staticmethod
    def backward(ctx, grad_output):

        shift = ctx.saved_tensors[0]
        data_grad_input = grad_output.new(*grad_output.size()).zero_()
        shift_grad_input = shift.new(*shift.size()).zero_()
        ext_module.tin_shift_backward(grad_output, shift, data_grad_input)

        return data_grad_input, shift_grad_input


tin_shift = TINShiftFunction.apply


class TINShift(nn.Module):
    """Temporal Interlace Shift.

    Temporal Interlace shift is a differentiable temporal-wise frame shifting
    which is proposed in "Temporal Interlacing Network"

    Please refer to
    `Temporal Interlacing Network <https://arxiv.org/abs/2001.06499>`_
     for more details.

    Code is modified from
    https://github.com/mit-han-lab/temporal-shift-module
    """

    def forward(self, input, shift):
        """Perform temporal interlace shift.

        Args:
            input (torch.Tensor): Feature map with shape
                [N, num_segments, C, H * W].
            shift (torch.Tensor): Shift tensor with shape [N, num_segments].

        Returns:
            Feature map after temporal interlace shift.
        """
        return tin_shift(input, shift)


================================================
FILE: mmcv/ops/upfirdn2d.py
================================================
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/upfirdn2d.py # noqa
"""Custom PyTorch ops for efficient resampling of 2D images."""
from typing import Dict, List, Union

import torch

from ..utils import ext_loader
from .conv2d_gradfix import conv2d

ext_module = ext_loader.load_ext('_ext', ['upfirdn2d'])


def _parse_scaling(scaling):
    """Parse scaling into list [x, y]"""
    if isinstance(scaling, int):
        scaling = [scaling, scaling]
    assert isinstance(scaling, (list, tuple))
    assert all(isinstance(x, int) for x in scaling)
    sx, sy = scaling
    assert sx >= 1 and sy >= 1
    return sx, sy


def _parse_padding(padding):
    """Parse padding into list [padx0, padx1, pady0, pady1]"""
    if isinstance(padding, int):
        padding = [padding, padding]
    assert isinstance(padding, (list, tuple))
    assert all(isinstance(x, int) for x in padding)
    if len(padding) == 2:
        padx, pady = padding
        padding = [padx, padx, pady, pady]
    padx0, padx1, pady0, pady1 = padding
    return padx0, padx1, pady0, pady1


def _get_filter_size(filter):
    """Get width and height of filter kernel."""
    if filter is None:
        return 1, 1
    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]
    fw = filter.shape[-1]
    fh = filter.shape[0]
    fw = int(fw)
    fh = int(fh)
    assert fw >= 1 and fh >= 1
    return fw, fh


def upfirdn2d(input: torch.Tensor,
              filter: torch.Tensor,
              up: int = 1,
              down: int = 1,
              padding: Union[int, List[int]] = 0,
              flip_filter: bool = False,
              gain: Union[float, int] = 1,
              use_custom_op: bool = True):
    """Pad, upsample, filter, and downsample a batch of 2D images.

    Performs the following sequence of operations for each channel:

    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).

    2. Pad the image with the specified number of zeros on each side
    (`padding`). Negative padding corresponds to cropping the image.

    3. Convolve the image with the specified 2D FIR filter (`f`),
    shrinking it so that the footprint of all output pixels lies within
    the input image.

    4. Downsample the image by keeping every Nth pixel (`down`).

    This sequence of operations bears close resemblance to
        scipy.signal.upfirdn().

    The fused op is considerably more efficient than performing the same
    calculation using standard PyTorch ops. It supports gradients of arbitrary
    order.

    Args:
        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
            `[batch_size, num_channels, in_height, in_width]`.
        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
            filter_width]` (non-separable), `[filter_taps]` (separable), or
            `None` (identity).
        up (int): Integer upsampling factor. Can be a single int or a
            list/tuple `[x, y]`. Defaults to 1.
        down (int): Integer downsampling factor. Can be a single int
            or a list/tuple `[x, y]`. Defaults to 1.
        padding (int | tuple[int]): Padding with respect to the upsampled
            image. Can be a single number or a list/tuple `[x, y]` or
            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.
        gain (int): Overall scaling factor for signal magnitude.
            Defaults to 1.
        use_custom_op (bool): Whether to use customized op.
            Defaults to True.

    Returns:
        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`
    """
    assert isinstance(input, torch.Tensor)
    if use_custom_op and input.device.type == 'cuda':
        return _upfirdn2d_cuda(
            up=up,
            down=down,
            padding=padding,
            flip_filter=flip_filter,
            gain=gain).apply(input, filter)
    elif use_custom_op and input.device.type == 'musa':
        return _upfirdn2d_musa(
            up=up,
            down=down,
            padding=padding,
            flip_filter=flip_filter,
            gain=gain).apply(input, filter)
    return _upfirdn2d_ref(
        input,
        filter,
        up=up,
        down=down,
        padding=padding,
        flip_filter=flip_filter,
        gain=gain)


def _upfirdn2d_ref(input: torch.Tensor,
                   filter: torch.Tensor,
                   up: int = 1,
                   down: int = 1,
                   padding: Union[int, List[int]] = 0,
                   flip_filter: bool = False,
                   gain: Union[float, int] = 1):
    """Slow reference implementation of `upfirdn2d()` using standard PyTorch
    ops.

    Args:
        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
            `[batch_size, num_channels, in_height, in_width]`.
        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
            filter_width]` (non-separable), `[filter_taps]` (separable), or
            `None` (identity).
        up (int): Integer upsampling factor. Can be a single int or a
            list/tuple `[x, y]`. Defaults to 1.
        down (int): Integer downsampling factor. Can be a single int
            or a list/tuple `[x, y]`. Defaults to 1.
        padding (int | tuple[int]): Padding with respect to the upsampled
            image. Can be a single number or a list/tuple `[x, y]` or
            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.
        gain (int): Overall scaling factor for signal magnitude.
            Defaults to 1.

    Returns:
        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
            out_height, out_width]`.
    """
    # Validate arguments.
    assert isinstance(input, torch.Tensor) and input.ndim == 4
    if filter is None:
        filter = torch.ones([1, 1], dtype=torch.float32, device=input.device)
    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]
    assert filter.dtype == torch.float32 and not filter.requires_grad
    batch_size, num_channels, in_height, in_width = input.shape
    upx, upy = _parse_scaling(up)
    downx, downy = _parse_scaling(down)
    padx0, padx1, pady0, pady1 = _parse_padding(padding)

    # Check that upsampled buffer is not smaller than the filter.
    upW = in_width * upx + padx0 + padx1
    upH = in_height * upy + pady0 + pady1
    assert upW >= filter.shape[-1] and upH >= filter.shape[0]

    # Upsample by inserting zeros.
    x = input.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])

    # Pad or crop.
    x = torch.nn.functional.pad(
        x, [max(padx0, 0),
            max(padx1, 0),
            max(pady0, 0),
            max(pady1, 0)])
    x = x[:, :,
          max(-pady0, 0):x.shape[2] - max(-pady1, 0),
          max(-padx0, 0):x.shape[3] - max(-padx1, 0)]

    # Setup filter.
    filter = filter * (gain**(filter.ndim / 2))
    filter = filter.to(x.dtype)
    if not flip_filter:
        filter = filter.flip(list(range(filter.ndim)))

    # Convolve with the filter.
    filter = filter[None, None].repeat([num_channels, 1] + [1] * filter.ndim)
    if filter.ndim == 4:
        x = conv2d(input=x, weight=filter, groups=num_channels)
    else:
        x = conv2d(input=x, weight=filter.unsqueeze(2), groups=num_channels)
        x = conv2d(input=x, weight=filter.unsqueeze(3), groups=num_channels)

    # Downsample by throwing away pixels.
    x = x[:, :, ::downy, ::downx]
    return x


_upfirdn2d_cuda_cache: Dict = dict()


def _upfirdn2d_cuda(up: int = 1,
                    down: int = 1,
                    padding: Union[int, List[int]] = 0,
                    flip_filter: bool = False,
                    gain: Union[float, int] = 1):
    """Fast CUDA implementation of `upfirdn2d()` using custom ops.

    Args:
        up (int): Integer upsampling factor. Can be a single int or a
            list/tuple `[x, y]`. Defaults to 1.
        down (int): Integer downsampling factor. Can be a single int
            or a list/tuple `[x, y]`. Defaults to 1.
        padding (int | tuple[int]): Padding with respect to the upsampled
            image. Can be a single number or a list/tuple `[x, y]` or
            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.
        gain (int): Overall scaling factor for signal magnitude.
            Defaults to 1.

    Returns:
        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
        out_height, out_width]`
    """
    # Parse arguments.
    upx, upy = _parse_scaling(up)
    downx, downy = _parse_scaling(down)
    padx0, padx1, pady0, pady1 = _parse_padding(padding)

    # Lookup from cache.
    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter,
           gain)
    if key in _upfirdn2d_cuda_cache:
        return _upfirdn2d_cuda_cache[key]

    # Forward op.
    class Upfirdn2dCuda(torch.autograd.Function):

        @staticmethod
        def forward(ctx, x, f):  # pylint: disable=arguments-differ
            assert isinstance(x, torch.Tensor) and x.ndim == 4
            if f is None:
                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
            if f.ndim == 1 and f.shape[0] == 1:
                f = f.square().unsqueeze(
                    0)  # Convert separable-1 into full-1x1.
            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
            y = x
            if f.ndim == 2:
                y = ext_module.upfirdn2d(y, f, upx, upy, downx, downy, padx0,
                                         padx1, pady0, pady1, flip_filter,
                                         gain)
            else:
                y = ext_module.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1,
                                         padx0, padx1, 0, 0, flip_filter, 1.0)
                y = ext_module.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy,
                                         0, 0, pady0, pady1, flip_filter, gain)
            ctx.save_for_backward(f)
            ctx.x_shape = x.shape
            return y

        @staticmethod
        def backward(ctx, dy):  # pylint: disable=arguments-differ
            f, = ctx.saved_tensors
            _, _, ih, iw = ctx.x_shape
            _, _, oh, ow = dy.shape
            fw, fh = _get_filter_size(f)
            p = [
                fw - padx0 - 1,
                iw * upx - ow * downx + padx0 - upx + 1,
                fh - pady0 - 1,
                ih * upy - oh * downy + pady0 - upy + 1,
            ]
            dx = None
            df = None

            if ctx.needs_input_grad[0]:
                dx = _upfirdn2d_cuda(
                    up=down,
                    down=up,
                    padding=p,
                    flip_filter=(not flip_filter),
                    gain=gain).apply(dy, f)

            assert not ctx.needs_input_grad[1]
            return dx, df

    # Add to cache.
    _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda
    return Upfirdn2dCuda


_upfirdn2d_musa_cache: Dict = dict()


def _upfirdn2d_musa(up: int = 1,
                    down: int = 1,
                    padding: Union[int, List[int]] = 0,
                    flip_filter: bool = False,
                    gain: Union[float, int] = 1):
    """Fast MUSA implementation of `upfirdn2d()` using custom ops.

    Args:
        up (int): Integer upsampling factor. Can be a single int or a
            list/tuple `[x, y]`. Defaults to 1.
        down (int): Integer downsampling factor. Can be a single int
            or a list/tuple `[x, y]`. Defaults to 1.
        padding (int | tuple[int]): Padding with respect to the upsampled
            image. Can be a single number or a list/tuple `[x, y]` or
            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.
        gain (int): Overall scaling factor for signal magnitude.
            Defaults to 1.

    Returns:
        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
        out_height, out_width]`
    """
    # Parse arguments.
    upx, upy = _parse_scaling(up)
    downx, downy = _parse_scaling(down)
    padx0, padx1, pady0, pady1 = _parse_padding(padding)

    # Lookup from cache.
    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter,
           gain)
    if key in _upfirdn2d_musa_cache:
        return _upfirdn2d_musa_cache[key]

    # Forward op.
    class Upfirdn2dMusa(torch.autograd.Function):

        @staticmethod
        def forward(ctx, x, f):  # pylint: disable=arguments-differ
            assert isinstance(x, torch.Tensor) and x.ndim == 4
            if f is None:
                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
            if f.ndim == 1 and f.shape[0] == 1:
                f = f.square().unsqueeze(
                    0)  # Convert separable-1 into full-1x1.
            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
            y = x
            if f.ndim == 2:
                y = ext_module.upfirdn2d(y, f, upx, upy, downx, downy, padx0,
                                         padx1, pady0, pady1, flip_filter,
                                         gain)
            else:
                y = ext_module.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1,
                                         padx0, padx1, 0, 0, flip_filter, 1.0)
                y = ext_module.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy,
                                         0, 0, pady0, pady1, flip_filter, gain)
            ctx.save_for_backward(f)
            ctx.x_shape = x.shape
            return y

        @staticmethod
        def backward(ctx, dy):  # pylint: disable=arguments-differ
            f, = ctx.saved_tensors
            _, _, ih, iw = ctx.x_shape
            _, _, oh, ow = dy.shape
            fw, fh = _get_filter_size(f)
            p = [
                fw - padx0 - 1,
                iw * upx - ow * downx + padx0 - upx + 1,
                fh - pady0 - 1,
                ih * upy - oh * downy + pady0 - upy + 1,
            ]
            dx = None
            df = None

            if ctx.needs_input_grad[0]:
                dx = _upfirdn2d_musa(
                    up=down,
                    down=up,
                    padding=p,
                    flip_filter=(not flip_filter),
                    gain=gain).apply(dy, f)

            assert not ctx.needs_input_grad[1]
            return dx, df

    # Add to cache.
    _upfirdn2d_musa_cache[key] = Upfirdn2dMusa
    return Upfirdn2dMusa


def filter2d(input: torch.Tensor,
             filter: torch.Tensor,
             padding: Union[int, List[int]] = 0,
             flip_filter: bool = False,
             gain: Union[float, int] = 1,
             use_custom_op: bool = True):
    """Filter a batch of 2D images using the given 2D FIR filter.

    By default, the result is padded so that its shape matches the input.
    User-specified padding is applied on top of that, with negative values
    indicating cropping. Pixels outside the image are assumed to be zero.

    Args:
        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
            `[batch_size, num_channels, in_height, in_width]`.
        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
            filter_width]` (non-separable), `[filter_taps]` (separable), or
            `None`.
        padding (int | tuple[int]): Padding with respect to the output.
            Can be a single number or a list/tuple `[x, y]` or `[x_before,
            x_after, y_before, y_after]`. Defaults to 0.
        flip_filter (bool): False = convolution, True = correlation.
            Defaults to False.
        gain (int): Overall scaling factor for signal magnitude.
            Defaults to 1.
        use_custom_op (bool): Whether to use customized op.
            Defaults to True.

    Returns:
        Tensor of the shape `[batch_size, num_channels, out_height,
        out_width]`.
    """
    padx0, padx1, pady0, pady1 = _parse_padding(padding)
    fw, fh = _get_filter_size(filter)
    p = [
        padx0 + fw // 2,
        padx1 + (fw - 1) // 2,
        pady0 + fh // 2,
        pady1 + (fh - 1) // 2,
    ]
    return upfirdn2d(
        input,
        filter,
        padding=p,
        flip_filter=flip_filter,
        gain=gain,
        use_custom_op=use_custom_op)


def upsample2d(input: torch.Tensor,
               filter: torch.Tensor,
               up: int = 2,
               padding: Union[int, List[int]] = 0,
               flip_filter: bool = False,
               gain: Union[float, int] = 1,
               use_custom_op: bool = True):
    """Upsample a batch of 2D images using the given 2D FIR filter.

    By default, the result is padded so that its shape is a multiple of the
    input.
    User-specified padding is applied on top of that, with negative values
    indicating cropping. Pixels outside the image are assumed to be zero.

    Args:
        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
            `[batch_size, num_channels, in_height, in_width]`.
        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
            filter_width]` (non-separable), `[filter_taps]` (separable), or
            `None` (identity).
        up (int): Integer upsampling factor. Can be a single int or a
            list/tuple `[x, y]`. Defaults to 2.
        padding (int | tuple[int]): Padding with respect to the output.
            Can be a single number or a list/tuple `[x, y]` or `[x_before,
            x_after, y_before, y_after]`. Defaults to 0.
        flip_filter (bool): False = convolution, True = correlation. Defaults
            to False.
        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.
        use_custom_op (bool): Whether to use customized op.
            Defaults to True.

    Returns:
        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
        out_height, out_width]`
    """
    upx, upy = _parse_scaling(up)
    padx0, padx1, pady0, pady1 = _parse_padding(padding)
    fw, fh = _get_filter_size(filter)
    p = [
        padx0 + (fw + upx - 1) // 2,
        padx1 + (fw - upx) // 2,
        pady0 + (fh + upy - 1) // 2,
        pady1 + (fh - upy) // 2,
    ]
    return upfirdn2d(
        input,
        filter,
        up=up,
        padding=p,
        flip_filter=flip_filter,
        gain=gain * upx * upy,
        use_custom_op=use_custom_op)


def downsample2d(input: torch.Tensor,
                 filter: torch.Tensor,
                 down: int = 2,
                 padding: Union[int, List[int]] = 0,
                 flip_filter: bool = False,
                 gain: Union[float, int] = 1,
                 use_custom_op: bool = True):
    """Downsample a batch of 2D images using the given 2D FIR filter.

    By default, the result is padded so that its shape is a fraction of the
    input.
    User-specified padding is applied on top of that, with negative values
    indicating cropping. Pixels outside the image are assumed to be zero.

    Args:
        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
            `[batch_size, num_channels, in_height, in_width]`.
        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
            filter_width]` (non-separable), `[filter_taps]` (separable), or
            `None` (identity).
        down (int): Integer downsampling factor. Can be a single int or a
                     list/tuple `[x, y]` (default: 1). Defaults to 2.
        padding (int | tuple[int]): Padding with respect to the input.
            Can be a single number or a list/tuple `[x, y]` or `[x_before,
            x_after, y_before, y_after]`. Defaults to 0.
        flip_filter (bool): False = convolution, True = correlation. Defaults
            to False.
        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.
        use_custom_op (bool): Whether to use customized op.
            Defaults to True.

    Returns:
        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
        out_height, out_width]`.
    """
    downx, downy = _parse_scaling(down)
    padx0, padx1, pady0, pady1 = _parse_padding(padding)
    fw, fh = _get_filter_size(filter)
    p = [
        padx0 + (fw - downx + 1) // 2,
        padx1 + (fw - downx) // 2,
        pady0 + (fh - downy + 1) // 2,
        pady1 + (fh - downy) // 2,
    ]
    return upfirdn2d(
        input,
        filter,
        down=down,
        padding=p,
        flip_filter=flip_filter,
        gain=gain,
        use_custom_op=use_custom_op)


================================================
FILE: mmcv/ops/voxelize.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any, List, Tuple, Union

import torch
from torch import nn
from torch.autograd import Function
from torch.nn.modules.utils import _pair

from ..utils import ext_loader

ext_module = ext_loader.load_ext(
    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])


class _Voxelization(Function):

    @staticmethod
    def forward(
            ctx: Any,
            points: torch.Tensor,
            voxel_size: Union[tuple, float],
            coors_range: Union[tuple, float],
            max_points: int = 35,
            max_voxels: int = 20000,
            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:
        """Convert kitti points(N, >=3) to voxels.

        Args:
            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
                and points[:, 3:] contain other information like reflectivity.
            voxel_size (tuple or float): The size of voxel with the shape of
                [3].
            coors_range (tuple or float): The coordinate range of voxel with
                the shape of [6].
            max_points (int, optional): maximum points contained in a voxel. if
                max_points=-1, it means using dynamic_voxelize. Default: 35.
            max_voxels (int, optional): maximum voxels this function create.
                for second, 20000 is a good choice. Users should shuffle points
                before call this function because max_voxels may drop points.
                Default: 20000.
            deterministic: bool. whether to invoke the non-deterministic
                version of hard-voxelization implementations. non-deterministic
                version is considerablly fast but is not deterministic. only
                affects hard voxelization. default True. for more information
                of this argument and the implementation insights, please refer
                to the following links:
                https://github.com/open-mmlab/mmdetection3d/issues/894
                https://github.com/open-mmlab/mmdetection3d/pull/904
                it is an experimental feature and we will appreciate it if
                you could share with us the failing cases.

        Returns:
            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
            elements. The first one is the output voxels with the shape of
            [M, max_points, n_dim], which only contain points and returned
            when max_points != -1. The second is the voxel coordinates with
            shape of [M, 3]. The last is number of point per voxel with the
            shape of [M], which only returned when max_points != -1.
        """
        if max_points == -1 or max_voxels == -1:
            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
            ext_module.dynamic_voxelize_forward(
                points,
                torch.tensor(voxel_size, dtype=torch.float),
                torch.tensor(coors_range, dtype=torch.float),
                coors,
                NDim=3)
            return coors
        else:
            voxels = points.new_zeros(
                size=(max_voxels, max_points, points.size(1)))
            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
            num_points_per_voxel = points.new_zeros(
                size=(max_voxels, ), dtype=torch.int)
            voxel_num = torch.zeros(size=(), dtype=torch.long)
            ext_module.hard_voxelize_forward(
                points,
                torch.tensor(voxel_size, dtype=torch.float),
                torch.tensor(coors_range, dtype=torch.float),
                voxels,
                coors,
                num_points_per_voxel,
                voxel_num,
                max_points=max_points,
                max_voxels=max_voxels,
                NDim=3,
                deterministic=deterministic)
            # select the valid voxels
            voxels_out = voxels[:voxel_num]
            coors_out = coors[:voxel_num]
            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
            return voxels_out, coors_out, num_points_per_voxel_out


voxelization = _Voxelization.apply


class Voxelization(nn.Module):
    """Convert kitti points(N, >=3) to voxels.

    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
    <https://arxiv.org/abs/1907.03739>`_ for more details.

    Args:
        voxel_size (tuple or float): The size of voxel with the shape of [3].
        point_cloud_range (tuple or float): The coordinate range of voxel with
            the shape of [6].
        max_num_points (int): maximum points contained in a voxel. if
            max_points=-1, it means using dynamic_voxelize.
        max_voxels (int, optional): maximum voxels this function create.
            for second, 20000 is a good choice. Users should shuffle points
            before call this function because max_voxels may drop points.
            Default: 20000.
    """

    def __init__(self,
                 voxel_size: List,
                 point_cloud_range: List,
                 max_num_points: int,
                 max_voxels: Union[tuple, int] = 20000,
                 deterministic: bool = True):
        """
        Args:
            voxel_size (list): list [x, y, z] size of three dimension
            point_cloud_range (list):
                [x_min, y_min, z_min, x_max, y_max, z_max]
            max_num_points (int): max number of points per voxel
            max_voxels (tuple or int): max number of voxels in
                (training, testing) time
            deterministic: bool. whether to invoke the non-deterministic
                version of hard-voxelization implementations. non-deterministic
                version is considerablly fast but is not deterministic. only
                affects hard voxelization. default True. for more information
                of this argument and the implementation insights, please refer
                to the following links:
                https://github.com/open-mmlab/mmdetection3d/issues/894
                https://github.com/open-mmlab/mmdetection3d/pull/904
                it is an experimental feature and we will appreciate it if
                you could share with us the failing cases.
        """
        super().__init__()

        self.voxel_size = voxel_size
        self.point_cloud_range = point_cloud_range
        self.max_num_points = max_num_points
        if isinstance(max_voxels, tuple):
            self.max_voxels = max_voxels
        else:
            self.max_voxels = _pair(max_voxels)
        self.deterministic = deterministic

        point_cloud_range = torch.tensor(
            point_cloud_range, dtype=torch.float32)
        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
        grid_size = (
            point_cloud_range[3:] -  # type: ignore
            point_cloud_range[:3]) / voxel_size  # type: ignore
        grid_size = torch.round(grid_size).long()
        input_feat_shape = grid_size[:2]
        self.grid_size = grid_size
        # the origin shape is as [x-len, y-len, z-len]
        # [w, h, d] -> [d, h, w]
        self.pcd_shape = [*input_feat_shape, 1][::-1]

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if self.training:
            max_voxels = self.max_voxels[0]
        else:
            max_voxels = self.max_voxels[1]

        return voxelization(input, self.voxel_size, self.point_cloud_range,
                            self.max_num_points, max_voxels,
                            self.deterministic)

    def __repr__(self):
        s = self.__class__.__name__ + '('
        s += 'voxel_size=' + str(self.voxel_size)
        s += ', point_cloud_range=' + str(self.point_cloud_range)
        s += ', max_num_points=' + str(self.max_num_points)
        s += ', max_voxels=' + str(self.max_voxels)
        s += ', deterministic=' + str(self.deterministic)
        s += ')'
        return s


================================================
FILE: mmcv/transforms/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .base import BaseTransform
from .builder import TRANSFORMS
from .loading import LoadAnnotations, LoadImageFromFile
from .processing import (CenterCrop, MultiScaleFlipAug, Normalize, Pad,
                         RandomChoiceResize, RandomFlip, RandomGrayscale,
                         RandomResize, Resize, TestTimeAug)
from .wrappers import (Compose, KeyMapper, RandomApply, RandomChoice,
                       TransformBroadcaster)

try:
    import torch  # noqa: F401
except ImportError:
    __all__ = [
        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',
        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',
        'Normalize', 'Resize', 'Pad', 'RandomFlip', 'RandomChoiceResize',
        'CenterCrop', 'RandomGrayscale', 'MultiScaleFlipAug', 'RandomResize',
        'RandomApply', 'TestTimeAug'
    ]
else:
    from .formatting import ImageToTensor, ToTensor, to_tensor

    __all__ = [
        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',
        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',
        'Normalize', 'Resize', 'Pad', 'ToTensor', 'to_tensor', 'ImageToTensor',
        'RandomFlip', 'RandomChoiceResize', 'CenterCrop', 'RandomGrayscale',
        'MultiScaleFlipAug', 'RandomResize', 'RandomApply', 'TestTimeAug'
    ]


================================================
FILE: mmcv/transforms/base.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from abc import ABCMeta, abstractmethod
from typing import Dict, List, Optional, Tuple, Union


class BaseTransform(metaclass=ABCMeta):
    """Base class for all transformations."""

    def __call__(self,
                 results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:

        return self.transform(results)

    @abstractmethod
    def transform(self,
                  results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:
        """The transform function. All subclass of BaseTransform should
        override this method.

        This function takes the result dict as the input, and can add new
        items to the dict or modify existing items in the dict. And the result
        dict will be returned in the end, which allows to concate multiple
        transforms into a pipeline.

        Args:
            results (dict): The result dict.

        Returns:
            dict: The result dict.
        """


================================================
FILE: mmcv/transforms/builder.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from mmengine.registry import TRANSFORMS  # noqa: F401


================================================
FILE: mmcv/transforms/formatting.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Sequence, Union

import mmengine
import numpy as np
import torch

from .base import BaseTransform
from .builder import TRANSFORMS


def to_tensor(
    data: Union[torch.Tensor, np.ndarray, Sequence, int,
                float]) -> torch.Tensor:
    """Convert objects of various python types to :obj:`torch.Tensor`.

    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
    :class:`Sequence`, :class:`int` and :class:`float`.

    Args:
        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
            be converted.

    Returns:
        torch.Tensor: the converted data.
    """

    if isinstance(data, torch.Tensor):
        return data
    elif isinstance(data, np.ndarray):
        return torch.from_numpy(data)
    elif isinstance(data, Sequence) and not mmengine.is_str(data):
        return torch.tensor(data)
    elif isinstance(data, int):
        return torch.LongTensor([data])
    elif isinstance(data, float):
        return torch.FloatTensor([data])
    else:
        raise TypeError(f'type {type(data)} cannot be converted to tensor.')


@TRANSFORMS.register_module()
class ToTensor(BaseTransform):
    """Convert some results to :obj:`torch.Tensor` by given keys.

    Required keys:

    - all these keys in `keys`

    Modified Keys:

    - all these keys in `keys`

    Args:
        keys (Sequence[str]): Keys that need to be converted to Tensor.
    """

    def __init__(self, keys: Sequence[str]) -> None:
        self.keys = keys

    def transform(self, results: dict) -> dict:
        """Transform function to convert data to `torch.Tensor`.

        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: `keys` in results will be updated.
        """
        for key in self.keys:

            key_list = key.split('.')
            cur_item = results
            for i in range(len(key_list)):
                if key_list[i] not in cur_item:
                    raise KeyError(f'Can not find key {key}')
                if i == len(key_list) - 1:
                    cur_item[key_list[i]] = to_tensor(cur_item[key_list[i]])
                    break
                cur_item = cur_item[key_list[i]]

        return results

    def __repr__(self) -> str:
        return self.__class__.__name__ + f'(keys={self.keys})'


@TRANSFORMS.register_module()
class ImageToTensor(BaseTransform):
    """Convert image to :obj:`torch.Tensor` by given keys.

    The dimension order of input image is (H, W, C). The pipeline will convert
    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
    (1, H, W).

    Required keys:

    - all these keys in `keys`

    Modified Keys:

    - all these keys in `keys`

    Args:
        keys (Sequence[str]): Key of images to be converted to Tensor.
    """

    def __init__(self, keys: dict) -> None:
        self.keys = keys

    def transform(self, results: dict) -> dict:
        """Transform function to convert image in results to
        :obj:`torch.Tensor` and transpose the channel order.

        Args:
            results (dict): Result dict contains the image data to convert.
        Returns:
            dict: The result dict contains the image converted
            to :obj:``torch.Tensor`` and transposed to (C, H, W) order.
        """
        for key in self.keys:
            img = results[key]
            if len(img.shape) < 3:
                img = np.expand_dims(img, -1)
            results[key] = (to_tensor(img.transpose(2, 0, 1))).contiguous()
        return results

    def __repr__(self) -> str:
        return self.__class__.__name__ + f'(keys={self.keys})'


================================================
FILE: mmcv/transforms/loading.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from typing import Optional

import mmengine.fileio as fileio
import numpy as np

import mmcv
from .base import BaseTransform
from .builder import TRANSFORMS


@TRANSFORMS.register_module()
class LoadImageFromFile(BaseTransform):
    """Load an image from file.

    Required Keys:

    - img_path

    Modified Keys:

    - img
    - img_shape
    - ori_shape

    Args:
        to_float32 (bool): Whether to convert the loaded image to a float32
            numpy array. If set to False, the loaded image is an uint8 array.
            Defaults to False.
        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
            Defaults to 'color'.
        imdecode_backend (str): The image decoding backend type. The backend
            argument for :func:`mmcv.imfrombytes`.
            See :func:`mmcv.imfrombytes` for details.
            Defaults to 'cv2'.
        file_client_args (dict, optional): Arguments to instantiate a
            FileClient. See :class:`mmengine.fileio.FileClient` for details.
            Defaults to None. It will be deprecated in future. Please use
            ``backend_args`` instead.
            Deprecated in version 2.0.0rc4.
        ignore_empty (bool): Whether to allow loading empty image or file path
            not existent. Defaults to False.
        backend_args (dict, optional): Instantiates the corresponding file
            backend. It may contain `backend` key to specify the file
            backend. If it contains, the file backend corresponding to this
            value will be used and initialized with the remaining values,
            otherwise the corresponding file backend will be selected
            based on the prefix of the file path. Defaults to None.
            New in version 2.0.0rc4.
    """

    def __init__(self,
                 to_float32: bool = False,
                 color_type: str = 'color',
                 imdecode_backend: str = 'cv2',
                 file_client_args: Optional[dict] = None,
                 ignore_empty: bool = False,
                 *,
                 backend_args: Optional[dict] = None) -> None:
        self.ignore_empty = ignore_empty
        self.to_float32 = to_float32
        self.color_type = color_type
        self.imdecode_backend = imdecode_backend

        self.file_client_args: Optional[dict] = None
        self.backend_args: Optional[dict] = None
        if file_client_args is not None:
            warnings.warn(
                '"file_client_args" will be deprecated in future. '
                'Please use "backend_args" instead', DeprecationWarning)
            if backend_args is not None:
                raise ValueError(
                    '"file_client_args" and "backend_args" cannot be set '
                    'at the same time.')

            self.file_client_args = file_client_args.copy()
        if backend_args is not None:
            self.backend_args = backend_args.copy()

    def transform(self, results: dict) -> Optional[dict]:
        """Functions to load image.

        Args:
            results (dict): Result dict from
                :class:`mmengine.dataset.BaseDataset`.

        Returns:
            dict: The dict contains loaded image and meta information.
        """

        filename = results['img_path']
        try:
            if self.file_client_args is not None:
                file_client = fileio.FileClient.infer_client(
                    self.file_client_args, filename)
                img_bytes = file_client.get(filename)
            else:
                img_bytes = fileio.get(
                    filename, backend_args=self.backend_args)
            img = mmcv.imfrombytes(
                img_bytes, flag=self.color_type, backend=self.imdecode_backend)
        except Exception as e:
            if self.ignore_empty:
                return None
            else:
                raise e
        # in some cases, images are not read successfully, the img would be
        # `None`, refer to https://github.com/open-mmlab/mmpretrain/issues/1427
        assert img is not None, f'failed to load image: {filename}'
        if self.to_float32:
            img = img.astype(np.float32)

        results['img'] = img
        results['img_shape'] = img.shape[:2]
        results['ori_shape'] = img.shape[:2]
        return results

    def __repr__(self):
        repr_str = (f'{self.__class__.__name__}('
                    f'ignore_empty={self.ignore_empty}, '
                    f'to_float32={self.to_float32}, '
                    f"color_type='{self.color_type}', "
                    f"imdecode_backend='{self.imdecode_backend}', ")

        if self.file_client_args is not None:
            repr_str += f'file_client_args={self.file_client_args})'
        else:
            repr_str += f'backend_args={self.backend_args})'

        return repr_str


@TRANSFORMS.register_module()
class LoadAnnotations(BaseTransform):
    """Load and process the ``instances`` and ``seg_map`` annotation provided
    by dataset.

    The annotation format is as the following:

    .. code-block:: python

        {
            'instances':
            [
                {
                # List of 4 numbers representing the bounding box of the
                # instance, in (x1, y1, x2, y2) order.
                'bbox': [x1, y1, x2, y2],

                # Label of image classification.
                'bbox_label': 1,

                # Used in key point detection.
                # Can only load the format of [x1, y1, v1,…, xn, yn, vn]. v[i]
                # means the visibility of this keypoint. n must be equal to the
                # number of keypoint categories.
                'keypoints': [x1, y1, v1, ..., xn, yn, vn]
                }
            ]
            # Filename of semantic or panoptic segmentation ground truth file.
            'seg_map_path': 'a/b/c'
        }

    After this module, the annotation has been changed to the format below:

    .. code-block:: python

        {
            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
            # in np.float32
            'gt_bboxes': np.ndarray(N, 4)
             # In np.int64 type.
            'gt_bboxes_labels': np.ndarray(N, )
             # In uint8 type.
            'gt_seg_map': np.ndarray (H, W)
             # with (x, y, v) order, in np.float32 type.
            'gt_keypoints': np.ndarray(N, NK, 3)
        }

    Required Keys:

    - instances

      - bbox (optional)
      - bbox_label
      - keypoints (optional)

    - seg_map_path (optional)

    Added Keys:

    - gt_bboxes (np.float32)
    - gt_bboxes_labels (np.int64)
    - gt_seg_map (np.uint8)
    - gt_keypoints (np.float32)

    Args:
        with_bbox (bool): Whether to parse and load the bbox annotation.
            Defaults to True.
        with_label (bool): Whether to parse and load the label annotation.
            Defaults to True.
        with_seg (bool): Whether to parse and load the semantic segmentation
            annotation. Defaults to False.
        with_keypoints (bool): Whether to parse and load the keypoints
            annotation. Defaults to False.
        imdecode_backend (str): The image decoding backend type. The backend
            argument for :func:`mmcv.imfrombytes`.
            See :func:`mmcv.imfrombytes` for details.
            Defaults to 'cv2'.
        file_client_args (dict, optional): Arguments to instantiate a
            FileClient. See :class:`mmengine.fileio.FileClient` for details.
            Defaults to None. It will be deprecated in future. Please use
            ``backend_args`` instead.
            Deprecated in version 2.0.0rc4.
        backend_args (dict, optional): Instantiates the corresponding file
            backend. It may contain `backend` key to specify the file
            backend. If it contains, the file backend corresponding to this
            value will be used and initialized with the remaining values,
            otherwise the corresponding file backend will be selected
            based on the prefix of the file path. Defaults to None.
            New in version 2.0.0rc4.
    """

    def __init__(
        self,
        with_bbox: bool = True,
        with_label: bool = True,
        with_seg: bool = False,
        with_keypoints: bool = False,
        imdecode_backend: str = 'cv2',
        file_client_args: Optional[dict] = None,
        *,
        backend_args: Optional[dict] = None,
    ) -> None:
        super().__init__()
        self.with_bbox = with_bbox
        self.with_label = with_label
        self.with_seg = with_seg
        self.with_keypoints = with_keypoints
        self.imdecode_backend = imdecode_backend

        self.file_client_args: Optional[dict] = None
        self.backend_args: Optional[dict] = None
        if file_client_args is not None:
            warnings.warn(
                '"file_client_args" will be deprecated in future. '
                'Please use "backend_args" instead', DeprecationWarning)
            if backend_args is not None:
                raise ValueError(
                    '"file_client_args" and "backend_args" cannot be set '
                    'at the same time.')

            self.file_client_args = file_client_args.copy()
        if backend_args is not None:
            self.backend_args = backend_args.copy()

    def _load_bboxes(self, results: dict) -> None:
        """Private function to load bounding box annotations.

        Args:
            results (dict): Result dict from
                :class:`mmengine.dataset.BaseDataset`.

        Returns:
            dict: The dict contains loaded bounding box annotations.
        """
        gt_bboxes = []
        for instance in results['instances']:
            gt_bboxes.append(instance['bbox'])
        results['gt_bboxes'] = np.array(
            gt_bboxes, dtype=np.float32).reshape(-1, 4)

    def _load_labels(self, results: dict) -> None:
        """Private function to load label annotations.

        Args:
            results (dict): Result dict from
                :class:`mmengine.dataset.BaseDataset`.

        Returns:
            dict: The dict contains loaded label annotations.
        """
        gt_bboxes_labels = []
        for instance in results['instances']:
            gt_bboxes_labels.append(instance['bbox_label'])
        results['gt_bboxes_labels'] = np.array(
            gt_bboxes_labels, dtype=np.int64)

    def _load_seg_map(self, results: dict) -> None:
        """Private function to load semantic segmentation annotations.

        Args:
            results (dict): Result dict from
                :class:`mmengine.dataset.BaseDataset`.

        Returns:
            dict: The dict contains loaded semantic segmentation annotations.
        """
        if self.file_client_args is not None:
            file_client = fileio.FileClient.infer_client(
                self.file_client_args, results['seg_map_path'])
            img_bytes = file_client.get(results['seg_map_path'])
        else:
            img_bytes = fileio.get(
                results['seg_map_path'], backend_args=self.backend_args)

        results['gt_seg_map'] = mmcv.imfrombytes(
            img_bytes, flag='unchanged',
            backend=self.imdecode_backend).squeeze()

    def _load_kps(self, results: dict) -> None:
        """Private function to load keypoints annotations.

        Args:
            results (dict): Result dict from
                :class:`mmengine.dataset.BaseDataset`.

        Returns:
            dict: The dict contains loaded keypoints annotations.
        """
        gt_keypoints = []
        for instance in results['instances']:
            gt_keypoints.append(instance['keypoints'])
        results['gt_keypoints'] = np.array(gt_keypoints, np.float32).reshape(
            (len(gt_keypoints), -1, 3))

    def transform(self, results: dict) -> dict:
        """Function to load multiple types annotations.

        Args:
            results (dict): Result dict from
                :class:`mmengine.dataset.BaseDataset`.

        Returns:
            dict: The dict contains loaded bounding box, label and
            semantic segmentation and keypoints annotations.
        """

        if self.with_bbox:
            self._load_bboxes(results)
        if self.with_label:
            self._load_labels(results)
        if self.with_seg:
            self._load_seg_map(results)
        if self.with_keypoints:
            self._load_kps(results)
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(with_bbox={self.with_bbox}, '
        repr_str += f'with_label={self.with_label}, '
        repr_str += f'with_seg={self.with_seg}, '
        repr_str += f'with_keypoints={self.with_keypoints}, '
        repr_str += f"imdecode_backend='{self.imdecode_backend}', "

        if self.file_client_args is not None:
            repr_str += f'file_client_args={self.file_client_args})'
        else:
            repr_str += f'backend_args={self.backend_args})'

        return repr_str


================================================
FILE: mmcv/transforms/processing.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import random
import warnings
from itertools import product
from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union

import mmengine
import numpy as np

import mmcv
from mmcv.image.geometric import _scale_size
from .base import BaseTransform
from .builder import TRANSFORMS
from .utils import cache_randomness
from .wrappers import Compose

Number = Union[int, float]


@TRANSFORMS.register_module()
class Normalize(BaseTransform):
    """Normalize the image.

    Required Keys:

    - img

    Modified Keys:

    - img

    Added Keys:

    - img_norm_cfg

      - mean
      - std
      - to_rgb


    Args:
        mean (sequence): Mean values of 3 channels.
        std (sequence): Std values of 3 channels.
        to_rgb (bool): Whether to convert the image from BGR to RGB before
            normlizing the image. If ``to_rgb=True``, the order of mean and std
            should be RGB. If ``to_rgb=False``, the order of mean and std
            should be the same order of the image. Defaults to True.
    """

    def __init__(self,
                 mean: Sequence[Number],
                 std: Sequence[Number],
                 to_rgb: bool = True) -> None:
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_rgb = to_rgb

    def transform(self, results: dict) -> dict:
        """Function to normalize images.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Normalized results, key 'img_norm_cfg' key is added in to
            result dict.
        """

        results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std,
                                          self.to_rgb)
        results['img_norm_cfg'] = dict(
            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
        return repr_str


@TRANSFORMS.register_module()
class Resize(BaseTransform):
    """Resize images & bbox & seg & keypoints.

    This transform resizes the input image according to ``scale`` or
    ``scale_factor``. Bboxes, seg map and keypoints are then resized with the
    same scale factor.
    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
    resize.

    Required Keys:

    - img
    - gt_bboxes (optional)
    - gt_seg_map (optional)
    - gt_keypoints (optional)

    Modified Keys:

    - img
    - gt_bboxes
    - gt_seg_map
    - gt_keypoints
    - img_shape

    Added Keys:

    - scale
    - scale_factor
    - keep_ratio

    Args:
        scale (int or tuple): Images scales for resizing. Defaults to None
        scale_factor (float or tuple[float]): Scale factors for resizing.
            Defaults to None.
        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
            image. Defaults to False.
        clip_object_border (bool): Whether to clip the objects
            outside the border of the image. In some dataset like MOT17, the gt
            bboxes are allowed to cross the border of images. Therefore, we
            don't need to clip the gt bboxes in these cases. Defaults to True.
        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
            These two backends generates slightly different results. Defaults
            to 'cv2'.
        interpolation (str): Interpolation method, accepted values are
            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
            to 'bilinear'.
    """

    def __init__(self,
                 scale: Optional[Union[int, Tuple[int, int]]] = None,
                 scale_factor: Optional[Union[float, Tuple[float,
                                                           float]]] = None,
                 keep_ratio: bool = False,
                 clip_object_border: bool = True,
                 backend: str = 'cv2',
                 interpolation='bilinear') -> None:
        assert scale is not None or scale_factor is not None, (
            '`scale` and'
            '`scale_factor` can not both be `None`')
        if scale is None:
            self.scale = None
        else:
            if isinstance(scale, int):
                self.scale = (scale, scale)
            else:
                self.scale = scale

        self.backend = backend
        self.interpolation = interpolation
        self.keep_ratio = keep_ratio
        self.clip_object_border = clip_object_border
        if scale_factor is None:
            self.scale_factor = None
        elif isinstance(scale_factor, float):
            self.scale_factor = (scale_factor, scale_factor)
        elif isinstance(scale_factor, tuple):
            assert (len(scale_factor)) == 2
            self.scale_factor = scale_factor
        else:
            raise TypeError(
                f'expect scale_factor is float or Tuple(float), but'
                f'get {type(scale_factor)}')

    def _resize_img(self, results: dict) -> None:
        """Resize images with ``results['scale']``."""

        if results.get('img', None) is not None:
            if self.keep_ratio:
                img, scale_factor = mmcv.imrescale(
                    results['img'],
                    results['scale'],
                    interpolation=self.interpolation,
                    return_scale=True,
                    backend=self.backend)
                # the w_scale and h_scale has minor difference
                # a real fix should be done in the mmcv.imrescale in the future
                new_h, new_w = img.shape[:2]
                h, w = results['img'].shape[:2]
                w_scale = new_w / w
                h_scale = new_h / h
            else:
                img, w_scale, h_scale = mmcv.imresize(
                    results['img'],
                    results['scale'],
                    interpolation=self.interpolation,
                    return_scale=True,
                    backend=self.backend)
            results['img'] = img
            results['img_shape'] = img.shape[:2]
            results['scale_factor'] = (w_scale, h_scale)
            results['keep_ratio'] = self.keep_ratio

    def _resize_bboxes(self, results: dict) -> None:
        """Resize bounding boxes with ``results['scale_factor']``."""
        if results.get('gt_bboxes', None) is not None:
            bboxes = results['gt_bboxes'] * np.tile(
                np.array(results['scale_factor']), 2)
            if self.clip_object_border:
                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0,
                                          results['img_shape'][1])
                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0,
                                          results['img_shape'][0])
            results['gt_bboxes'] = bboxes

    def _resize_seg(self, results: dict) -> None:
        """Resize semantic segmentation map with ``results['scale']``."""
        if results.get('gt_seg_map', None) is not None:
            if self.keep_ratio:
                gt_seg = mmcv.imrescale(
                    results['gt_seg_map'],
                    results['scale'],
                    interpolation='nearest',
                    backend=self.backend)
            else:
                gt_seg = mmcv.imresize(
                    results['gt_seg_map'],
                    results['scale'],
                    interpolation='nearest',
                    backend=self.backend)
            results['gt_seg_map'] = gt_seg

    def _resize_keypoints(self, results: dict) -> None:
        """Resize keypoints with ``results['scale_factor']``."""
        if results.get('gt_keypoints', None) is not None:
            keypoints = results['gt_keypoints']

            keypoints[:, :, :2] = keypoints[:, :, :2] * np.array(
                results['scale_factor'])
            if self.clip_object_border:
                keypoints[:, :, 0] = np.clip(keypoints[:, :, 0], 0,
                                             results['img_shape'][1])
                keypoints[:, :, 1] = np.clip(keypoints[:, :, 1], 0,
                                             results['img_shape'][0])
            results['gt_keypoints'] = keypoints

    def transform(self, results: dict) -> dict:
        """Transform function to resize images, bounding boxes, semantic
        segmentation map and keypoints.

        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
            and 'keep_ratio' keys are updated in result dict.
        """

        if self.scale:
            results['scale'] = self.scale
        else:
            img_shape = results['img'].shape[:2]
            results['scale'] = _scale_size(img_shape[::-1],
                                           self.scale_factor)  # type: ignore
        self._resize_img(results)
        self._resize_bboxes(results)
        self._resize_seg(results)
        self._resize_keypoints(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(scale={self.scale}, '
        repr_str += f'scale_factor={self.scale_factor}, '
        repr_str += f'keep_ratio={self.keep_ratio}, '
        repr_str += f'clip_object_border={self.clip_object_border}), '
        repr_str += f'backend={self.backend}), '
        repr_str += f'interpolation={self.interpolation})'
        return repr_str


@TRANSFORMS.register_module()
class Pad(BaseTransform):
    """Pad the image & segmentation map.

    There are three padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number. and (3)pad to square. Also,
    pad to square and pad to the minimum size can be used as the same time.

    Required Keys:

    - img
    - gt_bboxes (optional)
    - gt_seg_map (optional)

    Modified Keys:

    - img
    - gt_seg_map
    - img_shape

    Added Keys:

    - pad_shape
    - pad_fixed_size
    - pad_size_divisor

    Args:
        size (tuple, optional): Fixed padding size.
            Expected padding shape (w, h). Defaults to None.
        size_divisor (int, optional): The divisor of padded size. Defaults to
            None.
        pad_to_square (bool): Whether to pad the image into a square.
            Currently only used for YOLOX. Defaults to False.
        pad_val (Number | dict[str, Number], optional): Padding value for if
            the pad_mode is "constant". If it is a single number, the value
            to pad the image is the number and to pad the semantic
            segmentation map is 255. If it is a dict, it should have the
            following keys:

            - img: The value to pad the image.
            - seg: The value to pad the semantic segmentation map.

            Defaults to dict(img=0, seg=255).
        padding_mode (str): Type of padding. Should be: constant, edge,
            reflect or symmetric. Defaults to 'constant'.

            - constant: pads with a constant value, this value is specified
              with pad_val.
            - edge: pads with the last value at the edge of the image.
            - reflect: pads with reflection of image without repeating the last
              value on the edge. For example, padding [1, 2, 3, 4] with 2
              elements on both sides in reflect mode will result in
              [3, 2, 1, 2, 3, 4, 3, 2].
            - symmetric: pads with reflection of image repeating the last value
              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
              both sides in symmetric mode will result in
              [2, 1, 1, 2, 3, 4, 4, 3]
    """

    def __init__(self,
                 size: Optional[Tuple[int, int]] = None,
                 size_divisor: Optional[int] = None,
                 pad_to_square: bool = False,
                 pad_val: Union[Number, dict] = dict(img=0, seg=255),
                 padding_mode: str = 'constant') -> None:
        self.size = size
        self.size_divisor = size_divisor
        if isinstance(pad_val, int):
            pad_val = dict(img=pad_val, seg=255)
        assert isinstance(pad_val, dict), 'pad_val '
        self.pad_val = pad_val
        self.pad_to_square = pad_to_square

        if pad_to_square:
            assert size is None, \
                'The size and size_divisor must be None ' \
                'when pad2square is True'
        else:
            assert size is not None or size_divisor is not None, \
                'only one of size and size_divisor should be valid'
            assert size is None or size_divisor is None
        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
        self.padding_mode = padding_mode

    def _pad_img(self, results: dict) -> None:
        """Pad images according to ``self.size``."""
        pad_val = self.pad_val.get('img', 0)

        size = None
        if self.pad_to_square:
            max_size = max(results['img'].shape[:2])
            size = (max_size, max_size)
        if self.size_divisor is not None:
            if size is None:
                size = (results['img'].shape[0], results['img'].shape[1])
            pad_h = int(np.ceil(
                size[0] / self.size_divisor)) * self.size_divisor
            pad_w = int(np.ceil(
                size[1] / self.size_divisor)) * self.size_divisor
            size = (pad_h, pad_w)
        elif self.size is not None:
            size = self.size[::-1]
        if isinstance(pad_val, int) and results['img'].ndim == 3:
            pad_val = tuple(pad_val for _ in range(results['img'].shape[2]))
        padded_img = mmcv.impad(
            results['img'],
            shape=size,
            pad_val=pad_val,
            padding_mode=self.padding_mode)

        results['img'] = padded_img
        results['pad_shape'] = padded_img.shape
        results['pad_fixed_size'] = self.size
        results['pad_size_divisor'] = self.size_divisor
        results['img_shape'] = padded_img.shape[:2]

    def _pad_seg(self, results: dict) -> None:
        """Pad semantic segmentation map according to
        ``results['pad_shape']``."""
        if results.get('gt_seg_map', None) is not None:
            pad_val = self.pad_val.get('seg', 255)
            if isinstance(pad_val, int) and results['gt_seg_map'].ndim == 3:
                pad_val = tuple(
                    pad_val for _ in range(results['gt_seg_map'].shape[2]))
            results['gt_seg_map'] = mmcv.impad(
                results['gt_seg_map'],
                shape=results['pad_shape'][:2],
                pad_val=pad_val,
                padding_mode=self.padding_mode)

    def transform(self, results: dict) -> dict:
        """Call function to pad images, masks, semantic segmentation maps.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Updated result dict.
        """
        self._pad_img(results)
        self._pad_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.size}, '
        repr_str += f'size_divisor={self.size_divisor}, '
        repr_str += f'pad_to_square={self.pad_to_square}, '
        repr_str += f'pad_val={self.pad_val}), '
        repr_str += f'padding_mode={self.padding_mode})'
        return repr_str


@TRANSFORMS.register_module()
class CenterCrop(BaseTransform):
    """Crop the center of the image, segmentation masks, bounding boxes and key
    points. If the crop area exceeds the original image and ``auto_pad`` is
    True, the original image will be padded before cropping.

    Required Keys:

    - img
    - gt_seg_map (optional)
    - gt_bboxes (optional)
    - gt_keypoints (optional)

    Modified Keys:

    - img
    - img_shape
    - gt_seg_map (optional)
    - gt_bboxes (optional)
    - gt_keypoints (optional)

    Added Key:

    - pad_shape


    Args:
        crop_size (Union[int, Tuple[int, int]]):  Expected size after cropping
            with the format of (w, h). If set to an integer, then cropping
            width and height are equal to this integer.
        auto_pad (bool): Whether to pad the image if it's smaller than the
            ``crop_size``. Defaults to False.
        pad_cfg (dict): Base config for padding. Refer to ``mmcv.Pad`` for
            detail. Defaults to ``dict(type='Pad')``.
        clip_object_border (bool): Whether to clip the objects
            outside the border of the image. In some dataset like MOT17, the
            gt bboxes are allowed to cross the border of images. Therefore,
            we don't need to clip the gt bboxes in these cases.
            Defaults to True.
    """

    def __init__(self,
                 crop_size: Union[int, Tuple[int, int]],
                 auto_pad: bool = False,
                 pad_cfg: dict = dict(type='Pad'),
                 clip_object_border: bool = True) -> None:
        super().__init__()
        assert isinstance(crop_size, int) or (
            isinstance(crop_size, tuple) and len(crop_size) == 2
        ), 'The expected crop_size is an integer, or a tuple containing two '
        'intergers'

        if isinstance(crop_size, int):
            crop_size = (crop_size, crop_size)
        assert crop_size[0] > 0 and crop_size[1] > 0
        self.crop_size = crop_size
        self.auto_pad = auto_pad

        self.pad_cfg = pad_cfg.copy()
        # size will be overwritten
        if 'size' in self.pad_cfg and auto_pad:
            warnings.warn('``size`` is set in ``pad_cfg``,'
                          'however this argument will be overwritten'
                          ' according to crop size and image size')

        self.clip_object_border = clip_object_border

    def _crop_img(self, results: dict, bboxes: np.ndarray) -> None:
        """Crop image.

        Args:
            results (dict): Result dict contains the data to transform.
            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
        """
        if results.get('img', None) is not None:
            img = mmcv.imcrop(results['img'], bboxes=bboxes)
            img_shape = img.shape[:2]  # type: ignore
            results['img'] = img
            results['img_shape'] = img_shape
            results['pad_shape'] = img_shape

    def _crop_seg_map(self, results: dict, bboxes: np.ndarray) -> None:
        """Crop semantic segmentation map.

        Args:
            results (dict): Result dict contains the data to transform.
            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
        """
        if results.get('gt_seg_map', None) is not None:
            img = mmcv.imcrop(results['gt_seg_map'], bboxes=bboxes)
            results['gt_seg_map'] = img

    def _crop_bboxes(self, results: dict, bboxes: np.ndarray) -> None:
        """Update bounding boxes according to CenterCrop.

        Args:
            results (dict): Result dict contains the data to transform.
            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
        """
        if 'gt_bboxes' in results:
            offset_w = bboxes[0]
            offset_h = bboxes[1]
            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h])
            # gt_bboxes has shape (num_gts, 4) in (tl_x, tl_y, br_x, br_y)
            # order.
            gt_bboxes = results['gt_bboxes'] - bbox_offset
            if self.clip_object_border:
                gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0,
                                             results['img'].shape[1])
                gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0,
                                             results['img'].shape[0])
            results['gt_bboxes'] = gt_bboxes

    def _crop_keypoints(self, results: dict, bboxes: np.ndarray) -> None:
        """Update key points according to CenterCrop. Keypoints that not in the
        cropped image will be set invisible.

        Args:
            results (dict): Result dict contains the data to transform.
            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
        """
        if 'gt_keypoints' in results:
            offset_w = bboxes[0]
            offset_h = bboxes[1]
            keypoints_offset = np.array([offset_w, offset_h, 0])
            # gt_keypoints has shape (N, NK, 3) in (x, y, visibility) order,
            # NK = number of points per object
            gt_keypoints = results['gt_keypoints'] - keypoints_offset
            # set gt_kepoints out of the result image invisible
            height, width = results['img'].shape[:2]
            valid_pos = (gt_keypoints[:, :, 0] >=
                         0) * (gt_keypoints[:, :, 0] <
                               width) * (gt_keypoints[:, :, 1] >= 0) * (
                                   gt_keypoints[:, :, 1] < height)
            gt_keypoints[:, :, 2] = np.where(valid_pos, gt_keypoints[:, :, 2],
                                             0)
            gt_keypoints[:, :, 0] = np.clip(gt_keypoints[:, :, 0], 0,
                                            results['img'].shape[1])
            gt_keypoints[:, :, 1] = np.clip(gt_keypoints[:, :, 1], 0,
                                            results['img'].shape[0])
            results['gt_keypoints'] = gt_keypoints

    def transform(self, results: dict) -> dict:
        """Apply center crop on results.

        Args:
            results (dict): Result dict contains the data to transform.

        Returns:
            dict: Results with CenterCropped image and semantic segmentation
            map.
        """
        crop_width, crop_height = self.crop_size[0], self.crop_size[1]

        assert 'img' in results, '`img` is not found in results'
        img = results['img']
        # img.shape has length 2 for grayscale, length 3 for color
        img_height, img_width = img.shape[:2]

        if crop_height > img_height or crop_width > img_width:
            if self.auto_pad:
                # pad the area
                img_height = max(img_height, crop_height)
                img_width = max(img_width, crop_width)
                pad_size = (img_width, img_height)
                _pad_cfg = self.pad_cfg.copy()
                _pad_cfg.update(dict(size=pad_size))
                pad_transform = TRANSFORMS.build(_pad_cfg)
                results = pad_transform(results)
            else:
                crop_height = min(crop_height, img_height)
                crop_width = min(crop_width, img_width)

        y1 = max(0, int(round((img_height - crop_height) / 2.)))
        x1 = max(0, int(round((img_width - crop_width) / 2.)))
        y2 = min(img_height, y1 + crop_height) - 1
        x2 = min(img_width, x1 + crop_width) - 1
        bboxes = np.array([x1, y1, x2, y2])

        # crop the image
        self._crop_img(results, bboxes)
        # crop the gt_seg_map
        self._crop_seg_map(results, bboxes)
        # crop the bounding box
        self._crop_bboxes(results, bboxes)
        # crop the keypoints
        self._crop_keypoints(results, bboxes)
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(crop_size = {self.crop_size}'
        repr_str += f', auto_pad={self.auto_pad}'
        repr_str += f', pad_cfg={self.pad_cfg}'
        repr_str += f',clip_object_border = {self.clip_object_border})'
        return repr_str


@TRANSFORMS.register_module()
class RandomGrayscale(BaseTransform):
    """Randomly convert image to grayscale with a probability.

    Required Key:

    - img

    Modified Key:

    - img

    Added Keys:

    - grayscale
    - grayscale_weights

    Args:
        prob (float): Probability that image should be converted to
            grayscale. Defaults to 0.1.
        keep_channels (bool): Whether keep channel number the same as
            input. Defaults to False.
        channel_weights (tuple): The grayscale weights of each channel,
            and the weights will be normalized. For example, (1, 2, 1)
            will be normalized as (0.25, 0.5, 0.25). Defaults to
            (1., 1., 1.).
        color_format (str): Color format set to be any of 'bgr',
            'rgb', 'hsv'. Note: 'hsv' image will be transformed into 'bgr'
            format no matter whether it is grayscaled. Defaults to 'bgr'.
    """

    def __init__(self,
                 prob: float = 0.1,
                 keep_channels: bool = False,
                 channel_weights: Sequence[float] = (1., 1., 1.),
                 color_format: str = 'bgr') -> None:
        super().__init__()
        assert 0. <= prob <= 1., ('The range of ``prob`` value is [0., 1.],' +
                                  f' but got {prob} instead')
        self.prob = prob
        self.keep_channels = keep_channels
        self.channel_weights = channel_weights
        assert color_format in ['bgr', 'rgb', 'hsv']
        self.color_format = color_format

    @cache_randomness
    def _random_prob(self):
        return random.random()

    def transform(self, results: dict) -> dict:
        """Apply random grayscale on results.

        Args:
            results (dict): Result dict contains the data to transform.

        Returns:
           dict: Results with grayscale image.
        """
        img = results['img']
        # convert hsv to bgr
        if self.color_format == 'hsv':
            img = mmcv.hsv2bgr(img)
        img = img[..., None] if img.ndim == 2 else img
        num_output_channels = img.shape[2]
        if self._random_prob() < self.prob:
            if num_output_channels > 1:
                assert num_output_channels == len(
                    self.channel_weights
                ), 'The length of ``channel_weights`` are supposed to be '
                f'num_output_channels, but got {len(self.channel_weights)}'
                ' instead.'
                normalized_weights = (
                    np.array(self.channel_weights) / sum(self.channel_weights))
                img = (normalized_weights * img).sum(axis=2)
                img = img.astype('uint8')
                if self.keep_channels:
                    img = img[:, :, None]
                    results['img'] = np.dstack(
                        [img for _ in range(num_output_channels)])
                else:
                    results['img'] = img
                return results
        img = img.astype('uint8')
        results['img'] = img
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(prob = {self.prob}'
        repr_str += f', keep_channels = {self.keep_channels}'
        repr_str += f', channel_weights = {self.channel_weights}'
        repr_str += f', color_format = {self.color_format})'
        return repr_str


@TRANSFORMS.register_module()
class MultiScaleFlipAug(BaseTransform):
    """Test-time augmentation with multiple scales and flipping.

    An example configuration is as followed:

    .. code-block::

        dict(
            type='MultiScaleFlipAug',
            scales=[(1333, 400), (1333, 800)],
            flip=True,
            transforms=[
                dict(type='Normalize', **img_norm_cfg),
                dict(type='Pad', size_divisor=1),
                dict(type='ImageToTensor', keys=['img']),
                dict(type='Collect', keys=['img'])
            ])

    ``results`` will be resized using all the sizes in ``scales``.
    If ``flip`` is True, then flipped results will also be added into output
    list.

    For the above configuration, there are four combinations of resize
    and flip:

    - Resize to (1333, 400) + no flip
    - Resize to (1333, 400) + flip
    - Resize to (1333, 800) + no flip
    - resize to (1333, 800) + flip

    The four results are then transformed with ``transforms`` argument.
    After that, results are wrapped into lists of the same length as below:

    .. code-block::

        dict(
            inputs=[...],
            data_samples=[...]
        )

    Where the length of ``inputs`` and ``data_samples`` are both 4.

    Required Keys:

    - Depending on the requirements of the ``transforms`` parameter.

    Modified Keys:

    - All output keys of each transform.

    Args:
        transforms (list[dict]): Transforms to be applied to each resized
            and flipped data.
        scales (tuple | list[tuple] | None): Images scales for resizing.
        scale_factor (float or tuple[float]): Scale factors for resizing.
            Defaults to None.
        allow_flip (bool): Whether apply flip augmentation. Defaults to False.
        flip_direction (str | list[str]): Flip augmentation directions,
            options are "horizontal", "vertical" and "diagonal". If
            flip_direction is a list, multiple flip augmentations will be
            applied. It has no effect when flip == False. Defaults to
            "horizontal".
        resize_cfg (dict): Base config for resizing. Defaults to
            ``dict(type='Resize', keep_ratio=True)``.
        flip_cfg (dict): Base config for flipping. Defaults to
            ``dict(type='RandomFlip')``.
    """

    def __init__(
        self,
        transforms: List[dict],
        scales: Optional[Union[Tuple, List[Tuple]]] = None,
        scale_factor: Optional[Union[float, List[float]]] = None,
        allow_flip: bool = False,
        flip_direction: Union[str, List[str]] = 'horizontal',
        resize_cfg: dict = dict(type='Resize', keep_ratio=True),
        flip_cfg: dict = dict(type='RandomFlip')
    ) -> None:
        super().__init__()
        self.transforms = Compose(transforms)  # type: ignore

        if scales is not None:
            self.scales = scales if isinstance(scales, list) else [scales]
            self.scale_key = 'scale'
            assert mmengine.is_list_of(self.scales, tuple)
        else:
            # if ``scales`` and ``scale_factor`` both be ``None``
            if scale_factor is None:
                self.scales = [1.]  # type: ignore
            elif isinstance(scale_factor, list):
                self.scales = scale_factor  # type: ignore
            else:
                self.scales = [scale_factor]  # type: ignore

            self.scale_key = 'scale_factor'

        self.allow_flip = allow_flip
        self.flip_direction = flip_direction if isinstance(
            flip_direction, list) else [flip_direction]
        assert mmengine.is_list_of(self.flip_direction, str)
        if not self.allow_flip and self.flip_direction != ['horizontal']:
            warnings.warn(
                'flip_direction has no effect when flip is set to False')
        self.resize_cfg = resize_cfg.copy()
        self.flip_cfg = flip_cfg

    def transform(self, results: dict) -> Dict:
        """Apply test time augment transforms on results.

        Args:
            results (dict): Result dict contains the data to transform.

        Returns:
            dict: The augmented data, where each value is wrapped
            into a list.
        """

        data_samples = []
        inputs = []
        flip_args = [(False, '')]
        if self.allow_flip:
            flip_args += [(True, direction)
                          for direction in self.flip_direction]
        for scale in self.scales:
            for flip, direction in flip_args:
                _resize_cfg = self.resize_cfg.copy()
                _resize_cfg.update({self.scale_key: scale})
                _resize_flip = [_resize_cfg]

                if flip:
                    _flip_cfg = self.flip_cfg.copy()
                    _flip_cfg.update(prob=1.0, direction=direction)
                    _resize_flip.append(_flip_cfg)
                else:
                    results['flip'] = False
                    results['flip_direction'] = None

                resize_flip = Compose(_resize_flip)
                _results = resize_flip(results.copy())
                packed_results = self.transforms(_results)  # type: ignore

                inputs.append(packed_results['inputs'])  # type: ignore
                data_samples.append(
                    packed_results['data_sample'])  # type: ignore
        return dict(inputs=inputs, data_sample=data_samples)

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(transforms={self.transforms}'
        repr_str += f', scales={self.scales}'
        repr_str += f', allow_flip={self.allow_flip}'
        repr_str += f', flip_direction={self.flip_direction})'
        return repr_str


@TRANSFORMS.register_module()
class TestTimeAug(BaseTransform):
    """Test-time augmentation transform.

    An example configuration is as followed:

    .. code-block::

        dict(type='TestTimeAug',
             transforms=[
                [dict(type='Resize', scale=(1333, 400), keep_ratio=True),
                 dict(type='Resize', scale=(1333, 800), keep_ratio=True)],
                [dict(type='RandomFlip', prob=1.),
                 dict(type='RandomFlip', prob=0.)],
                [dict(type='PackDetInputs',
                      meta_keys=('img_id', 'img_path', 'ori_shape',
                                 'img_shape', 'scale_factor', 'flip',
                                 'flip_direction'))]])

    ``results`` will be transformed using all transforms defined in
    ``transforms`` arguments.

    For the above configuration, there are four combinations of resize
    and flip:

    - Resize to (1333, 400) + no flip
    - Resize to (1333, 400) + flip
    - Resize to (1333, 800) + no flip
    - resize to (1333, 800) + flip

    After that, results are wrapped into lists of the same length as below:

    .. code-block::

        dict(
            inputs=[...],
            data_samples=[...]
        )

    The length of ``inputs`` and ``data_samples`` are both 4.

    Required Keys:

    - Depending on the requirements of the ``transforms`` parameter.

    Modified Keys:

    - All output keys of each transform.

    Args:
        transforms (list[list[dict]]): Transforms to be applied to data sampled
            from dataset. ``transforms`` is a list of list, and each list
            element usually represents a series of transforms with the same
            type and different arguments. Data will be processed by each list
            elements sequentially. See more information in :meth:`transform`.
    """

    def __init__(self, transforms: list):
        for i, transform_list in enumerate(transforms):
            for j, transform in enumerate(transform_list):
                if isinstance(transform, dict):
                    transform_list[j] = TRANSFORMS.build(transform)
                elif callable(transform):
                    continue
                else:
                    raise TypeError(
                        'transform must be callable or a dict, but got'
                        f' {type(transform)}')
            transforms[i] = transform_list

        self.subroutines = [
            Compose(subroutine) for subroutine in product(*transforms)
        ]

    def transform(self, results: dict) -> dict:
        """Apply all transforms defined in :attr:`transforms` to the results.

        As the example given in :obj:`TestTimeAug`, ``transforms`` consists of
        2 ``Resize``, 2 ``RandomFlip`` and 1 ``PackDetInputs``.
        The data sampled from dataset will be processed as follows:

        1. Data will be processed by 2 ``Resize`` and return a list
           of 2 results.
        2. Each result in list will be further passed to 2
           ``RandomFlip``, and aggregates into a list of 4 results.
        3. Each result will be processed by ``PackDetInputs``, and
           return a list of dict.
        4. Aggregates the same fields of results, and finally returns
           a dict. Each value of the dict represents 4 transformed
           results.

        Args:
            results (dict): Result dict contains the data to transform.

        Returns:
            dict: The augmented data, where each value is wrapped
            into a list.
        """
        results_list = []  # type: ignore
        for subroutine in self.subroutines:
            result = subroutine(copy.deepcopy(results))
            assert isinstance(result, dict), (
                f'Data processed by {subroutine} must return a dict, but got '
                f'{result}')
            assert result is not None, (
                f'Data processed by {subroutine} in `TestTimeAug` should not '
                'be None! Please check your validation dataset and the '
                f'transforms in {subroutine}')
            results_list.append(result)

        aug_data_dict = {
            key: [item[key] for item in results_list]  # type: ignore
            for key in results_list[0]  # type: ignore
        }
        return aug_data_dict

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += 'transforms=\n'
        for subroutine in self.subroutines:
            repr_str += f'{repr(subroutine)}\n'
        return repr_str


@TRANSFORMS.register_module()
class RandomChoiceResize(BaseTransform):
    """Resize images & bbox & mask from a list of multiple scales.

    This transform resizes the input image to some scale. Bboxes and masks are
    then resized with the same scale factor. Resize scale will be randomly
    selected from ``scales``.

    How to choose the target scale to resize the image will follow the rules
    below:

    - if `scale` is a list of tuple, the target scale is sampled from the list
      uniformally.
    - if `scale` is a tuple, the target scale will be set to the tuple.

    Required Keys:

    - img
    - gt_bboxes (optional)
    - gt_seg_map (optional)
    - gt_keypoints (optional)

    Modified Keys:

    - img
    - img_shape
    - gt_bboxes (optional)
    - gt_seg_map (optional)
    - gt_keypoints (optional)

    Added Keys:

    - scale
    - scale_factor
    - scale_idx
    - keep_ratio


    Args:
        scales (Union[list, Tuple]): Images scales for resizing.
        resize_type (str): The type of resize class to use. Defaults to
            "Resize".
        **resize_kwargs: Other keyword arguments for the ``resize_type``.

    Note:
        By defaults, the ``resize_type`` is "Resize", if it's not overwritten
        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,
        ``resize_kwargs`` accepts any keyword arguments of it, like
        ``keep_ratio``, ``interpolation`` and so on.

        If you want to use your custom resize class, the class should accept
        ``scale`` argument and have ``scale`` attribution which determines the
        resize shape.
    """

    def __init__(
        self,
        scales: Sequence[Union[int, Tuple]],
        resize_type: str = 'Resize',
        **resize_kwargs,
    ) -> None:
        super().__init__()
        if isinstance(scales, list):
            self.scales = scales
        else:
            self.scales = [scales]
        assert mmengine.is_seq_of(self.scales, (tuple, int))

        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
        # create a empty Resize object
        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})

    @cache_randomness
    def _random_select(self) -> Tuple[int, int]:
        """Randomly select an scale from given candidates.

        Returns:
            (tuple, int): Returns a tuple ``(scale, scale_dix)``,
            where ``scale`` is the selected image scale and
            ``scale_idx`` is the selected index in the given candidates.
        """

        scale_idx = np.random.randint(len(self.scales))
        scale = self.scales[scale_idx]
        return scale, scale_idx

    def transform(self, results: dict) -> dict:
        """Apply resize transforms on results from a list of scales.

        Args:
            results (dict): Result dict contains the data to transform.

        Returns:
            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
            and 'keep_ratio' keys are updated in result dict.
        """

        target_scale, scale_idx = self._random_select()
        self.resize.scale = target_scale
        results = self.resize(results)
        results['scale_idx'] = scale_idx
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(scales={self.scales}'
        repr_str += f', resize_cfg={self.resize_cfg})'
        return repr_str


@TRANSFORMS.register_module()
class RandomFlip(BaseTransform):
    """Flip the image & bbox & keypoints & segmentation map. Added or Updated
    keys: flip, flip_direction, img, gt_bboxes, gt_seg_map, and
    gt_keypoints. There are 3 flip modes:

    - ``prob`` is float, ``direction`` is string: the image will be
      ``direction``ly flipped with probability of ``prob`` .
      E.g., ``prob=0.5``, ``direction='horizontal'``,
      then image will be horizontally flipped with probability of 0.5.

    - ``prob`` is float, ``direction`` is list of string: the image will
      be ``direction[i]``ly flipped with probability of
      ``prob/len(direction)``.
      E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
      then image will be horizontally flipped with probability of 0.25,
      vertically with probability of 0.25.

    - ``prob`` is list of float, ``direction`` is list of string:
      given ``len(prob) == len(direction)``, the image will
      be ``direction[i]``ly flipped with probability of ``prob[i]``.
      E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
      'vertical']``, then image will be horizontally flipped with
      probability of 0.3, vertically with probability of 0.5.

    Required Keys:

    - img
    - gt_bboxes (optional)
    - gt_seg_map (optional)
    - gt_keypoints (optional)

    Modified Keys:

    - img
    - gt_bboxes (optional)
    - gt_seg_map (optional)
    - gt_keypoints (optional)

    Added Keys:

    - flip
    - flip_direction
    - swap_seg_labels (optional)

    Args:
        prob (float | list[float], optional): The flipping probability.
            Defaults to None.
        direction(str | list[str]): The flipping direction. Options
            If input is a list, the length must equal ``prob``. Each
            element in ``prob`` indicates the flip probability of
            corresponding direction. Defaults to 'horizontal'.
        swap_seg_labels (list, optional): The label pair need to be swapped
            for ground truth, like 'left arm' and 'right arm' need to be
            swapped after horizontal flipping. For example, ``[(1, 5)]``,
            where 1/5 is the label of the left/right arm. Defaults to None.
    """

    def __init__(self,
                 prob: Optional[Union[float, Iterable[float]]] = None,
                 direction: Union[str, Sequence[Optional[str]]] = 'horizontal',
                 swap_seg_labels: Optional[Sequence] = None) -> None:
        if isinstance(prob, list):
            assert mmengine.is_list_of(prob, float)
            assert 0 <= sum(prob) <= 1
        elif isinstance(prob, float):
            assert 0 <= prob <= 1
        else:
            raise ValueError(f'probs must be float or list of float, but \
                              got `{type(prob)}`.')
        self.prob = prob
        self.swap_seg_labels = swap_seg_labels

        valid_directions = ['horizontal', 'vertical', 'diagonal']
        if isinstance(direction, str):
            assert direction in valid_directions
        elif isinstance(direction, list):
            assert mmengine.is_list_of(direction, str)
            assert set(direction).issubset(set(valid_directions))
        else:
            raise ValueError(f'direction must be either str or list of str, \
                               but got `{type(direction)}`.')
        self.direction = direction

        if isinstance(prob, list):
            assert len(prob) == len(self.direction)

    def _flip_bbox(self, bboxes: np.ndarray, img_shape: Tuple[int, int],
                   direction: str) -> np.ndarray:
        """Flip bboxes horizontally.

        Args:
            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
            img_shape (tuple[int]): Image shape (height, width)
            direction (str): Flip direction. Options are 'horizontal',
                'vertical', and 'diagonal'.

        Returns:
            numpy.ndarray: Flipped bounding boxes.
        """
        assert bboxes.shape[-1] % 4 == 0
        flipped = bboxes.copy()
        h, w = img_shape
        if direction == 'horizontal':
            flipped[..., 0::4] = w - bboxes[..., 2::4]
            flipped[..., 2::4] = w - bboxes[..., 0::4]
        elif direction == 'vertical':
            flipped[..., 1::4] = h - bboxes[..., 3::4]
            flipped[..., 3::4] = h - bboxes[..., 1::4]
        elif direction == 'diagonal':
            flipped[..., 0::4] = w - bboxes[..., 2::4]
            flipped[..., 1::4] = h - bboxes[..., 3::4]
            flipped[..., 2::4] = w - bboxes[..., 0::4]
            flipped[..., 3::4] = h - bboxes[..., 1::4]
        else:
            raise ValueError(
                f"Flipping direction must be 'horizontal', 'vertical', \
                  or 'diagonal', but got '{direction}'")
        return flipped

    def _flip_keypoints(
        self,
        keypoints: np.ndarray,
        img_shape: Tuple[int, int],
        direction: str,
    ) -> np.ndarray:
        """Flip keypoints horizontally, vertically or diagonally.

        Args:
            keypoints (numpy.ndarray): Keypoints, shape (..., 2)
            img_shape (tuple[int]): Image shape (height, width)
            direction (str): Flip direction. Options are 'horizontal',
                'vertical', and 'diagonal'.

        Returns:
            numpy.ndarray: Flipped keypoints.
        """

        meta_info = keypoints[..., 2:]
        keypoints = keypoints[..., :2]
        flipped = keypoints.copy()
        h, w = img_shape
        if direction == 'horizontal':
            flipped[..., 0::2] = w - keypoints[..., 0::2]
        elif direction == 'vertical':
            flipped[..., 1::2] = h - keypoints[..., 1::2]
        elif direction == 'diagonal':
            flipped[..., 0::2] = w - keypoints[..., 0::2]
            flipped[..., 1::2] = h - keypoints[..., 1::2]
        else:
            raise ValueError(
                f"Flipping direction must be 'horizontal', 'vertical', \
                  or 'diagonal', but got '{direction}'")
        flipped = np.concatenate([flipped, meta_info], axis=-1)
        return flipped

    def _flip_seg_map(self, seg_map: dict, direction: str) -> np.ndarray:
        """Flip segmentation map horizontally, vertically or diagonally.

        Args:
            seg_map (numpy.ndarray): segmentation map, shape (H, W).
            direction (str): Flip direction. Options are 'horizontal',
                'vertical'.

        Returns:
            numpy.ndarray: Flipped segmentation map.
        """
        seg_map = mmcv.imflip(seg_map, direction=direction)
        if self.swap_seg_labels is not None:
            # to handle datasets with left/right annotations
            # like 'Left-arm' and 'Right-arm' in LIP dataset
            # Modified from https://github.com/openseg-group/openseg.pytorch/blob/master/lib/datasets/tools/cv2_aug_transforms.py # noqa:E501
            # Licensed under MIT license
            temp = seg_map.copy()
            assert isinstance(self.swap_seg_labels, (tuple, list))
            for pair in self.swap_seg_labels:
                assert isinstance(pair, (tuple, list)) and len(pair) == 2, \
                    'swap_seg_labels must be a sequence with pair, but got ' \
                    f'{self.swap_seg_labels}.'
                seg_map[temp == pair[0]] = pair[1]
                seg_map[temp == pair[1]] = pair[0]
        return seg_map

    @cache_randomness
    def _choose_direction(self) -> str:
        """Choose the flip direction according to `prob` and `direction`"""
        if isinstance(self.direction,
                      Sequence) and not isinstance(self.direction, str):
            # None means non-flip
            direction_list: list = list(self.direction) + [None]
        elif isinstance(self.direction, str):
            # None means non-flip
            direction_list = [self.direction, None]

        if isinstance(self.prob, list):
            non_prob: float = 1 - sum(self.prob)
            prob_list = self.prob + [non_prob]
        elif isinstance(self.prob, float):
            non_prob = 1. - self.prob
            # exclude non-flip
            single_ratio = self.prob / (len(direction_list) - 1)
            prob_list = [single_ratio] * (len(direction_list) - 1) + [non_prob]

        cur_dir = np.random.choice(direction_list, p=prob_list)

        return cur_dir

    def _flip(self, results: dict) -> None:
        """Flip images, bounding boxes, semantic segmentation map and
        keypoints."""
        # flip image
        results['img'] = mmcv.imflip(
            results['img'], direction=results['flip_direction'])

        img_shape = results['img'].shape[:2]

        # flip bboxes
        if results.get('gt_bboxes', None) is not None:
            results['gt_bboxes'] = self._flip_bbox(results['gt_bboxes'],
                                                   img_shape,
                                                   results['flip_direction'])

        # flip keypoints
        if results.get('gt_keypoints', None) is not None:
            results['gt_keypoints'] = self._flip_keypoints(
                results['gt_keypoints'], img_shape, results['flip_direction'])

        # flip seg map
        if results.get('gt_seg_map', None) is not None:
            results['gt_seg_map'] = self._flip_seg_map(
                results['gt_seg_map'], direction=results['flip_direction'])
            results['swap_seg_labels'] = self.swap_seg_labels

    def _flip_on_direction(self, results: dict) -> None:
        """Function to flip images, bounding boxes, semantic segmentation map
        and keypoints."""
        cur_dir = self._choose_direction()
        if cur_dir is None:
            results['flip'] = False
            results['flip_direction'] = None
        else:
            results['flip'] = True
            results['flip_direction'] = cur_dir
            self._flip(results)

    def transform(self, results: dict) -> dict:
        """Transform function to flip images, bounding boxes, semantic
        segmentation map and keypoints.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Flipped results, 'img', 'gt_bboxes', 'gt_seg_map',
            'gt_keypoints', 'flip', and 'flip_direction' keys are
            updated in result dict.
        """
        self._flip_on_direction(results)

        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(prob={self.prob}, '
        repr_str += f'direction={self.direction})'

        return repr_str


@TRANSFORMS.register_module()
class RandomResize(BaseTransform):
    """Random resize images & bbox & keypoints.

    How to choose the target scale to resize the image will follow the rules
    below:

    - if ``scale`` is a sequence of tuple

    .. math::
        target\\_scale[0] \\sim Uniform([scale[0][0], scale[1][0]])
    .. math::
        target\\_scale[1] \\sim Uniform([scale[0][1], scale[1][1]])

    Following the resize order of weight and height in cv2, ``scale[i][0]``
    is for width, and ``scale[i][1]`` is for height.

    - if ``scale`` is a tuple

    .. math::
        target\\_scale[0] \\sim Uniform([ratio\\_range[0], ratio\\_range[1]])
            * scale[0]
    .. math::
        target\\_scale[1] \\sim Uniform([ratio\\_range[0], ratio\\_range[1]])
            * scale[1]

    Following the resize order of weight and height in cv2, ``ratio_range[0]``
    is for width, and ``ratio_range[1]`` is for height.

    - if ``keep_ratio`` is True, the minimum value of ``target_scale`` will be
      used to set the shorter side and the maximum value will be used to
      set the longer side.

    - if ``keep_ratio`` is False, the value of ``target_scale`` will be used to
      reisze the width and height accordingly.

    Required Keys:

    - img
    - gt_bboxes
    - gt_seg_map
    - gt_keypoints

    Modified Keys:

    - img
    - gt_bboxes
    - gt_seg_map
    - gt_keypoints
    - img_shape

    Added Keys:

    - scale
    - scale_factor
    - keep_ratio

    Args:
        scale (tuple or Sequence[tuple]): Images scales for resizing.
            Defaults to None.
        ratio_range (tuple[float], optional): (min_ratio, max_ratio).
            Defaults to None.
        resize_type (str): The type of resize class to use. Defaults to
            "Resize".
        **resize_kwargs: Other keyword arguments for the ``resize_type``.

    Note:
        By defaults, the ``resize_type`` is "Resize", if it's not overwritten
        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,
        ``resize_kwargs`` accepts any keyword arguments of it, like
        ``keep_ratio``, ``interpolation`` and so on.

        If you want to use your custom resize class, the class should accept
        ``scale`` argument and have ``scale`` attribution which determines the
        resize shape.
    """

    def __init__(
        self,
        scale: Union[Tuple[int, int], Sequence[Tuple[int, int]]],
        ratio_range: Optional[Tuple[float, float]] = None,
        resize_type: str = 'Resize',
        **resize_kwargs,
    ) -> None:

        self.scale = scale
        self.ratio_range = ratio_range

        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
        # create a empty Reisize object
        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})

    @staticmethod
    def _random_sample(scales: Sequence[Tuple[int, int]]) -> tuple:
        """Private function to randomly sample a scale from a list of tuples.

        Args:
            scales (list[tuple]): Images scale range for sampling.
                There must be two tuples in scales, which specify the lower
                and upper bound of image scales.

        Returns:
            tuple: The targeted scale of the image to be resized.
        """

        assert mmengine.is_list_of(scales, tuple) and len(scales) == 2
        scale_0 = [scales[0][0], scales[1][0]]
        scale_1 = [scales[0][1], scales[1][1]]
        edge_0 = np.random.randint(min(scale_0), max(scale_0) + 1)
        edge_1 = np.random.randint(min(scale_1), max(scale_1) + 1)
        scale = (edge_0, edge_1)
        return scale

    @staticmethod
    def _random_sample_ratio(scale: tuple, ratio_range: Tuple[float,
                                                              float]) -> tuple:
        """Private function to randomly sample a scale from a tuple.

        A ratio will be randomly sampled from the range specified by
        ``ratio_range``. Then it would be multiplied with ``scale`` to
        generate sampled scale.

        Args:
            scale (tuple): Images scale base to multiply with ratio.
            ratio_range (tuple[float]): The minimum and maximum ratio to scale
                the ``scale``.

        Returns:
            tuple: The targeted scale of the image to be resized.
        """

        assert isinstance(scale, tuple) and len(scale) == 2
        min_ratio, max_ratio = ratio_range
        assert min_ratio <= max_ratio
        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
        scale = int(scale[0] * ratio), int(scale[1] * ratio)
        return scale

    @cache_randomness
    def _random_scale(self) -> tuple:
        """Private function to randomly sample an scale according to the type
        of ``scale``.

        Returns:
            tuple: The targeted scale of the image to be resized.
        """

        if mmengine.is_tuple_of(self.scale, int):
            assert self.ratio_range is not None and len(self.ratio_range) == 2
            scale = self._random_sample_ratio(
                self.scale,  # type: ignore
                self.ratio_range)
        elif mmengine.is_seq_of(self.scale, tuple):
            scale = self._random_sample(self.scale)  # type: ignore
        else:
            raise NotImplementedError('Do not support sampling function '
                                      f'for "{self.scale}"')

        return scale

    def transform(self, results: dict) -> dict:
        """Transform function to resize images, bounding boxes, semantic
        segmentation map.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Resized results, ``img``, ``gt_bboxes``, ``gt_semantic_seg``,
            ``gt_keypoints``, ``scale``, ``scale_factor``, ``img_shape``, and
            ``keep_ratio`` keys are updated in result dict.
        """
        results['scale'] = self._random_scale()
        self.resize.scale = results['scale']
        results = self.resize(results)
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(scale={self.scale}, '
        repr_str += f'ratio_range={self.ratio_range}, '
        repr_str += f'resize_cfg={self.resize_cfg})'
        return repr_str


================================================
FILE: mmcv/transforms/utils.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.

import copy
import functools
import inspect
import weakref
from collections import defaultdict
from collections.abc import Iterable
from contextlib import contextmanager
from typing import Callable, Union

from .base import BaseTransform


class cache_randomness:
    """Decorator that marks the method with random return value(s) in a
    transform class.

    This decorator is usually used together with the context-manager
    :func`:cache_random_params`. In this context, a decorated method will
    cache its return value(s) at the first time of being invoked, and always
    return the cached values when being invoked again.

    .. note::
        Only an instance method can be decorated with ``cache_randomness``.
    """

    def __init__(self, func):

        # Check `func` is to be bound as an instance method
        if not inspect.isfunction(func):
            raise TypeError('Unsupport callable to decorate with'
                            '@cache_randomness.')
        func_args = inspect.getfullargspec(func).args
        if len(func_args) == 0 or func_args[0] != 'self':
            raise TypeError(
                '@cache_randomness should only be used to decorate '
                'instance methods (the first argument is ``self``).')

        functools.update_wrapper(self, func)
        self.func = func
        self.instance_ref = None

    def __set_name__(self, owner, name):
        # Maintain a record of decorated methods in the class
        if not hasattr(owner, '_methods_with_randomness'):
            setattr(owner, '_methods_with_randomness', [])

        # Here `name` equals to `self.__name__`, i.e., the name of the
        # decorated function, due to the invocation of `update_wrapper` in
        # `self.__init__()`
        owner._methods_with_randomness.append(name)

    def __call__(self, *args, **kwargs):
        # Get the transform instance whose method is decorated
        # by cache_randomness
        instance = self.instance_ref()
        name = self.__name__

        # Check the flag ``self._cache_enabled``, which should be
        # set by the contextmanagers like ``cache_random_parameters```
        cache_enabled = getattr(instance, '_cache_enabled', False)

        if cache_enabled:
            # Initialize the cache of the transform instances. The flag
            # ``cache_enabled``` is set by contextmanagers like
            # ``cache_random_params```.
            if not hasattr(instance, '_cache'):
                setattr(instance, '_cache', {})

            if name not in instance._cache:
                instance._cache[name] = self.func(instance, *args, **kwargs)
            # Return the cached value
            return instance._cache[name]
        else:
            # Clear cache
            if hasattr(instance, '_cache'):
                del instance._cache
            # Return function output
            return self.func(instance, *args, **kwargs)

    def __get__(self, obj, cls):
        self.instance_ref = weakref.ref(obj)
        # Return a copy to avoid multiple transform instances sharing
        # one `cache_randomness` instance, which may cause data races
        # in multithreading cases.
        return copy.copy(self)


def avoid_cache_randomness(cls):
    """Decorator that marks a data transform class (subclass of
    :class:`BaseTransform`) prohibited from caching randomness. With this
    decorator, errors will be raised in following cases:

        1. A method is defined in the class with the decorate
    `cache_randomness`;
        2. An instance of the class is invoked with the context
    `cache_random_params`.

    A typical usage of `avoid_cache_randomness` is to decorate the data
    transforms with non-cacheable random behaviors (e.g., the random behavior
    can not be defined in a method, thus can not be decorated with
    `cache_randomness`). This is for preventing unintentinoal use of such data
    transforms within the context of caching randomness, which may lead to
    unexpected results.
    """

    # Check that cls is a data transform class
    assert issubclass(cls, BaseTransform)

    # Check that no method is decorated with `cache_randomness` in cls
    if getattr(cls, '_methods_with_randomness', None):
        raise RuntimeError(
            f'Class {cls.__name__} decorated with '
            '``avoid_cache_randomness`` should not have methods decorated '
            'with ``cache_randomness`` (invalid methods: '
            f'{cls._methods_with_randomness})')

    class AvoidCacheRandomness:

        def __get__(self, obj, objtype=None):
            # Here we check the value in `objtype.__dict__` instead of
            # directly checking the attribute
            # `objtype._avoid_cache_randomness`. So if the base class is
            # decorated with :func:`avoid_cache_randomness`, it will not be
            # inherited by subclasses.
            return objtype.__dict__.get('_avoid_cache_randomness', False)

    cls.avoid_cache_randomness = AvoidCacheRandomness()
    cls._avoid_cache_randomness = True

    return cls


@contextmanager
def cache_random_params(transforms: Union[BaseTransform, Iterable]):
    """Context-manager that enables the cache of return values of methods
    decorated with ``cache_randomness`` in transforms.

    In this mode, decorated methods will cache their return values on the
    first invoking, and always return the cached value afterward. This allow
    to apply random transforms in a deterministic way. For example, apply same
    transforms on multiple examples. See ``cache_randomness`` for more
    information.

    Args:
        transforms (BaseTransform|list[BaseTransform]): The transforms to
            enable cache.
    """

    # key2method stores the original methods that are replaced by the wrapped
    # ones. These methods will be restituted when exiting the context.
    key2method = dict()

    # key2counter stores the usage number of each cache_randomness. This is
    # used to check that any cache_randomness is invoked once during processing
    # on data sample.
    key2counter: dict = defaultdict(int)

    def _add_invoke_counter(obj, method_name):
        method = getattr(obj, method_name)
        key = f'{id(obj)}.{method_name}'
        key2method[key] = method

        @functools.wraps(method)
        def wrapped(*args, **kwargs):
            key2counter[key] += 1
            return method(*args, **kwargs)

        return wrapped

    def _add_invoke_checker(obj, method_name):
        # check that the method in _methods_with_randomness has been
        # invoked at most once
        method = getattr(obj, method_name)
        key = f'{id(obj)}.{method_name}'
        key2method[key] = method

        @functools.wraps(method)
        def wrapped(*args, **kwargs):
            # clear counter
            for name in obj._methods_with_randomness:
                key = f'{id(obj)}.{name}'
                key2counter[key] = 0

            output = method(*args, **kwargs)

            for name in obj._methods_with_randomness:
                key = f'{id(obj)}.{name}'
                if key2counter[key] > 1:
                    raise RuntimeError(
                        'The method decorated with ``cache_randomness`` '
                        'should be invoked at most once during processing '
                        f'one data sample. The method {name} of {obj} has '
                        f'been invoked {key2counter[key]} times.')
            return output

        return wrapped

    def _start_cache(t: BaseTransform):
        # Check if cache is allowed for `t`
        if getattr(t, 'avoid_cache_randomness', False):
            raise RuntimeError(
                f'Class {t.__class__.__name__} decorated with '
                '``avoid_cache_randomness`` is not allowed to be used with'
                ' ``cache_random_params`` (e.g. wrapped by '
                '``ApplyToMultiple`` with ``share_random_params==True``).')

        # Skip transforms w/o random method
        if not hasattr(t, '_methods_with_randomness'):
            return

        # Set cache enabled flag
        setattr(t, '_cache_enabled', True)

        # Store the original method and init the counter
        if hasattr(t, '_methods_with_randomness'):
            setattr(t, 'transform', _add_invoke_checker(t, 'transform'))
            for name in getattr(t, '_methods_with_randomness'):
                setattr(t, name, _add_invoke_counter(t, name))

    def _end_cache(t: BaseTransform):
        # Skip transforms w/o random method
        if not hasattr(t, '_methods_with_randomness'):
            return

        # Remove cache enabled flag
        delattr(t, '_cache_enabled')
        if hasattr(t, '_cache'):
            delattr(t, '_cache')

        # Restore the original method
        if hasattr(t, '_methods_with_randomness'):
            for name in getattr(t, '_methods_with_randomness'):
                key = f'{id(t)}.{name}'
                setattr(t, name, key2method[key])

            key_transform = f'{id(t)}.transform'
            setattr(t, 'transform', key2method[key_transform])

    def _apply(t: Union[BaseTransform, Iterable],
               func: Callable[[BaseTransform], None]):
        if isinstance(t, BaseTransform):
            func(t)
        if isinstance(t, Iterable):
            for _t in t:
                _apply(_t, func)

    try:
        _apply(transforms, _start_cache)
        yield
    finally:
        _apply(transforms, _end_cache)


================================================
FILE: mmcv/transforms/wrappers.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.

from typing import Any, Callable, Dict, List, Optional, Sequence, Union

import mmengine
import numpy as np

from .base import BaseTransform
from .builder import TRANSFORMS
from .utils import cache_random_params, cache_randomness

# Define type of transform or transform config
Transform = Union[Dict, Callable[[Dict], Dict]]

# Indicator of keys marked by KeyMapper._map_input, which means ignoring the
# marked keys in KeyMapper._apply_transform so they will be invisible to
# wrapped transforms.
# This can be 2 possible case:
# 1. The key is required but missing in results
# 2. The key is manually set as ... (Ellipsis) in ``mapping``, which means
# the original value in results should be ignored
IgnoreKey = object()

# Import nullcontext if python>=3.7, otherwise use a simple alternative
# implementation.
try:
    from contextlib import nullcontext  # type: ignore
except ImportError:
    from contextlib import contextmanager

    @contextmanager  # type: ignore
    def nullcontext(resource=None):
        try:
            yield resource
        finally:
            pass


@TRANSFORMS.register_module()
class Compose(BaseTransform):
    """Compose multiple transforms sequentially.

    Args:
        transforms (list[dict | callable]): Sequence of transform object or
            config dict to be composed.

    Examples:
        >>> pipeline = [
        >>>     dict(type='Compose',
        >>>         transforms=[
        >>>             dict(type='LoadImageFromFile'),
        >>>             dict(type='Normalize')
        >>>         ]
        >>>     )
        >>> ]
    """

    def __init__(self, transforms: Union[Transform, Sequence[Transform]]):
        super().__init__()

        if not isinstance(transforms, Sequence):
            transforms = [transforms]
        self.transforms: List = []
        for transform in transforms:
            if isinstance(transform, dict):
                transform = TRANSFORMS.build(transform)
                self.transforms.append(transform)
            elif callable(transform):
                self.transforms.append(transform)
            else:
                raise TypeError('transform must be callable or a dict, but got'
                                f' {type(transform)}')

    def __iter__(self):
        """Allow easy iteration over the transform sequence."""
        return iter(self.transforms)

    def transform(self, results: Dict) -> Optional[Dict]:
        """Call function to apply transforms sequentially.

        Args:
            results (dict): A result dict contains the results to transform.

        Returns:
            dict or None: Transformed results.
        """
        for t in self.transforms:
            results = t(results)  # type: ignore
            if results is None:
                return None
        return results

    def __repr__(self):
        """Compute the string representation."""
        format_string = self.__class__.__name__ + '('
        for t in self.transforms:
            format_string += f'\n    {t}'
        format_string += '\n)'
        return format_string


@TRANSFORMS.register_module()
class KeyMapper(BaseTransform):
    """A transform wrapper to map and reorganize the input/output of the
    wrapped transforms (or sub-pipeline).

    Args:
        transforms (list[dict | callable], optional): Sequence of transform
            object or config dict to be wrapped.
        mapping (dict): A dict that defines the input key mapping.
            The keys corresponds to the inner key (i.e., kwargs of the
            ``transform`` method), and should be string type. The values
            corresponds to the outer keys (i.e., the keys of the
            data/results), and should have a type of string, list or dict.
            None means not applying input mapping. Default: None.
        remapping (dict): A dict that defines the output key mapping.
            The keys and values have the same meanings and rules as in the
            ``mapping``. Default: None.
        auto_remap (bool, optional): If True, an inverse of the mapping will
            be used as the remapping. If auto_remap is not given, it will be
            automatically set True if 'remapping' is not given, and vice
            versa. Default: None.
        allow_nonexist_keys (bool): If False, the outer keys in the mapping
            must exist in the input data, or an exception will be raised.
            Default: False.

    Examples:
        >>> # Example 1: KeyMapper 'gt_img' to 'img'
        >>> pipeline = [
        >>>     # Use KeyMapper to convert outer (original) field name
        >>>     # 'gt_img' to inner (used by inner transforms) filed name
        >>>     # 'img'
        >>>     dict(type='KeyMapper',
        >>>         mapping={'img': 'gt_img'},
        >>>         # auto_remap=True means output key mapping is the revert of
        >>>         # the input key mapping, e.g. inner 'img' will be mapped
        >>>         # back to outer 'gt_img'
        >>>         auto_remap=True,
        >>>         transforms=[
        >>>             # In all transforms' implementation just use 'img'
        >>>             # as a standard field name
        >>>             dict(type='Crop', crop_size=(384, 384)),
        >>>             dict(type='Normalize'),
        >>>         ])
        >>> ]

        >>> # Example 2: Collect and structure multiple items
        >>> pipeline = [
        >>>     # The inner field 'imgs' will be a dict with keys 'img_src'
        >>>     # and 'img_tar', whose values are outer fields 'img1' and
        >>>     # 'img2' respectively.
        >>>     dict(type='KeyMapper',
        >>>         dict(
        >>>             type='KeyMapper',
        >>>             mapping=dict(
        >>>                 imgs=dict(
        >>>                     img_src='img1',
        >>>                     img_tar='img2')),
        >>>         transforms=...)
        >>> ]

        >>> # Example 3: Manually set ignored keys by "..."
        >>> pipeline = [
        >>>     ...
        >>>     dict(type='KeyMapper',
        >>>         mapping={
        >>>             # map outer key "gt_img" to inner key "img"
        >>>             'img': 'gt_img',
        >>>             # ignore outer key "mask"
        >>>             'mask': ...,
        >>>         },
        >>>         transforms=[
        >>>             dict(type='RandomFlip'),
        >>>         ])
        >>>     ...
        >>> ]
    """

    def __init__(self,
                 transforms: Union[Transform, List[Transform], None] = None,
                 mapping: Optional[Dict] = None,
                 remapping: Optional[Dict] = None,
                 auto_remap: Optional[bool] = None,
                 allow_nonexist_keys: bool = False):

        super().__init__()

        self.allow_nonexist_keys = allow_nonexist_keys
        self.mapping = mapping

        if auto_remap is None:
            auto_remap = remapping is None
        self.auto_remap = auto_remap

        if self.auto_remap:
            if remapping is not None:
                raise ValueError('KeyMapper: ``remapping`` must be None if'
                                 '`auto_remap` is set True.')
            self.remapping = mapping
        else:
            self.remapping = remapping

        if transforms is None:
            transforms = []
        self.transforms = Compose(transforms)

    def __iter__(self):
        """Allow easy iteration over the transform sequence."""
        return iter(self.transforms)

    def _map_input(self, data: Dict,
                   mapping: Optional[Dict]) -> Dict[str, Any]:
        """KeyMapper inputs for the wrapped transforms by gathering and
        renaming data items according to the mapping.

        Args:
            data (dict): The original input data
            mapping (dict, optional): The input key mapping. See the document
                of ``mmcv.transforms.wrappers.KeyMapper`` for details. In
                set None, return the input data directly.

        Returns:
            dict: The input data with remapped keys. This will be the actual
                input of the wrapped pipeline.
        """

        if mapping is None:
            return data.copy()

        def _map(data, m):
            if isinstance(m, dict):
                # m is a dict {inner_key:outer_key, ...}
                return {k_in: _map(data, k_out) for k_in, k_out in m.items()}
            if isinstance(m, (tuple, list)):
                # m is a list or tuple [outer_key1, outer_key2, ...]
                # This is the case when we collect items from the original
                # data to form a list or tuple to feed to the wrapped
                # transforms.
                return m.__class__(_map(data, e) for e in m)

            # allow manually mark a key to be ignored by ...
            if m is ...:
                return IgnoreKey

            # m is an outer_key
            if self.allow_nonexist_keys:
                return data.get(m, IgnoreKey)
            else:
                return data.get(m)

        collected = _map(data, mapping)

        # Retain unmapped items
        inputs = data.copy()
        inputs.update(collected)

        return inputs

    def _map_output(self, data: Dict,
                    remapping: Optional[Dict]) -> Dict[str, Any]:
        """KeyMapper outputs from the wrapped transforms by gathering and
        renaming data items according to the remapping.

        Args:
            data (dict): The output of the wrapped pipeline.
            remapping (dict, optional): The output key mapping. See the
                document of ``mmcv.transforms.wrappers.KeyMapper`` for
                details. If ``remapping is None``, no key mapping will be
                applied but only remove the special token ``IgnoreKey``.

        Returns:
            dict: The output with remapped keys.
        """

        # Remove ``IgnoreKey``
        if remapping is None:
            return {k: v for k, v in data.items() if v is not IgnoreKey}

        def _map(data, m):
            if isinstance(m, dict):
                assert isinstance(data, dict)
                results = {}
                for k_in, k_out in m.items():
                    assert k_in in data
                    results.update(_map(data[k_in], k_out))
                return results
            if isinstance(m, (list, tuple)):
                assert isinstance(data, (list, tuple))
                assert len(data) == len(m)
                results = {}
                for m_i, d_i in zip(m, data):
                    results.update(_map(d_i, m_i))
                return results

            # ``m is ...`` means the key is marked ignored, in which case the
            # inner resuls will not affect the outer results in remapping.
            # Another case that will have ``data is IgnoreKey`` is that the
            # key is missing in the inputs. In this case, if the inner key is
            # created by the wrapped transforms, it will be remapped to the
            # corresponding outer key during remapping.
            if m is ... or data is IgnoreKey:
                return {}

            return {m: data}

        # Note that unmapped items are not retained, which is different from
        # the behavior in _map_input. This is to avoid original data items
        # being overwritten by intermediate namesakes
        return _map(data, remapping)

    def _apply_transforms(self, inputs: Dict) -> Dict:
        """Apply ``self.transforms``.

        Note that the special token ``IgnoreKey`` will be invisible to
        ``self.transforms``, but not removed in this method. It will be
        eventually removed in :func:``self._map_output``.
        """
        results = inputs.copy()
        inputs = {k: v for k, v in inputs.items() if v is not IgnoreKey}
        outputs = self.transforms(inputs)

        if outputs is None:
            raise ValueError(
                f'Transforms wrapped by {self.__class__.__name__} should '
                'not return None.')

        results.update(outputs)  # type: ignore
        return results

    def transform(self, results: Dict) -> Dict:
        """Apply mapping, wrapped transforms and remapping."""

        # Apply mapping
        inputs = self._map_input(results, self.mapping)
        # Apply wrapped transforms
        outputs = self._apply_transforms(inputs)
        # Apply remapping
        outputs = self._map_output(outputs, self.remapping)

        results.update(outputs)  # type: ignore
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(transforms = {self.transforms}'
        repr_str += f', mapping = {self.mapping}'
        repr_str += f', remapping = {self.remapping}'
        repr_str += f', auto_remap = {self.auto_remap}'
        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys})'
        return repr_str


@TRANSFORMS.register_module()
class TransformBroadcaster(KeyMapper):
    """A transform wrapper to apply the wrapped transforms to multiple data
    items. For example, apply Resize to multiple images.

    Args:
        transforms (list[dict | callable]): Sequence of transform object or
            config dict to be wrapped.
        mapping (dict): A dict that defines the input key mapping.
            Note that to apply the transforms to multiple data items, the
            outer keys of the target items should be remapped as a list with
            the standard inner key (The key required by the wrapped transform).
            See the following example and the document of
            ``mmcv.transforms.wrappers.KeyMapper`` for details.
        remapping (dict): A dict that defines the output key mapping.
            The keys and values have the same meanings and rules as in the
            ``mapping``. Default: None.
        auto_remap (bool, optional): If True, an inverse of the mapping will
            be used as the remapping. If auto_remap is not given, it will be
            automatically set True if 'remapping' is not given, and vice
            versa. Default: None.
        allow_nonexist_keys (bool): If False, the outer keys in the mapping
            must exist in the input data, or an exception will be raised.
            Default: False.
        share_random_params (bool): If True, the random transform
            (e.g., RandomFlip) will be conducted in a deterministic way and
            have the same behavior on all data items. For example, to randomly
            flip either both input image and ground-truth image, or none.
            Default: False.

    .. note::
        To apply the transforms to each elements of a list or tuple, instead
        of separating data items, you can map the outer key of the target
        sequence to the standard inner key. See example 2.
        example.

    Examples:
        >>> # Example 1: Broadcast to enumerated keys, each contains a single
        >>> # data element
        >>> pipeline = [
        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img
        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img
        >>>     # TransformBroadcaster maps multiple outer fields to standard
        >>>     # the inner field and process them with wrapped transforms
        >>>     # respectively
        >>>     dict(type='TransformBroadcaster',
        >>>         # case 1: from multiple outer fields
        >>>         mapping={'img': ['lq', 'gt']},
        >>>         auto_remap=True,
        >>>         # share_random_param=True means using identical random
        >>>         # parameters in every processing
        >>>         share_random_param=True,
        >>>         transforms=[
        >>>             dict(type='Crop', crop_size=(384, 384)),
        >>>             dict(type='Normalize'),
        >>>         ])
        >>> ]

        >>> # Example 2: Broadcast to keys that contains data sequences
        >>> pipeline = [
        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img
        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img
        >>>     # TransformBroadcaster maps multiple outer fields to standard
        >>>     # the inner field and process them with wrapped transforms
        >>>     # respectively
        >>>     dict(type='TransformBroadcaster',
        >>>         # case 2: from one outer field that contains multiple
        >>>         # data elements (e.g. a list)
        >>>         # mapping={'img': 'images'},
        >>>         auto_remap=True,
        >>>         share_random_param=True,
        >>>         transforms=[
        >>>             dict(type='Crop', crop_size=(384, 384)),
        >>>             dict(type='Normalize'),
        >>>         ])
        >>> ]

        >>> Example 3: Set ignored keys in broadcasting
        >>> pipeline = [
        >>>        dict(type='TransformBroadcaster',
        >>>            # Broadcast the wrapped transforms to multiple images
        >>>            # 'lq' and 'gt, but only update 'img_shape' once
        >>>            mapping={
        >>>                'img': ['lq', 'gt'],
        >>>                'img_shape': ['img_shape', ...],
        >>>             },
        >>>            auto_remap=True,
        >>>            share_random_params=True,
        >>>            transforms=[
        >>>                # `RandomCrop` will modify the field "img",
        >>>                # and optionally update "img_shape" if it exists
        >>>                dict(type='RandomCrop'),
        >>>            ])
        >>>    ]
    """

    def __init__(self,
                 transforms: List[Union[Dict, Callable[[Dict], Dict]]],
                 mapping: Optional[Dict] = None,
                 remapping: Optional[Dict] = None,
                 auto_remap: Optional[bool] = None,
                 allow_nonexist_keys: bool = False,
                 share_random_params: bool = False):
        super().__init__(transforms, mapping, remapping, auto_remap,
                         allow_nonexist_keys)

        self.share_random_params = share_random_params

    def scatter_sequence(self, data: Dict) -> List[Dict]:
        """Scatter the broadcasting targets to a list of inputs of the wrapped
        transforms."""

        # infer split number from input
        seq_len = 0
        key_rep = None

        if self.mapping:
            keys = self.mapping.keys()
        else:
            keys = data.keys()

        for key in keys:
            assert isinstance(data[key], Sequence)
            if seq_len:
                if len(data[key]) != seq_len:
                    raise ValueError('Got inconsistent sequence length: '
                                     f'{seq_len} ({key_rep}) vs. '
                                     f'{len(data[key])} ({key})')
            else:
                seq_len = len(data[key])
                key_rep = key

        assert seq_len > 0, 'Fail to get the number of broadcasting targets'

        scatters = []
        for i in range(seq_len):  # type: ignore
            scatter = data.copy()
            for key in keys:
                scatter[key] = data[key][i]
            scatters.append(scatter)
        return scatters

    def transform(self, results: Dict):
        """Broadcast wrapped transforms to multiple targets."""

        # Apply input remapping
        inputs = self._map_input(results, self.mapping)

        # Scatter sequential inputs into a list
        input_scatters = self.scatter_sequence(inputs)

        # Control random parameter sharing with a context manager
        if self.share_random_params:
            # The context manager :func`:cache_random_params` will let
            # cacheable method of the transforms cache their outputs. Thus
            # the random parameters will only generated once and shared
            # by all data items.
            ctx = cache_random_params  # type: ignore
        else:
            ctx = nullcontext  # type: ignore

        with ctx(self.transforms):
            output_scatters = [
                self._apply_transforms(_input) for _input in input_scatters
            ]

        # Collate output scatters (list of dict to dict of list)
        outputs = {
            key: [_output[key] for _output in output_scatters]
            for key in output_scatters[0]
        }

        # Apply remapping
        outputs = self._map_output(outputs, self.remapping)

        results.update(outputs)
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(transforms = {self.transforms}'
        repr_str += f', mapping = {self.mapping}'
        repr_str += f', remapping = {self.remapping}'
        repr_str += f', auto_remap = {self.auto_remap}'
        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys}'
        repr_str += f', share_random_params = {self.share_random_params})'
        return repr_str


@TRANSFORMS.register_module()
class RandomChoice(BaseTransform):
    """Process data with a randomly chosen transform from given candidates.

    Args:
        transforms (list[list]): A list of transform candidates, each is a
            sequence of transforms.
        prob (list[float], optional): The probabilities associated
            with each pipeline. The length should be equal to the pipeline
            number and the sum should be 1. If not given, a uniform
            distribution will be assumed.

    Examples:
        >>> # config
        >>> pipeline = [
        >>>     dict(type='RandomChoice',
        >>>         transforms=[
        >>>             [dict(type='RandomHorizontalFlip')],  # subpipeline 1
        >>>             [dict(type='RandomRotate')],  # subpipeline 2
        >>>         ]
        >>>     )
        >>> ]
    """

    def __init__(self,
                 transforms: List[Union[Transform, List[Transform]]],
                 prob: Optional[List[float]] = None):

        super().__init__()

        if prob is not None:
            assert mmengine.is_seq_of(prob, float)
            assert len(transforms) == len(prob), \
                '``transforms`` and ``prob`` must have same lengths. ' \
                f'Got {len(transforms)} vs {len(prob)}.'
            assert sum(prob) == 1

        self.prob = prob
        self.transforms = [Compose(transforms) for transforms in transforms]

    def __iter__(self):
        return iter(self.transforms)

    @cache_randomness
    def random_pipeline_index(self) -> int:
        """Return a random transform index."""
        indices = np.arange(len(self.transforms))
        return np.random.choice(indices, p=self.prob)

    def transform(self, results: Dict) -> Optional[Dict]:
        """Randomly choose a transform to apply."""
        idx = self.random_pipeline_index()
        return self.transforms[idx](results)

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(transforms = {self.transforms}'
        repr_str += f'prob = {self.prob})'
        return repr_str


@TRANSFORMS.register_module()
class RandomApply(BaseTransform):
    """Apply transforms randomly with a given probability.

    Args:
        transforms (list[dict | callable]): The transform or transform list
            to randomly apply.
        prob (float): The probability to apply transforms. Default: 0.5

    Examples:
        >>> # config
        >>> pipeline = [
        >>>     dict(type='RandomApply',
        >>>         transforms=[dict(type='HorizontalFlip')],
        >>>         prob=0.3)
        >>> ]
    """

    def __init__(self,
                 transforms: Union[Transform, List[Transform]],
                 prob: float = 0.5):

        super().__init__()
        self.prob = prob
        self.transforms = Compose(transforms)

    def __iter__(self):
        return iter(self.transforms)

    @cache_randomness
    def random_apply(self) -> bool:
        """Return a random bool value indicating whether apply the
        transform."""
        return np.random.rand() < self.prob

    def transform(self, results: Dict) -> Optional[Dict]:
        """Randomly apply the transform."""
        if self.random_apply():
            return self.transforms(results)  # type: ignore
        else:
            return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'(transforms = {self.transforms}'
        repr_str += f', prob = {self.prob})'
        return repr_str


================================================
FILE: mmcv/utils/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .device_type import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE,
                          IS_MPS_AVAILABLE, IS_MUSA_AVAILABLE,
                          IS_NPU_AVAILABLE)
from .env import collect_env
from .parrots_jit import jit, skip_no_elena

__all__ = [
    'IS_MLU_AVAILABLE', 'IS_MPS_AVAILABLE', 'IS_CUDA_AVAILABLE',
    'IS_NPU_AVAILABLE', 'IS_MUSA_AVAILABLE', 'collect_env', 'jit',
    'skip_no_elena'
]


================================================
FILE: mmcv/utils/device_type.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from mmengine.device import (is_cuda_available, is_mlu_available,
                             is_mps_available, is_musa_available,
                             is_npu_available)

IS_MLU_AVAILABLE = is_mlu_available()
IS_MPS_AVAILABLE = is_mps_available()
IS_CUDA_AVAILABLE = is_cuda_available()
IS_NPU_AVAILABLE = is_npu_available()
IS_MUSA_AVAILABLE = is_musa_available()


================================================
FILE: mmcv/utils/env.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
"""This file holding some environment constant for sharing by other files."""

import os.path as osp
import subprocess

import torch
from mmengine.utils.dl_utils import collect_env as mmengine_collect_env

import mmcv


def collect_env():
    """Collect the information of the running environments.

    Returns:
        dict: The environment information. The following fields are contained.

            - sys.platform: The variable of ``sys.platform``.
            - Python: Python version.
            - CUDA available: Bool, indicating if CUDA is available.
            - GPU devices: Device type of each GPU.
            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
            - NVCC (optional): NVCC version.
            - GCC: GCC version, "n/a" if GCC is not installed.
            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
            - PyTorch: PyTorch version.
            - PyTorch compiling details: The output of \
                ``torch.__config__.show()``.
            - TorchVision (optional): TorchVision version.
            - OpenCV: OpenCV version.
            - MMEngine: MMEngine version.
            - MMCV: MMCV version.
            - MMCV Compiler: The GCC version for compiling MMCV ops.
            - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops.
    """
    env_info = mmengine_collect_env()

    # MMEngine does not add the hipcc compiler information when collecting
    # environment information, so it is added here. When MMEngine v0.3.0 is
    # released, the code here can be removed.
    cuda_available = torch.cuda.is_available()
    if cuda_available and env_info.get('NVCC') == 'Not Available':
        CUDA_HOME = env_info['CUDA_HOME']
        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
            if CUDA_HOME == '/opt/rocm':
                try:
                    nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')
                    nvcc = subprocess.check_output(
                        f'"{nvcc}" --version', shell=True)
                    nvcc = nvcc.decode('utf-8').strip()
                    release = nvcc.rfind('HIP version:')
                    build = nvcc.rfind('')
                    nvcc = nvcc[release:build].strip()
                except subprocess.SubprocessError:
                    nvcc = 'Not Available'
            else:
                try:
                    nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
                    nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
                    nvcc = nvcc.decode('utf-8').strip()
                    release = nvcc.rfind('Cuda compilation tools')
                    build = nvcc.rfind('Build ')
                    nvcc = nvcc[release:build].strip()
                except subprocess.SubprocessError:
                    nvcc = 'Not Available'
            env_info['NVCC'] = nvcc

    env_info['MMCV'] = mmcv.__version__

    try:
        from mmcv.ops import get_compiler_version, get_compiling_cuda_version
    except ModuleNotFoundError:
        env_info['MMCV Compiler'] = 'n/a'
        env_info['MMCV CUDA Compiler'] = 'n/a'
    else:
        env_info['MMCV Compiler'] = get_compiler_version()
        env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version()

    return env_info


================================================
FILE: mmcv/utils/ext_loader.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import importlib
import os
import pkgutil
import warnings
from collections import namedtuple

import torch

if torch.__version__ != 'parrots':

    def load_ext(name, funcs):
        ext = importlib.import_module('mmcv.' + name)
        for fun in funcs:
            assert hasattr(ext, fun), f'{fun} miss in module {name}'
        return ext
else:
    from parrots import extension
    from parrots.base import ParrotsException

    has_return_value_ops = [
        'nms',
        'softnms',
        'nms_match',
        'nms_rotated',
        'top_pool_forward',
        'top_pool_backward',
        'bottom_pool_forward',
        'bottom_pool_backward',
        'left_pool_forward',
        'left_pool_backward',
        'right_pool_forward',
        'right_pool_backward',
        'fused_bias_leakyrelu',
        'upfirdn2d',
        'ms_deform_attn_forward',
        'pixel_group',
        'contour_expand',
        'diff_iou_rotated_sort_vertices_forward',
    ]

    def get_fake_func(name, e):

        def fake_func(*args, **kwargs):
            warnings.warn(f'{name} is not supported in parrots now')
            raise e

        return fake_func

    def load_ext(name, funcs):
        ExtModule = namedtuple('ExtModule', funcs)
        ext_list = []
        lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        for fun in funcs:
            try:
                ext_fun = extension.load(fun, name, lib_dir=lib_root)
            except ParrotsException as e:
                if 'No element registered' not in e.message:
                    warnings.warn(e.message)
                ext_fun = get_fake_func(fun, e)
                ext_list.append(ext_fun)
            else:
                if fun in has_return_value_ops:
                    ext_list.append(ext_fun.op)
                else:
                    ext_list.append(ext_fun.op_)
        return ExtModule(*ext_list)


def check_ops_exist() -> bool:
    ext_loader = pkgutil.find_loader('mmcv._ext')
    return ext_loader is not None


================================================
FILE: mmcv/utils/parrots_jit.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os

from mmengine.utils.dl_utils.parrots_wrapper import TORCH_VERSION

parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')

if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
    from parrots.jit import pat as jit
else:

    def jit(func=None,
            check_input=None,
            full_shape=True,
            derivate=False,
            coderize=False,
            optimize=False):

        def wrapper(func):

            def wrapper_inner(*args, **kargs):
                return func(*args, **kargs)

            return wrapper_inner

        if func is None:
            return wrapper
        else:
            return func


if TORCH_VERSION == 'parrots':
    from parrots.utils.tester import skip_no_elena
else:

    def skip_no_elena(func):

        def wrapper(*args, **kargs):
            return func(*args, **kargs)

        return wrapper


================================================
FILE: mmcv/version.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
__version__ = '2.2.0'


def parse_version_info(version_str: str, length: int = 4) -> tuple:
    """Parse a version string into a tuple.

    Args:
        version_str (str): The version string.
        length (int): The maximum number of version levels. Default: 4.

    Returns:
        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
            (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into
            (2, 0, 0, 0, 'rc', 1) (when length is set to 4).
    """
    from packaging.version import parse
    version = parse(version_str)
    assert version.release, f'failed to parse version {version_str}'
    release = list(version.release)
    release = release[:length]
    if len(release) < length:
        release = release + [0] * (length - len(release))
    if version.is_prerelease:
        release.extend(list(version.pre))  # type: ignore
    elif version.is_postrelease:
        release.extend(list(version.post))  # type: ignore
    else:
        release.extend([0, 0])
    return tuple(release)


version_info = tuple(int(x) for x in __version__.split('.')[:3])

__all__ = ['__version__', 'version_info', 'parse_version_info']


================================================
FILE: mmcv/video/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .io import Cache, VideoReader, frames2video
from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
                      flowwrite, quantize_flow, sparse_flow_from_bytes)
from .processing import concat_video, convert_video, cut_video, resize_video

__all__ = [
    'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
    'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
    'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
]


================================================
FILE: mmcv/video/io.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from collections import OrderedDict

import cv2
from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,
                 CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,
                 CAP_PROP_POS_FRAMES, VideoWriter_fourcc)
from mmengine.utils import (check_file_exist, mkdir_or_exist, scandir,
                            track_progress)


class Cache:

    def __init__(self, capacity):
        self._cache = OrderedDict()
        self._capacity = int(capacity)
        if capacity <= 0:
            raise ValueError('capacity must be a positive integer')

    @property
    def capacity(self):
        return self._capacity

    @property
    def size(self):
        return len(self._cache)

    def put(self, key, val):
        if key in self._cache:
            return
        if len(self._cache) >= self.capacity:
            self._cache.popitem(last=False)
        self._cache[key] = val

    def get(self, key, default=None):
        val = self._cache[key] if key in self._cache else default
        return val


class VideoReader:
    """Video class with similar usage to a list object.

    This video wrapper class provides convenient apis to access frames.
    There exists an issue of OpenCV's VideoCapture class that jumping to a
    certain frame may be inaccurate. It is fixed in this class by checking
    the position after jumping each time.
    Cache is used when decoding videos. So if the same frame is visited for
    the second time, there is no need to decode again if it is stored in the
    cache.

    Examples:
        >>> import mmcv
        >>> v = mmcv.VideoReader('sample.mp4')
        >>> len(v)  # get the total frame number with `len()`
        120
        >>> for img in v:  # v is iterable
        >>>     mmcv.imshow(img)
        >>> v[5]  # get the 6th frame
    """

    def __init__(self, filename, cache_capacity=10):
        # Check whether the video path is a url
        if not filename.startswith(('https://', 'http://')):
            check_file_exist(filename, 'Video file not found: ' + filename)
        self._vcap = cv2.VideoCapture(filename)
        assert cache_capacity > 0
        self._cache = Cache(cache_capacity)
        self._position = 0
        # get basic info
        self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH))
        self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT))
        self._fps = self._vcap.get(CAP_PROP_FPS)
        self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT))
        self._fourcc = self._vcap.get(CAP_PROP_FOURCC)

    @property
    def vcap(self):
        """:obj:`cv2.VideoCapture`: The raw VideoCapture object."""
        return self._vcap

    @property
    def opened(self):
        """bool: Indicate whether the video is opened."""
        return self._vcap.isOpened()

    @property
    def width(self):
        """int: Width of video frames."""
        return self._width

    @property
    def height(self):
        """int: Height of video frames."""
        return self._height

    @property
    def resolution(self):
        """tuple: Video resolution (width, height)."""
        return (self._width, self._height)

    @property
    def fps(self):
        """float: FPS of the video."""
        return self._fps

    @property
    def frame_cnt(self):
        """int: Total frames of the video."""
        return self._frame_cnt

    @property
    def fourcc(self):
        """str: "Four character code" of the video."""
        return self._fourcc

    @property
    def position(self):
        """int: Current cursor position, indicating frame decoded."""
        return self._position

    def _get_real_position(self):
        return int(round(self._vcap.get(CAP_PROP_POS_FRAMES)))

    def _set_real_position(self, frame_id):
        self._vcap.set(CAP_PROP_POS_FRAMES, frame_id)
        pos = self._get_real_position()
        for _ in range(frame_id - pos):
            self._vcap.read()
        self._position = frame_id

    def read(self):
        """Read the next frame.

        If the next frame have been decoded before and in the cache, then
        return it directly, otherwise decode, cache and return it.

        Returns:
            ndarray or None: Return the frame if successful, otherwise None.
        """
        # pos = self._position
        if self._cache:
            img = self._cache.get(self._position)
            if img is not None:
                ret = True
            else:
                if self._position != self._get_real_position():
                    self._set_real_position(self._position)
                ret, img = self._vcap.read()
                if ret:
                    self._cache.put(self._position, img)
        else:
            ret, img = self._vcap.read()
        if ret:
            self._position += 1
        return img

    def get_frame(self, frame_id):
        """Get frame by index.

        Args:
            frame_id (int): Index of the expected frame, 0-based.

        Returns:
            ndarray or None: Return the frame if successful, otherwise None.
        """
        if frame_id < 0 or frame_id >= self._frame_cnt:
            raise IndexError(
                f'"frame_id" must be between 0 and {self._frame_cnt - 1}')
        if frame_id == self._position:
            return self.read()
        if self._cache:
            img = self._cache.get(frame_id)
            if img is not None:
                self._position = frame_id + 1
                return img
        self._set_real_position(frame_id)
        ret, img = self._vcap.read()
        if ret:
            if self._cache:
                self._cache.put(self._position, img)
            self._position += 1
        return img

    def current_frame(self):
        """Get the current frame (frame that is just visited).

        Returns:
            ndarray or None: If the video is fresh, return None, otherwise
            return the frame.
        """
        if self._position == 0:
            return None
        return self._cache.get(self._position - 1)

    def cvt2frames(self,
                   frame_dir,
                   file_start=0,
                   filename_tmpl='{:06d}.jpg',
                   start=0,
                   max_num=0,
                   show_progress=True):
        """Convert a video to frame images.

        Args:
            frame_dir (str): Output directory to store all the frame images.
            file_start (int): Filenames will start from the specified number.
            filename_tmpl (str): Filename template with the index as the
                placeholder.
            start (int): The starting frame index.
            max_num (int): Maximum number of frames to be written.
            show_progress (bool): Whether to show a progress bar.
        """
        mkdir_or_exist(frame_dir)
        if max_num == 0:
            task_num = self.frame_cnt - start
        else:
            task_num = min(self.frame_cnt - start, max_num)
        if task_num <= 0:
            raise ValueError('start must be less than total frame number')
        if start > 0:
            self._set_real_position(start)

        def write_frame(file_idx):
            img = self.read()
            if img is None:
                return
            filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
            cv2.imwrite(filename, img)

        if show_progress:
            track_progress(write_frame, range(file_start,
                                              file_start + task_num))
        else:
            for i in range(task_num):
                write_frame(file_start + i)

    def __len__(self):
        return self.frame_cnt

    def __getitem__(self, index):
        if isinstance(index, slice):
            return [
                self.get_frame(i)
                for i in range(*index.indices(self.frame_cnt))
            ]
        # support negative indexing
        if index < 0:
            index += self.frame_cnt
            if index < 0:
                raise IndexError('index out of range')
        return self.get_frame(index)

    def __iter__(self):
        self._set_real_position(0)
        return self

    def __next__(self):
        img = self.read()
        if img is not None:
            return img
        else:
            raise StopIteration

    next = __next__

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self._vcap.release()


def frames2video(frame_dir: str,
                 video_file: str,
                 fps: float = 30,
                 fourcc: str = 'XVID',
                 filename_tmpl: str = '{:06d}.jpg',
                 start: int = 0,
                 end: int = 0,
                 show_progress: bool = True) -> None:
    """Read the frame images from a directory and join them as a video.

    Args:
        frame_dir (str): The directory containing video frames.
        video_file (str): Output filename.
        fps (float): FPS of the output video.
        fourcc (str): Fourcc of the output video, this should be compatible
            with the output file type.
        filename_tmpl (str): Filename template with the index as the variable.
        start (int): Starting frame index.
        end (int): Ending frame index.
        show_progress (bool): Whether to show a progress bar.
    """
    if end == 0:
        ext = filename_tmpl.split('.')[-1]
        end = len([name for name in scandir(frame_dir, ext)])
    first_file = osp.join(frame_dir, filename_tmpl.format(start))
    check_file_exist(first_file, 'The start frame not found: ' + first_file)
    img = cv2.imread(first_file)
    height, width = img.shape[:2]
    resolution = (width, height)
    vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps,
                              resolution)

    def write_frame(file_idx):
        filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
        img = cv2.imread(filename)
        vwriter.write(img)

    if show_progress:
        track_progress(write_frame, range(start, end))
    else:
        for i in range(start, end):
            write_frame(i)
    vwriter.release()


================================================
FILE: mmcv/video/optflow.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from typing import Tuple, Union

import cv2
import numpy as np
from mmengine.utils import is_str

from mmcv.arraymisc import dequantize, quantize
from mmcv.image import imread, imwrite


def flowread(flow_or_path: Union[np.ndarray, str],
             quantize: bool = False,
             concat_axis: int = 0,
             *args,
             **kwargs) -> np.ndarray:
    """Read an optical flow map.

    Args:
        flow_or_path (ndarray or str): A flow map or filepath.
        quantize (bool): whether to read quantized pair, if set to True,
            remaining args will be passed to :func:`dequantize_flow`.
        concat_axis (int): The axis that dx and dy are concatenated,
            can be either 0 or 1. Ignored if quantize is False.

    Returns:
        ndarray: Optical flow represented as a (h, w, 2) numpy array
    """
    if isinstance(flow_or_path, np.ndarray):
        if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2):
            raise ValueError(f'Invalid flow with shape {flow_or_path.shape}')
        return flow_or_path
    elif not is_str(flow_or_path):
        raise TypeError(f'"flow_or_path" must be a filename or numpy array, '
                        f'not {type(flow_or_path)}')

    if not quantize:
        with open(flow_or_path, 'rb') as f:
            try:
                header = f.read(4).decode('utf-8')
            except Exception:
                raise OSError(f'Invalid flow file: {flow_or_path}')
            else:
                if header != 'PIEH':
                    raise OSError(f'Invalid flow file: {flow_or_path}, '
                                  'header does not contain PIEH')

            w = np.fromfile(f, np.int32, 1).squeeze()
            h = np.fromfile(f, np.int32, 1).squeeze()
            flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2))
    else:
        assert concat_axis in [0, 1]
        cat_flow = imread(flow_or_path, flag='unchanged')
        if cat_flow.ndim != 2:
            raise OSError(
                f'{flow_or_path} is not a valid quantized flow file, '
                f'its dimension is {cat_flow.ndim}.')
        assert cat_flow.shape[concat_axis] % 2 == 0
        dx, dy = np.split(cat_flow, 2, axis=concat_axis)
        flow = dequantize_flow(dx, dy, *args, **kwargs)

    return flow.astype(np.float32)


def flowwrite(flow: np.ndarray,
              filename: str,
              quantize: bool = False,
              concat_axis: int = 0,
              *args,
              **kwargs) -> None:
    """Write optical flow to file.

    If the flow is not quantized, it will be saved as a .flo file losslessly,
    otherwise a jpeg image which is lossy but of much smaller size. (dx and dy
    will be concatenated horizontally into a single image if quantize is True.)

    Args:
        flow (ndarray): (h, w, 2) array of optical flow.
        filename (str): Output filepath.
        quantize (bool): Whether to quantize the flow and save it to 2 jpeg
            images. If set to True, remaining args will be passed to
            :func:`quantize_flow`.
        concat_axis (int): The axis that dx and dy are concatenated,
            can be either 0 or 1. Ignored if quantize is False.
    """
    if not quantize:
        with open(filename, 'wb') as f:
            f.write(b'PIEH')
            np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
            flow = flow.astype(np.float32)
            flow.tofile(f)
            f.flush()
    else:
        assert concat_axis in [0, 1]
        dx, dy = quantize_flow(flow, *args, **kwargs)
        dxdy = np.concatenate((dx, dy), axis=concat_axis)
        imwrite(dxdy, filename)


def quantize_flow(flow: np.ndarray,
                  max_val: float = 0.02,
                  norm: bool = True) -> tuple:
    """Quantize flow to [0, 255].

    After this step, the size of flow will be much smaller, and can be
    dumped as jpeg images.

    Args:
        flow (ndarray): (h, w, 2) array of optical flow.
        max_val (float): Maximum value of flow, values beyond
                        [-max_val, max_val] will be truncated.
        norm (bool): Whether to divide flow values by image width/height.

    Returns:
        tuple[ndarray]: Quantized dx and dy.
    """
    h, w, _ = flow.shape
    dx = flow[..., 0]
    dy = flow[..., 1]
    if norm:
        dx = dx / w  # avoid inplace operations
        dy = dy / h
    # use 255 levels instead of 256 to make sure 0 is 0 after dequantization.
    flow_comps = [
        quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy]
    ]
    return tuple(flow_comps)


def dequantize_flow(dx: np.ndarray,
                    dy: np.ndarray,
                    max_val: float = 0.02,
                    denorm: bool = True) -> np.ndarray:
    """Recover from quantized flow.

    Args:
        dx (ndarray): Quantized dx.
        dy (ndarray): Quantized dy.
        max_val (float): Maximum value used when quantizing.
        denorm (bool): Whether to multiply flow values with width/height.

    Returns:
        ndarray: Dequantized flow.
    """
    assert dx.shape == dy.shape
    assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)

    dx, dy = (dequantize(d, -max_val, max_val, 255) for d in [dx, dy])

    if denorm:
        dx *= dx.shape[1]  # type: ignore
        dy *= dx.shape[0]  # type: ignore
    flow = np.dstack((dx, dy))
    return flow


def flow_warp(img: np.ndarray,
              flow: np.ndarray,
              filling_value: int = 0,
              interpolate_mode: str = 'nearest') -> np.ndarray:
    """Use flow to warp img.

    Args:
        img (ndarray): Image to be warped.
        flow (ndarray): Optical Flow.
        filling_value (int): The missing pixels will be set with filling_value.
        interpolate_mode (str): bilinear -> Bilinear Interpolation;
                                nearest -> Nearest Neighbor.

    Returns:
        ndarray: Warped image with the same shape of img
    """
    warnings.warn('This function is just for prototyping and cannot '
                  'guarantee the computational efficiency.')
    assert flow.ndim == 3, 'Flow must be in 3D arrays.'
    height = flow.shape[0]
    width = flow.shape[1]
    channels = img.shape[2]

    output = np.ones(
        (height, width, channels), dtype=img.dtype) * filling_value

    grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2)
    dx = grid[:, :, 0] + flow[:, :, 1]
    dy = grid[:, :, 1] + flow[:, :, 0]
    sx = np.floor(dx).astype(int)
    sy = np.floor(dy).astype(int)
    valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1)

    if interpolate_mode == 'nearest':
        output[valid, :] = img[dx[valid].round().astype(int),
                               dy[valid].round().astype(int), :]
    elif interpolate_mode == 'bilinear':
        # dirty walkround for integer positions
        eps_ = 1e-6
        dx, dy = dx + eps_, dy + eps_
        left_top_ = img[np.floor(dx[valid]).astype(int),
                        np.floor(dy[valid]).astype(int), :] * (
                            np.ceil(dx[valid]) - dx[valid])[:, None] * (
                                np.ceil(dy[valid]) - dy[valid])[:, None]
        left_down_ = img[np.ceil(dx[valid]).astype(int),
                         np.floor(dy[valid]).astype(int), :] * (
                             dx[valid] - np.floor(dx[valid]))[:, None] * (
                                 np.ceil(dy[valid]) - dy[valid])[:, None]
        right_top_ = img[np.floor(dx[valid]).astype(int),
                         np.ceil(dy[valid]).astype(int), :] * (
                             np.ceil(dx[valid]) - dx[valid])[:, None] * (
                                 dy[valid] - np.floor(dy[valid]))[:, None]
        right_down_ = img[np.ceil(dx[valid]).astype(int),
                          np.ceil(dy[valid]).astype(int), :] * (
                              dx[valid] - np.floor(dx[valid]))[:, None] * (
                                  dy[valid] - np.floor(dy[valid]))[:, None]
        output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_
    else:
        raise NotImplementedError(
            'We only support interpolation modes of nearest and bilinear, '
            f'but got {interpolate_mode}.')
    return output.astype(img.dtype)


def flow_from_bytes(content: bytes) -> np.ndarray:
    """Read dense optical flow from bytes.

    .. note::
        This load optical flow function works for FlyingChairs, FlyingThings3D,
        Sintel, FlyingChairsOcc datasets, but cannot load the data from
        ChairsSDHom.

    Args:
        content (bytes): Optical flow bytes got from files or other streams.

    Returns:
        ndarray: Loaded optical flow with the shape (H, W, 2).
    """

    # header in first 4 bytes
    header = content[:4]
    if header.decode('utf-8') != 'PIEH':
        raise Exception('Flow file header does not contain PIEH')
    # width in second 4 bytes
    width = np.frombuffer(content[4:], np.int32, 1).squeeze()
    # height in third 4 bytes
    height = np.frombuffer(content[8:], np.int32, 1).squeeze()
    # after first 12 bytes, all bytes are flow
    flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape(
        (height, width, 2))

    return flow


def sparse_flow_from_bytes(content: bytes) -> Tuple[np.ndarray, np.ndarray]:
    """Read the optical flow in KITTI datasets from bytes.

    This function is modified from RAFT load the `KITTI datasets
    <https://github.com/princeton-vl/RAFT/blob/224320502d66c356d88e6c712f38129e60661e80/core/utils/frame_utils.py#L102>`_.

    Args:
        content (bytes): Optical flow bytes got from files or other streams.

    Returns:
        Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)
        and flow valid mask with the shape (H, W).
    """  # nopa

    content = np.frombuffer(content, np.uint8)
    flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
    flow = flow[:, :, ::-1].astype(np.float32)
    # flow shape (H, W, 2) valid shape (H, W)
    flow, valid = flow[:, :, :2], flow[:, :, 2]
    flow = (flow - 2**15) / 64.0
    return flow, valid


================================================
FILE: mmcv/video/processing.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import subprocess
import tempfile
from typing import List, Optional, Union

from mmengine.utils import requires_executable


@requires_executable('ffmpeg')
def convert_video(in_file: str,
                  out_file: str,
                  print_cmd: bool = False,
                  pre_options: str = '',
                  **kwargs) -> None:
    """Convert a video with ffmpeg.

    This provides a general api to ffmpeg, the executed command is::

        `ffmpeg -y <pre_options> -i <in_file> <options> <out_file>`

    Options(kwargs) are mapped to ffmpeg commands with the following rules:

    - key=val: "-key val"
    - key=True: "-key"
    - key=False: ""

    Args:
        in_file (str): Input video filename.
        out_file (str): Output video filename.
        pre_options (str): Options appears before "-i <in_file>".
        print_cmd (bool): Whether to print the final ffmpeg command.
    """
    options = []
    for k, v in kwargs.items():
        if isinstance(v, bool):
            if v:
                options.append(f'-{k}')
        elif k == 'log_level':
            assert v in [
                'quiet', 'panic', 'fatal', 'error', 'warning', 'info',
                'verbose', 'debug', 'trace'
            ]
            options.append(f'-loglevel {v}')
        else:
            options.append(f'-{k} {v}')
    cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \
          f'{out_file}'
    if print_cmd:
        print(cmd)
    subprocess.call(cmd, shell=True)


@requires_executable('ffmpeg')
def resize_video(in_file: str,
                 out_file: str,
                 size: Optional[tuple] = None,
                 ratio: Union[tuple, float, None] = None,
                 keep_ar: bool = False,
                 log_level: str = 'info',
                 print_cmd: bool = False) -> None:
    """Resize a video.

    Args:
        in_file (str): Input video filename.
        out_file (str): Output video filename.
        size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1).
        ratio (tuple or float): Expected resize ratio, (2, 0.5) means
            (w*2, h*0.5).
        keep_ar (bool): Whether to keep original aspect ratio.
        log_level (str): Logging level of ffmpeg.
        print_cmd (bool): Whether to print the final ffmpeg command.
    """
    if size is None and ratio is None:
        raise ValueError('expected size or ratio must be specified')
    if size is not None and ratio is not None:
        raise ValueError('size and ratio cannot be specified at the same time')
    options = {'log_level': log_level}
    if size:
        if not keep_ar:
            options['vf'] = f'scale={size[0]}:{size[1]}'
        else:
            options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \
                            'force_original_aspect_ratio=decrease'
    else:
        if not isinstance(ratio, tuple):
            ratio = (ratio, ratio)
        options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"'
    convert_video(in_file, out_file, print_cmd, **options)


@requires_executable('ffmpeg')
def cut_video(in_file: str,
              out_file: str,
              start: Optional[float] = None,
              end: Optional[float] = None,
              vcodec: Optional[str] = None,
              acodec: Optional[str] = None,
              log_level: str = 'info',
              print_cmd: bool = False) -> None:
    """Cut a clip from a video.

    Args:
        in_file (str): Input video filename.
        out_file (str): Output video filename.
        start (None or float): Start time (in seconds).
        end (None or float): End time (in seconds).
        vcodec (None or str): Output video codec, None for unchanged.
        acodec (None or str): Output audio codec, None for unchanged.
        log_level (str): Logging level of ffmpeg.
        print_cmd (bool): Whether to print the final ffmpeg command.
    """
    options = {'log_level': log_level}
    if vcodec is None:
        options['vcodec'] = 'copy'
    if acodec is None:
        options['acodec'] = 'copy'
    if start:
        options['ss'] = start  # type: ignore
    else:
        start = 0
    if end:
        options['t'] = end - start  # type: ignore
    convert_video(in_file, out_file, print_cmd, **options)


@requires_executable('ffmpeg')
def concat_video(video_list: List,
                 out_file: str,
                 vcodec: Optional[str] = None,
                 acodec: Optional[str] = None,
                 log_level: str = 'info',
                 print_cmd: bool = False) -> None:
    """Concatenate multiple videos into a single one.

    Args:
        video_list (list): A list of video filenames
        out_file (str): Output video filename
        vcodec (None or str): Output video codec, None for unchanged
        acodec (None or str): Output audio codec, None for unchanged
        log_level (str): Logging level of ffmpeg.
        print_cmd (bool): Whether to print the final ffmpeg command.
    """
    tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True)
    with open(tmp_filename, 'w') as f:
        for filename in video_list:
            f.write(f'file {osp.abspath(filename)}\n')
    options = {'log_level': log_level}
    if vcodec is None:
        options['vcodec'] = 'copy'
    if acodec is None:
        options['acodec'] = 'copy'
    convert_video(
        tmp_filename,
        out_file,
        print_cmd,
        pre_options='-f concat -safe 0',
        **options)
    os.close(tmp_filehandler)
    os.remove(tmp_filename)


================================================
FILE: mmcv/visualization/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .color import Color, color_val
from .image import imshow, imshow_bboxes, imshow_det_bboxes
from .optflow import flow2rgb, flowshow, make_color_wheel

__all__ = [
    'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
    'flowshow', 'flow2rgb', 'make_color_wheel'
]


================================================
FILE: mmcv/visualization/color.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from enum import Enum
from typing import Union

import numpy as np
from mmengine.utils import is_str


class Color(Enum):
    """An enum that defines common colors.

    Contains red, green, blue, cyan, yellow, magenta, white and black.
    """
    red = (0, 0, 255)
    green = (0, 255, 0)
    blue = (255, 0, 0)
    cyan = (255, 255, 0)
    yellow = (0, 255, 255)
    magenta = (255, 0, 255)
    white = (255, 255, 255)
    black = (0, 0, 0)


def color_val(color: Union[Color, str, tuple, int, np.ndarray]) -> tuple:
    """Convert various input to color tuples.

    Args:
        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs

    Returns:
        tuple[int]: A tuple of 3 integers indicating BGR channels.
    """
    if is_str(color):
        return Color[color].value  # type: ignore
    elif isinstance(color, Color):
        return color.value
    elif isinstance(color, tuple):
        assert len(color) == 3
        for channel in color:
            assert 0 <= channel <= 255
        return color
    elif isinstance(color, int):
        assert 0 <= color <= 255
        return color, color, color
    elif isinstance(color, np.ndarray):
        assert color.ndim == 1 and color.size == 3
        assert np.all((color >= 0) & (color <= 255))
        color = color.astype(np.uint8)
        return tuple(color)
    else:
        raise TypeError(f'Invalid type for color: {type(color)}')


================================================
FILE: mmcv/visualization/image.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional, Union

import cv2
import numpy as np

from mmcv.image import imread, imwrite
from .color import Color, color_val

# a type alias declares the optional types of color argument
ColorType = Union[Color, str, tuple, int, np.ndarray]


def imshow(img: Union[str, np.ndarray],
           win_name: str = '',
           wait_time: int = 0):
    """Show an image.

    Args:
        img (str or ndarray): The image to be displayed.
        win_name (str): The window name.
        wait_time (int): Value of waitKey param.
    """
    cv2.imshow(win_name, imread(img))
    if wait_time == 0:  # prevent from hanging if windows was closed
        while True:
            ret = cv2.waitKey(1)

            closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1
            # if user closed window or if some key pressed
            if closed or ret != -1:
                break
    else:
        ret = cv2.waitKey(wait_time)


def imshow_bboxes(img: Union[str, np.ndarray],
                  bboxes: Union[list, np.ndarray],
                  colors: ColorType = 'green',
                  top_k: int = -1,
                  thickness: int = 1,
                  show: bool = True,
                  win_name: str = '',
                  wait_time: int = 0,
                  out_file: Optional[str] = None):
    """Draw bboxes on an image.

    Args:
        img (str or ndarray): The image to be displayed.
        bboxes (list or ndarray): A list of ndarray of shape (k, 4).
        colors (Color or str or tuple or int or ndarray): A list of colors.
        top_k (int): Plot the first k bboxes only if set positive.
        thickness (int): Thickness of lines.
        show (bool): Whether to show the image.
        win_name (str): The window name.
        wait_time (int): Value of waitKey param.
        out_file (str, optional): The filename to write the image.

    Returns:
        ndarray: The image with bboxes drawn on it.
    """
    img = imread(img)
    img = np.ascontiguousarray(img)

    if isinstance(bboxes, np.ndarray):
        bboxes = [bboxes]
    if not isinstance(colors, list):
        colors = [colors for _ in range(len(bboxes))]
    colors = [color_val(c) for c in colors]
    assert len(bboxes) == len(colors)

    for i, _bboxes in enumerate(bboxes):
        _bboxes = _bboxes.astype(np.int32)
        if top_k <= 0:
            _top_k = _bboxes.shape[0]
        else:
            _top_k = min(top_k, _bboxes.shape[0])
        for j in range(_top_k):
            left_top = (_bboxes[j, 0], _bboxes[j, 1])
            right_bottom = (_bboxes[j, 2], _bboxes[j, 3])
            cv2.rectangle(
                img, left_top, right_bottom, colors[i], thickness=thickness)

    if show:
        imshow(img, win_name, wait_time)
    if out_file is not None:
        imwrite(img, out_file)
    return img


def imshow_det_bboxes(img: Union[str, np.ndarray],
                      bboxes: np.ndarray,
                      labels: np.ndarray,
                      class_names: Optional[List[str]] = None,
                      score_thr: float = 0,
                      bbox_color: ColorType = 'green',
                      text_color: ColorType = 'green',
                      thickness: int = 1,
                      font_scale: float = 0.5,
                      show: bool = True,
                      win_name: str = '',
                      wait_time: int = 0,
                      out_file: Optional[str] = None):
    """Draw bboxes and class labels (with scores) on an image.

    Args:
        img (str or ndarray): The image to be displayed.
        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
            (n, 5).
        labels (ndarray): Labels of bboxes.
        class_names (list[str]): Names of each classes.
        score_thr (float): Minimum score of bboxes to be shown.
        bbox_color (Color or str or tuple or int or ndarray): Color
            of bbox lines.
        text_color (Color or str or tuple or int or ndarray): Color
            of texts.
        thickness (int): Thickness of lines.
        font_scale (float): Font scales of texts.
        show (bool): Whether to show the image.
        win_name (str): The window name.
        wait_time (int): Value of waitKey param.
        out_file (str or None): The filename to write the image.

    Returns:
        ndarray: The image with bboxes drawn on it.
    """
    assert bboxes.ndim == 2
    assert labels.ndim == 1
    assert bboxes.shape[0] == labels.shape[0]
    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5
    img = imread(img)
    img = np.ascontiguousarray(img)

    if score_thr > 0:
        assert bboxes.shape[1] == 5
        scores = bboxes[:, -1]
        inds = scores > score_thr
        bboxes = bboxes[inds, :]
        labels = labels[inds]

    bbox_color = color_val(bbox_color)
    text_color = color_val(text_color)

    for bbox, label in zip(bboxes, labels):
        bbox_int = bbox.astype(np.int32)
        left_top = (bbox_int[0], bbox_int[1])
        right_bottom = (bbox_int[2], bbox_int[3])
        cv2.rectangle(
            img, left_top, right_bottom, bbox_color, thickness=thickness)
        label_text = class_names[
            label] if class_names is not None else f'cls {label}'
        if len(bbox) > 4:
            label_text += f'|{bbox[-1]:.02f}'
        cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2),
                    cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color)

    if show:
        imshow(img, win_name, wait_time)
    if out_file is not None:
        imwrite(img, out_file)
    return img


================================================
FILE: mmcv/visualization/optflow.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Union

import numpy as np

from mmcv.image import rgb2bgr
from mmcv.video import flowread
from .image import imshow


def flowshow(flow: Union[np.ndarray, str],
             win_name: str = '',
             wait_time: int = 0) -> None:
    """Show optical flow.

    Args:
        flow (ndarray or str): The optical flow to be displayed.
        win_name (str): The window name.
        wait_time (int): Value of waitKey param.
    """
    flow = flowread(flow)
    flow_img = flow2rgb(flow)
    imshow(rgb2bgr(flow_img), win_name, wait_time)


def flow2rgb(flow: np.ndarray,
             color_wheel: Optional[np.ndarray] = None,
             unknown_thr: float = 1e6) -> np.ndarray:
    """Convert flow map to RGB image.

    Args:
        flow (ndarray): Array of optical flow.
        color_wheel (ndarray or None): Color wheel used to map flow field to
            RGB colorspace. Default color wheel will be used if not specified.
        unknown_thr (float): Values above this threshold will be marked as
            unknown and thus ignored.

    Returns:
        ndarray: RGB image that can be visualized.
    """
    assert flow.ndim == 3 and flow.shape[-1] == 2
    if color_wheel is None:
        color_wheel = make_color_wheel()
    assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3
    num_bins = color_wheel.shape[0]

    dx = flow[:, :, 0].copy()
    dy = flow[:, :, 1].copy()

    ignore_inds = (
        np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) |
        (np.abs(dy) > unknown_thr))
    dx[ignore_inds] = 0
    dy[ignore_inds] = 0

    rad = np.sqrt(dx**2 + dy**2)
    if np.any(rad > np.finfo(float).eps):
        max_rad = np.max(rad)
        dx /= max_rad
        dy /= max_rad

    rad = np.sqrt(dx**2 + dy**2)
    angle = np.arctan2(-dy, -dx) / np.pi

    bin_real = (angle + 1) / 2 * (num_bins - 1)
    bin_left = np.floor(bin_real).astype(int)
    bin_right = (bin_left + 1) % num_bins
    w = (bin_real - bin_left.astype(np.float32))[..., None]
    flow_img = (1 -
                w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :]
    small_ind = rad <= 1
    flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind])
    flow_img[np.logical_not(small_ind)] *= 0.75

    flow_img[ignore_inds, :] = 0

    return flow_img


def make_color_wheel(bins: Optional[Union[list, tuple]] = None) -> np.ndarray:
    """Build a color wheel.

    Args:
        bins(list or tuple, optional): Specify the number of bins for each
            color range, corresponding to six ranges: red -> yellow,
            yellow -> green, green -> cyan, cyan -> blue, blue -> magenta,
            magenta -> red. [15, 6, 4, 11, 13, 6] is used for default
            (see Middlebury).

    Returns:
        ndarray: Color wheel of shape (total_bins, 3).
    """
    if bins is None:
        bins = [15, 6, 4, 11, 13, 6]
    assert len(bins) == 6

    RY, YG, GC, CB, BM, MR = tuple(bins)

    ry = [1, np.arange(RY) / RY, 0]
    yg = [1 - np.arange(YG) / YG, 1, 0]
    gc = [0, 1, np.arange(GC) / GC]
    cb = [0, 1 - np.arange(CB) / CB, 1]
    bm = [np.arange(BM) / BM, 0, 1]
    mr = [1, 0, 1 - np.arange(MR) / MR]

    num_bins = RY + YG + GC + CB + BM + MR

    color_wheel = np.zeros((3, num_bins), dtype=np.float32)

    col = 0
    for i, color in enumerate([ry, yg, gc, cb, bm, mr]):
        for j in range(3):
            color_wheel[j, col:col + bins[i]] = color[j]
        col += bins[i]

    return color_wheel.T


================================================
FILE: requirements/build.txt
================================================
pytest-runner


================================================
FILE: requirements/docs.txt
================================================
docutils==0.16.0
markdown>=3.4.0
myst-parser
opencv-python
-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
sphinx==4.0.2
sphinx-copybutton
sphinx_markdown_tables>=0.0.16
torch
urllib3<2.0.0


================================================
FILE: requirements/optional.txt
================================================
ninja
psutil


================================================
FILE: requirements/runtime.txt
================================================
addict
mmengine>=0.3.0
numpy
packaging
Pillow
pyyaml
regex;sys_platform=='win32'
yapf


================================================
FILE: requirements/test.txt
================================================
coverage
lmdb
onnx
onnxoptimizer
onnxruntime
pytest
PyTurboJPEG
scipy
tifffile


================================================
FILE: requirements.txt
================================================
-r requirements/build.txt
-r requirements/optional.txt
-r requirements/runtime.txt
-r requirements/test.txt


================================================
FILE: setup.cfg
================================================
[bdist_wheel]
universal=1

[aliases]
test=pytest

[yapf]
based_on_style = pep8
blank_line_before_nested_class_or_def = true
split_before_expression_after_opening_paren = true

[isort]
line_length = 79
multi_line_output = 0
extra_standard_library = pkg_resources,setuptools,logging,os,warnings,abc
known_first_party = mmcv
known_third_party = addict,cv2,matplotlib,numpy,onnx,packaging,pytest,pytorch_sphinx_theme,scipy,sphinx,torch,torchvision,yaml,yapf
no_lines_before = STDLIB,LOCALFOLDER
default_section = THIRDPARTY

# ignore-words-list needs to be lowercase format. For example, if we want to
# ignore word "BA", then we need to append "ba" to ignore-words-list rather
# than "BA"
[codespell]
quiet-level = 3
ignore-words-list = inout,hist,ba,ro,inh


================================================
FILE: setup.py
================================================
import glob
import os
import platform
import re
from pkg_resources import DistributionNotFound, get_distribution, parse_version
from setuptools import find_packages, setup

EXT_TYPE = ''
try:
    import torch
    if torch.__version__ == 'parrots':
        from parrots.utils.build_extension import BuildExtension
        EXT_TYPE = 'parrots'
    elif (hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()) or \
            os.getenv('FORCE_MLU', '0') == '1':
        from torch_mlu.utils.cpp_extension import BuildExtension
        EXT_TYPE = 'pytorch'
    elif (hasattr(torch, 'is_musa_available') and torch.is_musa_available()) \
            or os.getenv('FORCE_MUSA', '0') == '1':
        from torch_musa.utils.musa_extension import BuildExtension
        EXT_TYPE = 'pytorch'
    else:
        from torch.utils.cpp_extension import BuildExtension
        EXT_TYPE = 'pytorch'
    cmd_class = {'build_ext': BuildExtension}
except ModuleNotFoundError:
    cmd_class = {}
    print('Skip building ext ops due to the absence of torch.')


def choose_requirement(primary, secondary):
    """If some version of primary requirement installed, return primary, else
    return secondary."""
    try:
        name = re.split(r'[!<>=]', primary)[0]
        get_distribution(name)
    except DistributionNotFound:
        return secondary

    return str(primary)


def get_version():
    version_file = 'mmcv/version.py'
    with open(version_file, encoding='utf-8') as f:
        exec(compile(f.read(), version_file, 'exec'))
    return locals()['__version__']


def parse_requirements(fname='requirements/runtime.txt', with_version=True):
    """Parse the package dependencies listed in a requirements file but strips
    specific versioning information.

    Args:
        fname (str): path to requirements file
        with_version (bool, default=False): if True include version specs

    Returns:
        List[str]: list of requirements items

    CommandLine:
        python -c "import setup; print(setup.parse_requirements())"
    """
    import sys
    from os.path import exists
    require_fpath = fname

    def parse_line(line):
        """Parse information from a line in a requirements text file."""
        if line.startswith('-r '):
            # Allow specifying requirements in other files
            target = line.split(' ')[1]
            for info in parse_require_file(target):
                yield info
        else:
            info = {'line': line}
            if line.startswith('-e '):
                info['package'] = line.split('#egg=')[1]
            else:
                # Remove versioning from the package
                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
                parts = re.split(pat, line, maxsplit=1)
                parts = [p.strip() for p in parts]

                info['package'] = parts[0]
                if len(parts) > 1:
                    op, rest = parts[1:]
                    if ';' in rest:
                        # Handle platform specific dependencies
                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
                        version, platform_deps = map(str.strip,
                                                     rest.split(';'))
                        info['platform_deps'] = platform_deps
                    else:
                        version = rest  # NOQA
                    info['version'] = (op, version)
            yield info

    def parse_require_file(fpath):
        with open(fpath) as f:
            for line in f.readlines():
                line = line.strip()
                if line and not line.startswith('#'):
                    yield from parse_line(line)

    def gen_packages_items():
        if exists(require_fpath):
            for info in parse_require_file(require_fpath):
                parts = [info['package']]
                if with_version and 'version' in info:
                    parts.extend(info['version'])
                if not sys.version.startswith('3.4'):
                    # apparently package_deps are broken in 3.4
                    platform_deps = info.get('platform_deps')
                    if platform_deps is not None:
                        parts.append(';' + platform_deps)
                item = ''.join(parts)
                yield item

    packages = list(gen_packages_items())
    return packages


install_requires = parse_requirements()

try:
    # OpenCV installed via conda.
    import cv2  # NOQA: F401
    major, minor, *rest = cv2.__version__.split('.')
    if int(major) < 3:
        raise RuntimeError(
            f'OpenCV >=3 is required but {cv2.__version__} is installed')
except ImportError:
    # If first not installed install second package
    CHOOSE_INSTALL_REQUIRES = [('opencv-python-headless>=3',
                                'opencv-python>=3')]
    for main, secondary in CHOOSE_INSTALL_REQUIRES:
        install_requires.append(choose_requirement(main, secondary))


def get_extensions():
    extensions = []

    if os.getenv('MMCV_WITH_OPS', '1') == '0':
        return extensions

    if EXT_TYPE == 'parrots':
        ext_name = 'mmcv._ext'
        from parrots.utils.build_extension import Extension

        # new parrots op impl do not use MMCV_USE_PARROTS
        # define_macros = [('MMCV_USE_PARROTS', None)]
        define_macros = []
        include_dirs = []
        op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\
            glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') +\
            glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
        op_files.remove('./mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu')
        op_files.remove('./mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp')
        op_files.remove('./mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu')
        cuda_args = os.getenv('MMCV_CUDA_ARGS')
        extra_compile_args = {
            'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],
            'cxx': ['-std=c++14'],
        }
        if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
            define_macros += [('MMCV_WITH_CUDA', None)]
            extra_compile_args['nvcc'] += [
                '-D__CUDA_NO_HALF_OPERATORS__',
                '-D__CUDA_NO_HALF_CONVERSIONS__',
                '-D__CUDA_NO_HALF2_OPERATORS__',
            ]
        ext_ops = Extension(
            name=ext_name,
            sources=op_files,
            include_dirs=include_dirs,
            define_macros=define_macros,
            extra_compile_args=extra_compile_args,
            cuda=True,
            pytorch=True)
        extensions.append(ext_ops)
    elif EXT_TYPE == 'pytorch':
        ext_name = 'mmcv._ext'
        from torch.utils.cpp_extension import CppExtension, CUDAExtension

        # prevent ninja from using too many resources
        try:
            import psutil
            num_cpu = len(psutil.Process().cpu_affinity())
            cpu_use = max(4, num_cpu - 1)
        except (ModuleNotFoundError, AttributeError):
            cpu_use = 4

        os.environ.setdefault('MAX_JOBS', str(cpu_use))
        define_macros = []

        # Before PyTorch1.8.0, when compiling CUDA code, `cxx` is a
        # required key passed to PyTorch. Even if there is no flag passed
        # to cxx, users also need to pass an empty list to PyTorch.
        # Since PyTorch1.8.0, it has a default value so users do not need
        # to pass an empty list anymore.
        # More details at https://github.com/pytorch/pytorch/pull/45956
        extra_compile_args = {'cxx': []}

        if platform.system() != 'Windows':
            if parse_version(torch.__version__) <= parse_version('1.12.1'):
                extra_compile_args['cxx'] = ['-std=c++14']
            else:
                extra_compile_args['cxx'] = ['-std=c++17']
        else:
            if parse_version(torch.__version__) <= parse_version('1.12.1'):
                extra_compile_args['cxx'] = ['/std:c++14']
            else:
                extra_compile_args['cxx'] = ['/std:c++17']

        include_dirs = []
        library_dirs = []
        libraries = []

        extra_objects = []
        extra_link_args = []
        is_rocm_pytorch = False
        try:
            from torch.utils.cpp_extension import ROCM_HOME
            is_rocm_pytorch = True if ((torch.version.hip is not None) and
                                       (ROCM_HOME is not None)) else False
        except ImportError:
            pass

        if os.getenv('MMCV_WITH_DIOPI', '0') == '1':
            import mmengine  # NOQA: F401
            from mmengine.utils.version_utils import digit_version
            assert digit_version(mmengine.__version__) >= digit_version(
                '0.7.4'), f'mmengine >= 0.7.4 is required \
                but {mmengine.__version__} is installed'

            print(f'Compiling {ext_name} with CPU and DIPU')
            define_macros += [('MMCV_WITH_DIOPI', None)]
            define_macros += [('DIOPI_ATTR_WEAK', None)]
            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
            extension = CppExtension
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
            dipu_root = os.getenv('DIPU_ROOT')
            diopi_path = os.getenv('DIOPI_PATH')
            dipu_path = os.getenv('DIPU_PATH')
            vendor_include_dirs = os.getenv('VENDOR_INCLUDE_DIRS')
            nccl_include_dirs = os.getenv('NCCL_INCLUDE_DIRS')
            pytorch_dir = os.getenv('PYTORCH_DIR')
            include_dirs.append(dipu_root)
            include_dirs.append(diopi_path + '/include')
            include_dirs.append(dipu_path + '/dist/include')
            include_dirs.append(vendor_include_dirs)
            include_dirs.append(pytorch_dir + 'torch/include')
            if nccl_include_dirs:
                include_dirs.append(nccl_include_dirs)
            library_dirs += [dipu_root]
            libraries += ['torch_dipu']
        elif is_rocm_pytorch or torch.cuda.is_available() or os.getenv(
                'FORCE_CUDA', '0') == '1':
            if is_rocm_pytorch:
                define_macros += [('MMCV_WITH_HIP', None)]
            define_macros += [('MMCV_WITH_CUDA', None)]
            cuda_args = os.getenv('MMCV_CUDA_ARGS')
            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cpp')
            extension = CUDAExtension
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/pytorch'))
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
        elif (hasattr(torch, 'is_mlu_available') and
                torch.is_mlu_available()) or \
                os.getenv('FORCE_MLU', '0') == '1':
            from torch_mlu.utils.cpp_extension import MLUExtension

            def get_mluops_version(file_path):
                with open(file_path) as f:
                    for line in f:
                        if re.search('MLUOP_MAJOR', line):
                            major = line.strip().split(' ')[2]
                        if re.search('MLUOP_MINOR', line):
                            minor = line.strip().split(' ')[2]
                        if re.search('MLUOP_PATCHLEVEL', line):
                            patchlevel = line.strip().split(' ')[2]
                mluops_version = f'v{major}.{minor}.{patchlevel}'
                return mluops_version

            mmcv_mluops_version = get_mluops_version(
                './mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h')
            mlu_ops_path = os.getenv('MMCV_MLU_OPS_PATH')
            if mlu_ops_path:
                exists_mluops_version = get_mluops_version(
                    mlu_ops_path + '/bangc-ops/mlu_op.h')
                if exists_mluops_version != mmcv_mluops_version:
                    print('the version of mlu-ops provided is %s,'
                          ' while %s is needed.' %
                          (exists_mluops_version, mmcv_mluops_version))
                    exit()
                try:
                    if os.path.exists('mlu-ops'):
                        if os.path.islink('mlu-ops'):
                            os.remove('mlu-ops')
                            os.symlink(mlu_ops_path, 'mlu-ops')
                        elif os.path.abspath('mlu-ops') != mlu_ops_path:
                            os.symlink(mlu_ops_path, 'mlu-ops')
                    else:
                        os.symlink(mlu_ops_path, 'mlu-ops')
                except Exception:
                    raise FileExistsError(
                        'mlu-ops already exists, please move it out,'
                        'or rename or remove it.')
            else:
                if not os.path.exists('mlu-ops'):
                    import requests
                    mluops_url = 'https://github.com/Cambricon/mlu-ops/' + \
                        'archive/refs/tags/' + mmcv_mluops_version + '.zip'
                    req = requests.get(mluops_url)
                    with open('./mlu-ops.zip', 'wb') as f:
                        try:
                            f.write(req.content)
                        except Exception:
                            raise ImportError('failed to download mlu-ops')

                    from zipfile import BadZipFile, ZipFile
                    with ZipFile('./mlu-ops.zip', 'r') as archive:
                        try:
                            archive.extractall()
                            dir_name = archive.namelist()[0].split('/')[0]
                            os.rename(dir_name, 'mlu-ops')
                        except BadZipFile:
                            print('invalid mlu-ops.zip file')
                else:
                    exists_mluops_version = get_mluops_version(
                        './mlu-ops/bangc-ops/mlu_op.h')
                    if exists_mluops_version != mmcv_mluops_version:
                        print('the version of provided mlu-ops is %s,'
                              ' while %s is needed.' %
                              (exists_mluops_version, mmcv_mluops_version))
                        exit()

            define_macros += [('MMCV_WITH_MLU', None)]
            mlu_args = os.getenv('MMCV_MLU_ARGS', '-DNDEBUG ')
            mluops_includes = []
            mluops_includes.append('-I' +
                                   os.path.abspath('./mlu-ops/bangc-ops'))
            mluops_includes.append(
                '-I' + os.path.abspath('./mlu-ops/bangc-ops/kernels'))
            extra_compile_args['cncc'] = [mlu_args] + \
                mluops_includes if mlu_args else mluops_includes
            extra_compile_args['cxx'] += ['-fno-gnu-unique']
            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu') + \
                glob.glob(
                    './mlu-ops/bangc-ops/core/**/*.cpp', recursive=True) + \
                glob.glob(
                    './mlu-ops/bangc-ops/kernels/**/*.cpp', recursive=True) + \
                glob.glob(
                    './mlu-ops/bangc-ops/kernels/**/*.mlu', recursive=True)
            extra_link_args = [
                '-Wl,--whole-archive',
                './mlu-ops/bangc-ops/kernels/kernel_wrapper/lib/libextops.a',
                '-Wl,--no-whole-archive'
            ]
            extension = MLUExtension
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu'))
            include_dirs.append(os.path.abspath('./mlu-ops/bangc-ops'))
        elif (hasattr(torch.backends, 'mps')
              and torch.backends.mps.is_available()) or os.getenv(
                  'FORCE_MPS', '0') == '1':
            # objc compiler support
            from distutils.unixccompiler import UnixCCompiler
            if '.mm' not in UnixCCompiler.src_extensions:
                UnixCCompiler.src_extensions.append('.mm')
                UnixCCompiler.language_map['.mm'] = 'objc'

            define_macros += [('MMCV_WITH_MPS', None)]
            extra_compile_args = {}
            extra_compile_args['cxx'] = ['-Wall', '-std=c++17']
            extra_compile_args['cxx'] += [
                '-framework', 'Metal', '-framework', 'Foundation'
            ]
            extra_compile_args['cxx'] += ['-ObjC++']
            # src
            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
            # TODO: support mps ops on torch>=2.1.0
            if parse_version(torch.__version__) < parse_version('2.1.0'):
                op_files += glob.glob('./mmcv/ops/csrc/common/mps/*.mm') + \
                    glob.glob('./mmcv/ops/csrc/pytorch/mps/*.mm')
            extension = CppExtension
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mps'))
        elif (os.getenv('FORCE_NPU', '0') == '1'):
            print(f'Compiling {ext_name} only with CPU and NPU')
            try:
                import importlib

                from torch_npu.utils.cpp_extension import NpuExtension
                extra_compile_args['cxx'] += [
                    '-D__FILENAME__=\"$$(notdir $$(abspath $$<))\"'
                ]
                extra_compile_args['cxx'] += [
                    '-I' + importlib.util.find_spec(
                        'torch_npu').submodule_search_locations[0] +
                    '/include/third_party/acl/inc'
                ]
                extra_compile_args['cxx'] += [
                    '-I' + importlib.util.find_spec(
                        'torch_npu').submodule_search_locations[0] +
                    '/include/third_party/hccl/inc'
                ]
                define_macros += [('MMCV_WITH_NPU', None)]
                extension = NpuExtension
                if parse_version(torch.__version__) < parse_version('2.1.0'):
                    define_macros += [('MMCV_WITH_XLA', None)]
                if parse_version(torch.__version__) >= parse_version('2.1.0'):
                    define_macros += [('MMCV_WITH_KPRIVATE', None)]
            except Exception:
                raise ImportError('can not find any torch_npu')
            # src
            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/common/npu/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/npu/*.cpp')
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/npu'))
        elif hasattr(torch, 'musa') or os.getenv('FORCE_MUSA', '0') == '1':
            from torch_musa.testing import get_musa_arch
            from torch_musa.utils.musa_extension import MUSAExtension
            define_macros += [('MMCV_WITH_MUSA', None),
                              ('MUSA_ARCH', str(get_musa_arch()))]
            os.environ['MUSA_ARCH'] = str(get_musa_arch())
            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/musa/*.mu') + \
                glob.glob('./mmcv/ops/csrc/pytorch/musa/*.cpp')
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/pytorch'))
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/musa'))
            extension = MUSAExtension
        else:
            print(f'Compiling {ext_name} only with CPU')
            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
            extension = CppExtension
            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))

        # Since the PR (https://github.com/open-mmlab/mmcv/pull/1463) uses
        # c++14 features, the argument ['std=c++14'] must be added here.
        # However, in the windows environment, some standard libraries
        # will depend on c++17 or higher. In fact, for the windows
        # environment, the compiler will choose the appropriate compiler
        # to compile those cpp files, so there is no need to add the
        # argument
        if 'nvcc' in extra_compile_args and platform.system() != 'Windows':
            if parse_version(torch.__version__) <= parse_version('1.12.1'):
                extra_compile_args['nvcc'] += ['-std=c++14']
            else:
                extra_compile_args['nvcc'] += ['-std=c++17']

        ext_ops = extension(
            name=ext_name,
            sources=op_files,
            include_dirs=include_dirs,
            define_macros=define_macros,
            extra_objects=extra_objects,
            extra_compile_args=extra_compile_args,
            library_dirs=library_dirs,
            libraries=libraries,
            extra_link_args=extra_link_args)
        extensions.append(ext_ops)
    return extensions


setup(
    name='mmcv' if os.getenv('MMCV_WITH_OPS', '1') == '1' else 'mmcv-lite',
    version=get_version(),
    description='OpenMMLab Computer Vision Foundation',
    keywords='computer vision',
    packages=find_packages(),
    include_package_data=True,
    classifiers=[
        'Development Status :: 4 - Beta',
        'License :: OSI Approved :: Apache Software License',
        'Operating System :: OS Independent',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3.9',
        'Programming Language :: Python :: 3.10',
        'Topic :: Utilities',
    ],
    url='https://github.com/open-mmlab/mmcv',
    author='MMCV Contributors',
    author_email='openmmlab@gmail.com',
    install_requires=install_requires,
    extras_require={
        'all': parse_requirements('requirements.txt'),
        'tests': parse_requirements('requirements/test.txt'),
        'build': parse_requirements('requirements/build.txt'),
        'optional': parse_requirements('requirements/optional.txt'),
    },
    python_requires='>=3.7',
    ext_modules=get_extensions(),
    cmdclass=cmd_class,
    zip_safe=False)


================================================
FILE: tests/test_arraymisc.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.

import numpy as np
import pytest

import mmcv


def test_quantize():
    arr = np.random.randn(10, 10)
    levels = 20

    qarr = mmcv.quantize(arr, -1, 1, levels)
    assert qarr.shape == arr.shape
    assert qarr.dtype == np.dtype('int64')
    for i in range(arr.shape[0]):
        for j in range(arr.shape[1]):
            ref = min(levels - 1,
                      int(np.floor(10 * (1 + max(min(arr[i, j], 1), -1)))))
            assert qarr[i, j] == ref

    qarr = mmcv.quantize(arr, -1, 1, 20, dtype=np.uint8)
    assert qarr.shape == arr.shape
    assert qarr.dtype == np.dtype('uint8')

    with pytest.raises(ValueError):
        mmcv.quantize(arr, -1, 1, levels=0)
    with pytest.raises(ValueError):
        mmcv.quantize(arr, -1, 1, levels=10.0)
    with pytest.raises(ValueError):
        mmcv.quantize(arr, 2, 1, levels)


def test_dequantize():
    levels = 20
    qarr = np.random.randint(levels, size=(10, 10))

    arr = mmcv.dequantize(qarr, -1, 1, levels)
    assert arr.shape == qarr.shape
    assert arr.dtype == np.dtype('float64')
    for i in range(qarr.shape[0]):
        for j in range(qarr.shape[1]):
            assert arr[i, j] == (qarr[i, j] + 0.5) / 10 - 1

    arr = mmcv.dequantize(qarr, -1, 1, levels, dtype=np.float32)
    assert arr.shape == qarr.shape
    assert arr.dtype == np.dtype('float32')

    with pytest.raises(ValueError):
        mmcv.dequantize(arr, -1, 1, levels=0)
    with pytest.raises(ValueError):
        mmcv.dequantize(arr, -1, 1, levels=10.0)
    with pytest.raises(ValueError):
        mmcv.dequantize(arr, 2, 1, levels)


def test_joint():
    arr = np.random.randn(100, 100)
    levels = 1000
    qarr = mmcv.quantize(arr, -1, 1, levels)
    recover = mmcv.dequantize(qarr, -1, 1, levels)
    assert np.abs(recover[arr < -1] + 0.999).max() < 1e-6
    assert np.abs(recover[arr > 1] - 0.999).max() < 1e-6
    assert np.abs((recover - arr)[(arr >= -1) & (arr <= 1)]).max() <= 1e-3

    arr = np.clip(np.random.randn(100) / 1000, -0.01, 0.01)
    levels = 99
    qarr = mmcv.quantize(arr, -1, 1, levels)
    recover = mmcv.dequantize(qarr, -1, 1, levels)
    assert np.all(recover == 0)


================================================
FILE: tests/test_cnn/test_build_layers.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import inspect
from importlib import import_module

import numpy as np
import pytest
import torch
import torch.nn as nn
from mmengine.registry import MODELS
from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
from torch.nn import ReflectionPad2d, Upsample

from mmcv.cnn.bricks import (ContextBlock, ConvModule, ConvTranspose2d,
                             GeneralizedAttention, NonLocal2d,
                             build_activation_layer, build_conv_layer,
                             build_norm_layer, build_padding_layer,
                             build_plugin_layer, build_upsample_layer, is_norm)
from mmcv.cnn.bricks.activation import Clamp
from mmcv.cnn.bricks.norm import infer_abbr as infer_norm_abbr
from mmcv.cnn.bricks.plugin import infer_abbr as infer_plugin_abbr
from mmcv.cnn.bricks.upsample import PixelShufflePack


def test_build_conv_layer():
    with pytest.raises(TypeError):
        # cfg must be a dict
        cfg = 'Conv2d'
        build_conv_layer(cfg)

    with pytest.raises(KeyError):
        # `type` must be in cfg
        cfg = dict(kernel_size=3)
        build_conv_layer(cfg)

    with pytest.raises(KeyError):
        # unsupported conv type
        cfg = dict(type='FancyConv')
        build_conv_layer(cfg)

    kwargs = dict(
        in_channels=4, out_channels=8, kernel_size=3, groups=2, dilation=2)
    cfg = None
    layer = build_conv_layer(cfg, **kwargs)
    assert isinstance(layer, nn.Conv2d)
    assert layer.in_channels == kwargs['in_channels']
    assert layer.out_channels == kwargs['out_channels']
    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
    assert layer.groups == kwargs['groups']
    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])

    cfg = dict(type='Conv')
    layer = build_conv_layer(cfg, **kwargs)
    assert isinstance(layer, nn.Conv2d)
    assert layer.in_channels == kwargs['in_channels']
    assert layer.out_channels == kwargs['out_channels']
    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
    assert layer.groups == kwargs['groups']
    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])

    cfg = dict(type='deconv')
    layer = build_conv_layer(cfg, **kwargs)
    assert isinstance(layer, nn.ConvTranspose2d)
    assert layer.in_channels == kwargs['in_channels']
    assert layer.out_channels == kwargs['out_channels']
    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
    assert layer.groups == kwargs['groups']
    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])

    # sparse convs cannot support the case when groups>1
    kwargs.pop('groups')

    for type_name, module in MODELS.module_dict.items():
        for type_name_ in (type_name, module):
            cfg = dict(type=type_name_)
            # SparseInverseConv2d and SparseInverseConv3d do not have the
            # argument 'dilation'
            if type_name == 'SparseInverseConv2d' or type_name == \
                    'SparseInverseConv3d':
                kwargs.pop('dilation')
            if 'conv' in type_name.lower():
                layer = build_conv_layer(cfg, **kwargs)
                assert isinstance(layer, module)
                assert layer.in_channels == kwargs['in_channels']
                assert layer.out_channels == kwargs['out_channels']
                kwargs['dilation'] = 2  # recover the key


def test_infer_norm_abbr():
    with pytest.raises(TypeError):
        # class_type must be a class
        infer_norm_abbr(0)

    class MyNorm:

        _abbr_ = 'mn'

    assert infer_norm_abbr(MyNorm) == 'mn'

    class FancyBatchNorm:
        pass

    assert infer_norm_abbr(FancyBatchNorm) == 'bn'

    class FancyInstanceNorm:
        pass

    assert infer_norm_abbr(FancyInstanceNorm) == 'in'

    class FancyLayerNorm:
        pass

    assert infer_norm_abbr(FancyLayerNorm) == 'ln'

    class FancyGroupNorm:
        pass

    assert infer_norm_abbr(FancyGroupNorm) == 'gn'

    class FancyNorm:
        pass

    assert infer_norm_abbr(FancyNorm) == 'norm_layer'


def test_build_norm_layer():
    with pytest.raises(TypeError):
        # cfg must be a dict
        cfg = 'BN'
        build_norm_layer(cfg, 3)

    with pytest.raises(KeyError):
        # `type` must be in cfg
        cfg = dict()
        build_norm_layer(cfg, 3)

    with pytest.raises(KeyError):
        # unsupported norm type
        cfg = dict(type='FancyNorm')
        build_norm_layer(cfg, 3)

    with pytest.raises(AssertionError):
        # postfix must be int or str
        cfg = dict(type='BN')
        build_norm_layer(cfg, 3, postfix=[1, 2])

    with pytest.raises(AssertionError):
        # `num_groups` must be in cfg when using 'GN'
        cfg = dict(type='GN')
        build_norm_layer(cfg, 3)

    # test each type of norm layer in norm_cfg
    abbr_mapping = {
        'BN': 'bn',
        'BN1d': 'bn',
        'BN2d': 'bn',
        'BN3d': 'bn',
        'SyncBN': 'bn',
        'GN': 'gn',
        'LN': 'ln',
        'IN': 'in',
        'IN1d': 'in',
        'IN2d': 'in',
        'IN3d': 'in',
    }
    for type_name, module in MODELS.module_dict.items():
        if type_name not in abbr_mapping:
            continue
        if type_name == 'MMSyncBN':  # skip MMSyncBN
            continue
        for postfix in ['_test', 1]:
            for type_name_ in (type_name, module):
                cfg = dict(type=type_name_)
                if type_name == 'GN':
                    cfg['num_groups'] = 3
                name, layer = build_norm_layer(cfg, 3, postfix=postfix)
                assert name == abbr_mapping[type_name] + str(postfix)
                assert isinstance(layer, module)
                if type_name == 'GN':
                    assert layer.num_channels == 3
                    assert layer.num_groups == cfg['num_groups']
                elif type_name != 'LN':
                    assert layer.num_features == 3


def test_build_activation_layer():
    act_names = [
        'ReLU', 'LeakyReLU', 'PReLU', 'RReLU', 'ReLU6', 'ELU', 'Sigmoid',
        'Tanh'
    ]

    for module_name in ['activation', 'hsigmoid', 'hswish', 'swish']:
        act_module = import_module(f'mmcv.cnn.bricks.{module_name}')
        for key, value in act_module.__dict__.items():
            if inspect.isclass(value) and issubclass(value, nn.Module):
                act_names.append(key)

    with pytest.raises(TypeError):
        # cfg must be a dict
        cfg = 'ReLU'
        build_activation_layer(cfg)

    with pytest.raises(KeyError):
        # `type` must be in cfg
        cfg = dict()
        build_activation_layer(cfg)

    with pytest.raises(KeyError):
        # unsupported activation type
        cfg = dict(type='FancyReLU')
        build_activation_layer(cfg)

    # test each type of activation layer in activation_cfg
    for type_name, module in MODELS.module_dict.items():
        if type_name in act_names:
            cfg['type'] = type_name
            layer = build_activation_layer(cfg)
            assert isinstance(layer, module)

    # sanity check for Clamp
    for type_name in ('Clamp', Clamp):
        act = build_activation_layer(dict(type='Clamp'))
        x = torch.randn(10) * 1000
        y = act(x)
        assert np.logical_and((y >= -1).numpy(), (y <= 1).numpy()).all()

    act = build_activation_layer(dict(type='Clip', min=0))
    y = act(x)
    assert np.logical_and((y >= 0).numpy(), (y <= 1).numpy()).all()
    act = build_activation_layer(dict(type='Clamp', max=0))
    y = act(x)
    assert np.logical_and((y >= -1).numpy(), (y <= 0).numpy()).all()


def test_build_padding_layer():
    pad_names = ['zero', 'reflect', 'replicate']
    for module_name in ['padding']:
        pad_module = import_module(f'mmcv.cnn.bricks.{module_name}')
        for key, value in pad_module.__dict__.items():
            if inspect.isclass(value) and issubclass(value, nn.Module):
                pad_names.append(key)

    with pytest.raises(TypeError):
        # cfg must be a dict
        cfg = 'reflect'
        build_padding_layer(cfg)

    with pytest.raises(KeyError):
        # `type` must be in cfg
        cfg = dict()
        build_padding_layer(cfg)

    with pytest.raises(KeyError):
        # unsupported activation type
        cfg = dict(type='FancyPad')
        build_padding_layer(cfg)

    for type_name, module in MODELS.module_dict.items():
        if type_name in pad_names:
            cfg['type'] = type_name
            layer = build_padding_layer(cfg, 2)
            assert isinstance(layer, module)
    for type_name in (ReflectionPad2d, 'reflect'):
        input_x = torch.randn(1, 2, 5, 5)
        cfg = dict(type=type_name)
        padding_layer = build_padding_layer(cfg, 2)
        res = padding_layer(input_x)
        assert res.shape == (1, 2, 9, 9)


def test_upsample_layer():
    with pytest.raises(TypeError):
        # cfg must be a dict
        cfg = 'bilinear'
        build_upsample_layer(cfg)

    with pytest.raises(KeyError):
        # `type` must be in cfg
        cfg = dict()
        build_upsample_layer(cfg)

    with pytest.raises(KeyError):
        # unsupported activation type
        cfg = dict(type='FancyUpsample')
        build_upsample_layer(cfg)

    for type_name in ['nearest', 'bilinear']:
        cfg['type'] = type_name
        layer = build_upsample_layer(cfg)
        assert isinstance(layer, nn.Upsample)
        assert layer.mode == type_name

    cfg = dict()
    cfg['type'] = Upsample
    layer_from_cls = build_upsample_layer(cfg)
    assert isinstance(layer_from_cls, nn.Upsample)
    assert layer_from_cls.mode == 'nearest'

    cfg = dict(
        type='deconv', in_channels=3, out_channels=3, kernel_size=3, stride=2)
    layer = build_upsample_layer(cfg)
    assert isinstance(layer, nn.ConvTranspose2d)

    for type_name in ('deconv', ConvTranspose2d):
        cfg = dict(type=ConvTranspose2d)
        kwargs = dict(in_channels=3, out_channels=3, kernel_size=3, stride=2)
        layer = build_upsample_layer(cfg, **kwargs)
        assert isinstance(layer, nn.ConvTranspose2d)
        assert layer.in_channels == kwargs['in_channels']
        assert layer.out_channels == kwargs['out_channels']
        assert layer.kernel_size == (kwargs['kernel_size'],
                                     kwargs['kernel_size'])
        assert layer.stride == (kwargs['stride'], kwargs['stride'])

        layer = build_upsample_layer(cfg, 3, 3, 3, 2)
        assert isinstance(layer, nn.ConvTranspose2d)
        assert layer.in_channels == kwargs['in_channels']
        assert layer.out_channels == kwargs['out_channels']
        assert layer.kernel_size == (kwargs['kernel_size'],
                                     kwargs['kernel_size'])
        assert layer.stride == (kwargs['stride'], kwargs['stride'])

    for type_name in ('pixel_shuffle', PixelShufflePack):
        cfg = dict(
            type=type_name,
            in_channels=3,
            out_channels=3,
            scale_factor=2,
            upsample_kernel=3)
        layer = build_upsample_layer(cfg)

        assert isinstance(layer, PixelShufflePack)
        assert layer.scale_factor == 2
        assert layer.upsample_kernel == 3


def test_pixel_shuffle_pack():
    x_in = torch.rand(2, 3, 10, 10)
    pixel_shuffle = PixelShufflePack(3, 3, scale_factor=2, upsample_kernel=3)
    assert pixel_shuffle.upsample_conv.kernel_size == (3, 3)
    x_out = pixel_shuffle(x_in)
    assert x_out.shape == (2, 3, 20, 20)


def test_is_norm():
    norm_set1 = [
        nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.InstanceNorm1d,
        nn.InstanceNorm2d, nn.InstanceNorm3d, nn.LayerNorm
    ]
    norm_set2 = [nn.GroupNorm]
    for norm_type in norm_set1:
        layer = norm_type(3)
        assert is_norm(layer)
        assert not is_norm(layer, exclude=(norm_type, ))
    for norm_type in norm_set2:
        layer = norm_type(3, 6)
        assert is_norm(layer)
        assert not is_norm(layer, exclude=(norm_type, ))

    class MyNorm(nn.BatchNorm2d):
        pass

    layer = MyNorm(3)
    assert is_norm(layer)
    assert not is_norm(layer, exclude=_BatchNorm)
    assert not is_norm(layer, exclude=(_BatchNorm, ))

    layer = nn.Conv2d(3, 8, 1)
    assert not is_norm(layer)

    with pytest.raises(TypeError):
        layer = nn.BatchNorm1d(3)
        is_norm(layer, exclude='BN')

    with pytest.raises(TypeError):
        layer = nn.BatchNorm1d(3)
        is_norm(layer, exclude=('BN', ))


def test_infer_plugin_abbr():
    with pytest.raises(TypeError):
        # class_type must be a class
        infer_plugin_abbr(0)

    class MyPlugin:

        _abbr_ = 'mp'

    assert infer_plugin_abbr(MyPlugin) == 'mp'

    class FancyPlugin:
        pass

    assert infer_plugin_abbr(FancyPlugin) == 'fancy_plugin'


def test_build_plugin_layer():
    with pytest.raises(TypeError):
        # cfg must be a dict
        cfg = 'Plugin'
        build_plugin_layer(cfg)

    with pytest.raises(KeyError):
        # `type` must be in cfg
        cfg = dict()
        build_plugin_layer(cfg)

    with pytest.raises(KeyError):
        # unsupported plugin type
        cfg = dict(type='FancyPlugin')
        build_plugin_layer(cfg)

    with pytest.raises(AssertionError):
        # postfix must be int or str
        cfg = dict(type='ConvModule')
        build_plugin_layer(cfg, postfix=[1, 2])

    # test ContextBlock
    for type_name in ('ContextBlock', ContextBlock):
        for postfix in ['', '_test', 1]:
            cfg = dict(type=type_name)
            name, layer = build_plugin_layer(
                cfg, postfix=postfix, in_channels=16, ratio=1. / 4)
            assert name == 'context_block' + str(postfix)
            assert isinstance(layer, MODELS.module_dict['ContextBlock'])

    # test GeneralizedAttention
    for type_name in ('GeneralizedAttention', GeneralizedAttention):
        for postfix in ['', '_test', 1]:
            cfg = dict(type=type_name)
            name, layer = build_plugin_layer(
                cfg, postfix=postfix, in_channels=16)
            assert name == 'gen_attention_block' + str(postfix)
            assert isinstance(layer,
                              MODELS.module_dict['GeneralizedAttention'])

    # test NonLocal2d
    for type_name in ('NonLocal2d', NonLocal2d):
        for postfix in ['', '_test', 1]:
            cfg = dict(type='NonLocal2d')
            name, layer = build_plugin_layer(
                cfg, postfix=postfix, in_channels=16)
            assert name == 'nonlocal_block' + str(postfix)
            assert isinstance(layer, MODELS.module_dict['NonLocal2d'])

    # test ConvModule
    for postfix in ['', '_test', 1]:
        for type_name in ('ConvModule', ConvModule):
            cfg = dict(type=type_name)
            name, layer = build_plugin_layer(
                cfg,
                postfix=postfix,
                in_channels=16,
                out_channels=4,
                kernel_size=3)
            assert name == 'conv_block' + str(postfix)
            assert isinstance(layer, MODELS.module_dict['ConvModule'])


================================================
FILE: tests/test_cnn/test_context_block.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.cnn.bricks import ContextBlock


def test_context_block():
    with pytest.raises(AssertionError):
        # pooling_type should be in ['att', 'avg']
        ContextBlock(16, 1. / 4, pooling_type='unsupport_type')

    with pytest.raises(AssertionError):
        # fusion_types should be of type list or tuple
        ContextBlock(16, 1. / 4, fusion_types='unsupport_type')

    with pytest.raises(AssertionError):
        # fusion_types should be in ['channel_add', 'channel_mul']
        ContextBlock(16, 1. / 4, fusion_types=('unsupport_type', ))

    # test pooling_type='att'
    imgs = torch.randn(2, 16, 20, 20)
    context_block = ContextBlock(16, 1. / 4, pooling_type='att')
    out = context_block(imgs)
    assert context_block.conv_mask.in_channels == 16
    assert context_block.conv_mask.out_channels == 1
    assert out.shape == imgs.shape

    # test pooling_type='avg'
    imgs = torch.randn(2, 16, 20, 20)
    context_block = ContextBlock(16, 1. / 4, pooling_type='avg')
    out = context_block(imgs)
    assert hasattr(context_block, 'avg_pool')
    assert out.shape == imgs.shape

    # test fusion_types=('channel_add',)
    imgs = torch.randn(2, 16, 20, 20)
    context_block = ContextBlock(16, 1. / 4, fusion_types=('channel_add', ))
    out = context_block(imgs)
    assert context_block.channel_add_conv is not None
    assert context_block.channel_mul_conv is None
    assert out.shape == imgs.shape

    # test fusion_types=('channel_mul',)
    imgs = torch.randn(2, 16, 20, 20)
    context_block = ContextBlock(16, 1. / 4, fusion_types=('channel_mul', ))
    out = context_block(imgs)
    assert context_block.channel_add_conv is None
    assert context_block.channel_mul_conv is not None
    assert out.shape == imgs.shape

    # test fusion_types=('channel_add', 'channel_mul')
    imgs = torch.randn(2, 16, 20, 20)
    context_block = ContextBlock(
        16, 1. / 4, fusion_types=('channel_add', 'channel_mul'))
    out = context_block(imgs)
    assert context_block.channel_add_conv is not None
    assert context_block.channel_mul_conv is not None
    assert out.shape == imgs.shape


================================================
FILE: tests/test_cnn/test_conv2d_adaptive_padding.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from mmcv.cnn.bricks import Conv2dAdaptivePadding


def test_conv2d_samepadding():
    # test Conv2dAdaptivePadding with stride=1
    inputs = torch.rand((1, 3, 28, 28))
    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=1)
    output = conv(inputs)
    assert output.shape == inputs.shape

    inputs = torch.rand((1, 3, 13, 13))
    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=1)
    output = conv(inputs)
    assert output.shape == inputs.shape

    # test Conv2dAdaptivePadding with stride=2
    inputs = torch.rand((1, 3, 28, 28))
    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=2)
    output = conv(inputs)
    assert output.shape == torch.Size([1, 3, 14, 14])

    inputs = torch.rand((1, 3, 13, 13))
    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=2)
    output = conv(inputs)
    assert output.shape == torch.Size([1, 3, 7, 7])


================================================
FILE: tests/test_cnn/test_conv_module.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from unittest.mock import patch

import pytest
import torch
import torch.nn as nn
from mmengine.registry import MODELS
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION

from mmcv.cnn.bricks import ConvModule, HSigmoid, HSwish


@MODELS.register_module()
class ExampleConv(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 norm_cfg=None):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups
        self.bias = bias
        self.norm_cfg = norm_cfg
        self.output_padding = (0, 0, 0)
        self.transposed = False

        self.conv0 = nn.Conv2d(in_channels, out_channels, kernel_size)
        self.init_weights()

    def forward(self, x):
        x = self.conv0(x)
        return x

    def init_weights(self):
        nn.init.constant_(self.conv0.weight, 0)


def test_conv_module():
    with pytest.raises(AssertionError):
        # conv_cfg must be a dict or None
        conv_cfg = 'conv'
        ConvModule(3, 8, 2, conv_cfg=conv_cfg)

    with pytest.raises(AssertionError):
        # norm_cfg must be a dict or None
        norm_cfg = 'norm'
        ConvModule(3, 8, 2, norm_cfg=norm_cfg)

    with pytest.raises(KeyError):
        # softmax is not supported
        act_cfg = dict(type='softmax')
        ConvModule(3, 8, 2, act_cfg=act_cfg)

    # conv + norm + act
    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
    assert conv.with_activation
    assert hasattr(conv, 'activate')
    assert conv.with_norm
    assert hasattr(conv, 'norm')
    x = torch.rand(1, 3, 256, 256)
    output = conv(x)
    assert output.shape == (1, 8, 255, 255)

    # conv + norm with efficient mode
    efficient_conv = ConvModule(
        3, 8, 2, norm_cfg=dict(type='BN'), efficient_conv_bn_eval=True).eval()
    plain_conv = ConvModule(
        3, 8, 2, norm_cfg=dict(type='BN'),
        efficient_conv_bn_eval=False).eval()
    for efficient_param, plain_param in zip(
            efficient_conv.state_dict().values(),
            plain_conv.state_dict().values()):
        plain_param.copy_(efficient_param)

    efficient_mode_output = efficient_conv(x)
    plain_mode_output = plain_conv(x)
    assert torch.allclose(efficient_mode_output, plain_mode_output, atol=1e-5)

    # `conv` attribute can be dynamically modified in efficient mode
    efficient_conv = ConvModule(
        3, 8, 2, norm_cfg=dict(type='BN'), efficient_conv_bn_eval=True).eval()
    new_conv = nn.Conv2d(3, 8, 2).eval()
    efficient_conv.conv = new_conv
    efficient_mode_output = efficient_conv(x)
    plain_mode_output = efficient_conv.activate(
        efficient_conv.norm(new_conv(x)))
    assert torch.allclose(efficient_mode_output, plain_mode_output, atol=1e-5)

    # conv + act
    conv = ConvModule(3, 8, 2)
    assert conv.with_activation
    assert hasattr(conv, 'activate')
    assert not conv.with_norm
    assert conv.norm is None
    x = torch.rand(1, 3, 256, 256)
    output = conv(x)
    assert output.shape == (1, 8, 255, 255)

    # conv
    conv = ConvModule(3, 8, 2, act_cfg=None)
    assert not conv.with_norm
    assert conv.norm is None
    assert not conv.with_activation
    assert not hasattr(conv, 'activate')
    x = torch.rand(1, 3, 256, 256)
    output = conv(x)
    assert output.shape == (1, 8, 255, 255)

    # conv with its own `init_weights` method
    conv_module = ConvModule(
        3, 8, 2, conv_cfg=dict(type='ExampleConv'), act_cfg=None)
    assert torch.equal(conv_module.conv.conv0.weight, torch.zeros(8, 3, 2, 2))

    # with_spectral_norm=True
    conv = ConvModule(3, 8, 3, padding=1, with_spectral_norm=True)
    assert hasattr(conv.conv, 'weight_orig')
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # padding_mode='reflect'
    conv = ConvModule(3, 8, 3, padding=1, padding_mode='reflect')
    assert isinstance(conv.padding_layer, nn.ReflectionPad2d)
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # non-existing padding mode
    with pytest.raises(KeyError):
        conv = ConvModule(3, 8, 3, padding=1, padding_mode='non_exists')

    # leaky relu
    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
    assert isinstance(conv.activate, nn.LeakyReLU)
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # tanh
    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='Tanh'))
    assert isinstance(conv.activate, nn.Tanh)
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # Sigmoid
    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='Sigmoid'))
    assert isinstance(conv.activate, nn.Sigmoid)
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # PReLU
    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='PReLU'))
    assert isinstance(conv.activate, nn.PReLU)
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # HSwish
    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSwish'))
    if (TORCH_VERSION == 'parrots'
            or digit_version(TORCH_VERSION) < digit_version('1.7')):
        assert isinstance(conv.activate, HSwish)
    else:
        assert isinstance(conv.activate, nn.Hardswish)

    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # HSigmoid
    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSigmoid'))
    assert isinstance(conv.activate, HSigmoid)
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)


def test_bias():
    # bias: auto, without norm
    conv = ConvModule(3, 8, 2)
    assert conv.conv.bias is not None

    # bias: auto, with norm
    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
    assert conv.conv.bias is None

    # bias: False, without norm
    conv = ConvModule(3, 8, 2, bias=False)
    assert conv.conv.bias is None

    # bias: True, with batch norm
    with pytest.warns(UserWarning) as record:
        ConvModule(3, 8, 2, bias=True, norm_cfg=dict(type='BN'))
    assert len(record) == 1
    assert record[0].message.args[
        0] == 'Unnecessary conv bias before batch/instance norm'

    # bias: True, with instance norm
    with pytest.warns(UserWarning) as record:
        ConvModule(3, 8, 2, bias=True, norm_cfg=dict(type='IN'))
    assert len(record) == 1
    assert record[0].message.args[
        0] == 'Unnecessary conv bias before batch/instance norm'

    # bias: True, with other norm
    with pytest.warns(UserWarning) as record:
        norm_cfg = dict(type='GN', num_groups=1)
        ConvModule(3, 8, 2, bias=True, norm_cfg=norm_cfg)
        warnings.warn('No warnings')
    assert len(record) == 1
    assert record[0].message.args[0] == 'No warnings'


def conv_forward(self, x):
    return x + '_conv'


def bn_forward(self, x):
    return x + '_bn'


def relu_forward(self, x):
    return x + '_relu'


@patch('torch.nn.ReLU.forward', relu_forward)
@patch('torch.nn.BatchNorm2d.forward', bn_forward)
@patch('torch.nn.Conv2d.forward', conv_forward)
def test_order():

    with pytest.raises(AssertionError):
        # order must be a tuple
        order = ['conv', 'norm', 'act']
        ConvModule(3, 8, 2, order=order)

    with pytest.raises(AssertionError):
        # length of order must be 3
        order = ('conv', 'norm')
        ConvModule(3, 8, 2, order=order)

    with pytest.raises(AssertionError):
        # order must be an order of 'conv', 'norm', 'act'
        order = ('conv', 'norm', 'norm')
        ConvModule(3, 8, 2, order=order)

    with pytest.raises(AssertionError):
        # order must be an order of 'conv', 'norm', 'act'
        order = ('conv', 'norm', 'something')
        ConvModule(3, 8, 2, order=order)

    # ('conv', 'norm', 'act')
    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
    out = conv('input')
    assert out == 'input_conv_bn_relu'

    # ('norm', 'conv', 'act')
    conv = ConvModule(
        3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
    out = conv('input')
    assert out == 'input_bn_conv_relu'

    # ('conv', 'norm', 'act'), activate=False
    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
    out = conv('input', activate=False)
    assert out == 'input_conv_bn'

    # ('conv', 'norm', 'act'), activate=False
    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
    out = conv('input', norm=False)
    assert out == 'input_conv_relu'


================================================
FILE: tests/test_cnn/test_depthwise_seperable_conv_module.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
import torch.nn as nn

from mmcv.cnn.bricks import DepthwiseSeparableConvModule


def test_depthwise_separable_conv():
    with pytest.raises(AssertionError):
        # conv_cfg must be a dict or None
        DepthwiseSeparableConvModule(4, 8, 2, groups=2)

    # test default config
    conv = DepthwiseSeparableConvModule(3, 8, 2)
    assert conv.depthwise_conv.conv.groups == 3
    assert conv.pointwise_conv.conv.kernel_size == (1, 1)
    assert not conv.depthwise_conv.with_norm
    assert not conv.pointwise_conv.with_norm
    assert conv.depthwise_conv.activate.__class__.__name__ == 'ReLU'
    assert conv.pointwise_conv.activate.__class__.__name__ == 'ReLU'
    x = torch.rand(1, 3, 256, 256)
    output = conv(x)
    assert output.shape == (1, 8, 255, 255)

    # test dw_norm_cfg
    conv = DepthwiseSeparableConvModule(3, 8, 2, dw_norm_cfg=dict(type='BN'))
    assert conv.depthwise_conv.norm_name == 'bn'
    assert not conv.pointwise_conv.with_norm
    x = torch.rand(1, 3, 256, 256)
    output = conv(x)
    assert output.shape == (1, 8, 255, 255)

    # test pw_norm_cfg
    conv = DepthwiseSeparableConvModule(3, 8, 2, pw_norm_cfg=dict(type='BN'))
    assert not conv.depthwise_conv.with_norm
    assert conv.pointwise_conv.norm_name == 'bn'
    x = torch.rand(1, 3, 256, 256)
    output = conv(x)
    assert output.shape == (1, 8, 255, 255)

    # test norm_cfg
    conv = DepthwiseSeparableConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
    assert conv.depthwise_conv.norm_name == 'bn'
    assert conv.pointwise_conv.norm_name == 'bn'
    x = torch.rand(1, 3, 256, 256)
    output = conv(x)
    assert output.shape == (1, 8, 255, 255)

    # add test for ['norm', 'conv', 'act']
    conv = DepthwiseSeparableConvModule(3, 8, 2, order=('norm', 'conv', 'act'))
    x = torch.rand(1, 3, 256, 256)
    output = conv(x)
    assert output.shape == (1, 8, 255, 255)

    conv = DepthwiseSeparableConvModule(
        3, 8, 3, padding=1, with_spectral_norm=True)
    assert hasattr(conv.depthwise_conv.conv, 'weight_orig')
    assert hasattr(conv.pointwise_conv.conv, 'weight_orig')
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    conv = DepthwiseSeparableConvModule(
        3, 8, 3, padding=1, padding_mode='reflect')
    assert isinstance(conv.depthwise_conv.padding_layer, nn.ReflectionPad2d)
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # test dw_act_cfg
    conv = DepthwiseSeparableConvModule(
        3, 8, 3, padding=1, dw_act_cfg=dict(type='LeakyReLU'))
    assert conv.depthwise_conv.activate.__class__.__name__ == 'LeakyReLU'
    assert conv.pointwise_conv.activate.__class__.__name__ == 'ReLU'
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # test pw_act_cfg
    conv = DepthwiseSeparableConvModule(
        3, 8, 3, padding=1, pw_act_cfg=dict(type='LeakyReLU'))
    assert conv.depthwise_conv.activate.__class__.__name__ == 'ReLU'
    assert conv.pointwise_conv.activate.__class__.__name__ == 'LeakyReLU'
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)

    # test act_cfg
    conv = DepthwiseSeparableConvModule(
        3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
    assert conv.depthwise_conv.activate.__class__.__name__ == 'LeakyReLU'
    assert conv.pointwise_conv.activate.__class__.__name__ == 'LeakyReLU'
    output = conv(x)
    assert output.shape == (1, 8, 256, 256)


================================================
FILE: tests/test_cnn/test_flops_counter.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
import torch.nn as nn

from mmcv.cnn import get_model_complexity_info
from mmcv.cnn.utils.flops_counter import flops_to_string, params_to_string

try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

# yapf: disable
gt_results = [
    {'model': nn.Conv1d(3, 8, 3), 'input': (3, 16), 'flops': 1120.0, 'params': 80.0},  # noqa: E501
    {'model': nn.Conv2d(3, 8, 3), 'input': (3, 16, 16), 'flops': 43904.0, 'params': 224.0},  # noqa: E501
    {'model': nn.Conv3d(3, 8, 3), 'input': (3, 3, 16, 16), 'flops': 128576.0, 'params': 656.0},  # noqa: E501
    {'model': nn.ReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.PReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 1},  # noqa: E501
    {'model': nn.ELU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.LeakyReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.ReLU6(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.MaxPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
    {'model': nn.MaxPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.MaxPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
    {'model': nn.AvgPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
    {'model': nn.AvgPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.AvgPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
    {'model': nn.AdaptiveMaxPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
    {'model': nn.AdaptiveMaxPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.AdaptiveMaxPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
    {'model': nn.AdaptiveAvgPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
    {'model': nn.AdaptiveAvgPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.AdaptiveAvgPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
    {'model': nn.BatchNorm1d(3), 'input': (3, 16), 'flops': 96.0, 'params': 6.0},  # noqa: E501
    {'model': nn.BatchNorm2d(3), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 6.0},  # noqa: E501
    {'model': nn.BatchNorm3d(3), 'input': (3, 3, 16, 16), 'flops': 4608.0, 'params': 6.0},  # noqa: E501
    {'model': nn.GroupNorm(2, 6), 'input': (6, 16, 16), 'flops': 3072.0, 'params': 12.0},  # noqa: E501
    {'model': nn.InstanceNorm1d(3, affine=True), 'input': (3, 16), 'flops': 96.0, 'params': 6.0},  # noqa: E501
    {'model': nn.InstanceNorm2d(3, affine=True), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 6.0},  # noqa: E501
    {'model': nn.InstanceNorm3d(3, affine=True), 'input': (3, 3, 16, 16), 'flops': 4608.0, 'params': 6.0},  # noqa: E501
    {'model': nn.LayerNorm((3, 16, 16)), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 1536.0},  # noqa: E501
    {'model': nn.LayerNorm((3, 16, 16), elementwise_affine=False), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
    {'model': nn.Linear(1024, 2), 'input': (1024, ), 'flops': 2048.0, 'params': 2050.0},  # noqa: E501
    {'model': nn.ConvTranspose2d(3, 8, 3), 'input': (3, 16, 16), 'flops': 57888, 'params': 224.0},  # noqa: E501
    {'model': nn.Upsample((32, 32)), 'input': (3, 16, 16), 'flops': 3072.0, 'params': 0}  # noqa: E501
]
# yapf: enable


class ExampleModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv2d = nn.Conv2d(3, 8, 3)

    def forward(self, imgs):
        x = torch.randn((1, *imgs))
        return self.conv2d(x)


def input_constructor(x):
    return dict(imgs=x)


def test_flops_counter():
    with pytest.raises(AssertionError):
        # input_res should be a tuple
        model = nn.Conv2d(3, 8, 3)
        input_res = [1, 3, 16, 16]
        get_model_complexity_info(model, input_res)

    with pytest.raises(AssertionError):
        # len(input_res) >= 2
        model = nn.Conv2d(3, 8, 3)
        input_res = tuple()
        get_model_complexity_info(model, input_res)

    # test common layers
    for item in gt_results:
        model = item['model']
        input = item['input']
        flops, params = get_model_complexity_info(
            model, input, as_strings=False, print_per_layer_stat=False)
        assert flops == item['flops'] and params == item['params']

    # test input constructor
    model = ExampleModel()
    x = (3, 16, 16)
    flops, params = get_model_complexity_info(
        model,
        x,
        as_strings=False,
        print_per_layer_stat=False,
        input_constructor=input_constructor)
    assert flops == 43904.0 and params == 224.0

    # test output string
    model = nn.Conv3d(3, 8, 3)
    x = (3, 3, 512, 512)
    flops, params = get_model_complexity_info(
        model, x, print_per_layer_stat=False)
    assert flops == '0.17 GFLOPs' and params == str(656)

    # test print per layer status
    model = nn.Conv1d(3, 8, 3)
    x = (3, 16)
    out = StringIO()
    get_model_complexity_info(model, x, ost=out)
    assert out.getvalue() == \
        'Conv1d(0.0 M, 100.000% Params, 0.0 GFLOPs, 100.000% FLOPs, 3, 8, kernel_size=(3,), stride=(1,))\n'  # noqa: E501

    # test when model is not a common instance
    model = nn.Sequential(nn.Conv2d(3, 8, 3), nn.Flatten(), nn.Linear(1568, 2))
    x = (3, 16, 16)
    flops, params = get_model_complexity_info(
        model, x, as_strings=False, print_per_layer_stat=True)
    assert flops == 47040.0 and params == 3362


def test_flops_to_string():
    flops = 6.54321 * 10.**9
    assert flops_to_string(flops) == '6.54 GFLOPs'
    assert flops_to_string(flops, 'MFLOPs') == '6543.21 MFLOPs'
    assert flops_to_string(flops, 'KFLOPs') == '6543210.0 KFLOPs'
    assert flops_to_string(flops, 'FLOPs') == '6543210000.0 FLOPs'
    assert flops_to_string(flops, precision=4) == '6.5432 GFLOPs'

    flops = 6.54321 * 10.**9
    assert flops_to_string(flops, None) == '6.54 GFLOPs'
    flops = 3.21 * 10.**7
    assert flops_to_string(flops, None) == '32.1 MFLOPs'
    flops = 5.4 * 10.**3
    assert flops_to_string(flops, None) == '5.4 KFLOPs'
    flops = 987
    assert flops_to_string(flops, None) == '987 FLOPs'


def test_params_to_string():
    num_params = 3.21 * 10.**7
    assert params_to_string(num_params) == '32.1 M'
    num_params = 4.56 * 10.**5
    assert params_to_string(num_params) == '456.0 k'
    num_params = 7.89 * 10.**2
    assert params_to_string(num_params) == '789.0'

    num_params = 6.54321 * 10.**7
    assert params_to_string(num_params, 'M') == '65.43 M'
    assert params_to_string(num_params, 'K') == '65432.1 K'
    assert params_to_string(num_params, '') == '65432100.0'
    assert params_to_string(num_params, precision=4) == '65.4321 M'


================================================
FILE: tests/test_cnn/test_fuse_conv_bn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn

from mmcv.cnn import ConvModule, fuse_conv_bn


def test_fuse_conv_bn():
    inputs = torch.rand((1, 3, 5, 5))
    modules = nn.ModuleList()
    modules.append(nn.BatchNorm2d(3))
    modules.append(ConvModule(3, 5, 3, norm_cfg=dict(type='BN')))
    modules.append(ConvModule(5, 5, 3, norm_cfg=dict(type='BN')))
    modules = nn.Sequential(*modules)
    fused_modules = fuse_conv_bn(modules)
    assert torch.equal(modules(inputs), fused_modules(inputs))


================================================
FILE: tests/test_cnn/test_generalized_attention.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from mmcv.cnn.bricks import GeneralizedAttention


def test_context_block():

    # test attention_type='1000'
    imgs = torch.randn(2, 16, 20, 20)
    gen_attention_block = GeneralizedAttention(16, attention_type='1000')
    assert gen_attention_block.query_conv.in_channels == 16
    assert gen_attention_block.key_conv.in_channels == 16
    assert gen_attention_block.key_conv.in_channels == 16
    out = gen_attention_block(imgs)
    assert out.shape == imgs.shape

    # test attention_type='0100'
    imgs = torch.randn(2, 16, 20, 20)
    gen_attention_block = GeneralizedAttention(16, attention_type='0100')
    assert gen_attention_block.query_conv.in_channels == 16
    assert gen_attention_block.appr_geom_fc_x.in_features == 8
    assert gen_attention_block.appr_geom_fc_y.in_features == 8
    out = gen_attention_block(imgs)
    assert out.shape == imgs.shape

    # test attention_type='0010'
    imgs = torch.randn(2, 16, 20, 20)
    gen_attention_block = GeneralizedAttention(16, attention_type='0010')
    assert gen_attention_block.key_conv.in_channels == 16
    assert hasattr(gen_attention_block, 'appr_bias')
    out = gen_attention_block(imgs)
    assert out.shape == imgs.shape

    # test attention_type='0001'
    imgs = torch.randn(2, 16, 20, 20)
    gen_attention_block = GeneralizedAttention(16, attention_type='0001')
    assert gen_attention_block.appr_geom_fc_x.in_features == 8
    assert gen_attention_block.appr_geom_fc_y.in_features == 8
    assert hasattr(gen_attention_block, 'geom_bias')
    out = gen_attention_block(imgs)
    assert out.shape == imgs.shape

    # test spatial_range >= 0
    imgs = torch.randn(2, 256, 20, 20)
    gen_attention_block = GeneralizedAttention(256, spatial_range=10)
    assert hasattr(gen_attention_block, 'local_constraint_map')
    out = gen_attention_block(imgs)
    assert out.shape == imgs.shape

    # test q_stride > 1
    imgs = torch.randn(2, 16, 20, 20)
    gen_attention_block = GeneralizedAttention(16, q_stride=2)
    assert gen_attention_block.q_downsample is not None
    out = gen_attention_block(imgs)
    assert out.shape == imgs.shape

    # test kv_stride > 1
    imgs = torch.randn(2, 16, 20, 20)
    gen_attention_block = GeneralizedAttention(16, kv_stride=2)
    assert gen_attention_block.kv_downsample is not None
    out = gen_attention_block(imgs)
    assert out.shape == imgs.shape

    # test fp16 with attention_type='1111'
    if torch.cuda.is_available():
        imgs = torch.randn(2, 16, 20, 20).cuda().to(torch.half)
        gen_attention_block = GeneralizedAttention(
            16,
            spatial_range=-1,
            num_heads=8,
            attention_type='1111',
            kv_stride=2)
        gen_attention_block.cuda().type(torch.half)
        out = gen_attention_block(imgs)
        assert out.shape == imgs.shape


================================================
FILE: tests/test_cnn/test_hsigmoid.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.cnn.bricks import HSigmoid


def test_hsigmoid():
    # test assertion divisor can not be zero
    with pytest.raises(AssertionError):
        HSigmoid(divisor=0)

    # test with default parameters
    act = HSigmoid()
    input_shape = torch.Size([1, 3, 64, 64])
    input = torch.randn(input_shape)
    output = act(input)
    expected_output = torch.min(
        torch.max((input + 3) / 6, torch.zeros(input_shape)),
        torch.ones(input_shape))
    # test output shape
    assert output.shape == expected_output.shape
    # test output value
    assert torch.equal(output, expected_output)

    # test with designated parameters
    act = HSigmoid(1, 2, 0, 1)
    input_shape = torch.Size([1, 3, 64, 64])
    input = torch.randn(input_shape)
    output = act(input)
    expected_output = torch.min(
        torch.max((input + 1) / 2, torch.zeros(input_shape)),
        torch.ones(input_shape))
    # test output shape
    assert output.shape == expected_output.shape
    # test output value
    assert torch.equal(output, expected_output)


================================================
FILE: tests/test_cnn/test_hswish.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch.nn.functional import relu6

from mmcv.cnn.bricks import HSwish


def test_hswish():
    # test inplace
    act = HSwish(inplace=True)
    assert act.act.inplace
    act = HSwish()
    assert not act.act.inplace

    input = torch.randn(1, 3, 64, 64)
    expected_output = input * relu6(input + 3) / 6
    output = act(input)
    # test output shape
    assert output.shape == expected_output.shape
    # test output value
    assert torch.equal(output, expected_output)


================================================
FILE: tests/test_cnn/test_non_local.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
import torch.nn as nn

from mmcv.cnn import NonLocal1d, NonLocal2d, NonLocal3d
from mmcv.cnn.bricks.non_local import _NonLocalNd


def test_nonlocal():
    with pytest.raises(ValueError):
        # mode should be in ['embedded_gaussian', 'dot_product']
        _NonLocalNd(3, mode='unsupport_mode')

    # _NonLocalNd with zero initialization
    _NonLocalNd(3)
    _NonLocalNd(3, norm_cfg=dict(type='BN'))

    # _NonLocalNd without zero initialization
    _NonLocalNd(3, zeros_init=False)
    _NonLocalNd(3, norm_cfg=dict(type='BN'), zeros_init=False)


def test_nonlocal3d():
    # NonLocal3d with 'embedded_gaussian' mode
    imgs = torch.randn(2, 3, 10, 20, 20)
    nonlocal_3d = NonLocal3d(3)
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            # NonLocal is only implemented on gpu in parrots
            imgs = imgs.cuda()
            nonlocal_3d.cuda()
    out = nonlocal_3d(imgs)
    assert out.shape == imgs.shape

    # NonLocal3d with 'dot_product' mode
    nonlocal_3d = NonLocal3d(3, mode='dot_product')
    assert nonlocal_3d.mode == 'dot_product'
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_3d.cuda()
    out = nonlocal_3d(imgs)
    assert out.shape == imgs.shape

    # NonLocal3d with 'concatenation' mode
    nonlocal_3d = NonLocal3d(3, mode='concatenation')
    assert nonlocal_3d.mode == 'concatenation'
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_3d.cuda()
    out = nonlocal_3d(imgs)
    assert out.shape == imgs.shape

    # NonLocal3d with 'gaussian' mode
    nonlocal_3d = NonLocal3d(3, mode='gaussian')
    assert not hasattr(nonlocal_3d, 'phi')
    assert nonlocal_3d.mode == 'gaussian'
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_3d.cuda()
    out = nonlocal_3d(imgs)
    assert out.shape == imgs.shape

    # NonLocal3d with 'gaussian' mode and sub_sample
    nonlocal_3d = NonLocal3d(3, mode='gaussian', sub_sample=True)
    assert isinstance(nonlocal_3d.g, nn.Sequential) and len(nonlocal_3d.g) == 2
    assert isinstance(nonlocal_3d.g[1], nn.MaxPool3d)
    assert nonlocal_3d.g[1].kernel_size == (1, 2, 2)
    assert isinstance(nonlocal_3d.phi, nn.MaxPool3d)
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_3d.cuda()
    out = nonlocal_3d(imgs)
    assert out.shape == imgs.shape

    # NonLocal3d with 'dot_product' mode and sub_sample
    nonlocal_3d = NonLocal3d(3, mode='dot_product', sub_sample=True)
    for m in [nonlocal_3d.g, nonlocal_3d.phi]:
        assert isinstance(m, nn.Sequential) and len(m) == 2
        assert isinstance(m[1], nn.MaxPool3d)
        assert m[1].kernel_size == (1, 2, 2)
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_3d.cuda()
    out = nonlocal_3d(imgs)
    assert out.shape == imgs.shape


def test_nonlocal2d():
    # NonLocal2d with 'embedded_gaussian' mode
    imgs = torch.randn(2, 3, 20, 20)
    nonlocal_2d = NonLocal2d(3)
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            nonlocal_2d.cuda()
    out = nonlocal_2d(imgs)
    assert out.shape == imgs.shape

    # NonLocal2d with 'dot_product' mode
    imgs = torch.randn(2, 3, 20, 20)
    nonlocal_2d = NonLocal2d(3, mode='dot_product')
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            nonlocal_2d.cuda()
    out = nonlocal_2d(imgs)
    assert out.shape == imgs.shape

    # NonLocal2d with 'concatenation' mode
    imgs = torch.randn(2, 3, 20, 20)
    nonlocal_2d = NonLocal2d(3, mode='concatenation')
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            nonlocal_2d.cuda()
    out = nonlocal_2d(imgs)
    assert out.shape == imgs.shape

    # NonLocal2d with 'gaussian' mode
    imgs = torch.randn(2, 3, 20, 20)
    nonlocal_2d = NonLocal2d(3, mode='gaussian')
    assert not hasattr(nonlocal_2d, 'phi')
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            nonlocal_2d.cuda()
    out = nonlocal_2d(imgs)
    assert out.shape == imgs.shape

    # NonLocal2d with 'gaussian' mode and sub_sample
    nonlocal_2d = NonLocal2d(3, mode='gaussian', sub_sample=True)
    assert isinstance(nonlocal_2d.g, nn.Sequential) and len(nonlocal_2d.g) == 2
    assert isinstance(nonlocal_2d.g[1], nn.MaxPool2d)
    assert nonlocal_2d.g[1].kernel_size == (2, 2)
    assert isinstance(nonlocal_2d.phi, nn.MaxPool2d)
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_2d.cuda()
    out = nonlocal_2d(imgs)
    assert out.shape == imgs.shape

    # NonLocal2d with 'dot_product' mode and sub_sample
    nonlocal_2d = NonLocal2d(3, mode='dot_product', sub_sample=True)
    for m in [nonlocal_2d.g, nonlocal_2d.phi]:
        assert isinstance(m, nn.Sequential) and len(m) == 2
        assert isinstance(m[1], nn.MaxPool2d)
        assert m[1].kernel_size == (2, 2)
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_2d.cuda()
    out = nonlocal_2d(imgs)
    assert out.shape == imgs.shape


def test_nonlocal1d():
    # NonLocal1d with 'embedded_gaussian' mode
    imgs = torch.randn(2, 3, 20)
    nonlocal_1d = NonLocal1d(3)
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            nonlocal_1d.cuda()
    out = nonlocal_1d(imgs)
    assert out.shape == imgs.shape

    # NonLocal1d with 'dot_product' mode
    imgs = torch.randn(2, 3, 20)
    nonlocal_1d = NonLocal1d(3, mode='dot_product')
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            nonlocal_1d.cuda()
    out = nonlocal_1d(imgs)
    assert out.shape == imgs.shape

    # NonLocal1d with 'concatenation' mode
    imgs = torch.randn(2, 3, 20)
    nonlocal_1d = NonLocal1d(3, mode='concatenation')
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            nonlocal_1d.cuda()
    out = nonlocal_1d(imgs)
    assert out.shape == imgs.shape

    # NonLocal1d with 'gaussian' mode
    imgs = torch.randn(2, 3, 20)
    nonlocal_1d = NonLocal1d(3, mode='gaussian')
    assert not hasattr(nonlocal_1d, 'phi')
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            nonlocal_1d.cuda()
    out = nonlocal_1d(imgs)
    assert out.shape == imgs.shape

    # NonLocal1d with 'gaussian' mode and sub_sample
    nonlocal_1d = NonLocal1d(3, mode='gaussian', sub_sample=True)
    assert isinstance(nonlocal_1d.g, nn.Sequential) and len(nonlocal_1d.g) == 2
    assert isinstance(nonlocal_1d.g[1], nn.MaxPool1d)
    assert nonlocal_1d.g[1].kernel_size == 2
    assert isinstance(nonlocal_1d.phi, nn.MaxPool1d)
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_1d.cuda()
    out = nonlocal_1d(imgs)
    assert out.shape == imgs.shape

    # NonLocal1d with 'dot_product' mode and sub_sample
    nonlocal_1d = NonLocal1d(3, mode='dot_product', sub_sample=True)
    for m in [nonlocal_1d.g, nonlocal_1d.phi]:
        assert isinstance(m, nn.Sequential) and len(m) == 2
        assert isinstance(m[1], nn.MaxPool1d)
        assert m[1].kernel_size == 2
    if torch.__version__ == 'parrots':
        if torch.cuda.is_available():
            nonlocal_1d.cuda()
    out = nonlocal_1d(imgs)
    assert out.shape == imgs.shape


================================================
FILE: tests/test_cnn/test_rfsearch/test_operator.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from copy import deepcopy

import torch
import torch.nn as nn

from mmcv.cnn.rfsearch.operator import Conv2dRFSearchOp

global_config = dict(
    step=0,
    max_step=12,
    search_interval=1,
    exp_rate=0.5,
    init_alphas=0.01,
    mmin=1,
    mmax=24,
    num_branches=2,
    skip_layer=['stem', 'layer1'])


# test with 3x3 conv
def test_rfsearch_operator_3x3():
    conv = nn.Conv2d(
        in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1)
    operator = Conv2dRFSearchOp(conv, global_config)
    x = torch.randn(1, 3, 32, 32)

    # set no_grad to perform in-place operator
    with torch.no_grad():
        # After expand: (1, 1) (2, 2)
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (2, 2)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (2, 2)
        assert operator.op_layer.dilation == (2, 2)
        assert operator.op_layer.padding == (2, 2)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After expand: (1, 1) (3, 3)
        operator.expand_rates()
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (3, 3)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        operator.branch_weights[0] = 0.1
        operator.branch_weights[1] = 0.4
        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (3, 3)
        assert operator.op_layer.dilation == (3, 3)
        assert operator.op_layer.padding == (3, 3)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)


# test with 5x5 conv
def test_rfsearch_operator_5x5():
    conv = nn.Conv2d(
        in_channels=3, out_channels=3, kernel_size=5, stride=1, padding=2)
    operator = Conv2dRFSearchOp(conv, global_config)
    x = torch.randn(1, 3, 32, 32)

    with torch.no_grad():
        # After expand: (1, 1) (2, 2)
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (2, 2)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (2, 2)
        assert operator.op_layer.dilation == (2, 2)
        assert operator.op_layer.padding == (4, 4)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After expand: (1, 1) (3, 3)
        operator.expand_rates()
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (3, 3)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        operator.branch_weights[0] = 0.1
        operator.branch_weights[1] = 0.4
        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (3, 3)
        assert operator.op_layer.dilation == (3, 3)
        assert operator.op_layer.padding == (6, 6)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)


# test with 5x5 conv num_branches=3
def test_rfsearch_operator_5x5_branch3():
    conv = nn.Conv2d(
        in_channels=3, out_channels=3, kernel_size=5, stride=1, padding=2)
    config = deepcopy(global_config)
    config['num_branches'] = 3
    operator = Conv2dRFSearchOp(conv, config)
    x = torch.randn(1, 3, 32, 32)

    with torch.no_grad():
        # After expand: (1, 1) (2, 2)
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (2, 2)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (2, 2)
        assert operator.op_layer.dilation == (2, 2)
        assert operator.op_layer.padding == (4, 4)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After expand: (1, 1) (2, 2) (3, 3)
        operator.expand_rates()
        assert len(operator.dilation_rates) == 3
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (2, 2)
        assert operator.dilation_rates[2] == (3, 3)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        operator.branch_weights[0] = 0.1
        operator.branch_weights[1] = 0.3
        operator.branch_weights[2] = 0.6
        # After estimate: (3, 3) with branch_weights of [0.1 0.3 0.6]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (3, 3)
        assert operator.op_layer.dilation == (3, 3)
        assert operator.op_layer.padding == (6, 6)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)


# test with 1x5 conv
def test_rfsearch_operator_1x5():
    conv = nn.Conv2d(
        in_channels=3,
        out_channels=3,
        kernel_size=(1, 5),
        stride=1,
        padding=(0, 2))
    operator = Conv2dRFSearchOp(conv, global_config)
    x = torch.randn(1, 3, 32, 32)

    # After expand: (1, 1) (1, 2)
    assert len(operator.dilation_rates) == 2
    assert operator.dilation_rates[0] == (1, 1)
    assert operator.dilation_rates[1] == (1, 2)
    assert torch.all(
        operator.branch_weights.data == global_config['init_alphas']).item()
    # test forward
    assert operator(x).shape == (1, 3, 32, 32)

    with torch.no_grad():
        # After estimate: (1, 2) with branch_weights of [0.5 0.5]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (1, 2)
        assert operator.op_layer.dilation == (1, 2)
        assert operator.op_layer.padding == (0, 4)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After expand: (1, 1) (1, 3)
        operator.expand_rates()
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (1, 3)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        operator.branch_weights[0] = 0.2
        operator.branch_weights[1] = 0.8
        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (1, 3)
        assert operator.op_layer.dilation == (1, 3)
        assert operator.op_layer.padding == (0, 6)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)


# test with 5x5 conv initial_dilation=(2, 2)
def test_rfsearch_operator_5x5_d2x2():
    conv = nn.Conv2d(
        in_channels=3,
        out_channels=3,
        kernel_size=5,
        stride=1,
        padding=4,
        dilation=(2, 2))
    operator = Conv2dRFSearchOp(conv, global_config)
    x = torch.randn(1, 3, 32, 32)

    with torch.no_grad():
        # After expand: (1, 1) (3, 3)
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (3, 3)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (2, 2)
        assert operator.op_layer.dilation == (2, 2)
        assert operator.op_layer.padding == (4, 4)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After expand: (1, 1) (3, 3)
        operator.expand_rates()
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (3, 3)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        operator.branch_weights[0] = 0.8
        operator.branch_weights[1] = 0.2
        # After estimate: (3, 3) with branch_weights of [0.8 0.2]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.op_layer.dilation == (1, 1)
        assert operator.op_layer.padding == (2, 2)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)


# test with 5x5 conv initial_dilation=(1, 2)
def test_rfsearch_operator_5x5_d1x2():
    conv = nn.Conv2d(
        in_channels=3,
        out_channels=3,
        kernel_size=5,
        stride=1,
        padding=(2, 4),
        dilation=(1, 2))
    operator = Conv2dRFSearchOp(conv, global_config)
    x = torch.randn(1, 3, 32, 32)

    with torch.no_grad():
        # After expand: (1, 1) (2, 3)
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (2, 3)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (2, 2)
        assert operator.op_layer.dilation == (2, 2)
        assert operator.op_layer.padding == (4, 4)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        # After expand: (1, 1) (3, 3)
        operator.expand_rates()
        assert len(operator.dilation_rates) == 2
        assert operator.dilation_rates[0] == (1, 1)
        assert operator.dilation_rates[1] == (3, 3)
        assert torch.all(operator.branch_weights.data ==
                         global_config['init_alphas']).item()
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)

        operator.branch_weights[0] = 0.1
        operator.branch_weights[1] = 0.8
        # After estimate: (3, 3) with branch_weights of [0.1 0.8]
        operator.estimate_rates()
        assert len(operator.dilation_rates) == 1
        assert operator.dilation_rates[0] == (3, 3)
        assert operator.op_layer.dilation == (3, 3)
        assert operator.op_layer.padding == (6, 6)
        # test forward
        assert operator(x).shape == (1, 3, 32, 32)


================================================
FILE: tests/test_cnn/test_rfsearch/test_search.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.

import torch.nn as nn

from mmcv.cnn.rfsearch import Conv2dRFSearchOp, RFSearchHook


def test_rfsearchhook():

    def conv(in_channels, out_channels, kernel_size, stride, padding,
             dilation):
        return nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation)

    class Model(nn.Module):

        def __init__(self):
            super().__init__()
            self.stem = conv(1, 2, 3, 1, 1, 1)
            self.conv0 = conv(2, 2, 3, 1, 1, 1)
            self.layer0 = nn.Sequential(
                conv(2, 2, 3, 1, 1, 1), conv(2, 2, 3, 1, 1, 1))
            self.conv1 = conv(2, 2, 1, 1, 0, 1)
            self.conv2 = conv(2, 2, 3, 1, 1, 1)
            self.conv3 = conv(2, 2, (1, 3), 1, (0, 1), 1)

        def forward(self, x):
            x1 = self.stem(x)
            x2 = self.layer0(x1)
            x3 = self.conv0(x2)
            x4 = self.conv1(x3)
            x5 = self.conv2(x4)
            x6 = self.conv3(x5)
            return x6

        def train_step(self, x, optimizer, **kwargs):
            return dict(loss=self(x).mean(), num_samples=x.shape[0])

    rfsearch_cfg = dict(
        mode='search',
        rfstructure_file=None,
        config=dict(
            search=dict(
                step=0,
                max_step=12,
                search_interval=1,
                exp_rate=0.5,
                init_alphas=0.01,
                mmin=1,
                mmax=24,
                num_branches=2,
                skip_layer=['stem', 'conv0', 'layer0.1'])),
    )

    # hook for search
    rfsearchhook_search = RFSearchHook(
        'search', rfsearch_cfg['config'], by_epoch=True, verbose=True)
    rfsearchhook_search.config['structure'] = {
        'module.layer0.0': [1, 1],
        'module.conv2': [2, 2],
        'module.conv3': [1, 1]
    }
    # hook for fixed_single_branch
    rfsearchhook_fixed_single_branch = RFSearchHook(
        'fixed_single_branch',
        rfsearch_cfg['config'],
        by_epoch=True,
        verbose=True)
    rfsearchhook_fixed_single_branch.config['structure'] = {
        'module.layer0.0': [1, 1],
        'module.conv2': [2, 2],
        'module.conv3': [1, 1]
    }
    # hook for fixed_multi_branch
    rfsearchhook_fixed_multi_branch = RFSearchHook(
        'fixed_multi_branch',
        rfsearch_cfg['config'],
        by_epoch=True,
        verbose=True)
    rfsearchhook_fixed_multi_branch.config['structure'] = {
        'module.layer0.0': [1, 1],
        'module.conv2': [2, 2],
        'module.conv3': [1, 1]
    }

    def test_skip_layer():
        assert not isinstance(model.stem, Conv2dRFSearchOp)
        assert not isinstance(model.conv0, Conv2dRFSearchOp)
        assert isinstance(model.layer0[0], Conv2dRFSearchOp)
        assert not isinstance(model.layer0[1], Conv2dRFSearchOp)

    # 1. test init_model() with mode of search
    model = Model()
    rfsearchhook_search.init_model(model)

    test_skip_layer()
    assert not isinstance(model.conv1, Conv2dRFSearchOp)
    assert isinstance(model.conv2, Conv2dRFSearchOp)
    assert isinstance(model.conv3, Conv2dRFSearchOp)
    assert model.conv2.dilation_rates == [(1, 1), (3, 3)]
    assert model.conv3.dilation_rates == [(1, 1), (1, 2)]

    # 2. test init_model() with mode of fixed_single_branch
    model = Model()
    rfsearchhook_fixed_single_branch.init_model(model)

    assert not isinstance(model.conv1, Conv2dRFSearchOp)
    assert not isinstance(model.conv2, Conv2dRFSearchOp)
    assert not isinstance(model.conv3, Conv2dRFSearchOp)
    assert model.conv1.dilation == (1, 1)
    assert model.conv2.dilation == (2, 2)
    assert model.conv3.dilation == (1, 1)

    # 3. test init_model() with mode of fixed_multi_branch
    model = Model()
    rfsearchhook_fixed_multi_branch.init_model(model)

    test_skip_layer()
    assert not isinstance(model.conv1, Conv2dRFSearchOp)
    assert isinstance(model.conv2, Conv2dRFSearchOp)
    assert isinstance(model.conv3, Conv2dRFSearchOp)
    assert model.conv2.dilation_rates == [(1, 1), (3, 3)]
    assert model.conv3.dilation_rates == [(1, 1), (1, 2)]


================================================
FILE: tests/test_cnn/test_scale.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.cnn.bricks import LayerScale, Scale


def test_scale():
    # test default scale
    scale = Scale()
    assert scale.scale.data == 1.
    assert scale.scale.dtype == torch.float
    x = torch.rand(1, 3, 64, 64)
    output = scale(x)
    assert output.shape == (1, 3, 64, 64)

    # test given scale
    scale = Scale(10.)
    assert scale.scale.data == 10.
    assert scale.scale.dtype == torch.float
    x = torch.rand(1, 3, 64, 64)
    output = scale(x)
    assert output.shape == (1, 3, 64, 64)


def test_layer_scale():
    with pytest.raises(AssertionError):
        cfg = dict(
            dim=10,
            data_format='BNC',
        )
        LayerScale(**cfg)

    # test init
    cfg = dict(dim=10)
    ls = LayerScale(**cfg)
    assert torch.equal(ls.weight, torch.ones(10, requires_grad=True) * 1e-5)

    # test forward
    # test channels_last
    cfg = dict(dim=256, inplace=False, data_format='channels_last')
    ls_channels_last = LayerScale(**cfg)
    x = torch.randn((4, 49, 256))
    out = ls_channels_last(x)
    assert tuple(out.size()) == (4, 49, 256)
    assert torch.equal(x * 1e-5, out)

    # test channels_last 2d
    cfg = dict(dim=256, inplace=False, data_format='channels_last')
    ls_channels_last = LayerScale(**cfg)
    x = torch.randn((4, 7, 49, 256))
    out = ls_channels_last(x)
    assert tuple(out.size()) == (4, 7, 49, 256)
    assert torch.equal(x * 1e-5, out)

    # test channels_first
    cfg = dict(dim=256, inplace=False, data_format='channels_first')
    ls_channels_first = LayerScale(**cfg)
    x = torch.randn((4, 256, 7, 7))
    out = ls_channels_first(x)
    assert tuple(out.size()) == (4, 256, 7, 7)
    assert torch.equal(x * 1e-5, out)

    # test channels_first 3D
    cfg = dict(dim=256, inplace=False, data_format='channels_first')
    ls_channels_first = LayerScale(**cfg)
    x = torch.randn((4, 256, 7, 7, 7))
    out = ls_channels_first(x)
    assert tuple(out.size()) == (4, 256, 7, 7, 7)
    assert torch.equal(x * 1e-5, out)

    # test inplace True
    cfg = dict(dim=256, inplace=True, data_format='channels_first')
    ls_channels_first = LayerScale(**cfg)
    x = torch.randn((4, 256, 7, 7))
    out = ls_channels_first(x)
    assert tuple(out.size()) == (4, 256, 7, 7)
    assert x is out


================================================
FILE: tests/test_cnn/test_silu.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from mmcv.cnn.bricks import build_activation_layer


def test_silu():
    act = build_activation_layer(dict(type='SiLU'))
    input = torch.randn(1, 3, 64, 64)
    expected_output = input * torch.sigmoid(input)
    output = act(input)
    # test output shape
    assert output.shape == expected_output.shape
    # test output value
    assert torch.allclose(output, expected_output)

    # test inplace
    act = build_activation_layer(dict(type='SiLU', inplace=True))
    assert act.inplace
    input = torch.randn(1, 3, 64, 64)
    expected_output = input * torch.sigmoid(input)
    output = act(input)
    # test output shape
    assert output.shape == expected_output.shape
    # test output value
    assert torch.allclose(output, expected_output)
    assert torch.allclose(input, expected_output)
    assert input is output


================================================
FILE: tests/test_cnn/test_swish.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn.functional as F

from mmcv.cnn.bricks import Swish


def test_swish():
    act = Swish()
    input = torch.randn(1, 3, 64, 64)
    expected_output = input * F.sigmoid(input)
    output = act(input)
    # test output shape
    assert output.shape == expected_output.shape
    # test output value
    assert torch.equal(output, expected_output)


================================================
FILE: tests/test_cnn/test_transformer.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy

import pytest
import torch
from mmengine.model import ModuleList

from mmcv.cnn.bricks.drop import DropPath
from mmcv.cnn.bricks.transformer import (FFN, AdaptivePadding,
                                         BaseTransformerLayer,
                                         MultiheadAttention, PatchEmbed,
                                         PatchMerging,
                                         TransformerLayerSequence)


def test_adaptive_padding():

    for padding in ('same', 'corner'):
        kernel_size = 16
        stride = 16
        dilation = 1
        input = torch.rand(1, 1, 15, 17)
        adap_pad = AdaptivePadding(
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding)
        out = adap_pad(input)
        # padding to divisible by 16
        assert (out.shape[2], out.shape[3]) == (16, 32)
        input = torch.rand(1, 1, 16, 17)
        out = adap_pad(input)
        # padding to divisible by 16
        assert (out.shape[2], out.shape[3]) == (16, 32)

        kernel_size = (2, 2)
        stride = (2, 2)
        dilation = (1, 1)

        adap_pad = AdaptivePadding(
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding)
        input = torch.rand(1, 1, 11, 13)
        out = adap_pad(input)
        # padding to divisible by 2
        assert (out.shape[2], out.shape[3]) == (12, 14)

        kernel_size = (2, 2)
        stride = (10, 10)
        dilation = (1, 1)

        adap_pad = AdaptivePadding(
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding)
        input = torch.rand(1, 1, 10, 13)
        out = adap_pad(input)
        #  no padding
        assert (out.shape[2], out.shape[3]) == (10, 13)

        kernel_size = (11, 11)
        adap_pad = AdaptivePadding(
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding)
        input = torch.rand(1, 1, 11, 13)
        out = adap_pad(input)
        #  all padding
        assert (out.shape[2], out.shape[3]) == (21, 21)

        # test padding as kernel is (7,9)
        input = torch.rand(1, 1, 11, 13)
        stride = (3, 4)
        kernel_size = (4, 5)
        dilation = (2, 2)
        # actually (7, 9)
        adap_pad = AdaptivePadding(
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding)
        dilation_out = adap_pad(input)
        assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
        kernel_size = (7, 9)
        dilation = (1, 1)
        adap_pad = AdaptivePadding(
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding)
        kernel79_out = adap_pad(input)
        assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
        assert kernel79_out.shape == dilation_out.shape

    # assert only support "same" "corner"
    with pytest.raises(AssertionError):
        AdaptivePadding(
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=1)


def test_patch_embed():
    B = 2
    H = 3
    W = 4
    C = 3
    embed_dims = 10
    kernel_size = 3
    stride = 1
    dummy_input = torch.rand(B, C, H, W)
    patch_merge_1 = PatchEmbed(
        in_channels=C,
        embed_dims=embed_dims,
        kernel_size=kernel_size,
        stride=stride,
        padding=0,
        dilation=1,
        norm_cfg=None)

    x1, shape = patch_merge_1(dummy_input)
    # test out shape
    assert x1.shape == (2, 2, 10)
    # test outsize is correct
    assert shape == (1, 2)
    # test L = out_h * out_w
    assert shape[0] * shape[1] == x1.shape[1]

    B = 2
    H = 10
    W = 10
    C = 3
    embed_dims = 10
    kernel_size = 5
    stride = 2
    dummy_input = torch.rand(B, C, H, W)
    # test dilation
    patch_merge_2 = PatchEmbed(
        in_channels=C,
        embed_dims=embed_dims,
        kernel_size=kernel_size,
        stride=stride,
        padding=0,
        dilation=2,
        norm_cfg=None,
    )

    x2, shape = patch_merge_2(dummy_input)
    # test out shape
    assert x2.shape == (2, 1, 10)
    # test outsize is correct
    assert shape == (1, 1)
    # test L = out_h * out_w
    assert shape[0] * shape[1] == x2.shape[1]

    stride = 2
    input_size = (10, 10)

    dummy_input = torch.rand(B, C, H, W)
    # test stride and norm
    patch_merge_3 = PatchEmbed(
        in_channels=C,
        embed_dims=embed_dims,
        kernel_size=kernel_size,
        stride=stride,
        padding=0,
        dilation=2,
        norm_cfg=dict(type='LN'),
        input_size=input_size)

    x3, shape = patch_merge_3(dummy_input)
    # test out shape
    assert x3.shape == (2, 1, 10)
    # test outsize is correct
    assert shape == (1, 1)
    # test L = out_h * out_w
    assert shape[0] * shape[1] == x3.shape[1]

    # test the init_out_size with nn.Unfold
    assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
                                              1) // 2 + 1
    assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
                                              1) // 2 + 1
    H = 11
    W = 12
    input_size = (H, W)
    dummy_input = torch.rand(B, C, H, W)
    # test stride and norm
    patch_merge_3 = PatchEmbed(
        in_channels=C,
        embed_dims=embed_dims,
        kernel_size=kernel_size,
        stride=stride,
        padding=0,
        dilation=2,
        norm_cfg=dict(type='LN'),
        input_size=input_size)

    _, shape = patch_merge_3(dummy_input)
    # when input_size equal to real input
    # the out_size should be equal to `init_out_size`
    assert shape == patch_merge_3.init_out_size

    input_size = (H, W)
    dummy_input = torch.rand(B, C, H, W)
    # test stride and norm
    patch_merge_3 = PatchEmbed(
        in_channels=C,
        embed_dims=embed_dims,
        kernel_size=kernel_size,
        stride=stride,
        padding=0,
        dilation=2,
        norm_cfg=dict(type='LN'),
        input_size=input_size)

    _, shape = patch_merge_3(dummy_input)
    # when input_size equal to real input
    # the out_size should be equal to `init_out_size`
    assert shape == patch_merge_3.init_out_size

    # test adap padding
    for padding in ('same', 'corner'):
        in_c = 2
        embed_dims = 3
        B = 2

        # test stride is 1
        input_size = (5, 5)
        kernel_size = (5, 5)
        stride = (1, 1)
        dilation = 1
        bias = False

        x = torch.rand(B, in_c, *input_size)
        patch_embed = PatchEmbed(
            in_channels=in_c,
            embed_dims=embed_dims,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        x_out, out_size = patch_embed(x)
        assert x_out.size() == (B, 25, 3)
        assert out_size == (5, 5)
        assert x_out.size(1) == out_size[0] * out_size[1]

        # test kernel_size == stride
        input_size = (5, 5)
        kernel_size = (5, 5)
        stride = (5, 5)
        dilation = 1
        bias = False

        x = torch.rand(B, in_c, *input_size)
        patch_embed = PatchEmbed(
            in_channels=in_c,
            embed_dims=embed_dims,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        x_out, out_size = patch_embed(x)
        assert x_out.size() == (B, 1, 3)
        assert out_size == (1, 1)
        assert x_out.size(1) == out_size[0] * out_size[1]

        # test kernel_size == stride
        input_size = (6, 5)
        kernel_size = (5, 5)
        stride = (5, 5)
        dilation = 1
        bias = False

        x = torch.rand(B, in_c, *input_size)
        patch_embed = PatchEmbed(
            in_channels=in_c,
            embed_dims=embed_dims,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        x_out, out_size = patch_embed(x)
        assert x_out.size() == (B, 2, 3)
        assert out_size == (2, 1)
        assert x_out.size(1) == out_size[0] * out_size[1]

        # test different kernel_size with different stride
        input_size = (6, 5)
        kernel_size = (6, 2)
        stride = (6, 2)
        dilation = 1
        bias = False

        x = torch.rand(B, in_c, *input_size)
        patch_embed = PatchEmbed(
            in_channels=in_c,
            embed_dims=embed_dims,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        x_out, out_size = patch_embed(x)
        assert x_out.size() == (B, 3, 3)
        assert out_size == (1, 3)
        assert x_out.size(1) == out_size[0] * out_size[1]


def test_patch_merging():

    # Test the model with int padding
    in_c = 3
    out_c = 4
    kernel_size = 3
    stride = 3
    padding = 1
    dilation = 1
    bias = False
    # test the case `pad_to_stride` is False
    patch_merge = PatchMerging(
        in_channels=in_c,
        out_channels=out_c,
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        bias=bias)
    B, L, C = 1, 100, 3
    input_size = (10, 10)
    x = torch.rand(B, L, C)
    x_out, out_size = patch_merge(x, input_size)
    assert x_out.size() == (1, 16, 4)
    assert out_size == (4, 4)
    # assert out size is consistent with real output
    assert x_out.size(1) == out_size[0] * out_size[1]
    in_c = 4
    out_c = 5
    kernel_size = 6
    stride = 3
    padding = 2
    dilation = 2
    bias = False
    patch_merge = PatchMerging(
        in_channels=in_c,
        out_channels=out_c,
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        bias=bias)
    B, L, C = 1, 100, 4
    input_size = (10, 10)
    x = torch.rand(B, L, C)
    x_out, out_size = patch_merge(x, input_size)
    assert x_out.size() == (1, 4, 5)
    assert out_size == (2, 2)
    # assert out size is consistent with real output
    assert x_out.size(1) == out_size[0] * out_size[1]

    # Test with adaptive padding
    for padding in ('same', 'corner'):
        in_c = 2
        out_c = 3
        B = 2

        # test stride is 1
        input_size = (5, 5)
        kernel_size = (5, 5)
        stride = (1, 1)
        dilation = 1
        bias = False
        L = input_size[0] * input_size[1]

        x = torch.rand(B, L, in_c)
        patch_merge = PatchMerging(
            in_channels=in_c,
            out_channels=out_c,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        x_out, out_size = patch_merge(x, input_size)
        assert x_out.size() == (B, 25, 3)
        assert out_size == (5, 5)
        assert x_out.size(1) == out_size[0] * out_size[1]

        # test kernel_size == stride
        input_size = (5, 5)
        kernel_size = (5, 5)
        stride = (5, 5)
        dilation = 1
        bias = False
        L = input_size[0] * input_size[1]

        x = torch.rand(B, L, in_c)
        patch_merge = PatchMerging(
            in_channels=in_c,
            out_channels=out_c,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        x_out, out_size = patch_merge(x, input_size)
        assert x_out.size() == (B, 1, 3)
        assert out_size == (1, 1)
        assert x_out.size(1) == out_size[0] * out_size[1]

        # test kernel_size == stride
        input_size = (6, 5)
        kernel_size = (5, 5)
        stride = (5, 5)
        dilation = 1
        bias = False
        L = input_size[0] * input_size[1]

        x = torch.rand(B, L, in_c)
        patch_merge = PatchMerging(
            in_channels=in_c,
            out_channels=out_c,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        x_out, out_size = patch_merge(x, input_size)
        assert x_out.size() == (B, 2, 3)
        assert out_size == (2, 1)
        assert x_out.size(1) == out_size[0] * out_size[1]

        # test different kernel_size with different stride
        input_size = (6, 5)
        kernel_size = (6, 2)
        stride = (6, 2)
        dilation = 1
        bias = False
        L = input_size[0] * input_size[1]

        x = torch.rand(B, L, in_c)
        patch_merge = PatchMerging(
            in_channels=in_c,
            out_channels=out_c,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        x_out, out_size = patch_merge(x, input_size)
        assert x_out.size() == (B, 3, 3)
        assert out_size == (1, 3)
        assert x_out.size(1) == out_size[0] * out_size[1]


def test_multiheadattention():
    MultiheadAttention(
        embed_dims=5,
        num_heads=5,
        attn_drop=0,
        proj_drop=0,
        dropout_layer=dict(type='Dropout', drop_prob=0.),
        batch_first=True)
    batch_dim = 2
    embed_dim = 5
    num_query = 100
    attn_batch_first = MultiheadAttention(
        embed_dims=5,
        num_heads=5,
        attn_drop=0,
        proj_drop=0,
        dropout_layer=dict(type='DropPath', drop_prob=0.),
        batch_first=True)

    attn_query_first = MultiheadAttention(
        embed_dims=5,
        num_heads=5,
        attn_drop=0,
        proj_drop=0,
        dropout_layer=dict(type='DropPath', drop_prob=0.),
        batch_first=False)

    param_dict = dict(attn_query_first.named_parameters())
    for n, v in attn_batch_first.named_parameters():
        param_dict[n].data = v.data

    input_batch_first = torch.rand(batch_dim, num_query, embed_dim)
    input_query_first = input_batch_first.transpose(0, 1)

    assert torch.allclose(
        attn_query_first(input_query_first).sum(),
        attn_batch_first(input_batch_first).sum())

    key_batch_first = torch.rand(batch_dim, num_query, embed_dim)
    key_query_first = key_batch_first.transpose(0, 1)

    assert torch.allclose(
        attn_query_first(input_query_first, key_query_first).sum(),
        attn_batch_first(input_batch_first, key_batch_first).sum())

    identity = torch.ones_like(input_query_first)

    # check deprecated arguments can be used normally

    assert torch.allclose(
        attn_query_first(
            input_query_first, key_query_first, residual=identity).sum(),
        attn_batch_first(input_batch_first, key_batch_first).sum() +
        identity.sum() - input_batch_first.sum())

    assert torch.allclose(
        attn_query_first(
            input_query_first, key_query_first, identity=identity).sum(),
        attn_batch_first(input_batch_first, key_batch_first).sum() +
        identity.sum() - input_batch_first.sum())

    attn_query_first(
        input_query_first, key_query_first, identity=identity).sum(),


def test_ffn():
    with pytest.raises(AssertionError):
        # num_fcs should be no less than 2
        FFN(num_fcs=1)
    ffn = FFN(dropout=0, add_identity=True)

    input_tensor = torch.rand(2, 20, 256)
    input_tensor_nbc = input_tensor.transpose(0, 1)
    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())
    residual = torch.rand_like(input_tensor)
    torch.allclose(
        ffn(input_tensor, residual=residual).sum(),
        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())

    torch.allclose(
        ffn(input_tensor, identity=residual).sum(),
        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())

    # test with layer_scale
    ffn = FFN(dropout=0, add_identity=True, layer_scale_init_value=0.1)

    input_tensor = torch.rand(2, 20, 256)
    input_tensor_nbc = input_tensor.transpose(0, 1)
    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())


@pytest.mark.skipif(not torch.cuda.is_available(), reason='Cuda not available')
def test_basetransformerlayer_cuda():
    # To test if the BaseTransformerLayer's behaviour remains
    # consistent after being deepcopied
    operation_order = ('self_attn', 'ffn')
    baselayer = BaseTransformerLayer(
        operation_order=operation_order,
        batch_first=True,
        attn_cfgs=dict(
            type='MultiheadAttention',
            embed_dims=256,
            num_heads=8,
        ),
    )
    baselayers = ModuleList([copy.deepcopy(baselayer) for _ in range(2)])
    baselayers.to('cuda')
    x = torch.rand(2, 10, 256).cuda()
    for m in baselayers:
        x = m(x)
        assert x.shape == torch.Size([2, 10, 256])


@pytest.mark.parametrize('embed_dims', [False, 256])
def test_basetransformerlayer(embed_dims):
    attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),
    if embed_dims:
        ffn_cfgs = dict(
            type='FFN',
            embed_dims=embed_dims,
            feedforward_channels=1024,
            num_fcs=2,
            ffn_drop=0.,
            act_cfg=dict(type='ReLU', inplace=True),
        )
    else:
        ffn_cfgs = dict(
            type='FFN',
            feedforward_channels=1024,
            num_fcs=2,
            ffn_drop=0.,
            act_cfg=dict(type='ReLU', inplace=True),
        )

    feedforward_channels = 2048
    ffn_dropout = 0.1
    operation_order = ('self_attn', 'norm', 'ffn', 'norm')

    # test deprecated_args
    baselayer = BaseTransformerLayer(
        attn_cfgs=attn_cfgs,
        ffn_cfgs=ffn_cfgs,
        feedforward_channels=feedforward_channels,
        ffn_dropout=ffn_dropout,
        operation_order=operation_order)
    assert baselayer.batch_first is False
    assert baselayer.ffns[0].feedforward_channels == feedforward_channels

    attn_cfgs = dict(type='MultiheadAttention', num_heads=8, embed_dims=256),
    feedforward_channels = 2048
    ffn_dropout = 0.1
    operation_order = ('self_attn', 'norm', 'ffn', 'norm')
    baselayer = BaseTransformerLayer(
        attn_cfgs=attn_cfgs,
        feedforward_channels=feedforward_channels,
        ffn_dropout=ffn_dropout,
        operation_order=operation_order,
        batch_first=True)
    assert baselayer.attentions[0].batch_first
    in_tensor = torch.rand(2, 10, 256)
    baselayer(in_tensor)


def test_transformerlayersequence():
    squeue = TransformerLayerSequence(
        num_layers=6,
        transformerlayers=dict(
            type='BaseTransformerLayer',
            attn_cfgs=[
                dict(
                    type='MultiheadAttention',
                    embed_dims=256,
                    num_heads=8,
                    dropout=0.1),
                dict(type='MultiheadAttention', embed_dims=256, num_heads=4)
            ],
            feedforward_channels=1024,
            ffn_dropout=0.1,
            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
                             'norm')))
    assert len(squeue.layers) == 6
    assert squeue.pre_norm is False
    with pytest.raises(AssertionError):
        # if transformerlayers is a list, len(transformerlayers)
        # should be equal to num_layers
        TransformerLayerSequence(
            num_layers=6,
            transformerlayers=[
                dict(
                    type='BaseTransformerLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=256,
                            num_heads=8,
                            dropout=0.1),
                        dict(type='MultiheadAttention', embed_dims=256)
                    ],
                    feedforward_channels=1024,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                     'ffn', 'norm'))
            ])


def test_drop_path():
    drop_path = DropPath(drop_prob=0)
    test_in = torch.rand(2, 3, 4, 5)
    assert test_in is drop_path(test_in)

    drop_path = DropPath(drop_prob=0.1)
    drop_path.training = False
    test_in = torch.rand(2, 3, 4, 5)
    assert test_in is drop_path(test_in)
    drop_path.training = True
    assert test_in is not drop_path(test_in)


================================================
FILE: tests/test_cnn/test_wrappers.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from unittest.mock import patch

import pytest
import torch
import torch.nn as nn
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION

from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
                             Linear, MaxPool2d, MaxPool3d)

if torch.__version__ != 'parrots':
    torch_version = '1.1'
else:
    torch_version = 'parrots'


@patch('torch.__version__', torch_version)
@pytest.mark.parametrize(
    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
def test_conv2d(in_w, in_h, in_channel, out_channel, kernel_size, stride,
                padding, dilation):
    """
    CommandLine:
        xdoctest -m tests/test_wrappers.py test_conv2d
    """
    # train mode
    # wrapper op with 0-dim input
    x_empty = torch.randn(0, in_channel, in_h, in_w)
    torch.manual_seed(0)
    wrapper = Conv2d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation)
    wrapper_out = wrapper(x_empty)

    # torch op with 3-dim input as shape reference
    x_normal = torch.randn(3, in_channel, in_h, in_w).requires_grad_(True)
    torch.manual_seed(0)
    ref = nn.Conv2d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation)
    ref_out = ref(x_normal)

    assert wrapper_out.shape[0] == 0
    assert wrapper_out.shape[1:] == ref_out.shape[1:]

    wrapper_out.sum().backward()
    assert wrapper.weight.grad is not None
    assert wrapper.weight.grad.shape == wrapper.weight.shape

    assert torch.equal(wrapper(x_normal), ref_out)

    # eval mode
    x_empty = torch.randn(0, in_channel, in_h, in_w)
    wrapper = Conv2d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation)
    wrapper.eval()
    wrapper(x_empty)


@patch('torch.__version__', torch_version)
@pytest.mark.parametrize(
    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
def test_conv3d(in_w, in_h, in_t, in_channel, out_channel, kernel_size, stride,
                padding, dilation):
    """
    CommandLine:
        xdoctest -m tests/test_wrappers.py test_conv3d
    """
    # train mode
    # wrapper op with 0-dim input
    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
    torch.manual_seed(0)
    wrapper = Conv3d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation)
    wrapper_out = wrapper(x_empty)

    # torch op with 3-dim input as shape reference
    x_normal = torch.randn(3, in_channel, in_t, in_h,
                           in_w).requires_grad_(True)
    torch.manual_seed(0)
    ref = nn.Conv3d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation)
    ref_out = ref(x_normal)

    assert wrapper_out.shape[0] == 0
    assert wrapper_out.shape[1:] == ref_out.shape[1:]

    wrapper_out.sum().backward()
    assert wrapper.weight.grad is not None
    assert wrapper.weight.grad.shape == wrapper.weight.shape

    assert torch.equal(wrapper(x_normal), ref_out)

    # eval mode
    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
    wrapper = Conv3d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation)
    wrapper.eval()
    wrapper(x_empty)


@patch('torch.__version__', torch_version)
@pytest.mark.parametrize(
    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
def test_conv_transposed_2d(in_w, in_h, in_channel, out_channel, kernel_size,
                            stride, padding, dilation):
    # wrapper op with 0-dim input
    x_empty = torch.randn(0, in_channel, in_h, in_w, requires_grad=True)
    # out padding must be smaller than either stride or dilation
    op = min(stride, dilation) - 1
    if torch.__version__ == 'parrots':
        op = 0
    torch.manual_seed(0)
    wrapper = ConvTranspose2d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        output_padding=op)
    wrapper_out = wrapper(x_empty)

    # torch op with 3-dim input as shape reference
    x_normal = torch.randn(3, in_channel, in_h, in_w)
    torch.manual_seed(0)
    ref = nn.ConvTranspose2d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        output_padding=op)
    ref_out = ref(x_normal)

    assert wrapper_out.shape[0] == 0
    assert wrapper_out.shape[1:] == ref_out.shape[1:]

    wrapper_out.sum().backward()
    assert wrapper.weight.grad is not None
    assert wrapper.weight.grad.shape == wrapper.weight.shape

    assert torch.equal(wrapper(x_normal), ref_out)

    # eval mode
    x_empty = torch.randn(0, in_channel, in_h, in_w)
    wrapper = ConvTranspose2d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        output_padding=op)
    wrapper.eval()
    wrapper(x_empty)


@patch('torch.__version__', torch_version)
@pytest.mark.parametrize(
    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
def test_conv_transposed_3d(in_w, in_h, in_t, in_channel, out_channel,
                            kernel_size, stride, padding, dilation):
    # wrapper op with 0-dim input
    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w, requires_grad=True)
    # out padding must be smaller than either stride or dilation
    op = min(stride, dilation) - 1
    torch.manual_seed(0)
    wrapper = ConvTranspose3d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        output_padding=op)
    wrapper_out = wrapper(x_empty)

    # torch op with 3-dim input as shape reference
    x_normal = torch.randn(3, in_channel, in_t, in_h, in_w)
    torch.manual_seed(0)
    ref = nn.ConvTranspose3d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        output_padding=op)
    ref_out = ref(x_normal)

    assert wrapper_out.shape[0] == 0
    assert wrapper_out.shape[1:] == ref_out.shape[1:]

    wrapper_out.sum().backward()
    assert wrapper.weight.grad is not None
    assert wrapper.weight.grad.shape == wrapper.weight.shape

    assert torch.equal(wrapper(x_normal), ref_out)

    # eval mode
    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
    wrapper = ConvTranspose3d(
        in_channel,
        out_channel,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        output_padding=op)
    wrapper.eval()
    wrapper(x_empty)


@patch('torch.__version__', torch_version)
@pytest.mark.parametrize(
    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
def test_max_pool_2d(in_w, in_h, in_channel, out_channel, kernel_size, stride,
                     padding, dilation):
    # wrapper op with 0-dim input
    x_empty = torch.randn(0, in_channel, in_h, in_w, requires_grad=True)
    wrapper = MaxPool2d(
        kernel_size, stride=stride, padding=padding, dilation=dilation)
    wrapper_out = wrapper(x_empty)

    # torch op with 3-dim input as shape reference
    x_normal = torch.randn(3, in_channel, in_h, in_w)
    ref = nn.MaxPool2d(
        kernel_size, stride=stride, padding=padding, dilation=dilation)
    ref_out = ref(x_normal)

    assert wrapper_out.shape[0] == 0
    assert wrapper_out.shape[1:] == ref_out.shape[1:]

    assert torch.equal(wrapper(x_normal), ref_out)


@patch('torch.__version__', torch_version)
@pytest.mark.parametrize(
    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
@pytest.mark.skipif(
    torch.__version__ == 'parrots' and not torch.cuda.is_available(),
    reason='parrots requires CUDA support')
def test_max_pool_3d(in_w, in_h, in_t, in_channel, out_channel, kernel_size,
                     stride, padding, dilation):
    # wrapper op with 0-dim input
    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w, requires_grad=True)
    wrapper = MaxPool3d(
        kernel_size, stride=stride, padding=padding, dilation=dilation)
    if torch.__version__ == 'parrots':
        x_empty = x_empty.cuda()
    wrapper_out = wrapper(x_empty)
    # torch op with 3-dim input as shape reference
    x_normal = torch.randn(3, in_channel, in_t, in_h, in_w)
    ref = nn.MaxPool3d(
        kernel_size, stride=stride, padding=padding, dilation=dilation)
    if torch.__version__ == 'parrots':
        x_normal = x_normal.cuda()
    ref_out = ref(x_normal)

    assert wrapper_out.shape[0] == 0
    assert wrapper_out.shape[1:] == ref_out.shape[1:]

    assert torch.equal(wrapper(x_normal), ref_out)


@patch('torch.__version__', torch_version)
@pytest.mark.parametrize('in_w,in_h,in_feature,out_feature', [(10, 10, 1, 1),
                                                              (20, 20, 3, 3)])
def test_linear(in_w, in_h, in_feature, out_feature):
    # wrapper op with 0-dim input
    x_empty = torch.randn(0, in_feature, requires_grad=True)
    torch.manual_seed(0)
    wrapper = Linear(in_feature, out_feature)
    wrapper_out = wrapper(x_empty)

    # torch op with 3-dim input as shape reference
    x_normal = torch.randn(3, in_feature)
    torch.manual_seed(0)
    ref = nn.Linear(in_feature, out_feature)
    ref_out = ref(x_normal)

    assert wrapper_out.shape[0] == 0
    assert wrapper_out.shape[1:] == ref_out.shape[1:]

    wrapper_out.sum().backward()
    assert wrapper.weight.grad is not None
    assert wrapper.weight.grad.shape == wrapper.weight.shape

    assert torch.equal(wrapper(x_normal), ref_out)

    # eval mode
    x_empty = torch.randn(0, in_feature)
    wrapper = Linear(in_feature, out_feature)
    wrapper.eval()
    wrapper(x_empty)


@patch('mmcv.cnn.bricks.wrappers.TORCH_VERSION', (1, 10))
def test_nn_op_forward_called():

    for m in ['Conv2d', 'ConvTranspose2d', 'MaxPool2d']:
        with patch(f'torch.nn.{m}.forward') as nn_module_forward:
            # randn input
            x_empty = torch.randn(0, 3, 10, 10)
            wrapper = eval(m)(3, 2, 1)
            wrapper(x_empty)
            nn_module_forward.assert_called_with(x_empty)

            # non-randn input
            x_normal = torch.randn(1, 3, 10, 10)
            wrapper = eval(m)(3, 2, 1)
            wrapper(x_normal)
            nn_module_forward.assert_called_with(x_normal)

    for m in ['Conv3d', 'ConvTranspose3d', 'MaxPool3d']:
        with patch(f'torch.nn.{m}.forward') as nn_module_forward:
            # randn input
            x_empty = torch.randn(0, 3, 10, 10, 10)
            wrapper = eval(m)(3, 2, 1)
            wrapper(x_empty)
            nn_module_forward.assert_called_with(x_empty)

            # non-randn input
            x_normal = torch.randn(1, 3, 10, 10, 10)
            wrapper = eval(m)(3, 2, 1)
            wrapper(x_normal)
            nn_module_forward.assert_called_with(x_normal)

    with patch('torch.nn.Linear.forward') as nn_module_forward:
        # randn input
        x_empty = torch.randn(0, 3)
        wrapper = Linear(3, 3)
        wrapper(x_empty)
        nn_module_forward.assert_called_with(x_empty)

        # non-randn input
        x_normal = torch.randn(1, 3)
        wrapper = Linear(3, 3)
        wrapper(x_normal)
        nn_module_forward.assert_called_with(x_normal)


@pytest.mark.skipif(
    digit_version(TORCH_VERSION) < digit_version('1.10'),
    reason='MaxPool2d and MaxPool3d will fail fx for torch<=1.9')
def test_fx_compatibility():
    from torch import fx

    # ensure the fx trace can pass the network
    for Net in (MaxPool2d, MaxPool3d):
        net = Net(1)
        gm_module = fx.symbolic_trace(net)  # noqa: F841
    for Net in (Linear, ):
        net = Net(1, 1)
        gm_module = fx.symbolic_trace(net)  # noqa: F841
    for Net in (Conv2d, ConvTranspose2d, Conv3d, ConvTranspose3d):
        net = Net(1, 1, 1)
        gm_module = fx.symbolic_trace(net)  # noqa: F841


================================================
FILE: tests/test_image/test_colorspace.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import cv2
import numpy as np
import pytest
from numpy.testing import assert_array_almost_equal, assert_array_equal

import mmcv
from mmcv.image.colorspace import (_convert_input_type_range,
                                   _convert_output_type_range)


def test_bgr2gray():
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.bgr2gray(in_img)
    computed_gray = (
        in_img[:, :, 0] * 0.114 + in_img[:, :, 1] * 0.587 +
        in_img[:, :, 2] * 0.299)
    assert_array_almost_equal(out_img, computed_gray, decimal=4)
    out_img_3d = mmcv.bgr2gray(in_img, True)
    assert out_img_3d.shape == (10, 10, 1)
    assert_array_almost_equal(out_img_3d[..., 0], out_img, decimal=4)


def test_rgb2gray():
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.rgb2gray(in_img)
    computed_gray = (
        in_img[:, :, 0] * 0.299 + in_img[:, :, 1] * 0.587 +
        in_img[:, :, 2] * 0.114)
    assert_array_almost_equal(out_img, computed_gray, decimal=4)
    out_img_3d = mmcv.rgb2gray(in_img, True)
    assert out_img_3d.shape == (10, 10, 1)
    assert_array_almost_equal(out_img_3d[..., 0], out_img, decimal=4)


def test_gray2bgr():
    in_img = np.random.rand(10, 10).astype(np.float32)
    out_img = mmcv.gray2bgr(in_img)
    assert out_img.shape == (10, 10, 3)
    for i in range(3):
        assert_array_almost_equal(out_img[..., i], in_img, decimal=4)


def test_gray2rgb():
    in_img = np.random.rand(10, 10).astype(np.float32)
    out_img = mmcv.gray2rgb(in_img)
    assert out_img.shape == (10, 10, 3)
    for i in range(3):
        assert_array_almost_equal(out_img[..., i], in_img, decimal=4)


def test_bgr2rgb():
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.bgr2rgb(in_img)
    assert out_img.shape == in_img.shape
    assert_array_equal(out_img[..., 0], in_img[..., 2])
    assert_array_equal(out_img[..., 1], in_img[..., 1])
    assert_array_equal(out_img[..., 2], in_img[..., 0])


def test_rgb2bgr():
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.rgb2bgr(in_img)
    assert out_img.shape == in_img.shape
    assert_array_equal(out_img[..., 0], in_img[..., 2])
    assert_array_equal(out_img[..., 1], in_img[..., 1])
    assert_array_equal(out_img[..., 2], in_img[..., 0])


def test_bgr2hsv():
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.bgr2hsv(in_img)
    argmax = in_img.argmax(axis=2)
    computed_hsv = np.empty_like(in_img)
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            b, g, r = in_img[i, j]
            v = max(r, g, b)
            s = (v - min(r, g, b)) / v if v != 0 else 0
            if argmax[i, j] == 0:
                h = 240 + 60 * (r - g) / (v - min(r, g, b))
            elif argmax[i, j] == 1:
                h = 120 + 60 * (b - r) / (v - min(r, g, b))
            else:
                h = 60 * (g - b) / (v - min(r, g, b))
            if h < 0:
                h += 360
            computed_hsv[i, j, :] = [h, s, v]
    assert_array_almost_equal(out_img, computed_hsv, decimal=2)


def test_convert_input_type_range():
    with pytest.raises(TypeError):
        # The img type should be np.float32 or np.uint8
        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
        _convert_input_type_range(in_img)
    # np.float32
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = _convert_input_type_range(in_img)
    assert out_img.dtype == np.float32
    assert np.absolute(out_img).mean() < 1
    # np.uint8
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
    out_img = _convert_input_type_range(in_img)
    assert out_img.dtype == np.float32
    assert np.absolute(out_img).mean() < 1


def test_convert_output_type_range():
    with pytest.raises(TypeError):
        # The dst_type should be np.float32 or np.uint8
        in_img = np.random.rand(10, 10, 3).astype(np.float32)
        _convert_output_type_range(in_img, np.uint64)
    # np.float32
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.float32)
    out_img = _convert_output_type_range(in_img, np.float32)
    assert out_img.dtype == np.float32
    assert np.absolute(out_img).mean() < 1
    # np.uint8
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.float32)
    out_img = _convert_output_type_range(in_img, np.uint8)
    assert out_img.dtype == np.uint8
    assert np.absolute(out_img).mean() > 1


def assert_image_almost_equal(x, y, atol=1):
    assert x.dtype == np.uint8
    assert y.dtype == np.uint8
    assert np.all(np.abs(x.astype(np.int32) - y.astype(np.int32)) <= atol)


def test_rgb2ycbcr():
    with pytest.raises(TypeError):
        # The img type should be np.float32 or np.uint8
        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
        mmcv.rgb2ycbcr(in_img)

    # float32
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.rgb2ycbcr(in_img)
    computed_ycbcr = np.empty_like(in_img)
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            r, g, b = in_img[i, j]
            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
            computed_ycbcr[i, j, :] = [y, cb, cr]
    computed_ycbcr /= 255.
    assert_array_almost_equal(out_img, computed_ycbcr, decimal=2)
    # y_only=True
    out_img = mmcv.rgb2ycbcr(in_img, y_only=True)
    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            r, g, b = in_img[i, j]
            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
            computed_y[i, j] = y
    computed_y /= 255.
    assert_array_almost_equal(out_img, computed_y, decimal=2)

    # uint8
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
    out_img = mmcv.rgb2ycbcr(in_img)
    computed_ycbcr = np.empty_like(in_img)
    in_img = in_img / 255.
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            r, g, b = in_img[i, j]
            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
            y, cb, cr = y.round(), cb.round(), cr.round()
            computed_ycbcr[i, j, :] = [y, cb, cr]
    assert_image_almost_equal(out_img, computed_ycbcr)
    # y_only=True
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
    out_img = mmcv.rgb2ycbcr(in_img, y_only=True)
    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
    in_img = in_img / 255.
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            r, g, b = in_img[i, j]
            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
            y = y.round()
            computed_y[i, j] = y
    assert_image_almost_equal(out_img, computed_y)


def test_bgr2ycbcr():
    # float32
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.bgr2ycbcr(in_img)
    computed_ycbcr = np.empty_like(in_img)
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            b, g, r = in_img[i, j]
            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
            computed_ycbcr[i, j, :] = [y, cb, cr]
    computed_ycbcr /= 255.
    assert_array_almost_equal(out_img, computed_ycbcr, decimal=2)
    # y_only=True
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.bgr2ycbcr(in_img, y_only=True)
    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            b, g, r = in_img[i, j]
            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
            computed_y[i, j] = y
    computed_y /= 255.
    assert_array_almost_equal(out_img, computed_y, decimal=2)

    # uint8
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
    out_img = mmcv.bgr2ycbcr(in_img)
    computed_ycbcr = np.empty_like(in_img)
    in_img = in_img / 255.
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            b, g, r = in_img[i, j]
            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
            y, cb, cr = y.round(), cb.round(), cr.round()
            computed_ycbcr[i, j, :] = [y, cb, cr]
    assert_image_almost_equal(out_img, computed_ycbcr)
    # y_only = True
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
    out_img = mmcv.bgr2ycbcr(in_img, y_only=True)
    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
    in_img = in_img / 255.
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            b, g, r = in_img[i, j]
            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
            y = y.round()
            computed_y[i, j] = y
    assert_image_almost_equal(out_img, computed_y)


def test_ycbcr2rgb():
    with pytest.raises(TypeError):
        # The img type should be np.float32 or np.uint8
        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
        mmcv.ycbcr2rgb(in_img)

    # float32
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.ycbcr2rgb(in_img)
    computed_rgb = np.empty_like(in_img)
    in_img *= 255.
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            y, cb, cr = in_img[i, j]
            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
                cr * 0.00318811 * 255
            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
            computed_rgb[i, j, :] = [r, g, b]
    computed_rgb /= 255.
    assert_array_almost_equal(out_img, computed_rgb, decimal=2)

    # uint8
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
    out_img = mmcv.ycbcr2rgb(in_img)
    computed_rgb = np.empty_like(in_img)
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            y, cb, cr = in_img[i, j]
            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
                cr * 0.00318811 * 255
            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
            r, g, b = r.round(), g.round(), b.round()
            computed_rgb[i, j, :] = [r, g, b]
    assert_image_almost_equal(out_img, computed_rgb)


def test_ycbcr2bgr():
    # float32
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.ycbcr2bgr(in_img)
    computed_bgr = np.empty_like(in_img)
    in_img *= 255.
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            y, cb, cr = in_img[i, j]
            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
                cr * 0.00318811 * 255
            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
            computed_bgr[i, j, :] = [b, g, r]
    computed_bgr /= 255.
    assert_array_almost_equal(out_img, computed_bgr, decimal=2)

    # uint8
    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
    out_img = mmcv.ycbcr2bgr(in_img)
    computed_bgr = np.empty_like(in_img)
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            y, cb, cr = in_img[i, j]
            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
                cr * 0.00318811 * 255
            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
            r, g, b = r.round(), g.round(), b.round()
            computed_bgr[i, j, :] = [b, g, r]
    assert_image_almost_equal(out_img, computed_bgr)


def test_bgr2hls():
    in_img = np.random.rand(10, 10, 3).astype(np.float32)
    out_img = mmcv.bgr2hls(in_img)
    argmax = in_img.argmax(axis=2)
    computed_hls = np.empty_like(in_img)
    for i in range(in_img.shape[0]):
        for j in range(in_img.shape[1]):
            b, g, r = in_img[i, j]
            maxc = max(r, g, b)
            minc = min(r, g, b)
            _l = (minc + maxc) / 2.0
            if minc == maxc:
                h = 0.0
                s = 0.0
            if _l <= 0.5:
                s = (maxc - minc) / (maxc + minc)
            else:
                s = (maxc - minc) / (2.0 - maxc - minc)
            if argmax[i, j] == 2:
                h = 60 * (g - b) / (maxc - minc)
            elif argmax[i, j] == 1:
                h = 60 * (2.0 + (b - r) / (maxc - minc))
            else:
                h = 60 * (4.0 + (r - g) / (maxc - minc))
            if h < 0:
                h += 360
            computed_hls[i, j, :] = [h, _l, s]
    assert_array_almost_equal(out_img, computed_hls, decimal=2)


@pytest.mark.parametrize('src,dst,ref', [('bgr', 'gray', cv2.COLOR_BGR2GRAY),
                                         ('rgb', 'gray', cv2.COLOR_RGB2GRAY),
                                         ('bgr', 'rgb', cv2.COLOR_BGR2RGB),
                                         ('rgb', 'bgr', cv2.COLOR_RGB2BGR),
                                         ('bgr', 'hsv', cv2.COLOR_BGR2HSV),
                                         ('hsv', 'bgr', cv2.COLOR_HSV2BGR),
                                         ('bgr', 'hls', cv2.COLOR_BGR2HLS),
                                         ('hls', 'bgr', cv2.COLOR_HLS2BGR)])
def test_imconvert(src, dst, ref):
    img = np.random.rand(10, 10, 3).astype(np.float32)
    assert_array_equal(mmcv.imconvert(img, src, dst), cv2.cvtColor(img, ref))


================================================
FILE: tests/test_image/test_geometric.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp

import cv2
import numpy as np
import pytest
from numpy.testing import assert_array_equal

import mmcv


class TestGeometric:

    @classmethod
    def setup_class(cls):
        cls.data_dir = osp.join(osp.dirname(__file__), '../data')
        # the test img resolution is 400x300
        cls.img_path = osp.join(cls.data_dir, 'color.jpg')
        cls.img = cv2.imread(cls.img_path)

    def test_imresize(self):
        resized_img = mmcv.imresize(self.img, (1000, 600))
        assert resized_img.shape == (600, 1000, 3)
        resized_img, w_scale, h_scale = mmcv.imresize(self.img, (1000, 600),
                                                      True)
        assert (resized_img.shape == (600, 1000, 3) and w_scale == 2.5
                and h_scale == 2.0)
        resized_img_dst = np.empty((600, 1000, 3), dtype=self.img.dtype)
        resized_img = mmcv.imresize(self.img, (1000, 600), out=resized_img_dst)
        assert id(resized_img_dst) == id(resized_img)
        assert_array_equal(resized_img_dst,
                           mmcv.imresize(self.img, (1000, 600)))
        for mode in ['nearest', 'bilinear', 'bicubic', 'area', 'lanczos']:
            resized_img = mmcv.imresize(
                self.img, (1000, 600), interpolation=mode)
            assert resized_img.shape == (600, 1000, 3)

        # test pillow resize
        for mode in [
                'nearest', 'bilinear', 'bicubic', 'box', 'lanczos', 'hamming'
        ]:
            resized_img = mmcv.imresize(
                self.img, (1000, 600), interpolation=mode, backend='pillow')
            assert resized_img.shape == (600, 1000, 3)

        # resize backend must be 'cv2' or 'pillow'
        with pytest.raises(ValueError):
            mmcv.imresize(self.img, (1000, 600), backend='not support')

    def test_imresize_to_multiple(self):
        # test size and keep_ratio = False
        resized_img = mmcv.imresize_to_multiple(
            self.img, divisor=16, size=(511, 513), keep_ratio=False)
        assert resized_img.shape == (528, 512, 3)
        resized_img = mmcv.imresize_to_multiple(
            self.img, divisor=(16, 32), size=(511, 513), keep_ratio=False)
        assert resized_img.shape == (544, 512, 3)

        # test size, keep_ratio = True, and return_scale
        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
            self.img,
            divisor=16,
            size=(1000, 600),
            keep_ratio=True,
            return_scale=True)
        assert resized_img.shape == (
            608, 800, 3) and h_scale == 608 / 300 and w_scale == 800 / 400
        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
            self.img,
            divisor=(18, 16),
            size=(1000, 600),
            keep_ratio=True,
            return_scale=True)
        assert resized_img.shape == (
            608, 810, 3) and h_scale == 608 / 300 and w_scale == 810 / 400

        # test scale_factor and return_scale
        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
            self.img, divisor=16, scale_factor=2, return_scale=True)
        assert resized_img.shape == (
            608, 800, 3) and h_scale == 608 / 300 and w_scale == 800 / 400
        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
            self.img, divisor=16, scale_factor=(2, 3), return_scale=True)
        assert resized_img.shape == (
            912, 800, 3) and h_scale == 912 / 300 and w_scale == 800 / 400
        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
            self.img, divisor=(18, 16), scale_factor=(2, 3), return_scale=True)
        assert resized_img.shape == (
            912, 810, 3) and h_scale == 912 / 300 and w_scale == 810 / 400

        # one of size and scale_factor should be given
        with pytest.raises(ValueError):
            mmcv.imresize_to_multiple(
                self.img, divisor=16, size=(1000, 600), scale_factor=2)
        with pytest.raises(ValueError):
            mmcv.imresize_to_multiple(
                self.img, divisor=16, size=None, scale_factor=None)

    def test_imresize_like(self):
        a = np.zeros((100, 200, 3))
        resized_img = mmcv.imresize_like(self.img, a)
        assert resized_img.shape == (100, 200, 3)

    def test_rescale_size(self):
        new_size, scale_factor = mmcv.rescale_size((400, 300), 1.5, True)
        assert new_size == (600, 450) and scale_factor == 1.5
        new_size, scale_factor = mmcv.rescale_size((400, 300), 0.934, True)
        assert new_size == (374, 280) and scale_factor == 0.934

        new_size = mmcv.rescale_size((400, 300), 1.5)
        assert new_size == (600, 450)
        new_size = mmcv.rescale_size((400, 300), 0.934)
        assert new_size == (374, 280)

        new_size, scale_factor = mmcv.rescale_size((400, 300), (1000, 600),
                                                   True)
        assert new_size == (800, 600) and scale_factor == 2.0
        new_size, scale_factor = mmcv.rescale_size((400, 300), (180, 200),
                                                   True)
        assert new_size == (200, 150) and scale_factor == 0.5

        new_size = mmcv.rescale_size((400, 300), (1000, 600))
        assert new_size == (800, 600)
        new_size = mmcv.rescale_size((400, 300), (180, 200))
        assert new_size == (200, 150)

        with pytest.raises(ValueError):
            mmcv.rescale_size((400, 300), -0.5)
        with pytest.raises(TypeError):
            mmcv.rescale_size()((400, 300), [100, 100])

    def test_imrescale(self):
        # rescale by a certain factor
        resized_img = mmcv.imrescale(self.img, 1.5)
        assert resized_img.shape == (450, 600, 3)
        resized_img = mmcv.imrescale(self.img, 0.934)
        assert resized_img.shape == (280, 374, 3)

        # rescale by a certain max_size
        # resize (400, 300) to (max_1000, max_600)
        resized_img = mmcv.imrescale(self.img, (1000, 600))
        assert resized_img.shape == (600, 800, 3)
        resized_img, scale = mmcv.imrescale(
            self.img, (1000, 600), return_scale=True)
        assert resized_img.shape == (600, 800, 3) and scale == 2.0
        # resize (400, 300) to (max_200, max_180)
        resized_img = mmcv.imrescale(self.img, (180, 200))
        assert resized_img.shape == (150, 200, 3)
        resized_img, scale = mmcv.imrescale(
            self.img, (180, 200), return_scale=True)
        assert resized_img.shape == (150, 200, 3) and scale == 0.5

        # test exceptions
        with pytest.raises(ValueError):
            mmcv.imrescale(self.img, -0.5)
        with pytest.raises(TypeError):
            mmcv.imrescale(self.img, [100, 100])

    def test_imflip(self):
        # direction must be "horizontal" or "vertical" or "diagonal"
        with pytest.raises(AssertionError):
            mmcv.imflip(np.random.rand(80, 60, 3), direction='random')

        # test horizontal flip (color image)
        img = np.random.rand(80, 60, 3)
        h, w, c = img.shape
        flipped_img = mmcv.imflip(img)
        assert flipped_img.shape == img.shape
        for i in range(h):
            for j in range(w):
                for k in range(c):
                    assert flipped_img[i, j, k] == img[i, w - 1 - j, k]

        # test vertical flip (color image)
        flipped_img = mmcv.imflip(img, direction='vertical')
        assert flipped_img.shape == img.shape
        for i in range(h):
            for j in range(w):
                for k in range(c):
                    assert flipped_img[i, j, k] == img[h - 1 - i, j, k]

        # test diagonal flip (color image)
        flipped_img = mmcv.imflip(img, direction='diagonal')
        assert flipped_img.shape == img.shape
        for i in range(h):
            for j in range(w):
                for k in range(c):
                    assert flipped_img[i, j, k] == img[h - 1 - i, w - 1 - j, k]

        # test horizontal flip (grayscale image)
        img = np.random.rand(80, 60)
        h, w = img.shape
        flipped_img = mmcv.imflip(img)
        assert flipped_img.shape == img.shape
        for i in range(h):
            for j in range(w):
                assert flipped_img[i, j] == img[i, w - 1 - j]

        # test vertical flip (grayscale image)
        flipped_img = mmcv.imflip(img, direction='vertical')
        assert flipped_img.shape == img.shape
        for i in range(h):
            for j in range(w):
                assert flipped_img[i, j] == img[h - 1 - i, j]

        # test diagonal flip (grayscale image)
        flipped_img = mmcv.imflip(img, direction='diagonal')
        assert flipped_img.shape == img.shape
        for i in range(h):
            for j in range(w):
                assert flipped_img[i, j] == img[h - 1 - i, w - 1 - j]

    def test_imflip_(self):
        # direction must be "horizontal" or "vertical" or "diagonal"
        with pytest.raises(AssertionError):
            mmcv.imflip_(np.random.rand(80, 60, 3), direction='random')

        # test horizontal flip (color image)
        img = np.random.rand(80, 60, 3)
        h, w, c = img.shape
        img_for_flip = img.copy()
        flipped_img = mmcv.imflip_(img_for_flip)
        assert flipped_img.shape == img.shape
        assert flipped_img.shape == img_for_flip.shape
        assert id(flipped_img) == id(img_for_flip)
        for i in range(h):
            for j in range(w):
                for k in range(c):
                    assert flipped_img[i, j, k] == img[i, w - 1 - j, k]
                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]

        # test vertical flip (color image)
        img_for_flip = img.copy()
        flipped_img = mmcv.imflip_(img_for_flip, direction='vertical')
        assert flipped_img.shape == img.shape
        assert flipped_img.shape == img_for_flip.shape
        assert id(flipped_img) == id(img_for_flip)
        for i in range(h):
            for j in range(w):
                for k in range(c):
                    assert flipped_img[i, j, k] == img[h - 1 - i, j, k]
                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]

        # test diagonal flip (color image)
        img_for_flip = img.copy()
        flipped_img = mmcv.imflip_(img_for_flip, direction='diagonal')
        assert flipped_img.shape == img.shape
        assert flipped_img.shape == img_for_flip.shape
        assert id(flipped_img) == id(img_for_flip)
        for i in range(h):
            for j in range(w):
                for k in range(c):
                    assert flipped_img[i, j, k] == img[h - 1 - i, w - 1 - j, k]
                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]

        # test horizontal flip (grayscale image)
        img = np.random.rand(80, 60)
        h, w = img.shape
        img_for_flip = img.copy()
        flipped_img = mmcv.imflip_(img_for_flip)
        assert flipped_img.shape == img.shape
        assert flipped_img.shape == img_for_flip.shape
        assert id(flipped_img) == id(img_for_flip)
        for i in range(h):
            for j in range(w):
                assert flipped_img[i, j] == img[i, w - 1 - j]
                assert flipped_img[i, j] == img_for_flip[i, j]

        # test vertical flip (grayscale image)
        img_for_flip = img.copy()
        flipped_img = mmcv.imflip_(img_for_flip, direction='vertical')
        assert flipped_img.shape == img.shape
        assert flipped_img.shape == img_for_flip.shape
        assert id(flipped_img) == id(img_for_flip)
        for i in range(h):
            for j in range(w):
                assert flipped_img[i, j] == img[h - 1 - i, j]
                assert flipped_img[i, j] == img_for_flip[i, j]

        # test diagonal flip (grayscale image)
        img_for_flip = img.copy()
        flipped_img = mmcv.imflip_(img_for_flip, direction='diagonal')
        assert flipped_img.shape == img.shape
        assert flipped_img.shape == img_for_flip.shape
        assert id(flipped_img) == id(img_for_flip)
        for i in range(h):
            for j in range(w):
                assert flipped_img[i, j] == img[h - 1 - i, w - 1 - j]
                assert flipped_img[i, j] == img_for_flip[i, j]

    def test_imcrop(self):
        # yapf: disable
        bboxes = np.array([[100, 100, 199, 199],  # center
                           [0, 0, 150, 100],  # left-top corner
                           [250, 200, 399, 299],  # right-bottom corner
                           [0, 100, 399, 199],  # wide
                           [150, 0, 299, 299]])  # tall
        # yapf: enable

        # crop one bbox
        patch = mmcv.imcrop(self.img, bboxes[0, :])
        patches = mmcv.imcrop(self.img, bboxes[[0], :])
        assert patch.shape == (100, 100, 3)
        patch_path = osp.join(self.data_dir, 'patches')
        ref_patch = np.load(patch_path + '/0.npy')
        assert_array_equal(patch, ref_patch)
        assert isinstance(patches, list) and len(patches) == 1
        assert_array_equal(patches[0], ref_patch)

        # crop with no scaling and padding
        patches = mmcv.imcrop(self.img, bboxes)
        assert len(patches) == bboxes.shape[0]
        for i in range(len(patches)):
            ref_patch = np.load(patch_path + f'/{i}.npy')
            assert_array_equal(patches[i], ref_patch)

        # crop with scaling and no padding
        patches = mmcv.imcrop(self.img, bboxes, 1.2)
        for i in range(len(patches)):
            ref_patch = np.load(patch_path + f'/scale_{i}.npy')
            assert_array_equal(patches[i], ref_patch)

        # crop with scaling and padding
        patches = mmcv.imcrop(self.img, bboxes, 1.2, pad_fill=[255, 255, 0])
        for i in range(len(patches)):
            ref_patch = np.load(patch_path + f'/pad_{i}.npy')
            assert_array_equal(patches[i], ref_patch)
        patches = mmcv.imcrop(self.img, bboxes, 1.2, pad_fill=0)
        for i in range(len(patches)):
            ref_patch = np.load(patch_path + f'/pad0_{i}.npy')
            assert_array_equal(patches[i], ref_patch)

    def test_impad(self):
        # grayscale image
        img = np.random.rand(10, 10).astype(np.float32)
        padded_img = mmcv.impad(img, padding=(0, 0, 2, 5), pad_val=0)
        assert_array_equal(img, padded_img[:10, :10])
        assert_array_equal(
            np.zeros((5, 12), dtype='float32'), padded_img[10:, :])
        assert_array_equal(
            np.zeros((15, 2), dtype='float32'), padded_img[:, 10:])

        # RGB image
        img = np.random.rand(10, 10, 3).astype(np.float32)
        padded_img = mmcv.impad(img, padding=(0, 0, 2, 5), pad_val=0)
        assert_array_equal(img, padded_img[:10, :10, :])
        assert_array_equal(
            np.zeros((5, 12, 3), dtype='float32'), padded_img[10:, :, :])
        assert_array_equal(
            np.zeros((15, 2, 3), dtype='float32'), padded_img[:, 10:, :])

        # RGB image with different values for three channels.
        img = np.random.randint(256, size=(10, 10, 3)).astype('uint8')
        padded_img = mmcv.impad(
            img, padding=(0, 0, 2, 5), pad_val=(100, 110, 120))
        assert_array_equal(img, padded_img[:10, :10, :])
        assert_array_equal(
            np.array([100, 110, 120], dtype='uint8') * np.ones(
                (5, 12, 3), dtype='uint8'), padded_img[10:, :, :])
        assert_array_equal(
            np.array([100, 110, 120], dtype='uint8') * np.ones(
                (15, 2, 3), dtype='uint8'), padded_img[:, 10:, :])

        # Pad the grayscale image to shape (15, 12)
        img = np.random.rand(10, 10).astype(np.float32)
        padded_img = mmcv.impad(img, shape=(15, 12))
        assert_array_equal(img, padded_img[:10, :10])
        assert_array_equal(
            np.zeros((5, 12), dtype='float32'), padded_img[10:, :])
        assert_array_equal(
            np.zeros((15, 2), dtype='float32'), padded_img[:, 10:])

        # Pad the RGB image to shape (15, 12)
        img = np.random.rand(10, 10, 3).astype(np.float32)
        padded_img = mmcv.impad(img, shape=(15, 12))
        assert_array_equal(img, padded_img[:10, :10, :])
        assert_array_equal(
            np.zeros((5, 12, 3), dtype='float32'), padded_img[10:, :, :])
        assert_array_equal(
            np.zeros((15, 2, 3), dtype='float32'), padded_img[:, 10:, :])

        # Pad the RGB image to shape (15, 12) with different values for
        # three channels.
        img = np.random.randint(256, size=(10, 10, 3)).astype('uint8')
        padded_img = mmcv.impad(img, shape=(15, 12), pad_val=(100, 110, 120))
        assert_array_equal(img, padded_img[:10, :10, :])
        assert_array_equal(
            np.array([100, 110, 120], dtype='uint8') * np.ones(
                (5, 12, 3), dtype='uint8'), padded_img[10:, :, :])
        assert_array_equal(
            np.array([100, 110, 120], dtype='uint8') * np.ones(
                (15, 2, 3), dtype='uint8'), padded_img[:, 10:, :])

        # RGB image with padding=[5, 2]
        img = np.random.rand(10, 10, 3).astype(np.float32)
        padded_img = mmcv.impad(img, padding=(5, 2), pad_val=0)

        assert padded_img.shape == (14, 20, 3)
        assert_array_equal(img, padded_img[2:12, 5:15, :])
        assert_array_equal(
            np.zeros((2, 5, 3), dtype='float32'), padded_img[:2, :5, :])
        assert_array_equal(
            np.zeros((2, 5, 3), dtype='float32'), padded_img[12:, :5, :])
        assert_array_equal(
            np.zeros((2, 5, 3), dtype='float32'), padded_img[:2, 15:, :])
        assert_array_equal(
            np.zeros((2, 5, 3), dtype='float32'), padded_img[12:, 15:, :])

        # RGB image with type(pad_val) = tuple
        pad_val = (0, 1, 2)
        img = np.random.rand(10, 10, 3).astype(np.float32)
        padded_img = mmcv.impad(img, padding=(0, 0, 5, 2), pad_val=pad_val)

        assert padded_img.shape == (12, 15, 3)
        assert_array_equal(img, padded_img[:10, :10, :])
        assert_array_equal(pad_val[0] * np.ones((2, 15, 1), dtype='float32'),
                           padded_img[10:, :, 0:1])
        assert_array_equal(pad_val[1] * np.ones((2, 15, 1), dtype='float32'),
                           padded_img[10:, :, 1:2])
        assert_array_equal(pad_val[2] * np.ones((2, 15, 1), dtype='float32'),
                           padded_img[10:, :, 2:3])

        assert_array_equal(pad_val[0] * np.ones((12, 5, 1), dtype='float32'),
                           padded_img[:, 10:, 0:1])
        assert_array_equal(pad_val[1] * np.ones((12, 5, 1), dtype='float32'),
                           padded_img[:, 10:, 1:2])
        assert_array_equal(pad_val[2] * np.ones((12, 5, 1), dtype='float32'),
                           padded_img[:, 10:, 2:3])

        # test different padding mode with channel number = 3
        for mode in ['constant', 'edge', 'reflect', 'symmetric']:
            img = np.random.rand(10, 10, 3).astype(np.float32)
            padded_img = mmcv.impad(
                img, padding=(0, 0, 5, 2), pad_val=pad_val, padding_mode=mode)
            assert padded_img.shape == (12, 15, 3)

        # test different padding mode with channel number = 1
        for mode in ['constant', 'edge', 'reflect', 'symmetric']:
            img = np.random.rand(10, 10).astype(np.float32)
            padded_img = mmcv.impad(
                img, padding=(0, 0, 5, 2), pad_val=0, padding_mode=mode)
            assert padded_img.shape == (12, 15)

        # Padding must be a int or a 2, or 4 element tuple.
        with pytest.raises(ValueError):
            mmcv.impad(img, padding=(1, 1, 1))

        # pad_val must be a int or a tuple
        with pytest.raises(TypeError):
            mmcv.impad(img, padding=(1, 1, 1, 1), pad_val='wrong')

        # When pad_val is a tuple,
        # len(pad_val) should be equal to img.shape[-1]
        img = np.random.rand(10, 10, 3).astype(np.float32)
        with pytest.raises(AssertionError):
            mmcv.impad(img, padding=3, pad_val=(100, 200))

        with pytest.raises(AssertionError):
            mmcv.impad(img, padding=2, pad_val=0, padding_mode='unknown')

        with pytest.raises(AssertionError):
            mmcv.impad(img, shape=(12, 15), padding=(0, 0, 5, 2))

        # Pad shape smaller than image shape
        padded_img = mmcv.impad(img, shape=(8, 8))
        assert padded_img.shape == (10, 10, 3)

    def test_impad_to_multiple(self):
        img = np.random.rand(11, 14, 3).astype(np.float32)
        padded_img = mmcv.impad_to_multiple(img, 4)
        assert padded_img.shape == (12, 16, 3)
        img = np.random.rand(20, 12).astype(np.float32)
        padded_img = mmcv.impad_to_multiple(img, 5)
        assert padded_img.shape == (20, 15)
        img = np.random.rand(20, 12).astype(np.float32)
        padded_img = mmcv.impad_to_multiple(img, 2)
        assert padded_img.shape == (20, 12)

    def test_cutout(self):
        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)

        # shape must be int or tuple
        with pytest.raises(AssertionError):
            mmcv.cutout(img, 2.5)
        # pad_val must be int or float or tuple with the same length
        # of img channels
        with pytest.raises(AssertionError):
            mmcv.cutout(img, 1, (1, 2, 3))
        with pytest.raises(TypeError):
            mmcv.cutout(img, 1, None)

        # test cutout the whole img
        assert_array_equal(mmcv.cutout(img, 6), np.zeros_like(img))
        # test not cutout
        assert_array_equal(mmcv.cutout(img, 0), img)
        # test cutout when shape is int
        np.random.seed(0)
        img_cutout = np.array([[1, 2, 3], [4, 0, 6], [7, 8,
                                                      9]]).astype(np.uint8)
        assert_array_equal(mmcv.cutout(img, 1), img_cutout)
        img_cutout = np.array([[1, 2, 3], [4, 10, 6], [7, 8,
                                                       9]]).astype(np.uint8)
        assert_array_equal(mmcv.cutout(img, 1, pad_val=10), img_cutout)
        # test cutout when shape is tuple
        np.random.seed(0)
        img_cutout = np.array([[1, 2, 3], [0, 0, 6], [7, 8,
                                                      9]]).astype(np.uint8)
        assert_array_equal(mmcv.cutout(img, (1, 2)), img_cutout)
        img_cutout = np.array([[1, 2, 3], [10, 10, 6], [7, 8,
                                                        9]]).astype(np.uint8)
        assert_array_equal(mmcv.cutout(img, (1, 2), pad_val=10), img_cutout)

    def test_imrotate(self):
        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
        assert_array_equal(mmcv.imrotate(img, 0), img)
        img_r = np.array([[7, 4, 1], [8, 5, 2], [9, 6, 3]])
        assert_array_equal(mmcv.imrotate(img, 90), img_r)
        img_r = np.array([[3, 6, 9], [2, 5, 8], [1, 4, 7]])
        assert_array_equal(mmcv.imrotate(img, -90), img_r)

        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]).astype(np.uint8)
        img_r = np.array([[0, 6, 2, 0], [0, 7, 3, 0]])
        assert_array_equal(mmcv.imrotate(img, 90), img_r)
        img_r = np.array([[1, 0, 0, 0], [2, 0, 0, 0]])
        assert_array_equal(mmcv.imrotate(img, 90, center=(0, 0)), img_r)
        img_r = np.array([[255, 6, 2, 255], [255, 7, 3, 255]])
        assert_array_equal(mmcv.imrotate(img, 90, border_value=255), img_r)
        img_r = np.array([[5, 1], [6, 2], [7, 3], [8, 4]])
        assert_array_equal(mmcv.imrotate(img, 90, auto_bound=True), img_r)
        img_r = np.array([[6, 6, 2, 2], [7, 7, 3, 3]])
        assert_array_equal(
            mmcv.imrotate(img, 90, border_mode='replicate'), img_r)

        with pytest.raises(ValueError):
            mmcv.imrotate(img, 90, center=(0, 0), auto_bound=True)

    def test_imshear(self):
        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
        assert_array_equal(mmcv.imshear(img, 0), img)
        # magnitude=1, horizontal
        img_sheared = np.array([[1, 2, 3], [0, 4, 5], [0, 0, 7]],
                               dtype=np.uint8)
        assert_array_equal(mmcv.imshear(img, 1), img_sheared)
        # magnitude=-1, vertical
        img_sheared = np.array([[1, 5, 9], [4, 8, 0], [7, 0, 0]],
                               dtype=np.uint8)
        assert_array_equal(mmcv.imshear(img, -1, 'vertical'), img_sheared)
        # magnitude=1, vertical, borderValue=100
        borderValue = 100
        img_sheared = np.array(
            [[1, borderValue, borderValue], [4, 2, borderValue], [7, 5, 3]],
            dtype=np.uint8)
        assert_array_equal(
            mmcv.imshear(img, 1, 'vertical', borderValue), img_sheared)
        # magnitude=1, vertical, borderValue=100, img shape (h,w,3)
        img = np.stack([img, img, img], axis=-1)
        img_sheared = np.stack([img_sheared, img_sheared, img_sheared],
                               axis=-1)
        assert_array_equal(
            mmcv.imshear(img, 1, 'vertical', borderValue), img_sheared)
        # test tuple format of borderValue
        assert_array_equal(
            mmcv.imshear(img, 1, 'vertical',
                         (borderValue, borderValue, borderValue)), img_sheared)

        # test invalid length of borderValue
        with pytest.raises(AssertionError):
            mmcv.imshear(img, 0.5, 'horizontal', (borderValue, ))

        # test invalid type of borderValue
        with pytest.raises(ValueError):
            mmcv.imshear(img, 0.5, 'horizontal', [borderValue])

        # test invalid value of direction
        with pytest.raises(AssertionError):
            mmcv.imshear(img, 0.5, 'diagonal')

    def test_imtranslate(self):
        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8)
        assert_array_equal(mmcv.imtranslate(img, 0), img)
        # offset=1, horizontal
        img_translated = np.array([[128, 1, 2], [128, 4, 5], [128, 7, 8]],
                                  dtype=np.uint8)
        assert_array_equal(
            mmcv.imtranslate(img, 1, border_value=128), img_translated)
        # offset=-1, vertical
        img_translated = np.array([[4, 5, 6], [7, 8, 9], [0, 0, 0]],
                                  dtype=np.uint8)
        assert_array_equal(
            mmcv.imtranslate(img, -1, 'vertical'), img_translated)
        # offset=-2, horizontal
        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint8)
        img = np.stack([img, img, img], axis=-1)
        img_translated = [[3, 4, 128, 128], [7, 8, 128, 128]]
        img_translated = np.stack(
            [img_translated, img_translated, img_translated], axis=-1)
        assert_array_equal(
            mmcv.imtranslate(img, -2, border_value=128), img_translated)
        # offset=2, vertical
        border_value = (110, 120, 130)
        img_translated = np.stack([
            np.ones((2, 4)) * border_value[0],
            np.ones((2, 4)) * border_value[1],
            np.ones((2, 4)) * border_value[2]
        ],
                                  axis=-1).astype(np.uint8)
        assert_array_equal(
            mmcv.imtranslate(img, 2, 'vertical', border_value), img_translated)
        # test invalid number elements in border_value
        with pytest.raises(AssertionError):
            mmcv.imtranslate(img, 1, border_value=(1, ))
        # test invalid type of border_value
        with pytest.raises(ValueError):
            mmcv.imtranslate(img, 1, border_value=[1, 2, 3])
        # test invalid value of direction
        with pytest.raises(AssertionError):
            mmcv.imtranslate(img, 1, 'diagonal')


================================================
FILE: tests/test_image/test_image_misc.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
from numpy.testing import assert_array_equal

import mmcv

try:
    import torch
except ImportError:
    torch = None


@pytest.mark.skipif(torch is None, reason='requires torch library')
def test_tensor2imgs():

    # test tensor obj
    with pytest.raises(AssertionError):
        tensor = np.random.rand(2, 3, 3)
        mmcv.tensor2imgs(tensor)

    # test tensor ndim
    with pytest.raises(AssertionError):
        tensor = torch.randn(2, 3, 3)
        mmcv.tensor2imgs(tensor)

    # test tensor dim-1
    with pytest.raises(AssertionError):
        tensor = torch.randn(2, 4, 3, 3)
        mmcv.tensor2imgs(tensor)

    # test mean length
    with pytest.raises(AssertionError):
        tensor = torch.randn(2, 3, 5, 5)
        mmcv.tensor2imgs(tensor, mean=(1, ))
        tensor = torch.randn(2, 1, 5, 5)
        mmcv.tensor2imgs(tensor, mean=(0, 0, 0))

    # test std length
    with pytest.raises(AssertionError):
        tensor = torch.randn(2, 3, 5, 5)
        mmcv.tensor2imgs(tensor, std=(1, ))
        tensor = torch.randn(2, 1, 5, 5)
        mmcv.tensor2imgs(tensor, std=(1, 1, 1))

    # test to_rgb
    with pytest.raises(AssertionError):
        tensor = torch.randn(2, 1, 5, 5)
        mmcv.tensor2imgs(tensor, mean=(0, ), std=(1, ), to_rgb=True)

    # test rgb=True
    tensor = torch.randn(2, 3, 5, 5)
    gts = [
        t.cpu().numpy().transpose(1, 2, 0).astype(np.uint8)
        for t in tensor.flip(1)
    ]
    outputs = mmcv.tensor2imgs(tensor, to_rgb=True)
    for gt, output in zip(gts, outputs):
        assert_array_equal(gt, output)

    # test rgb=False
    tensor = torch.randn(2, 3, 5, 5)
    gts = [t.cpu().numpy().transpose(1, 2, 0).astype(np.uint8) for t in tensor]
    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
    for gt, output in zip(gts, outputs):
        assert_array_equal(gt, output)

    # test tensor channel 1 and rgb=False
    tensor = torch.randn(2, 1, 5, 5)
    gts = [t.squeeze(0).cpu().numpy().astype(np.uint8) for t in tensor]
    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
    for gt, output in zip(gts, outputs):
        assert_array_equal(gt, output)


================================================
FILE: tests/test_image/test_io.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import sys
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch

import cv2
import mmengine
import numpy as np
import pytest
import torch
from mmengine.fileio.file_client import HTTPBackend, PetrelBackend
from numpy.testing import assert_allclose, assert_array_equal

import mmcv

if torch.__version__ == 'parrots':
    pytest.skip('not necessary in parrots test', allow_module_level=True)


class TestIO:

    @classmethod
    def setup_class(cls):
        cls.data_dir = osp.join(osp.dirname(__file__), '../data')
        # the test img resolution is 400x300
        cls.img_path = osp.join(cls.data_dir, 'color.jpg')
        cls.img_path_obj = Path(cls.img_path)
        cls.gray_img_path = osp.join(cls.data_dir, 'grayscale.jpg')
        cls.gray_img_path_obj = Path(cls.gray_img_path)
        cls.gray_img_dim3_path = osp.join(cls.data_dir, 'grayscale_dim3.jpg')
        cls.gray_alpha_img_path = osp.join(cls.data_dir, 'gray_alpha.png')
        cls.palette_img_path = osp.join(cls.data_dir, 'palette.gif')
        cls.exif_img_path = osp.join(cls.data_dir, 'color_exif.jpg')
        cls.img = cv2.imread(cls.img_path)
        cls.tiff_path = osp.join(cls.data_dir, 'uint16-5channel.tif')
        # petrel s3 path
        cls.s3_path = 's3://path/of/your/file.jpg'
        # http path
        cls.http_path = 'http://path/of/your/file.jpg'
        # add mock package
        sys.modules['petrel_client'] = MagicMock()
        sys.modules['petrel_client.client'] = MagicMock()

    @classmethod
    def teardown_class(cls):
        # clean instances avoid to influence other unittest
        mmengine.FileClient._instances = {}

    def assert_img_equal(self, img, ref_img, ratio_thr=0.999):
        assert img.shape == ref_img.shape
        assert img.dtype == ref_img.dtype
        area = ref_img.shape[0] * ref_img.shape[1]
        diff = np.abs(img.astype('int32') - ref_img.astype('int32'))
        assert np.sum(diff <= 1) / float(area) > ratio_thr

    def test_imread(self):
        # backend cv2
        mmcv.use_backend('cv2')

        # file_client_args and backend_args can not be both set
        with pytest.raises(
                ValueError,
                match='"file_client_args" and "backend_args" cannot be set'):
            mmcv.imread(
                self.img_path,
                file_client_args={'backend': 'disk'},
                backend_args={'backend': 'disk'})

        # HardDiskBackend
        img_cv2_color_bgr = mmcv.imread(self.img_path)
        assert img_cv2_color_bgr.shape == (300, 400, 3)
        img_cv2_color_rgb = mmcv.imread(self.img_path, channel_order='rgb')
        assert img_cv2_color_rgb.shape == (300, 400, 3)
        assert_array_equal(img_cv2_color_rgb[:, :, ::-1], img_cv2_color_bgr)
        img_cv2_grayscale1 = mmcv.imread(self.img_path, 'grayscale')
        assert img_cv2_grayscale1.shape == (300, 400)
        img_cv2_grayscale2 = mmcv.imread(self.gray_img_path)
        assert img_cv2_grayscale2.shape == (300, 400, 3)
        img_cv2_unchanged = mmcv.imread(self.gray_img_path, 'unchanged')
        assert img_cv2_unchanged.shape == (300, 400)
        img_cv2_unchanged = mmcv.imread(img_cv2_unchanged)
        assert_array_equal(img_cv2_unchanged, mmcv.imread(img_cv2_unchanged))

        img_cv2_color_bgr = mmcv.imread(self.img_path_obj)
        assert img_cv2_color_bgr.shape == (300, 400, 3)
        img_cv2_color_rgb = mmcv.imread(self.img_path_obj, channel_order='rgb')
        assert img_cv2_color_rgb.shape == (300, 400, 3)
        assert_array_equal(img_cv2_color_rgb[:, :, ::-1], img_cv2_color_bgr)
        img_cv2_grayscale1 = mmcv.imread(self.img_path_obj, 'grayscale')
        assert img_cv2_grayscale1.shape == (300, 400)
        img_cv2_grayscale2 = mmcv.imread(self.gray_img_path_obj)
        assert img_cv2_grayscale2.shape == (300, 400, 3)
        img_cv2_unchanged = mmcv.imread(self.gray_img_path_obj, 'unchanged')
        assert img_cv2_unchanged.shape == (300, 400)
        with pytest.raises(TypeError):
            mmcv.imread(1)

        # PetrelBackend
        img_cv2_color_bgr = mmcv.imread(self.img_path)
        with patch.object(
                PetrelBackend, 'get',
                return_value=img_cv2_color_bgr) as mock_method:
            img_cv2_color_bgr_petrel = mmcv.imread(self.s3_path, backend='cv2')
            img_cv2_color_bgr_petrel_with_args = mmcv.imread(
                self.s3_path,
                backend='cv2',
                file_client_args={'backend': 'petrel'})
            mock_method.assert_called()
            assert_array_equal(img_cv2_color_bgr_petrel,
                               img_cv2_color_bgr_petrel_with_args)

            mock_method.reset_mock()

            img_cv2_color_bgr_petrel_with_args = mmcv.imread(
                self.s3_path,
                backend='cv2',
                backend_args={'backend': 'petrel'})
            mock_method.assert_called()
            assert_array_equal(img_cv2_color_bgr_petrel,
                               img_cv2_color_bgr_petrel_with_args)

        # HTTPBackend
        img_cv2_color_bgr = mmcv.imread(self.img_path)
        with patch.object(
                HTTPBackend, 'get',
                return_value=img_cv2_color_bgr) as mock_method:
            img_cv2_color_bgr_http = mmcv.imread(self.http_path, backend='cv2')
            img_cv2_color_bgr_http_with_args = mmcv.imread(
                self.http_path,
                backend='cv2',
                file_client_args={'backend': 'http'})
            mock_method.assert_called()
            assert_array_equal(img_cv2_color_bgr_http,
                               img_cv2_color_bgr_http_with_args)

            mock_method.reset_mock()

            img_cv2_color_bgr_http_with_args = mmcv.imread(
                self.http_path,
                backend='cv2',
                backend_args={'backend': 'http'})
            mock_method.assert_called()
            assert_array_equal(img_cv2_color_bgr_http,
                               img_cv2_color_bgr_http_with_args)

        with pytest.raises(FileNotFoundError):
            mmcv.imread('/not/exists/' + self.img_path)

        # test arg backend pillow
        img_pil_gray_alpha = mmcv.imread(
            self.gray_alpha_img_path, 'grayscale', backend='pillow')
        assert img_pil_gray_alpha.shape == (400, 500)
        mean = img_pil_gray_alpha[300:, 400:].mean()
        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
        img_pil_gray_alpha = mmcv.imread(
            self.gray_alpha_img_path, backend='pillow')
        mean = img_pil_gray_alpha[300:, 400:].mean(axis=(0, 1))
        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
        assert img_pil_gray_alpha.shape == (400, 500, 3)
        img_pil_gray_alpha = mmcv.imread(
            self.gray_alpha_img_path, 'unchanged', backend='pillow')
        assert img_pil_gray_alpha.shape == (400, 500, 2)
        img_pil_palette = mmcv.imread(
            self.palette_img_path, 'grayscale', backend='pillow')
        assert img_pil_palette.shape == (300, 400)
        img_pil_palette = mmcv.imread(self.palette_img_path, backend='pillow')
        assert img_pil_palette.shape == (300, 400, 3)
        img_pil_palette = mmcv.imread(
            self.palette_img_path, 'unchanged', backend='pillow')
        assert img_pil_palette.shape == (300, 400)

        # backend pillow
        mmcv.use_backend('pillow')
        img_pil_grayscale1 = mmcv.imread(self.img_path, 'grayscale')
        assert img_pil_grayscale1.shape == (300, 400)
        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path, 'grayscale')
        assert img_pil_gray_alpha.shape == (400, 500)
        mean = img_pil_gray_alpha[300:, 400:].mean()
        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path)
        mean = img_pil_gray_alpha[300:, 400:].mean(axis=(0, 1))
        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
        assert img_pil_gray_alpha.shape == (400, 500, 3)
        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path, 'unchanged')
        assert img_pil_gray_alpha.shape == (400, 500, 2)
        img_pil_palette = mmcv.imread(self.palette_img_path, 'grayscale')
        assert img_pil_palette.shape == (300, 400)
        img_pil_palette = mmcv.imread(self.palette_img_path)
        assert img_pil_palette.shape == (300, 400, 3)
        img_pil_palette = mmcv.imread(self.palette_img_path, 'unchanged')
        assert img_pil_palette.shape == (300, 400)
        img_pil_grayscale2 = mmcv.imread(self.gray_img_path)
        assert img_pil_grayscale2.shape == (300, 400, 3)
        img_pil_unchanged = mmcv.imread(self.gray_img_path, 'unchanged')
        assert img_pil_unchanged.shape == (300, 400)
        img_pil_unchanged = mmcv.imread(img_pil_unchanged)
        assert_array_equal(img_pil_unchanged, mmcv.imread(img_pil_unchanged))

        img_pil_color_bgr = mmcv.imread(self.img_path_obj)
        assert img_pil_color_bgr.shape == (300, 400, 3)
        img_pil_color_rgb = mmcv.imread(self.img_path_obj, channel_order='rgb')
        assert img_pil_color_rgb.shape == (300, 400, 3)
        assert (img_pil_color_rgb == img_cv2_color_rgb).sum() / float(
            img_cv2_color_rgb.size) > 0.5
        assert_array_equal(img_pil_color_rgb[:, :, ::-1], img_pil_color_bgr)
        img_pil_grayscale1 = mmcv.imread(self.img_path_obj, 'grayscale')
        assert img_pil_grayscale1.shape == (300, 400)
        img_pil_grayscale2 = mmcv.imread(self.gray_img_path_obj)
        assert img_pil_grayscale2.shape == (300, 400, 3)
        img_pil_unchanged = mmcv.imread(self.gray_img_path_obj, 'unchanged')
        assert img_pil_unchanged.shape == (300, 400)
        with pytest.raises(TypeError):
            mmcv.imread(1)

        # backend turbojpeg
        mmcv.use_backend('turbojpeg')

        img_turbojpeg_color_bgr = mmcv.imread(self.img_path)
        assert img_turbojpeg_color_bgr.shape == (300, 400, 3)
        assert_array_equal(img_turbojpeg_color_bgr, img_cv2_color_bgr)

        img_turbojpeg_color_rgb = mmcv.imread(
            self.img_path, channel_order='rgb')
        assert img_turbojpeg_color_rgb.shape == (300, 400, 3)
        assert_array_equal(img_turbojpeg_color_rgb, img_cv2_color_rgb)

        with pytest.raises(ValueError):
            mmcv.imread(self.img_path, channel_order='unsupport_order')

        img_turbojpeg_grayscale1 = mmcv.imread(self.img_path, flag='grayscale')
        assert img_turbojpeg_grayscale1.shape == (300, 400)
        assert_array_equal(img_turbojpeg_grayscale1, img_cv2_grayscale1)

        img_turbojpeg_grayscale2 = mmcv.imread(self.gray_img_path)
        assert img_turbojpeg_grayscale2.shape == (300, 400, 3)
        assert_array_equal(img_turbojpeg_grayscale2, img_cv2_grayscale2)

        img_turbojpeg_grayscale2 = mmcv.imread(img_turbojpeg_grayscale2)
        assert_array_equal(img_turbojpeg_grayscale2,
                           mmcv.imread(img_turbojpeg_grayscale2))

        with pytest.raises(ValueError):
            mmcv.imread(self.gray_img_path, 'unchanged')

        with pytest.raises(TypeError):
            mmcv.imread(1)

        with pytest.raises(AssertionError):
            mmcv.use_backend('unsupport_backend')

        with pytest.raises(ValueError):
            mmcv.imread(self.img_path, 'unsupported_backend')

        # backend tifffile, multi channel tiff file(> 4 channels).
        mmcv.use_backend('tifffile')
        img_tifffile = mmcv.imread(self.tiff_path)
        assert img_tifffile.shape == (200, 150, 5)

        mmcv.use_backend('cv2')

        # consistent exif behaviour
        img_cv2_exif = mmcv.imread(self.exif_img_path)
        img_pil_exif = mmcv.imread(self.exif_img_path, backend='pillow')
        assert img_cv2_exif.shape == (400, 300, 3)
        assert img_pil_exif.shape == (400, 300, 3)
        img_cv2_exif_unchanged = mmcv.imread(
            self.exif_img_path, flag='unchanged')
        img_pil_exif_unchanged = mmcv.imread(
            self.exif_img_path, backend='pillow', flag='unchanged')
        assert img_cv2_exif_unchanged.shape == (300, 400, 3)
        assert img_pil_exif_unchanged.shape == (300, 400, 3)
        img_cv2_color_ignore_exif = mmcv.imread(
            self.exif_img_path, flag='color_ignore_orientation')
        img_pil_color_ignore_exif = mmcv.imread(
            self.exif_img_path,
            backend='pillow',
            flag='color_ignore_orientation')
        assert img_cv2_color_ignore_exif.shape == (300, 400, 3)
        assert img_pil_color_ignore_exif.shape == (300, 400, 3)
        img_cv2_grayscale_ignore_exif = mmcv.imread(
            self.exif_img_path, flag='grayscale_ignore_orientation')
        img_pil_grayscale_ignore_exif = mmcv.imread(
            self.exif_img_path,
            backend='pillow',
            flag='grayscale_ignore_orientation')
        assert img_cv2_grayscale_ignore_exif.shape == (300, 400)
        assert img_pil_grayscale_ignore_exif.shape == (300, 400)

    def test_imfrombytes(self):
        # backend cv2, channel order: bgr
        mmcv.use_backend('cv2')
        with open(self.img_path, 'rb') as f:
            img_bytes = f.read()
        img_cv2 = mmcv.imfrombytes(img_bytes)
        assert img_cv2.shape == (300, 400, 3)

        # backend cv2, channel order: rgb
        mmcv.use_backend('cv2')
        with open(self.img_path, 'rb') as f:
            img_bytes = f.read()
        img_rgb_cv2 = mmcv.imfrombytes(img_bytes, channel_order='rgb')
        assert img_rgb_cv2.shape == (300, 400, 3)
        assert_array_equal(img_rgb_cv2, img_cv2[:, :, ::-1])

        # backend cv2, grayscale, decode as 3 channels
        with open(self.gray_img_path, 'rb') as f:
            img_bytes = f.read()
        gray_img_rgb_cv2 = mmcv.imfrombytes(img_bytes)
        assert gray_img_rgb_cv2.shape == (300, 400, 3)

        # backend cv2, grayscale
        with open(self.gray_img_path, 'rb') as f:
            img_bytes = f.read()
        gray_img_cv2 = mmcv.imfrombytes(img_bytes, flag='grayscale')
        assert gray_img_cv2.shape == (300, 400)

        # backend cv2, grayscale dim3
        with open(self.gray_img_dim3_path, 'rb') as f:
            img_bytes = f.read()
        gray_img_dim3_cv2 = mmcv.imfrombytes(img_bytes, flag='grayscale')
        assert gray_img_dim3_cv2.shape == (300, 400)

        # arg backend pillow, channel order: bgr
        with open(self.img_path, 'rb') as f:
            img_bytes = f.read()
        img_pillow = mmcv.imfrombytes(img_bytes, backend='pillow')
        assert img_pillow.shape == (300, 400, 3)
        # Pillow and opencv decoding may not be the same
        assert (img_cv2 == img_pillow).sum() / float(img_cv2.size) > 0.5

        # backend pillow, channel order: bgr
        mmcv.use_backend('pillow')
        with open(self.img_path, 'rb') as f:
            img_bytes = f.read()
        img_pillow = mmcv.imfrombytes(img_bytes)
        assert img_pillow.shape == (300, 400, 3)
        # Pillow and opencv decoding may not be the same
        assert (img_cv2 == img_pillow).sum() / float(img_cv2.size) > 0.5

        # backend turbojpeg, channel order: bgr
        mmcv.use_backend('turbojpeg')
        with open(self.img_path, 'rb') as f:
            img_bytes = f.read()
        img_turbojpeg = mmcv.imfrombytes(img_bytes)
        assert img_turbojpeg.shape == (300, 400, 3)
        assert_array_equal(img_cv2, img_turbojpeg)

        # backend turbojpeg, channel order: rgb
        with open(self.img_path, 'rb') as f:
            img_bytes = f.read()
        img_rgb_turbojpeg = mmcv.imfrombytes(img_bytes, channel_order='rgb')
        assert img_rgb_turbojpeg.shape == (300, 400, 3)
        assert_array_equal(img_rgb_turbojpeg, img_cv2[:, :, ::-1])

        # backend turbojpeg, grayscale, decode as 3 channels
        with open(self.gray_img_path, 'rb') as f:
            img_bytes = f.read()
        gray_img_turbojpeg = mmcv.imfrombytes(img_bytes)
        assert gray_img_turbojpeg.shape == (300, 400, 3)
        assert_array_equal(gray_img_rgb_cv2, gray_img_turbojpeg)

        # backend turbojpeg, grayscale
        with open(self.gray_img_path, 'rb') as f:
            img_bytes = f.read()
        gray_img_turbojpeg = mmcv.imfrombytes(img_bytes, flag='grayscale')
        assert gray_img_turbojpeg.shape == (300, 400)
        assert_array_equal(gray_img_cv2, gray_img_turbojpeg)

        # backend turbojpeg, grayscale dim3
        with open(self.gray_img_dim3_path, 'rb') as f:
            img_bytes = f.read()
        gray_img_dim3_turbojpeg = mmcv.imfrombytes(img_bytes, flag='grayscale')
        assert gray_img_dim3_turbojpeg.shape == (300, 400)
        assert_array_equal(gray_img_dim3_cv2, gray_img_dim3_turbojpeg)

        mmcv.use_backend('cv2')

        with pytest.raises(ValueError):
            with open(self.img_path, 'rb') as f:
                img_bytes = f.read()
            mmcv.imfrombytes(img_bytes, backend='unsupported_backend')

    def test_imwrite(self):
        img = mmcv.imread(self.img_path)
        out_file = osp.join(tempfile.gettempdir(), 'mmcv_test.jpg')

        # file_client_args and backend_args can not be both set
        with pytest.raises(
                ValueError,
                match='"file_client_args" and "backend_args" cannot be set'):
            mmcv.imwrite(
                img,
                out_file,
                file_client_args={'backend': 'disk'},
                backend_args={'backend': 'disk'})

        mmcv.imwrite(img, out_file)
        rewrite_img = mmcv.imread(out_file)
        os.remove(out_file)
        self.assert_img_equal(img, rewrite_img)

        # test petrel client
        with patch.object(
                PetrelBackend, 'put', return_value=None) as mock_method:
            ret = mmcv.imwrite(img, self.s3_path)
            ret_with_args = mmcv.imwrite(
                img, self.s3_path, file_client_args={'backend': 'petrel'})
            assert ret
            assert ret_with_args
            mock_method.assert_called()

            mock_method.reset_mock()

            ret_with_args = mmcv.imwrite(
                img, self.s3_path, backend_args={'backend': 'petrel'})
            assert ret_with_args
            mock_method.assert_called()

        with pytest.raises(cv2.error):
            mmcv.imwrite(img, 'error_file.jppg')

    @patch('mmcv.image.io.TurboJPEG', None)
    def test_no_turbojpeg(self):
        with pytest.raises(ImportError):
            mmcv.use_backend('turbojpeg')

        mmcv.use_backend('cv2')

    @patch('mmcv.image.io.Image', None)
    def test_no_pillow(self):
        with pytest.raises(ImportError):
            mmcv.use_backend('pillow')

        mmcv.use_backend('cv2')


================================================
FILE: tests/test_image/test_photometric.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp

import cv2
import numpy as np
import pytest
from numpy.testing import assert_array_equal

import mmcv


class TestPhotometric:

    @classmethod
    def setup_class(cls):
        # the test img resolution is 400x300
        cls.img_path = osp.join(osp.dirname(__file__), '../data/color.jpg')
        cls.img = cv2.imread(cls.img_path)
        cls.mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
        cls.std = np.array([58.395, 57.12, 57.375], dtype=np.float32)

    def test_imnormalize(self):
        rgb_img = self.img[:, :, ::-1]
        baseline = (rgb_img - self.mean) / self.std
        img = mmcv.imnormalize(self.img, self.mean, self.std)
        assert np.allclose(img, baseline)
        assert id(img) != id(self.img)
        img = mmcv.imnormalize(rgb_img, self.mean, self.std, to_rgb=False)
        assert np.allclose(img, baseline)
        assert id(img) != id(rgb_img)

    def test_imnormalize_(self):
        img_for_normalize = np.float32(self.img)
        rgb_img_for_normalize = np.float32(self.img[:, :, ::-1])
        baseline = (rgb_img_for_normalize - self.mean) / self.std
        img = mmcv.imnormalize_(img_for_normalize, self.mean, self.std)
        assert np.allclose(img_for_normalize, baseline)
        assert id(img) == id(img_for_normalize)
        img = mmcv.imnormalize_(
            rgb_img_for_normalize, self.mean, self.std, to_rgb=False)
        assert np.allclose(img, baseline)
        assert id(img) == id(rgb_img_for_normalize)

    def test_imdenormalize(self):
        norm_img = (self.img[:, :, ::-1] - self.mean) / self.std
        rgb_baseline = (norm_img * self.std + self.mean)
        bgr_baseline = rgb_baseline[:, :, ::-1]
        img = mmcv.imdenormalize(norm_img, self.mean, self.std)
        assert np.allclose(img, bgr_baseline)
        img = mmcv.imdenormalize(norm_img, self.mean, self.std, to_bgr=False)
        assert np.allclose(img, rgb_baseline)

    def test_iminvert(self):
        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img_r = np.array([[255, 127, 0], [254, 128, 1], [253, 126, 2]],
                         dtype=np.uint8)
        assert_array_equal(mmcv.iminvert(img), img_r)

    def test_solarize(self):
        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img_r = np.array([[0, 127, 0], [1, 127, 1], [2, 126, 2]],
                         dtype=np.uint8)
        assert_array_equal(mmcv.solarize(img), img_r)
        img_r = np.array([[0, 127, 0], [1, 128, 1], [2, 126, 2]],
                         dtype=np.uint8)
        assert_array_equal(mmcv.solarize(img, 100), img_r)

    def test_posterize(self):
        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img_r = np.array([[0, 128, 128], [0, 0, 128], [0, 128, 128]],
                         dtype=np.uint8)
        assert_array_equal(mmcv.posterize(img, 1), img_r)
        img_r = np.array([[0, 128, 224], [0, 96, 224], [0, 128, 224]],
                         dtype=np.uint8)
        assert_array_equal(mmcv.posterize(img, 3), img_r)

    def test_adjust_color(self, nb_rand_test=100):
        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img = np.stack([img, img, img], axis=-1)
        assert_array_equal(mmcv.adjust_color(img), img)
        img_gray = mmcv.bgr2gray(img)
        img_r = np.stack([img_gray, img_gray, img_gray], axis=-1)
        assert_array_equal(mmcv.adjust_color(img, 0), img_r)
        assert_array_equal(mmcv.adjust_color(img, 0, 1), img_r)
        assert_array_equal(
            mmcv.adjust_color(img, 0.5, 0.5),
            np.round(np.clip((img * 0.5 + img_r * 0.5), 0,
                             255)).astype(img.dtype))
        assert_array_equal(
            mmcv.adjust_color(img, 1, 1.5),
            np.round(np.clip(img * 1 + img_r * 1.5, 0, 255)).astype(img.dtype))
        assert_array_equal(
            mmcv.adjust_color(img, 0.8, -0.6, gamma=2),
            np.round(np.clip(img * 0.8 - 0.6 * img_r + 2, 0,
                             255)).astype(img.dtype))
        assert_array_equal(
            mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6),
            np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0,
                             255)).astype(img.dtype))

        # test float type of image
        img = img.astype(np.float32)
        assert_array_equal(
            np.round(mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6)),
            np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0, 255)))

        # test equalize with randomly sampled image.
        for _ in range(nb_rand_test):
            img = np.clip(np.random.normal(0, 1, (256, 256, 3)) * 260, 0,
                          255).astype(np.uint8)
            factor = np.random.uniform()
            cv2_img = mmcv.adjust_color(img, alpha=factor)
            pil_img = mmcv.adjust_color(img, alpha=factor, backend='pillow')
            np.testing.assert_allclose(cv2_img, pil_img, rtol=0, atol=2)

        # the input type must be uint8 for pillow backend
        with pytest.raises(AssertionError):
            mmcv.adjust_color(img.astype(np.float32), backend='pillow')

        # backend must be 'cv2' or 'pillow'
        with pytest.raises(ValueError):
            mmcv.adjust_color(img.astype(np.uint8), backend='not support')

    def test_imequalize(self, nb_rand_test=100):

        def _imequalize(img):
            # equalize the image using PIL.ImageOps.equalize
            from PIL import Image, ImageOps
            img = Image.fromarray(img)
            equalized_img = np.asarray(ImageOps.equalize(img))
            return equalized_img

        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img = np.stack([img, img, img], axis=-1)
        equalized_img = mmcv.imequalize(img)
        assert_array_equal(equalized_img, _imequalize(img))

        # test equalize with case step=0
        img = np.array([[0, 0, 0], [120, 120, 120], [255, 255, 255]],
                       dtype=np.uint8)
        img = np.stack([img, img, img], axis=-1)
        assert_array_equal(mmcv.imequalize(img), img)

        # test equalize with randomly sampled image.
        for _ in range(nb_rand_test):
            img = np.clip(np.random.normal(0, 1, (256, 256, 3)) * 260, 0,
                          255).astype(np.uint8)
            equalized_img = mmcv.imequalize(img)
            assert_array_equal(equalized_img, _imequalize(img))

    def test_adjust_brightness(self, nb_rand_test=100):

        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img = np.stack([img, img, img], axis=-1)
        # test case with factor 1.0
        assert_array_equal(mmcv.adjust_brightness(img, 1.), img)
        # test case with factor 0.0
        assert_array_equal(mmcv.adjust_brightness(img, 0.), np.zeros_like(img))
        # test adjust_brightness with randomly sampled images and factors.
        for _ in range(nb_rand_test):
            img = np.clip(
                np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,
                255).astype(np.uint8)
            factor = np.random.uniform() + np.random.choice([0, 1])
            np.testing.assert_allclose(
                mmcv.adjust_brightness(img, factor).astype(np.int32),
                mmcv.adjust_brightness(img, factor,
                                       backend='pillow').astype(np.int32),
                rtol=0,
                atol=1)

        # the input type must be uint8 for pillow backend
        with pytest.raises(AssertionError):
            mmcv.adjust_brightness(img.astype(np.float32), backend='pillow')

        # backend must be 'cv2' or 'pillow'
        with pytest.raises(ValueError):
            mmcv.adjust_brightness(img.astype(np.uint8), backend='not support')

    def test_adjust_contrast(self, nb_rand_test=100):

        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img = np.stack([img, img, img], axis=-1)
        # test case with factor 1.0
        assert_array_equal(mmcv.adjust_contrast(img, 1.), img)
        # test case with factor 0.0
        assert_array_equal(
            mmcv.adjust_contrast(img, 0.),
            mmcv.adjust_contrast(img, 0., backend='pillow'))
        # test adjust_contrast with randomly sampled images and factors.
        for _ in range(nb_rand_test):
            img = np.clip(
                np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,
                255).astype(np.uint8)
            factor = np.random.uniform() + np.random.choice([0, 1])
            # Note the gap (less_equal 1) between PIL.ImageEnhance.Contrast
            # and mmcv.adjust_contrast comes from the gap that converts from
            # a color image to gray image using mmcv or PIL.
            np.testing.assert_allclose(
                mmcv.adjust_contrast(img, factor).astype(np.int32),
                mmcv.adjust_contrast(img, factor,
                                     backend='pillow').astype(np.int32),
                rtol=0,
                atol=1)

        # the input type must be uint8 pillow backend
        with pytest.raises(AssertionError):
            mmcv.adjust_contrast(img.astype(np.float32), backend='pillow')

        # backend must be 'cv2' or 'pillow'
        with pytest.raises(ValueError):
            mmcv.adjust_contrast(img.astype(np.uint8), backend='not support')

    def test_auto_contrast(self, nb_rand_test=100):

        def _auto_contrast(img, cutoff=0):
            from PIL import Image
            from PIL.ImageOps import autocontrast

            # Image.fromarray defaultly supports RGB, not BGR.
            # convert from BGR to RGB
            img = Image.fromarray(img[..., ::-1], mode='RGB')
            contrasted_img = autocontrast(img, cutoff)
            # convert from RGB to BGR
            return np.asarray(contrasted_img)[..., ::-1]

        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img = np.stack([img, img, img], axis=-1)

        # test case without cut-off
        assert_array_equal(mmcv.auto_contrast(img), _auto_contrast(img))
        # test case with cut-off as int
        assert_array_equal(
            mmcv.auto_contrast(img, 10), _auto_contrast(img, 10))
        # test case with cut-off as float
        assert_array_equal(
            mmcv.auto_contrast(img, 12.5), _auto_contrast(img, 12.5))
        # test case with cut-off as tuple
        assert_array_equal(
            mmcv.auto_contrast(img, (10, 10)), _auto_contrast(img, 10))
        # test case with cut-off with sum over 100
        assert_array_equal(
            mmcv.auto_contrast(img, 60), _auto_contrast(img, 60))

        # test auto_contrast with randomly sampled images and factors.
        for _ in range(nb_rand_test):
            img = np.clip(
                np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,
                255).astype(np.uint8)
            # cut-offs are not set as tuple since in `build.yml`, pillow 6.2.2
            # is installed, which does not support setting low cut-off and high
            #  cut-off differently.
            # With pillow above 8.0.0, cutoff can be set as tuple
            cutoff = np.random.rand() * 100
            assert_array_equal(
                mmcv.auto_contrast(img, cutoff), _auto_contrast(img, cutoff))

    def test_adjust_sharpness(self, nb_rand_test=100):

        def _adjust_sharpness(img, factor):
            # adjust the sharpness of image using
            # PIL.ImageEnhance.Sharpness
            from PIL import Image
            from PIL.ImageEnhance import Sharpness
            img = Image.fromarray(img)
            sharpened_img = Sharpness(img).enhance(factor)
            return np.asarray(sharpened_img)

        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                       dtype=np.uint8)
        img = np.stack([img, img, img], axis=-1)

        # test case with invalid type of kernel
        with pytest.raises(AssertionError):
            mmcv.adjust_sharpness(img, 1., kernel=1.)
        # test case with invalid shape of kernel
        kernel = np.ones((3, 3, 3))
        with pytest.raises(AssertionError):
            mmcv.adjust_sharpness(img, 1., kernel=kernel)
        # test case with all-zero kernel, factor 0.0
        kernel = np.zeros((3, 3))
        assert_array_equal(
            mmcv.adjust_sharpness(img, 0., kernel=kernel), np.zeros_like(img))

        # test case with factor 1.0
        assert_array_equal(mmcv.adjust_sharpness(img, 1.), img)
        # test adjust_sharpness with randomly sampled images and factors.
        for _ in range(nb_rand_test):
            img = np.clip(
                np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,
                255).astype(np.uint8)
            factor = np.random.uniform()
            # Note the gap between PIL.ImageEnhance.Sharpness and
            # mmcv.adjust_sharpness mainly comes from the difference ways of
            # handling img edges when applying filters
            np.testing.assert_allclose(
                mmcv.adjust_sharpness(img, factor).astype(np.int32)[1:-1,
                                                                    1:-1],
                _adjust_sharpness(img, factor).astype(np.int32)[1:-1, 1:-1],
                rtol=0,
                atol=1)

    def test_adjust_lighting(self):
        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
        img = np.stack([img, img, img], axis=-1)

        # eigval and eigvec must be np.ndarray
        with pytest.raises(AssertionError):
            mmcv.adjust_lighting(img, 1, np.ones((3, 1)))
        with pytest.raises(AssertionError):
            mmcv.adjust_lighting(img, np.array([1]), (1, 1, 1))
        # we must have the same number of eigval and eigvec
        with pytest.raises(AssertionError):
            mmcv.adjust_lighting(img, np.array([1]), np.eye(2))
        with pytest.raises(AssertionError):
            mmcv.adjust_lighting(img, np.array([1]), np.array([1]))

        img_adjusted = mmcv.adjust_lighting(
            img,
            np.random.normal(0, 1, 2),
            np.random.normal(0, 1, (3, 2)),
            alphastd=0.)
        assert_array_equal(img_adjusted, img)

    def test_lut_transform(self):
        lut_table = np.array(list(range(256)))

        # test assertion image values should between 0 and 255.
        with pytest.raises(AssertionError):
            mmcv.lut_transform(np.array([256]), lut_table)
        with pytest.raises(AssertionError):
            mmcv.lut_transform(np.array([-1]), lut_table)

        # test assertion lut_table should be ndarray with shape (256, )
        with pytest.raises(AssertionError):
            mmcv.lut_transform(np.array([0]), list(range(256)))
        with pytest.raises(AssertionError):
            mmcv.lut_transform(np.array([1]), np.array(list(range(257))))

        img = mmcv.lut_transform(self.img, lut_table)
        baseline = cv2.LUT(self.img, lut_table)
        assert np.allclose(img, baseline)

        input_img = np.array(
            [[[0, 128, 255], [255, 128, 0]], [[0, 128, 255], [255, 128, 0]]],
            dtype=float)
        img = mmcv.lut_transform(input_img, lut_table)
        baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)
        assert np.allclose(img, baseline)

        input_img = np.random.randint(0, 256, size=(7, 8, 9, 10, 11))
        img = mmcv.lut_transform(input_img, lut_table)
        baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)
        assert np.allclose(img, baseline)

    def test_clahe(self):

        def _clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
            clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
            return clahe.apply(np.array(img, dtype=np.uint8))

        # test assertion image should have the right shape
        with pytest.raises(AssertionError):
            mmcv.clahe(self.img)

        # test assertion tile_grid_size should be a tuple with 2 integers
        with pytest.raises(AssertionError):
            mmcv.clahe(self.img[:, :, 0], tile_grid_size=(8.0, 8.0))
        with pytest.raises(AssertionError):
            mmcv.clahe(self.img[:, :, 0], tile_grid_size=(8, 8, 8))
        with pytest.raises(AssertionError):
            mmcv.clahe(self.img[:, :, 0], tile_grid_size=[8, 8])

        # test with different channels
        for i in range(self.img.shape[-1]):
            img = mmcv.clahe(self.img[:, :, i])
            img_std = _clahe(self.img[:, :, i])
            assert np.allclose(img, img_std)
            assert id(img) != id(self.img[:, :, i])
            assert id(img_std) != id(self.img[:, :, i])

        # test case with clip_limit=1.2
        for i in range(self.img.shape[-1]):
            img = mmcv.clahe(self.img[:, :, i], 1.2)
            img_std = _clahe(self.img[:, :, i], 1.2)
            assert np.allclose(img, img_std)
            assert id(img) != id(self.img[:, :, i])
            assert id(img_std) != id(self.img[:, :, i])

    def test_adjust_hue(self):
        # test case with img is not ndarray
        from PIL import Image
        pil_img = Image.fromarray(self.img)

        with pytest.raises(TypeError):
            mmcv.adjust_hue(pil_img, hue_factor=0.0)

        # test case with hue_factor > 0.5 or hue_factor < -0.5
        with pytest.raises(ValueError):
            mmcv.adjust_hue(self.img, hue_factor=-0.6)
        with pytest.raises(ValueError):
            mmcv.adjust_hue(self.img, hue_factor=0.6)

        for i in np.arange(-0.5, 0.5, 0.2):
            pil_res = mmcv.adjust_hue(self.img, hue_factor=i, backend='pillow')
            pil_res = np.array(pil_res)
            cv2_res = mmcv.adjust_hue(self.img, hue_factor=i)
            assert np.allclose(pil_res, cv2_res, atol=10.0)

        # test pillow backend
        with pytest.raises(AssertionError):
            mmcv.adjust_hue(
                self.img.astype(np.float32), hue_factor=0, backend='pillow')

        # backend must be 'cv2' or 'pillow'
        with pytest.raises(ValueError):
            mmcv.adjust_hue(
                self.img.astype(np.uint8), hue_factor=0, backend='not support')


================================================
FILE: tests/test_ops/test_active_rotated_filter.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import active_rotated_filter
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE

np_feature = np.array([[[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
                          [-1.0986e+00, -1.1463e+00, -1.3176e+00],
                          [1.4808e+00, 7.6572e-01, -1.4548e+00]]]],
                       [[[[1.9370e+00, 6.2799e-01, 2.5834e-02],
                          [-1.4242e+00, 7.6566e-01, 1.0015e+00],
                          [9.8669e-01, 4.1356e-01, 6.1068e-01]]]],
                       [[[[1.4565e+00, 1.4960e+00, 2.4339e-01],
                          [-2.2484e-01, 7.5942e-01, -8.1184e-01],
                          [-1.7077e+00, 1.0658e+00, 3.8311e-01]]]],
                       [[[[8.4734e-01, 1.0904e+00, 2.4356e+00],
                          [9.5822e-01, 2.2260e-01, -2.4450e-01],
                          [-1.5078e+00, 7.0902e-02, -1.5921e+00]]]],
                       [[[[2.1173e+00, -7.3524e-01, 1.8888e+00],
                          [1.0169e+00, 4.7033e-01, -1.0875e+00],
                          [-1.0736e+00, -5.2245e-01, -2.8733e-01]]]],
                       [[[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
                          [-8.8974e-01, -4.3128e-01, -2.2423e-01],
                          [1.6552e-03, -1.7292e+00, 2.6639e-01]]]],
                       [[[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
                          [5.6248e-01, -5.1189e-01, 1.3614e+00],
                          [3.3680e-01, -8.7148e-01, 5.0592e-01]]]],
                       [[[[1.6781e-02, -8.3929e-01, 1.2060e+00],
                          [-1.0764e+00, 4.7821e-01, 1.5342e+00],
                          [-4.4542e-01, -1.8606e+00, 3.0827e-01]]]]])

np_indices = np.array([[[[1, 2, 3, 6, 9, 8, 7, 4], [2, 3, 6, 9, 8, 7, 4, 1],
                         [3, 6, 9, 8, 7, 4, 1, 2]],
                        [[4, 1, 2, 3, 6, 9, 8, 7], [5, 5, 5, 5, 5, 5, 5, 5],
                         [6, 9, 8, 7, 4, 1, 2, 3]],
                        [[7, 4, 1, 2, 3, 6, 9, 8], [8, 7, 4, 1, 2, 3, 6, 9],
                         [9, 8, 7, 4, 1, 2, 3, 6]]]])

expected_output = np.array([[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
                              [-1.0986e+00, -1.1463e+00, -1.3176e+00],
                              [1.4808e+00, 7.6572e-01, -1.4548e+00]]],
                            [[[-1.0986e+00, -1.4934e-01, 1.1341e+00],
                              [1.4808e+00, -1.1463e+00, -1.6241e-01],
                              [7.6572e-01, -1.4548e+00, -1.3176e+00]]],
                            [[[1.4808e+00, -1.0986e+00, -1.4934e-01],
                              [7.6572e-01, -1.1463e+00, 1.1341e+00],
                              [-1.4548e+00, -1.3176e+00, -1.6241e-01]]],
                            [[[7.6572e-01, 1.4808e+00, -1.0986e+00],
                              [-1.4548e+00, -1.1463e+00, -1.4934e-01],
                              [-1.3176e+00, -1.6241e-01, 1.1341e+00]]],
                            [[[-1.4548e+00, 7.6572e-01, 1.4808e+00],
                              [-1.3176e+00, -1.1463e+00, -1.0986e+00],
                              [-1.6241e-01, 1.1341e+00, -1.4934e-01]]],
                            [[[-1.3176e+00, -1.4548e+00, 7.6572e-01],
                              [-1.6241e-01, -1.1463e+00, 1.4808e+00],
                              [1.1341e+00, -1.4934e-01, -1.0986e+00]]],
                            [[[-1.6241e-01, -1.3176e+00, -1.4548e+00],
                              [1.1341e+00, -1.1463e+00, 7.6572e-01],
                              [-1.4934e-01, -1.0986e+00, 1.4808e+00]]],
                            [[[1.1341e+00, -1.6241e-01, -1.3176e+00],
                              [-1.4934e-01, -1.1463e+00, -1.4548e+00],
                              [-1.0986e+00, 1.4808e+00, 7.6572e-01]]],
                            [[[1.9370e+00, 6.2799e-01, 2.5834e-02],
                              [-1.4242e+00, 7.6566e-01, 1.0015e+00],
                              [9.8669e-01, 4.1356e-01, 6.1068e-01]]],
                            [[[-1.4242e+00, 1.9370e+00, 6.2799e-01],
                              [9.8669e-01, 7.6566e-01, 2.5834e-02],
                              [4.1356e-01, 6.1068e-01, 1.0015e+00]]],
                            [[[9.8669e-01, -1.4242e+00, 1.9370e+00],
                              [4.1356e-01, 7.6566e-01, 6.2799e-01],
                              [6.1068e-01, 1.0015e+00, 2.5834e-02]]],
                            [[[4.1356e-01, 9.8669e-01, -1.4242e+00],
                              [6.1068e-01, 7.6566e-01, 1.9370e+00],
                              [1.0015e+00, 2.5834e-02, 6.2799e-01]]],
                            [[[6.1068e-01, 4.1356e-01, 9.8669e-01],
                              [1.0015e+00, 7.6566e-01, -1.4242e+00],
                              [2.5834e-02, 6.2799e-01, 1.9370e+00]]],
                            [[[1.0015e+00, 6.1068e-01, 4.1356e-01],
                              [2.5834e-02, 7.6566e-01, 9.8669e-01],
                              [6.2799e-01, 1.9370e+00, -1.4242e+00]]],
                            [[[2.5834e-02, 1.0015e+00, 6.1068e-01],
                              [6.2799e-01, 7.6566e-01, 4.1356e-01],
                              [1.9370e+00, -1.4242e+00, 9.8669e-01]]],
                            [[[6.2799e-01, 2.5834e-02, 1.0015e+00],
                              [1.9370e+00, 7.6566e-01, 6.1068e-01],
                              [-1.4242e+00, 9.8669e-01, 4.1356e-01]]],
                            [[[1.4565e+00, 1.4960e+00, 2.4339e-01],
                              [-2.2484e-01, 7.5942e-01, -8.1184e-01],
                              [-1.7077e+00, 1.0658e+00, 3.8311e-01]]],
                            [[[-2.2484e-01, 1.4565e+00, 1.4960e+00],
                              [-1.7077e+00, 7.5942e-01, 2.4339e-01],
                              [1.0658e+00, 3.8311e-01, -8.1184e-01]]],
                            [[[-1.7077e+00, -2.2484e-01, 1.4565e+00],
                              [1.0658e+00, 7.5942e-01, 1.4960e+00],
                              [3.8311e-01, -8.1184e-01, 2.4339e-01]]],
                            [[[1.0658e+00, -1.7077e+00, -2.2484e-01],
                              [3.8311e-01, 7.5942e-01, 1.4565e+00],
                              [-8.1184e-01, 2.4339e-01, 1.4960e+00]]],
                            [[[3.8311e-01, 1.0658e+00, -1.7077e+00],
                              [-8.1184e-01, 7.5942e-01, -2.2484e-01],
                              [2.4339e-01, 1.4960e+00, 1.4565e+00]]],
                            [[[-8.1184e-01, 3.8311e-01, 1.0658e+00],
                              [2.4339e-01, 7.5942e-01, -1.7077e+00],
                              [1.4960e+00, 1.4565e+00, -2.2484e-01]]],
                            [[[2.4339e-01, -8.1184e-01, 3.8311e-01],
                              [1.4960e+00, 7.5942e-01, 1.0658e+00],
                              [1.4565e+00, -2.2484e-01, -1.7077e+00]]],
                            [[[1.4960e+00, 2.4339e-01, -8.1184e-01],
                              [1.4565e+00, 7.5942e-01, 3.8311e-01],
                              [-2.2484e-01, -1.7077e+00, 1.0658e+00]]],
                            [[[8.4734e-01, 1.0904e+00, 2.4356e+00],
                              [9.5822e-01, 2.2260e-01, -2.4450e-01],
                              [-1.5078e+00, 7.0902e-02, -1.5921e+00]]],
                            [[[9.5822e-01, 8.4734e-01, 1.0904e+00],
                              [-1.5078e+00, 2.2260e-01, 2.4356e+00],
                              [7.0902e-02, -1.5921e+00, -2.4450e-01]]],
                            [[[-1.5078e+00, 9.5822e-01, 8.4734e-01],
                              [7.0902e-02, 2.2260e-01, 1.0904e+00],
                              [-1.5921e+00, -2.4450e-01, 2.4356e+00]]],
                            [[[7.0902e-02, -1.5078e+00, 9.5822e-01],
                              [-1.5921e+00, 2.2260e-01, 8.4734e-01],
                              [-2.4450e-01, 2.4356e+00, 1.0904e+00]]],
                            [[[-1.5921e+00, 7.0902e-02, -1.5078e+00],
                              [-2.4450e-01, 2.2260e-01, 9.5822e-01],
                              [2.4356e+00, 1.0904e+00, 8.4734e-01]]],
                            [[[-2.4450e-01, -1.5921e+00, 7.0902e-02],
                              [2.4356e+00, 2.2260e-01, -1.5078e+00],
                              [1.0904e+00, 8.4734e-01, 9.5822e-01]]],
                            [[[2.4356e+00, -2.4450e-01, -1.5921e+00],
                              [1.0904e+00, 2.2260e-01, 7.0902e-02],
                              [8.4734e-01, 9.5822e-01, -1.5078e+00]]],
                            [[[1.0904e+00, 2.4356e+00, -2.4450e-01],
                              [8.4734e-01, 2.2260e-01, -1.5921e+00],
                              [9.5822e-01, -1.5078e+00, 7.0902e-02]]],
                            [[[2.1173e+00, -7.3524e-01, 1.8888e+00],
                              [1.0169e+00, 4.7033e-01, -1.0875e+00],
                              [-1.0736e+00, -5.2245e-01, -2.8733e-01]]],
                            [[[1.0169e+00, 2.1173e+00, -7.3524e-01],
                              [-1.0736e+00, 4.7033e-01, 1.8888e+00],
                              [-5.2245e-01, -2.8733e-01, -1.0875e+00]]],
                            [[[-1.0736e+00, 1.0169e+00, 2.1173e+00],
                              [-5.2245e-01, 4.7033e-01, -7.3524e-01],
                              [-2.8733e-01, -1.0875e+00, 1.8888e+00]]],
                            [[[-5.2245e-01, -1.0736e+00, 1.0169e+00],
                              [-2.8733e-01, 4.7033e-01, 2.1173e+00],
                              [-1.0875e+00, 1.8888e+00, -7.3524e-01]]],
                            [[[-2.8733e-01, -5.2245e-01, -1.0736e+00],
                              [-1.0875e+00, 4.7033e-01, 1.0169e+00],
                              [1.8888e+00, -7.3524e-01, 2.1173e+00]]],
                            [[[-1.0875e+00, -2.8733e-01, -5.2245e-01],
                              [1.8888e+00, 4.7033e-01, -1.0736e+00],
                              [-7.3524e-01, 2.1173e+00, 1.0169e+00]]],
                            [[[1.8888e+00, -1.0875e+00, -2.8733e-01],
                              [-7.3524e-01, 4.7033e-01, -5.2245e-01],
                              [2.1173e+00, 1.0169e+00, -1.0736e+00]]],
                            [[[-7.3524e-01, 1.8888e+00, -1.0875e+00],
                              [2.1173e+00, 4.7033e-01, -2.8733e-01],
                              [1.0169e+00, -1.0736e+00, -5.2245e-01]]],
                            [[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
                              [-8.8974e-01, -4.3128e-01, -2.2423e-01],
                              [1.6552e-03, -1.7292e+00, 2.6639e-01]]],
                            [[[-8.8974e-01, -5.6433e-01, 1.5835e+00],
                              [1.6552e-03, -4.3128e-01, -1.5826e+00],
                              [-1.7292e+00, 2.6639e-01, -2.2423e-01]]],
                            [[[1.6552e-03, -8.8974e-01, -5.6433e-01],
                              [-1.7292e+00, -4.3128e-01, 1.5835e+00],
                              [2.6639e-01, -2.2423e-01, -1.5826e+00]]],
                            [[[-1.7292e+00, 1.6552e-03, -8.8974e-01],
                              [2.6639e-01, -4.3128e-01, -5.6433e-01],
                              [-2.2423e-01, -1.5826e+00, 1.5835e+00]]],
                            [[[2.6639e-01, -1.7292e+00, 1.6552e-03],
                              [-2.2423e-01, -4.3128e-01, -8.8974e-01],
                              [-1.5826e+00, 1.5835e+00, -5.6433e-01]]],
                            [[[-2.2423e-01, 2.6639e-01, -1.7292e+00],
                              [-1.5826e+00, -4.3128e-01, 1.6552e-03],
                              [1.5835e+00, -5.6433e-01, -8.8974e-01]]],
                            [[[-1.5826e+00, -2.2423e-01, 2.6639e-01],
                              [1.5835e+00, -4.3128e-01, -1.7292e+00],
                              [-5.6433e-01, -8.8974e-01, 1.6552e-03]]],
                            [[[1.5835e+00, -1.5826e+00, -2.2423e-01],
                              [-5.6433e-01, -4.3128e-01, 2.6639e-01],
                              [-8.8974e-01, 1.6552e-03, -1.7292e+00]]],
                            [[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
                              [5.6248e-01, -5.1189e-01, 1.3614e+00],
                              [3.3680e-01, -8.7148e-01, 5.0592e-01]]],
                            [[[5.6248e-01, -1.2951e-01, 1.3493e+00],
                              [3.3680e-01, -5.1189e-01, -1.9329e+00],
                              [-8.7148e-01, 5.0592e-01, 1.3614e+00]]],
                            [[[3.3680e-01, 5.6248e-01, -1.2951e-01],
                              [-8.7148e-01, -5.1189e-01, 1.3493e+00],
                              [5.0592e-01, 1.3614e+00, -1.9329e+00]]],
                            [[[-8.7148e-01, 3.3680e-01, 5.6248e-01],
                              [5.0592e-01, -5.1189e-01, -1.2951e-01],
                              [1.3614e+00, -1.9329e+00, 1.3493e+00]]],
                            [[[5.0592e-01, -8.7148e-01, 3.3680e-01],
                              [1.3614e+00, -5.1189e-01, 5.6248e-01],
                              [-1.9329e+00, 1.3493e+00, -1.2951e-01]]],
                            [[[1.3614e+00, 5.0592e-01, -8.7148e-01],
                              [-1.9329e+00, -5.1189e-01, 3.3680e-01],
                              [1.3493e+00, -1.2951e-01, 5.6248e-01]]],
                            [[[-1.9329e+00, 1.3614e+00, 5.0592e-01],
                              [1.3493e+00, -5.1189e-01, -8.7148e-01],
                              [-1.2951e-01, 5.6248e-01, 3.3680e-01]]],
                            [[[1.3493e+00, -1.9329e+00, 1.3614e+00],
                              [-1.2951e-01, -5.1189e-01, 5.0592e-01],
                              [5.6248e-01, 3.3680e-01, -8.7148e-01]]],
                            [[[1.6781e-02, -8.3929e-01, 1.2060e+00],
                              [-1.0764e+00, 4.7821e-01, 1.5342e+00],
                              [-4.4542e-01, -1.8606e+00, 3.0827e-01]]],
                            [[[-1.0764e+00, 1.6781e-02, -8.3929e-01],
                              [-4.4542e-01, 4.7821e-01, 1.2060e+00],
                              [-1.8606e+00, 3.0827e-01, 1.5342e+00]]],
                            [[[-4.4542e-01, -1.0764e+00, 1.6781e-02],
                              [-1.8606e+00, 4.7821e-01, -8.3929e-01],
                              [3.0827e-01, 1.5342e+00, 1.2060e+00]]],
                            [[[-1.8606e+00, -4.4542e-01, -1.0764e+00],
                              [3.0827e-01, 4.7821e-01, 1.6781e-02],
                              [1.5342e+00, 1.2060e+00, -8.3929e-01]]],
                            [[[3.0827e-01, -1.8606e+00, -4.4542e-01],
                              [1.5342e+00, 4.7821e-01, -1.0764e+00],
                              [1.2060e+00, -8.3929e-01, 1.6781e-02]]],
                            [[[1.5342e+00, 3.0827e-01, -1.8606e+00],
                              [1.2060e+00, 4.7821e-01, -4.4542e-01],
                              [-8.3929e-01, 1.6781e-02, -1.0764e+00]]],
                            [[[1.2060e+00, 1.5342e+00, 3.0827e-01],
                              [-8.3929e-01, 4.7821e-01, -1.8606e+00],
                              [1.6781e-02, -1.0764e+00, -4.4542e-01]]],
                            [[[-8.3929e-01, 1.2060e+00, 1.5342e+00],
                              [1.6781e-02, 4.7821e-01, 3.0827e-01],
                              [-1.0764e+00, -4.4542e-01, -1.8606e+00]]]])

expected_grad = np.array([[[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]]])


@pytest.mark.parametrize('device', [
    'cpu',
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_active_rotated_filter(device):
    feature = torch.tensor(
        np_feature, dtype=torch.float, device=device, requires_grad=True)
    indices = torch.tensor(np_indices, dtype=torch.int, device=device)
    output = active_rotated_filter(feature, indices)
    output.backward(torch.ones_like(output))
    assert np.allclose(output.data.cpu().numpy(), expected_output, atol=1e-3)
    assert np.allclose(
        feature.grad.data.cpu().numpy(), expected_grad, atol=1e-3)


================================================
FILE: tests/test_ops/test_assign_score_withk.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import assign_score_withk
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_paconv_assign_scores(device):
    scores = torch.tensor(
        [[[[0.06947571, 0.6065746], [0.28462553, 0.8378516],
           [0.7595994, 0.97220325], [0.519155, 0.766185]],
          [[0.15348864, 0.6051019], [0.21510637, 0.31916398],
           [0.00236845, 0.5842595], [0.6783676, 0.5216348]]],
         [[[0.23089725, 0.5568468], [0.7405102, 0.06438422],
           [0.6887394, 0.22089851], [0.0502342, 0.79228795]],
          [[0.44883424, 0.15427643], [0.13817799, 0.34856772],
           [0.7989621, 0.33788306], [0.15699774, 0.7693662]]]],
        device=device).float()
    points = torch.tensor(
        [[[[0.06001121, 0.92963666, 0.5753327, 0.7251477],
           [0.53563064, 0.23129565, 0.92366195, 0.44261628]],
          [[0.5770022, 0.56625944, 0.23560429, 0.11178821],
           [0.7735967, 0.95678777, 0.25468266, 0.02895975]],
          [[0.0589869, 0.09017515, 0.5977862, 0.02797985],
           [0.603862, 0.35991007, 0.85761684, 0.3096559]],
          [[0.22359002, 0.13983732, 0.5544243, 0.68863827],
           [0.85646236, 0.75651926, 0.8638947, 0.83600986]],
          [[0.45424145, 0.27458847, 0.6456112, 0.47162914],
           [0.15773582, 0.47645122, 0.79964715, 0.3323908]],
          [[0.8351399, 0.84696376, 0.9431732, 0.29418713],
           [0.77168906, 0.6996871, 0.19354361, 0.03392768]],
          [[0.30976456, 0.7074133, 0.581795, 0.976677],
           [0.69656056, 0.07199162, 0.4708506, 0.29117996]],
          [[0.5829035, 0.30201727, 0.76556486, 0.0935446],
           [0.88030535, 0.16129416, 0.9242525, 0.49545723]]],
         [[[0.50899494, 0.06482804, 0.44939405, 0.37704808],
           [0.47028124, 0.11969638, 0.62823206, 0.28560323]],
          [[0.40690207, 0.689753, 0.51636654, 0.23040164],
           [0.06935787, 0.00488842, 0.22462702, 0.09182382]],
          [[0.26611632, 0.00184339, 0.7730655, 0.5228131],
           [0.87776035, 0.77895886, 0.2787183, 0.16620636]],
          [[0.502574, 0.04039001, 0.5368497, 0.98379374],
           [0.40973026, 0.3238272, 0.9733018, 0.13988364]],
          [[0.04586202, 0.20983845, 0.20662665, 0.22270602],
           [0.60387236, 0.5155574, 0.51237285, 0.6528438]],
          [[0.45735973, 0.86821306, 0.61054605, 0.8370336],
           [0.45193362, 0.3734138, 0.7825672, 0.5699416]],
          [[0.44591594, 0.12447512, 0.09282011, 0.7055254],
           [0.25223452, 0.46696228, 0.7051136, 0.892151]],
          [[0.49615085, 0.47321403, 0.93138885, 0.7652197],
           [0.38766378, 0.30332977, 0.23131835, 0.02863514]]]],
        device=device).float()
    centers = torch.tensor(
        [[[[0.83878064, 0.96658987, 0.8033424, 0.9598312],
           [0.45035273, 0.8768925, 0.977736, 0.54547966]],
          [[0.01041394, 0.597893, 0.36212963, 0.4410367],
           [0.94879234, 0.8372817, 0.21237361, 0.67945415]],
          [[0.5096087, 0.26401454, 0.60034937, 0.5417416],
           [0.87591463, 0.546456, 0.4096033, 0.16373193]],
          [[0.79547447, 0.1482386, 0.12840575, 0.45384115],
           [0.5640288, 0.944541, 0.5745328, 0.73229736]],
          [[0.93011934, 0.7406011, 0.62621707, 0.8677915],
           [0.91563636, 0.3595413, 0.6678378, 0.6085383]],
          [[0.22431666, 0.65617776, 0.7483924, 0.6263364],
           [0.30968404, 0.78204364, 0.14899081, 0.09628749]],
          [[0.73675203, 0.72104895, 0.4648038, 0.6101647],
           [0.7817645, 0.16572917, 0.3311919, 0.43407398]],
          [[0.8193154, 0.09559608, 0.05978829, 0.90262103],
           [0.4256065, 0.8165596, 0.8206446, 0.6604721]]],
         [[[0.7159653, 0.18600845, 0.21433902, 0.3159626],
           [0.3921569, 0.33221376, 0.5061177, 0.7961841]],
          [[0.95338356, 0.04785997, 0.67185795, 0.6538394],
           [0.4729132, 0.33404195, 0.17750603, 0.8445621]],
          [[0.6755793, 0.16193843, 0.75943846, 0.92123103],
           [0.2781859, 0.03114432, 0.710638, 0.52729136]],
          [[0.8376105, 0.10858494, 0.13208169, 0.365772],
           [0.5930795, 0.27390373, 0.14036089, 0.170403]],
          [[0.3479789, 0.89855295, 0.04844379, 0.9871029],
           [0.29781651, 0.0244137, 0.9179047, 0.8081611]],
          [[0.12460887, 0.44991326, 0.19382608, 0.35037738],
           [0.2773472, 0.4362057, 0.36757517, 0.5993509]],
          [[0.29630446, 0.90046406, 0.5417113, 0.13510644],
           [0.09623539, 0.04226565, 0.32001644, 0.44358212]],
          [[0.5274848, 0.82096446, 0.9415489, 0.7123748],
           [0.7537517, 0.8086482, 0.85345286, 0.7472754]]]],
        device=device).float()
    if device == 'cuda' or device == 'musa':
        points.requires_grad_()
        scores.requires_grad_()
        centers.requires_grad_()
    knn_idx = torch.tensor(
        [[[6, 7, 4, 6], [2, 4, 2, 4]], [[7, 1, 3, 2], [6, 0, 2, 6]]],
        device=device).long()
    aggregate = 'sum'
    expected_output = torch.tensor(
        [[[[-0.08134781, 0.03877336, -0.8212776, -0.2869547],
           [-0.23378491, -0.24112664, -0.1600166, -0.4121864]],
          [[-0.05780616, -0.12298299, -0.0370461, -0.07889931],
           [-0.13956165, -0.02006848, -0.10940295, -0.0293439]],
          [[0.09284145, 0.58250105, 0.5927749, 0.16774094],
           [0.27070042, 0.13422406, 0.2617501, 0.23416464]],
          [[-0.06121218, -0.09561322, -0.20408826, 0.08079343],
           [0.00944228, 0.03874819, 0.08404065, 0.04041629]]],
         [[[-0.2110898, -0.13335688, -0.09315082, 0.08512095],
           [0.09121774, 0.15976946, 0.23994486, 0.14350912]],
          [[-0.36167958, -0.14891288, -0.64470863, -0.0646704],
           [-0.28276974, -0.08847666, -0.46904767, 0.20491874]],
          [[-0.34877953, -0.35533834, -0.25225785, -0.4638189],
           [-0.1420663, 0.09467781, 0.17088932, 0.22580585]],
          [[-0.3879708, -0.3991068, 0.05276498, -0.46989647],
           [0.32522714, -0.02163534, 0.21604237, 0.4346682]]]]).float()

    # test forward
    output = assign_score_withk(scores, points, centers, knn_idx, aggregate)
    assert torch.allclose(output.detach().cpu(), expected_output, atol=1e-6)

    # test backward
    if device == 'cuda' or device == 'musa':
        loss = output.sum()
        loss.backward()
        expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683],
                                               [-0.78873926, 0.7485497],
                                               [-0.6866992, 0.05346543],
                                               [0.04288036, -0.18217683]],
                                              [[-1.1407862, 0.13533896],
                                               [-0.06964391, -0.22948086],
                                               [-1.1407862, 0.13533896],
                                               [-0.06964391, -0.22948086]]],
                                             [[[-0.3363995, -2.212181],
                                               [-1.1589496, -2.7724311],
                                               [-0.9387654, -1.3163853],
                                               [-1.4385346, -1.0614843]],
                                              [[-0.5048497, 1.4143617],
                                               [-0.47332114, 0.6017133],
                                               [-0.30974793, 1.1995442],
                                               [-0.5048497,
                                                1.4143617]]]]).float()
        expected_points_grad = torch.tensor(
            [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0.15585709, 0.15585709, 0.15585709, 0.15585709],
               [1.1893613, 1.1893613, 1.1893613, 1.1893613]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[1.6530733, 1.6530733, 1.6530733, 1.6530733],
               [1.8130021, 1.8130021, 1.8130021, 1.8130021]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0.58863074, 0.58863074, 0.58863074, 0.58863074],
               [1.3727596, 1.3727596, 1.3727596, 1.3727596]],
              [[0.28462553, 0.28462553, 0.28462553, 0.28462553],
               [0.8378516, 0.8378516, 0.8378516, 0.8378516]]],
             [[[0.13817799, 0.13817799, 0.13817799, 0.13817799],
               [0.34856772, 0.34856772, 0.34856772, 0.34856772]],
              [[0.7405102, 0.7405102, 0.7405102, 0.7405102],
               [0.06438422, 0.06438422, 0.06438422, 0.06438422]],
              [[0.8491963, 0.8491963, 0.8491963, 0.8491963],
               [1.1301711, 1.1301711, 1.1301711, 1.1301711]],
              [[0.6887394, 0.6887394, 0.6887394, 0.6887394],
               [0.22089851, 0.22089851, 0.22089851, 0.22089851]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0.605832, 0.605832, 0.605832, 0.605832],
               [0.92364264, 0.92364264, 0.92364264, 0.92364264]],
              [[0.23089725, 0.23089725, 0.23089725, 0.23089725],
               [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float()
        expected_centers_grad = torch.tensor(
            [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[-1.0493311, -1.0493311, -1.0493311, -1.0493311],
               [-2.0301602, -2.0301602, -2.0301602, -2.0301602]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[-1.6328557, -1.6328557, -1.6328557, -1.6328557],
               [-3.1828144, -3.1828144, -3.1828144, -3.1828144]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]]],
             [[[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
              [[-1.5429721, -1.5429721, -1.5429721, -1.5429721],
               [-1.6100934, -1.6100934, -1.6100934, -1.6100934]],
              [[-1.7103812, -1.7103812, -1.7103812, -1.7103812],
               [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float()
        assert torch.allclose(
            scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6)
        assert torch.allclose(
            points.grad.detach().cpu(), expected_points_grad, atol=1e-6)
        assert torch.allclose(
            centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6)


================================================
FILE: tests/test_ops/test_ball_query.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import ball_query
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_ball_query(device):
    new_xyz = torch.tensor(
        [[[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334],
          [-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625],
          [-0.0740, 1.3147, -1.3625]],
         [[-2.0289, 2.4952, -0.1708], [-2.0668, 6.0278, -0.4875],
          [0.4066, 1.4211, -0.2947], [-2.0289, 2.4952, -0.1708],
          [-2.0289, 2.4952, -0.1708]]],
        device=device)

    xyz = torch.tensor(
        [[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
          [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],
          [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],
          [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],
          [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496]],
         [[-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],
          [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
          [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
          [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
          [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]]],
        device=device)

    idx = ball_query(0, 0.2, 5, xyz, new_xyz)
    expected_idx = torch.tensor(
        [[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0]],
         [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0]]],
        device=device)
    assert torch.all(idx == expected_idx)

    # test dilated ball query
    idx = ball_query(0.2, 0.4, 5, xyz, new_xyz)
    expected_idx = torch.tensor(
        [[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6], [2, 3, 2, 2, 2], [0, 5, 7, 0, 0],
          [0, 5, 7, 0, 0]],
         [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0]]],
        device=device)
    assert torch.all(idx == expected_idx)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_stack_ball_query(device):
    new_xyz = torch.tensor(
        [[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334],
         [-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625],
         [-0.0740, 1.3147, -1.3625], [-2.0289, 2.4952, -0.1708],
         [-2.0668, 6.0278, -0.4875], [0.4066, 1.4211, -0.2947],
         [-2.0289, 2.4952, -0.1708], [-2.0289, 2.4952, -0.1708]],
        device=device)
    new_xyz_batch_cnt = torch.tensor([5, 5], dtype=torch.int32, device=device)
    xyz = torch.tensor([[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
                        [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],
                        [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],
                        [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],
                        [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496],
                        [-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],
                        [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
                        [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
                        [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
                        [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]],
                       device=device)
    xyz_batch_cnt = torch.tensor([10, 10], dtype=torch.int32, device=device)
    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
    expected_idx = torch.tensor(
        [[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7],
         [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]],
        device=device)
    assert torch.all(idx == expected_idx)

    if device == 'cuda' or device == 'npu':
        xyz = xyz.double()
        new_xyz = new_xyz.double()
        expected_idx = expected_idx.double()
        idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt,
                         new_xyz_batch_cnt)
        assert torch.all(idx == expected_idx)

    xyz = xyz.half()
    new_xyz = new_xyz.half()
    expected_idx = expected_idx.half()
    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
    assert torch.all(idx == expected_idx)


================================================
FILE: tests/test_ops/test_bbox.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch
from mmengine.utils import digit_version

from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE,
                        IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE)


class TestBBox:

    def _test_bbox_overlaps(self, device='cpu', dtype=torch.float):
        from mmcv.ops import bbox_overlaps
        b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0, 4.0],
                           [7.0, 7.0, 8.0, 8.0]]).to(device).type(dtype)
        b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
                                                  3.0]]).to(device).type(dtype)
        should_output = np.array([[0.33333334, 0.5], [0.2, 0.5], [0.0, 0.0]])
        out = bbox_overlaps(b1, b2, offset=1)
        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)

        b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0,
                                                  4.0]]).to(device).type(dtype)
        b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
                                                  3.0]]).to(device).type(dtype)
        should_output = np.array([0.33333334, 0.5])
        out = bbox_overlaps(b1, b2, aligned=True, offset=1)
        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)

        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).to(device).type(dtype)
        b2 = torch.tensor([[4.0, 0.0, 5.0, 3.0], [3.0, 0.0, 4.0, 3.0],
                           [2.0, 0.0, 3.0, 3.0], [1.0, 0.0, 2.0,
                                                  3.0]]).to(device).type(dtype)
        should_output = np.array([0, 0.2, 0.5, 0.5])
        out = bbox_overlaps(b1, b2, offset=1)
        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)

        b1 = torch.tensor([[10.0 + i, 10.0 + i, 30.0 + i, 30.0 + i]
                           for i in range(1000)]).to(device).type(dtype)
        b2 = torch.tensor([[20.0 + i, 20.0 + i, 40.0 + i, 40.0 + i]
                           for i in range(1000)]).to(device).type(dtype)
        should_output = np.array([1 / 7] * 1000)
        out = bbox_overlaps(b1, b2, aligned=True)
        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'mps',
            marks=pytest.mark.skipif(
                not IS_MPS_AVAILABLE
                or digit_version(torch.__version__) >= digit_version('2.1.0'),
                reason='requires MPS support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_bbox_overlaps_float(self, device):
        self._test_bbox_overlaps(device, dtype=torch.float)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_bbox_overlaps_half(self, device):
        self._test_bbox_overlaps(device, dtype=torch.half)


================================================
FILE: tests/test_ops/test_bezier_align.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE

inputs = ([[[
    [1., 2., 5., 6.],
    [3., 4., 7., 8.],
    [9., 10., 13., 14.],
    [11., 12., 15., 16.],
]]], [[0., 0., 0., 1, 0., 2., 0., 3., 0., 3., 3., 2., 3., 1., 3., 0., 3.]])
outputs = ([[[[1., 1.75, 3.5, 5.25], [2.5, 3.25, 5., 6.75],
              [6., 6.75, 8.5, 10.25],
              [9.5, 10.25, 12., 13.75]]]], [[[[1.5625, 1.5625, 1.5625, 0.3125],
                                              [1.5625, 1.5625, 1.5625, 0.3125],
                                              [1.5625, 1.5625, 1.5625, 0.3125],
                                              [0.3125, 0.3125, 0.3125,
                                               0.0625]]]])


@pytest.mark.parametrize('device', [
    'cpu',
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
@pytest.mark.parametrize('dtype', [
    torch.float,
    pytest.param(
        torch.double,
        marks=[
            pytest.mark.skipif(
                IS_MUSA_AVAILABLE,
                reason='MUSA does not support for 64-bit floating point')
        ]), torch.half
])
def test_bezieralign(device, dtype):
    try:
        from mmcv.ops import bezier_align
    except ModuleNotFoundError:
        pytest.skip('test requires compilation')
    pool_h = 4
    pool_w = 4
    spatial_scale = 1.0
    sampling_ratio = 1
    np_input = np.array(inputs[0])
    np_rois = np.array(inputs[1])
    np_output = np.array(outputs[0])
    np_grad = np.array(outputs[1])

    x = torch.tensor(np_input, dtype=dtype, device=device, requires_grad=True)
    rois = torch.tensor(np_rois, dtype=dtype, device=device)

    output = bezier_align(x, rois, (pool_h, pool_w), spatial_scale,
                          sampling_ratio, False)
    output.backward(torch.ones_like(output))
    assert np.allclose(
        output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
    assert np.allclose(
        x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)


================================================
FILE: tests/test_ops/test_bias_act.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import bias_act
from mmcv.ops.bias_act import EasyDict
from mmcv.utils import IS_MUSA_AVAILABLE

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck, gradgradcheck
    _USING_PARROTS = False


class TestBiasAct:

    @classmethod
    def setup_class(cls):
        cls.input_tensor = torch.randn((1, 3), requires_grad=True)
        cls.bias = torch.randn(3, requires_grad=True)

    def test_bias_act_cpu(self):
        out = bias_act(self.input_tensor, self.bias)
        assert out.shape == (1, 3)

        # test with different dim
        input_tensor = torch.randn((1, 1, 3), requires_grad=True)
        bias = torch.randn(3, requires_grad=True)
        out = bias_act(input_tensor, bias, dim=2)
        assert out.shape == (1, 1, 3)

        # test with different act
        out = bias_act(self.input_tensor, self.bias, act='relu')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor, self.bias, act='lrelu')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor, self.bias, act='tanh')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor, self.bias, act='sigmoid')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor, self.bias, act='elu')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor, self.bias, act='selu')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor, self.bias, act='softplus')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor, self.bias, act='swish')
        assert out.shape == (1, 3)

        # test with different alpha
        out = bias_act(self.input_tensor, self.bias, act='lrelu', alpha=0.1)
        assert out.shape == (1, 3)

        # test with different gain
        out1 = bias_act(self.input_tensor, self.bias, act='lrelu', gain=0.2)
        out2 = bias_act(self.input_tensor, self.bias, act='lrelu', gain=0.1)
        assert torch.allclose(out1, out2 * 2)

        # test with different clamp
        out1 = bias_act(self.input_tensor, self.bias, act='lrelu', clamp=0.5)
        out2 = bias_act(self.input_tensor, self.bias, act='lrelu', clamp=0.2)
        assert out1.max() <= 0.5
        assert out2.max() <= 0.5

    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
    def test_bias_act_cuda(self):
        if _USING_PARROTS:
            gradcheck(
                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
                delta=1e-4,
                pt_atol=1e-3)
        else:
            gradcheck(
                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
                eps=1e-4,
                atol=1e-3)

            gradgradcheck(
                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
                eps=1e-4,
                atol=1e-3)

        out = bias_act(self.input_tensor.cuda(), self.bias.cuda())
        assert out.shape == (1, 3)

        # test with different dim
        input_tensor = torch.randn((1, 1, 3), requires_grad=True).cuda()
        bias = torch.randn(3, requires_grad=True).cuda()
        out = bias_act(input_tensor, bias, dim=2)
        assert out.shape == (1, 1, 3)

        # test with different act
        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='relu')
        assert out.shape == (1, 3)

        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='lrelu')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='tanh')
        assert out.shape == (1, 3)
        out = bias_act(
            self.input_tensor.cuda(), self.bias.cuda(), act='sigmoid')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='elu')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='selu')
        assert out.shape == (1, 3)
        out = bias_act(
            self.input_tensor.cuda(), self.bias.cuda(), act='softplus')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='swish')
        assert out.shape == (1, 3)

        # test with different alpha
        out = bias_act(
            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', alpha=0.1)
        assert out.shape == (1, 3)

        # test with different gain
        out1 = bias_act(
            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', gain=0.2)
        out2 = bias_act(
            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', gain=0.1)
        assert torch.allclose(out1, out2 * 2)

        # test with different clamp
        out1 = bias_act(
            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', clamp=0.5)
        out2 = bias_act(
            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', clamp=0.2)
        assert out1.max() <= 0.5
        assert out2.max() <= 0.5

    @pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')
    def test_bias_act_musa(self):
        if _USING_PARROTS:
            gradcheck(
                bias_act, (self.input_tensor.musa(), self.bias.musa()),
                delta=1e-4,
                pt_atol=1e-3)
        else:
            gradcheck(
                bias_act, (self.input_tensor.musa(), self.bias.musa()),
                eps=1e-4,
                atol=1e-3)

            gradgradcheck(
                bias_act, (self.input_tensor.musa(), self.bias.musa()),
                eps=1e-4,
                atol=1e-3)

        out = bias_act(self.input_tensor.musa(), self.bias.musa())
        assert out.shape == (1, 3)

        # test with different dim
        input_tensor = torch.randn((1, 1, 3), requires_grad=True).musa()
        bias = torch.randn(3, requires_grad=True).musa()
        out = bias_act(input_tensor, bias, dim=2)
        assert out.shape == (1, 1, 3)

        # test with different act
        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='relu')
        assert out.shape == (1, 3)

        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='lrelu')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='tanh')
        assert out.shape == (1, 3)
        out = bias_act(
            self.input_tensor.musa(), self.bias.musa(), act='sigmoid')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='elu')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='selu')
        assert out.shape == (1, 3)
        out = bias_act(
            self.input_tensor.musa(), self.bias.musa(), act='softplus')
        assert out.shape == (1, 3)
        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='swish')
        assert out.shape == (1, 3)

        # test with different alpha
        out = bias_act(
            self.input_tensor.musa(), self.bias.musa(), act='lrelu', alpha=0.1)
        assert out.shape == (1, 3)

        # test with different gain
        out1 = bias_act(
            self.input_tensor.musa(), self.bias.musa(), act='lrelu', gain=0.2)
        out2 = bias_act(
            self.input_tensor.musa(), self.bias.musa(), act='lrelu', gain=0.1)
        assert torch.allclose(out1, out2 * 2)

        # test with different clamp
        out1 = bias_act(
            self.input_tensor.musa(), self.bias.musa(), act='lrelu', clamp=0.5)
        out2 = bias_act(
            self.input_tensor.musa(), self.bias.musa(), act='lrelu', clamp=0.2)
        assert out1.max() <= 0.5
        assert out2.max() <= 0.5

    def test_easy_dict(self):
        easy_dict = EasyDict(
            func=lambda x, **_: x,
            def_alpha=0,
            def_gain=1,
            cuda_idx=1,
            ref='',
            has_2nd_grad=False)
        _ = easy_dict.def_alpha
        easy_dict.def_alpha = 1
        del easy_dict.def_alpha

    def test_easy_dict_musa(self):
        easy_dict = EasyDict(
            func=lambda x, **_: x,
            def_alpha=0,
            def_gain=1,
            musa_idx=1,
            ref='',
            has_2nd_grad=False)
        _ = easy_dict.def_alpha
        easy_dict.def_alpha = 1
        del easy_dict.def_alpha


================================================
FILE: tests/test_ops/test_bilinear_grid_sample.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
import torch.nn.functional as F


class TestBilinearGridSample:

    def _test_bilinear_grid_sample(self,
                                   dtype=torch.float,
                                   align_corners=False,
                                   multiplier=1,
                                   precision=1e-3):
        from mmcv.ops.point_sample import bilinear_grid_sample

        input = torch.rand(1, 1, 20, 20, dtype=dtype)
        grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
        grid = F.affine_grid(
            grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
        grid *= multiplier

        out = bilinear_grid_sample(input, grid, align_corners=align_corners)
        ref_out = F.grid_sample(input, grid, align_corners=align_corners)

        assert np.allclose(out.data.detach().cpu().numpy(),
                           ref_out.data.detach().cpu().numpy(), precision)

    def test_bilinear_grid_sample(self):
        self._test_bilinear_grid_sample(torch.double, False)
        self._test_bilinear_grid_sample(torch.double, True)
        self._test_bilinear_grid_sample(torch.float, False)
        self._test_bilinear_grid_sample(torch.float, True)
        self._test_bilinear_grid_sample(torch.float, False)
        self._test_bilinear_grid_sample(torch.float, True, 5)
        self._test_bilinear_grid_sample(torch.float, False, 10)
        self._test_bilinear_grid_sample(torch.float, True, -6)
        self._test_bilinear_grid_sample(torch.float, False, -10)
        self._test_bilinear_grid_sample(torch.double, True, 5)
        self._test_bilinear_grid_sample(torch.double, False, 10)
        self._test_bilinear_grid_sample(torch.double, True, -6)
        self._test_bilinear_grid_sample(torch.double, False, -10)


================================================
FILE: tests/test_ops/test_border_align.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy

import numpy as np
import pytest
import torch

from mmcv.utils import IS_MUSA_AVAILABLE

# [1,4c,h,w]
input_arr = [[[[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.]],
              [[6, 7, 5, 8], [2, 1, 3, 4], [12, 9, 11, 10]],
              [[-2, -3, 2, 0], [-4, -5, 1, -1], [-1, -1, -1, -1]],
              [[0, -1, 2, 1], [-4, -3, -2, -1], [-1, -2, -3, -4]]]]
# [1,h*w,4]
boxes_arr = [[[0, 0, 2, 1], [1, 0, 3, 1], [1, 0, 2, 1], [0, 0, 3, 1],
              [0, 0, 1, 2], [0, 0, 2, 2], [1, 0, 2, 1], [1, 0, 3, 1],
              [0, 1, 1, 2], [0, 0, 3, 2], [1, 0, 3, 2], [2, 0, 3, 2]]]
output_dict = {
    # [1,c,h*w,4] for each value,
    # the output is manually checked for its correctness

    # pool_size=1
    1: [[[[3., 6., 1., 2.], [4., 7., -1., 1.], [3., 7., 1., 2.],
          [4., 6., -1., 1.], [2., 12., -1., -1.], [3., 12., -1., 2.],
          [3., 7., 1., 2.], [4., 7., -1., 1.], [6., 12., -1., -2.],
          [4., 12., -1., 1.], [4., 9., -1., 1.], [4., 11., -1., 1.]]]],

    # pool_size=2
    2: [[[[3., 6., 1., 2.], [4., 7., 1., 1.], [3., 7., 1., 2.],
          [4., 6., -1., 1.], [2., 12., -1., -1.], [3., 12., -1., 2.],
          [3., 7., 1., 2.], [4., 7., 1., 1.], [6., 12., -1., -2.],
          [4., 12., -1., 1.], [4., 9., -1., 1.], [4., 11., -1., 1.]]]],
}
input_grad_dict = {
    # [1,4c,h,w] for each value
    # the grad is manually checked for its correctness

    # pool_size=1
    1: [[[[0., 1., 4., 6.], [0., 1., 0., 0.], [0., 0., 0., 0.]],
         [[2., 4., 0., 0.], [0., 0., 0., 0.], [4., 1., 1., 0.]],
         [[0., 0., 0., 0.], [0., 0., 3., 3.], [0., 2., 1., 3.]],
         [[0., 1., 4., 6.], [0., 0., 0., 0.], [0., 1., 0., 0.]]]],

    # pool_size=2
    2: [[[[0., 1., 4., 6.], [0., 1., 0., 0.], [0., 0., 0., 0.]],
         [[2., 4., 0., 0.], [0., 0., 0., 0.], [4., 1., 1., 0.]],
         [[0., 0., 0., 0.], [0., 0., 5., 1.], [0., 2., 1., 3.]],
         [[0., 1., 4., 6.], [0., 0., 0., 0.], [0., 1., 0., 0.]]]],
}


def _test_border_align_allclose(device, dtype, pool_size):
    if not torch.cuda.is_available() and device == 'cuda':
        pytest.skip('test requires GPU')
    elif not IS_MUSA_AVAILABLE and device == 'musa':
        pytest.skip('test requires GPU')
    try:
        from mmcv.ops import BorderAlign, border_align
    except ModuleNotFoundError:
        pytest.skip('BorderAlign op is not successfully compiled')

    np_input = np.array(input_arr)
    np_boxes = np.array(boxes_arr)
    np_output = np.array(output_dict[pool_size])
    np_grad = np.array(input_grad_dict[pool_size])

    input = torch.tensor(
        np_input, dtype=dtype, device=device, requires_grad=True)
    boxes = torch.tensor(np_boxes, dtype=dtype, device=device)

    # test for border_align
    input_cp = copy.deepcopy(input)
    output = border_align(input_cp, boxes, pool_size)
    output.backward(torch.ones_like(output))
    assert np.allclose(
        output.data.type(dtype).cpu().numpy(), np_output, atol=1e-5)
    assert np.allclose(
        input_cp.grad.data.type(dtype).cpu().numpy(), np_grad, atol=1e-5)

    # test for BorderAlign
    pool_module = BorderAlign(pool_size)
    output = pool_module(input, boxes)
    output.backward(torch.ones_like(output))
    assert np.allclose(
        output.data.type(dtype).cpu().numpy(), np_output, atol=1e-5)
    assert np.allclose(
        input.grad.data.type(dtype).cpu().numpy(), np_grad, atol=1e-5)


@pytest.mark.parametrize('device', ['cuda', 'musa'])
@pytest.mark.parametrize('dtype', [
    torch.float,
    torch.half,
    pytest.param(
        torch.double,
        marks=pytest.mark.skipif(
            IS_MUSA_AVAILABLE,
            reason='MUSA does not support for 64-bit floating point')),
])
@pytest.mark.parametrize('pool_size', [1, 2])
def test_border_align(device, dtype, pool_size):
    _test_border_align_allclose(device, dtype, pool_size)


================================================
FILE: tests/test_ops/test_box_iou_quadri.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE


class TestBoxIoUQuadri:

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_box_iou_quadri_cuda(self, device):
        from mmcv.ops import box_iou_quadri
        np_boxes1 = np.asarray([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0],
                                [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0],
                                [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]],
                               dtype=np.float32)
        np_boxes2 = np.asarray([[0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0],
                                [2.0, 1.0, 2.0, 4.0, 4.0, 4.0, 4.0, 1.0],
                                [7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]],
                               dtype=np.float32)
        np_expect_ious = np.asarray(
            [[0.0714, 1.0000, 0.0000], [0.0000, 0.5000, 0.0000],
             [0.0000, 0.0000, 0.5000]],
            dtype=np.float32)
        np_expect_ious_aligned = np.asarray([0.0714, 0.5000, 0.5000],
                                            dtype=np.float32)

        boxes1 = torch.from_numpy(np_boxes1).to(device)
        boxes2 = torch.from_numpy(np_boxes2).to(device)

        ious = box_iou_quadri(boxes1, boxes2)
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)

        ious = box_iou_quadri(boxes1, boxes2, aligned=True)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_box_iou_quadri_iof_cuda(self, device):
        from mmcv.ops import box_iou_quadri
        np_boxes1 = np.asarray([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0],
                                [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0],
                                [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]],
                               dtype=np.float32)
        np_boxes2 = np.asarray([[0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0],
                                [2.0, 1.0, 2.0, 4.0, 4.0, 4.0, 4.0, 1.0],
                                [7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]],
                               dtype=np.float32)
        np_expect_ious = np.asarray(
            [[0.1111, 1.0000, 0.0000], [0.0000, 1.0000, 0.0000],
             [0.0000, 0.0000, 1.0000]],
            dtype=np.float32)
        np_expect_ious_aligned = np.asarray([0.1111, 1.0000, 1.0000],
                                            dtype=np.float32)

        boxes1 = torch.from_numpy(np_boxes1).to(device)
        boxes2 = torch.from_numpy(np_boxes2).to(device)

        ious = box_iou_quadri(boxes1, boxes2, mode='iof')
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)

        ious = box_iou_quadri(boxes1, boxes2, mode='iof', aligned=True)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)


================================================
FILE: tests/test_ops/test_box_iou_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import box_iou_rotated
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


class TestBoxIoURotated:

    def test_box_iou_rotated_cpu(self):
        np_boxes1 = np.asarray(
            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
             [7.0, 7.0, 8.0, 8.0, 0.4]],
            dtype=np.float32)
        np_boxes2 = np.asarray(
            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
             [5.0, 5.0, 6.0, 7.0, 0.4]],
            dtype=np.float32)
        np_expect_ious = np.asarray(
            [[0.3708, 0.4351, 0.0000], [0.1104, 0.4487, 0.0424],
             [0.0000, 0.0000, 0.3622]],
            dtype=np.float32)
        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],
                                            dtype=np.float32)

        boxes1 = torch.from_numpy(np_boxes1)
        boxes2 = torch.from_numpy(np_boxes2)

        # test cw angle definition
        ious = box_iou_rotated(boxes1, boxes2)
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)

        ious = box_iou_rotated(boxes1, boxes2, aligned=True)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)

        # test ccw angle definition
        boxes1[..., -1] *= -1
        boxes2[..., -1] *= -1
        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)

        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_box_iou_rotated(self, device):
        np_boxes1 = np.asarray(
            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
             [7.0, 7.0, 8.0, 8.0, 0.4]],
            dtype=np.float32)
        np_boxes2 = np.asarray(
            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
             [5.0, 5.0, 6.0, 7.0, 0.4]],
            dtype=np.float32)
        np_expect_ious = np.asarray(
            [[0.3708, 0.4351, 0.0000], [0.1104, 0.4487, 0.0424],
             [0.0000, 0.0000, 0.3622]],
            dtype=np.float32)
        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],
                                            dtype=np.float32)

        boxes1 = torch.from_numpy(np_boxes1).to(device)
        boxes2 = torch.from_numpy(np_boxes2).to(device)

        # test cw angle definition
        ious = box_iou_rotated(boxes1, boxes2)
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)

        ious = box_iou_rotated(boxes1, boxes2, aligned=True)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)

        # test ccw angle definition
        boxes1[..., -1] *= -1
        boxes2[..., -1] *= -1
        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)

        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)

    def test_box_iou_rotated_iof_cpu(self):
        np_boxes1 = np.asarray(
            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
             [7.0, 7.0, 8.0, 8.0, 0.4]],
            dtype=np.float32)
        np_boxes2 = np.asarray(
            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
             [5.0, 5.0, 6.0, 7.0, 0.4]],
            dtype=np.float32)
        np_expect_ious = np.asarray(
            [[0.4959, 0.5306, 0.0000], [0.1823, 0.5420, 0.1832],
             [0.0000, 0.0000, 0.4404]],
            dtype=np.float32)
        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],
                                            dtype=np.float32)

        boxes1 = torch.from_numpy(np_boxes1)
        boxes2 = torch.from_numpy(np_boxes2)

        # test cw angle definition
        ious = box_iou_rotated(boxes1, boxes2, mode='iof')
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
        ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)

        # test ccw angle definition
        boxes1[..., -1] *= -1
        boxes2[..., -1] *= -1
        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
        ious = box_iou_rotated(
            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
    ])
    def test_box_iou_rotated_iof(self, device):
        np_boxes1 = np.asarray(
            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
             [7.0, 7.0, 8.0, 8.0, 0.4]],
            dtype=np.float32)
        np_boxes2 = np.asarray(
            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
             [5.0, 5.0, 6.0, 7.0, 0.4]],
            dtype=np.float32)
        np_expect_ious = np.asarray(
            [[0.4959, 0.5306, 0.0000], [0.1823, 0.5420, 0.1832],
             [0.0000, 0.0000, 0.4404]],
            dtype=np.float32)
        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],
                                            dtype=np.float32)

        boxes1 = torch.from_numpy(np_boxes1).to(device)
        boxes2 = torch.from_numpy(np_boxes2).to(device)

        # test cw angle definition
        ious = box_iou_rotated(boxes1, boxes2, mode='iof')
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)

        ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)

        # test ccw angle definition
        boxes1[..., -1] *= -1
        boxes2[..., -1] *= -1
        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)

        ious = box_iou_rotated(
            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)


================================================
FILE: tests/test_ops/test_carafe.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch
from torch.autograd import gradcheck

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE


class TestCarafe:

    def test_carafe_naive_gradcheck(self):
        if (not torch.cuda.is_available()) and (not IS_MUSA_AVAILABLE):
            return
        from mmcv.ops import CARAFENaive
        if IS_CUDA_AVAILABLE:
            feat = torch.randn(
                2, 64, 3, 3, requires_grad=True, device='cuda').double()
            mask = torch.randn(
                2, 100, 6, 6, requires_grad=True,
                device='cuda').sigmoid().double()
            gradcheck(CARAFENaive(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)

    def test_carafe_gradcheck(self):
        if (not torch.cuda.is_available()) and (not IS_MUSA_AVAILABLE):
            return
        from mmcv.ops import CARAFE
        if IS_CUDA_AVAILABLE:
            feat = torch.randn(
                2, 64, 3, 3, requires_grad=True, device='cuda').double()
            mask = torch.randn(
                2, 100, 6, 6, requires_grad=True,
                device='cuda').sigmoid().double()
            gradcheck(CARAFE(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_carafe_allclose(self, device):
        try:
            from mmcv.ops import CARAFE
        except ModuleNotFoundError:
            pytest.skip('test requires compilation')

        np_feat = np.fromfile(
            'tests/data/for_carafe/carafe_feat.bin', dtype=np.float32)
        np_mask = np.fromfile(
            'tests/data/for_carafe/carafe_mask.bin', dtype=np.float32)
        np_output = np.fromfile(
            'tests/data/for_carafe/carafe_output.bin', dtype=np.float32)
        np_feat_grad = np.fromfile(
            'tests/data/for_carafe/carafe_feat_grad.bin', dtype=np.float32)
        np_mask_grad = np.fromfile(
            'tests/data/for_carafe/carafe_mask_grad.bin', dtype=np.float32)

        np_feat = np_feat.reshape((2, 64, 3, 3))
        np_mask = np_mask.reshape((2, 100, 6, 6))
        np_output = np_output.reshape((2, 64, 6, 6))
        np_feat_grad = np_feat_grad.reshape((2, 64, 3, 3))
        np_mask_grad = np_mask_grad.reshape((2, 100, 6, 6))

        # feat = torch.tensor(
        #     np_feat, dtype=torch.float, device=device, requires_grad=True)
        # mask = torch.tensor(
        #     np_mask, dtype=torch.float, device=device, requires_grad=True)

        # feat = torch.tensor(
        #     np_feat, dtype=torch.float, requires_grad=True).to(device)
        # mask = torch.tensor(
        #     np_mask, dtype=torch.float, requires_grad=True).to(device)
        # feat = torch.tensor(
        #     np_feat, dtype=torch.float).to(device)
        # mask = torch.tensor(
        #     np_mask, dtype=torch.float).to(device)
        # feat_cpu = torch.from_numpy(np_feat).to(torch.float)
        # mask_cpu = torch.from_numpy(np_mask).to(torch.float)

        # if device == 'musa':
        #     feat =feat_cpu.musa()
        #     mask =mask_cpu.musa()
        # else:
        #     feat =feat_cpu.to(device)
        #     mask =mask_cpu.to(device)
        # feat.requires_grad = True
        # mask.requires_grad = True
        feat_cpu = torch.FloatTensor(np_feat)
        mask_cpu = torch.FloatTensor(np_mask)
        feat = feat_cpu.to(device)
        mask = mask_cpu.to(device)
        feat.requires_grad = True
        mask.requires_grad = True
        # pytest.set_trace()

        carafe = CARAFE(5, 4, 2)
        carafe.to(device)
        carafe.train()
        output = carafe(feat, mask)

        output.backward(torch.ones_like(output))
        assert np.allclose(
            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
        assert np.allclose(
            feat.grad.data.type(torch.float).cpu().numpy(),
            np_feat_grad,
            atol=1e-3)
        assert np.allclose(
            mask.grad.data.type(torch.float).cpu().numpy(),
            np_mask_grad,
            atol=1e-3)


================================================
FILE: tests/test_ops/test_cc_attention.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
import torch.nn as nn

from mmcv.utils import IS_MUSA_AVAILABLE


class Loss(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, input, target):
        input = input.view(-1)
        target = target.view(-1)
        return torch.mean(input - target)


class TestCrissCrossAttention:

    def test_cc_attention(self):
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        if IS_MUSA_AVAILABLE:
            device = torch.device('musa:0')

        from mmcv.ops import CrissCrossAttention
        loss_func = Loss()

        input = np.fromfile(
            'tests/data/for_ccattention/ccattention_input.bin',
            dtype=np.float32)
        output = np.fromfile(
            'tests/data/for_ccattention/ccattention_output.bin',
            dtype=np.float32)
        input = input.reshape((1, 32, 45, 45))
        output = output.reshape((1, 32, 45, 45))
        label = torch.ones((1, 32, 45, 45))

        input = torch.FloatTensor(input)
        output = torch.FloatTensor(output)

        input.requires_grad = True

        shape = input.shape
        channel = shape[1]

        cca = CrissCrossAttention(channel)
        cca.to(device)
        input = input.to(device)
        label = label.to(device)
        cca.train()
        test_output = cca(input)
        test_loss = loss_func(test_output, label)
        test_loss.backward()
        test_output = test_output.detach().cpu().numpy()
        output = output.numpy()

        assert np.allclose(test_output, output)
        assert test_output.shape == shape


================================================
FILE: tests/test_ops/test_chamfer_distance.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import chamfer_distance
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE


def chamfer_distance_forward_groundtruth(xyz1, xyz2, dtype):
    bs, ns, ss = xyz1.shape
    dist1 = np.zeros((bs, ns)).astype(torch_to_np_type(dtype))
    dist2 = np.zeros((bs, ns)).astype(torch_to_np_type(dtype))
    idx1 = np.zeros((bs, ns)).astype('int32')
    idx2 = np.zeros((bs, ns)).astype('int32')
    for b1 in range(bs):
        for n1 in range(ns):
            x1, y1 = xyz1[b1][n1]
            dist1[b1][n1] = 10000000
            for n2 in range(ns):
                x2, y2 = xyz2[b1][n2]
                dst = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)
                if dist1[b1][n1] > dst:
                    dist1[b1][n1] = dst
                    idx1[b1][n1] = n2
    for b1 in range(bs):
        for n1 in range(ns):
            x1, y1 = xyz2[b1][n1]
            dist2[b1][n1] = 10000000
            for n2 in range(ns):
                x2, y2 = xyz1[b1][n2]
                dst = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)
                if dist2[b1][n1] > dst:
                    dist2[b1][n1] = dst
                    idx2[b1][n1] = n2
    return [dist1, dist2, idx1, idx2]


def torch_to_np_type(dtype):
    if dtype == torch.half:
        return np.float16
    elif dtype == torch.float32:
        return np.float32


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
@pytest.mark.parametrize('dtype', [torch.half, torch.float32])
@pytest.mark.parametrize('shape', [(2, 600, 2), (1, 1, 2), (7, 7, 2)])
def test_chamfer_distance_npu_dynamic_shape(dtype, device, shape):
    bs = shape[0]
    ns = shape[1]
    xyz1 = np.random.uniform(-10.0, 10.0,
                             (bs, ns, 2)).astype(torch_to_np_type(dtype))
    xyz2 = np.random.uniform(-10.0, 10.0,
                             (bs, ns, 2)).astype(torch_to_np_type(dtype))
    xyz1_npu = torch.tensor(xyz1, dtype=dtype).to(device)
    xyz2_npu = torch.tensor(xyz2, dtype=dtype).to(device)
    expected_output = chamfer_distance_forward_groundtruth(xyz1, xyz2, dtype)
    output = chamfer_distance(xyz1_npu, xyz2_npu)
    assert np.allclose(output[0].cpu().numpy(), expected_output[0], 1e-3, 1e-4)
    assert np.allclose(output[1].cpu().numpy(), expected_output[1], 1e-3, 1e-4)
    assert np.allclose(output[2].cpu().numpy(), expected_output[2], 1e-3, 1e-4)
    assert np.allclose(output[3].cpu().numpy(), expected_output[3], 1e-3, 1e-4)


================================================
FILE: tests/test_ops/test_contour_expand.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch


def test_contour_expand():
    from mmcv.ops import contour_expand

    np_internal_kernel_label = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 0, 0, 0, 0,
                                          0]]).astype(np.int32)
    np_kernel_mask1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0,
                                 0]]).astype(np.uint8)
    np_kernel_mask2 = (np_internal_kernel_label > 0).astype(np.uint8)

    np_kernel_mask = np.stack([np_kernel_mask1, np_kernel_mask2])
    min_area = 1
    kernel_region_num = 3
    result = contour_expand(np_kernel_mask, np_internal_kernel_label, min_area,
                            kernel_region_num)
    gt = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 2, 2, 2, 0],
          [0, 0, 1, 1, 1, 1, 2, 2, 2, 0], [0, 0, 1, 1, 1, 1, 2, 2, 2, 0],
          [0, 0, 1, 1, 1, 1, 2, 2, 2, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    assert np.allclose(result, gt)

    np_kernel_mask_t = torch.from_numpy(np_kernel_mask)
    np_internal_kernel_label_t = torch.from_numpy(np_internal_kernel_label)
    result = contour_expand(np_kernel_mask_t, np_internal_kernel_label_t,
                            min_area, kernel_region_num)
    assert np.allclose(result, gt)


================================================
FILE: tests/test_ops/test_conv_gradfix.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
import torch.nn as nn
from torch.autograd import gradcheck, gradgradcheck

from mmcv.ops import conv2d, conv_transpose2d
from mmcv.utils import IS_MUSA_AVAILABLE


class TestCond2d:

    @classmethod
    def setup_class(cls):
        cls.input = torch.randn((1, 3, 32, 32), requires_grad=True)
        cls.weight = nn.Parameter(torch.randn(1, 3, 3, 3))

    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
    def test_conv2d_cuda(self):
        x = self.input.cuda()
        weight = self.weight.cuda()
        res = conv2d(x, weight, None, 1, 1)
        assert res.shape == (1, 1, 32, 32)
        gradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)
        gradgradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)

    @pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')
    def test_conv2d_musa(self):
        x = self.input.musa()
        weight = self.weight.musa()
        res = conv2d(x, weight, None, 1, 1)
        assert res.shape == (1, 1, 32, 32)
        gradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)
        gradgradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)


class TestCond2dTansposed:

    @classmethod
    def setup_class(cls):
        cls.input = torch.randn((1, 3, 32, 32), requires_grad=True)
        cls.weight = nn.Parameter(torch.randn(3, 1, 3, 3))

    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
    def test_conv2d_transposed_cuda(self):
        x = self.input.cuda()
        weight = self.weight.cuda()
        res = conv_transpose2d(x, weight, None, 1, 1)
        assert res.shape == (1, 1, 32, 32)
        gradcheck(
            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)
        gradgradcheck(
            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)

    @pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')
    def test_conv2d_transposed_musa(self):
        x = self.input.musa()
        weight = self.weight.musa()
        res = conv_transpose2d(x, weight, None, 1, 1)
        assert res.shape == (1, 1, 32, 32)
        gradcheck(
            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)
        gradgradcheck(
            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)


================================================
FILE: tests/test_ops/test_convex_iou.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import convex_giou, convex_iou
from mmcv.utils import IS_MUSA_AVAILABLE

np_pointsets = np.asarray([[
    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
    2.0, 1.5, 1.5
],
                           [
                               1.5, 1.5, 2.5, 2.5, 1.5, 2.5, 2.5, 1.5, 1.5,
                               3.5, 3.5, 1.5, 2.5, 3.5, 3.5, 2.5, 2.0, 2.0
                           ]])

np_polygons = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0],
                          [1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 1.0]])

np_expected_iou = np.asarray([[0.2857, 0.8750], [0.0588, 0.4286]])

np_expected_giou = np.asarray([0.2857, 0.3831])

np_expected_grad = np.asarray([[
    0.0204, 0.0408, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0612,
    -0.0408, -0.0408, 0.0816, -0.0408, -0.0816, -0.0816, -0.0408, 0.0000,
    0.0000
],
                               [
                                   -0.1848, -0.1848, 0.0000, 0.0000, 0.0000,
                                   0.0000, 0.0000, 0.0000, -0.1076, -0.0801,
                                   -0.0801, -0.1076, -0.0367, -0.0734, -0.0734,
                                   -0.0367, 0.0000, 0.0000
                               ]])


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
def test_convex_iou():
    pointsets = torch.from_numpy(np_pointsets).cuda().float()
    polygons = torch.from_numpy(np_polygons).cuda().float()
    expected_iou = torch.from_numpy(np_expected_iou).cuda().float()
    assert torch.allclose(
        convex_iou(pointsets, polygons), expected_iou, atol=1e-3)


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
def test_convex_giou():
    pointsets = torch.from_numpy(np_pointsets).cuda().float()
    polygons = torch.from_numpy(np_polygons).cuda().float()
    expected_giou = torch.from_numpy(np_expected_giou).cuda().float()
    expected_grad = torch.from_numpy(np_expected_grad).cuda().float()
    giou, grad = convex_giou(pointsets, polygons)
    assert torch.allclose(giou, expected_giou, atol=1e-3)
    assert torch.allclose(grad, expected_grad, atol=1e-3)


@pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')
def test_convex_miou():
    pointsets = torch.from_numpy(np_pointsets).musa().float()
    polygons = torch.from_numpy(np_polygons).musa().float()
    expected_iou = torch.from_numpy(np_expected_iou).musa().float()
    assert torch.allclose(
        convex_iou(pointsets, polygons), expected_iou, atol=1e-3)


================================================
FILE: tests/test_ops/test_corner_pool.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
"""
CommandLine:
    pytest tests/test_corner_pool.py
"""
import pytest
import torch

from mmcv.ops import CornerPool


def test_corner_pool_device_and_dtypes_cpu():
    """
    CommandLine:
        xdoctest -m tests/test_corner_pool.py \
            test_corner_pool_device_and_dtypes_cpu
    """
    with pytest.raises(AssertionError):
        # pool mode must in ['bottom', 'left', 'right', 'top']
        pool = CornerPool('corner')

    lr_tensor = torch.tensor([[[[0, 0, 0, 0, 0], [2, 1, 3, 0, 2],
                                [5, 4, 1, 1, 6], [0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0]]]])
    tb_tensor = torch.tensor([[[[0, 3, 1, 0, 0], [0, 1, 1, 0, 0],
                                [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],
                                [0, 0, 2, 0, 0]]]])
    # Left Pool
    left_answer = torch.tensor([[[[0, 0, 0, 0, 0], [3, 3, 3, 2, 2],
                                  [6, 6, 6, 6, 6], [0, 0, 0, 0, 0],
                                  [0, 0, 0, 0, 0]]]])
    pool = CornerPool('left')
    left_tensor = pool(lr_tensor)
    assert left_tensor.type() == lr_tensor.type()
    assert torch.equal(left_tensor, left_answer)
    # Right Pool
    right_answer = torch.tensor([[[[0, 0, 0, 0, 0], [2, 2, 3, 3, 3],
                                   [5, 5, 5, 5, 6], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0]]]])
    pool = CornerPool('right')
    right_tensor = pool(lr_tensor)
    assert right_tensor.type() == lr_tensor.type()
    assert torch.equal(right_tensor, right_answer)
    # Top Pool
    top_answer = torch.tensor([[[[0, 3, 4, 0, 0], [0, 3, 4, 0, 0],
                                 [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],
                                 [0, 0, 2, 0, 0]]]])
    pool = CornerPool('top')
    top_tensor = pool(tb_tensor)
    assert top_tensor.type() == tb_tensor.type()
    assert torch.equal(top_tensor, top_answer)
    # Bottom Pool
    bottom_answer = torch.tensor([[[[0, 3, 1, 0, 0], [0, 3, 1, 0, 0],
                                    [0, 3, 4, 0, 0], [0, 3, 4, 0, 0],
                                    [0, 3, 4, 0, 0]]]])
    pool = CornerPool('bottom')
    bottom_tensor = pool(tb_tensor)
    assert bottom_tensor.type() == tb_tensor.type()
    assert torch.equal(bottom_tensor, bottom_answer)


================================================
FILE: tests/test_ops/test_correlation.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import Correlation
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE

_input1 = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
_input2 = [[[[1., 2., 3.], [3., 1., 2.], [8., 5., 2.]]]]

gt_out_shape = (1, 1, 1, 3, 3)
_gt_out = [[[[[1., 4., 9.], [0., 1., 4.], [24., 25., 4.]]]]]
gt_input1_grad = [[[[1., 2., 3.], [3., 1., 2.], [8., 5., 2.]]]]


def assert_equal_tensor(tensor_a, tensor_b):

    assert tensor_a.eq(tensor_b).all()


class TestCorrelation:

    def _test_correlation(self, dtype=torch.float):

        layer = Correlation(max_displacement=0)

        if IS_CUDA_AVAILABLE:
            input1 = torch.tensor(_input1, dtype=dtype).cuda()
            input2 = torch.tensor(_input2, dtype=dtype).cuda()
        elif IS_MUSA_AVAILABLE:
            input1 = torch.tensor(_input1, dtype=dtype).musa()
            input2 = torch.tensor(_input2, dtype=dtype).musa()
        input1.requires_grad = True
        input2.requires_grad = True
        out = layer(input1, input2)
        out.backward(torch.ones_like(out))

        # `eq_cpu` is not implemented for 'Half' in torch1.5.0,
        # so we need to make a comparison for cuda/musa tensor
        # rather than cpu tensor
        if IS_CUDA_AVAILABLE:
            gt_out = torch.tensor(_gt_out, dtype=dtype).cuda()
        elif IS_MUSA_AVAILABLE:
            gt_out = torch.tensor(_gt_out, dtype=dtype).musa()
        assert_equal_tensor(out, gt_out)
        assert_equal_tensor(input1.grad.detach(), input2)
        assert_equal_tensor(input2.grad.detach(), input1)

    @pytest.mark.skipif(
        (not torch.cuda.is_available()) and (not IS_MUSA_AVAILABLE),
        reason='requires CUDA/MUSA support')
    def test_correlation(self):
        self._test_correlation(torch.float)
        if IS_CUDA_AVAILABLE:
            self._test_correlation(torch.double)
        self._test_correlation(torch.half)


================================================
FILE: tests/test_ops/test_deform_conv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE

if IS_MLU_AVAILABLE:
    torch.backends.cnnl.allow_tf32 = False

if IS_MUSA_AVAILABLE:
    try:
        from torch_musa.core.amp import autocast
    except ImportError:
        pass
else:
    try:
        # If PyTorch version >= 1.6.0 and fp16 is enabled
        # torch.cuda.amp.autocast would be imported and used
        # we should test if our modules support it.
        from torch.cuda.amp import autocast
    except ImportError:
        pass

input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
                 [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
                 [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
                 [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]

gt_out = [[[[1.650, 0.], [0.000, 0.]]]]
gt_x_grad = [[[[-0.666, 0.204, 0.000], [0.030, -0.416, 0.012],
               [0.000, 0.252, 0.129]]]]
gt_offset_weight_grad = [[[[1.44, 2.88], [0.00, 1.44]]],
                         [[[-0.72, -1.44], [0.00, -0.72]]],
                         [[[0.00, 0.00], [0.00, 0.00]]],
                         [[[0.00, 0.00], [0.00, 0.00]]],
                         [[[-0.10, -0.20], [0.00, -0.10]]],
                         [[[-0.08, -0.16], [0.00, -0.08]]],
                         [[[-0.54, -1.08], [0.00, -0.54]]],
                         [[[-0.54, -1.08], [0.00, -0.54]]]]
gt_offset_bias_grad = [1.44, -0.72, 0., 0., -0.10, -0.08, -0.54, -0.54],
gt_deform_weight_grad = [[[[3.62, 0.], [0.40, 0.18]]]]


class TestDeformconv:

    def _test_deformconv(self,
                         dtype=torch.float,
                         threshold=1e-3,
                         device='cuda',
                         batch_size=10,
                         im2col_step=2):
        if not torch.cuda.is_available() and device == 'cuda':
            pytest.skip('test requires GPU')
        if device == 'mlu':
            from mmcv.ops import DeformConv2dPack_MLU as DeformConv2dPack
        else:
            from mmcv.ops import DeformConv2dPack
        c_in = 1
        c_out = 1
        batch_size = 10
        repeated_input = np.repeat(input, batch_size, axis=0)
        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)
        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)
        x = torch.tensor(repeated_input, device=device, dtype=dtype)
        x.requires_grad = True
        model = DeformConv2dPack(
            in_channels=c_in,
            out_channels=c_out,
            kernel_size=2,
            stride=1,
            padding=0,
            im2col_step=im2col_step)
        model.conv_offset.weight.data = torch.nn.Parameter(
            torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
        model.conv_offset.bias.data = torch.nn.Parameter(
            torch.Tensor(offset_bias).reshape(8))
        model.weight.data = torch.nn.Parameter(
            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
        if device == 'cuda':
            model.cuda()
        elif device == 'mlu':
            model.mlu()
        elif device == 'musa':
            model.musa()
        model.type(dtype)

        out = model(x)
        out.backward(torch.ones_like(out))

        assert np.allclose(out.data.detach().cpu().numpy(), repeated_gt_out,
                           threshold)
        assert np.allclose(x.grad.detach().cpu().numpy(), repeated_gt_x_grad,
                           threshold)
        # the batch size of the input is increased which results in
        # a larger gradient so we need to divide by the batch_size
        assert np.allclose(
            model.conv_offset.weight.grad.detach().cpu().numpy() / batch_size,
            gt_offset_weight_grad, threshold)
        assert np.allclose(
            model.conv_offset.bias.grad.detach().cpu().numpy() / batch_size,
            gt_offset_bias_grad, threshold)
        assert np.allclose(
            model.weight.grad.detach().cpu().numpy() / batch_size,
            gt_deform_weight_grad, threshold)

        from mmcv.ops import DeformConv2d

        # test bias
        model = DeformConv2d(1, 1, 2, stride=1, padding=0)
        assert not hasattr(model, 'bias')
        # test bias=True
        with pytest.raises(AssertionError):
            model = DeformConv2d(1, 1, 2, stride=1, padding=0, bias=True)
        # test in_channels % group != 0
        with pytest.raises(AssertionError):
            model = DeformConv2d(3, 2, 3, groups=2)
        # test out_channels % group != 0
        with pytest.raises(AssertionError):
            model = DeformConv2d(3, 4, 3, groups=3)

    def _test_amp_deformconv(self,
                             input_dtype,
                             threshold=1e-3,
                             device='cuda',
                             batch_size=10,
                             im2col_step=2):
        """The function to test amp released on pytorch 1.6.0.

        The type of input data might be torch.float or torch.half,
        so we should test deform_conv in both cases. With amp, the
        data type of model will NOT be set manually.

        Args:
            input_dtype: torch.float or torch.half.
            threshold: the same as above function.
        """
        if not torch.cuda.is_available() and device == 'cuda':
            return
        if device == 'mlu':
            from mmcv.ops import DeformConv2dPack_MLU as DeformConv2dPack
        else:
            from mmcv.ops import DeformConv2dPack
        c_in = 1
        c_out = 1
        repeated_input = np.repeat(input, batch_size, axis=0)
        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)
        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)
        x = torch.Tensor(repeated_input).to(device).type(input_dtype)
        x.requires_grad = True
        model = DeformConv2dPack(
            in_channels=c_in,
            out_channels=c_out,
            kernel_size=2,
            stride=1,
            padding=0,
            im2col_step=im2col_step)
        model.conv_offset.weight.data = torch.nn.Parameter(
            torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
        model.conv_offset.bias.data = torch.nn.Parameter(
            torch.Tensor(offset_bias).reshape(8))
        model.weight.data = torch.nn.Parameter(
            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
        if device == 'cuda':
            model.cuda()
        elif device == 'mlu':
            model.mlu()
        elif device == 'musa':
            model.musa()

        out = model(x)
        out.backward(torch.ones_like(out))

        assert np.allclose(out.data.detach().cpu().numpy(), repeated_gt_out,
                           threshold)
        assert np.allclose(x.grad.detach().cpu().numpy(), repeated_gt_x_grad,
                           threshold)
        assert np.allclose(
            model.conv_offset.weight.grad.detach().cpu().numpy() / batch_size,
            gt_offset_weight_grad, threshold)
        assert np.allclose(
            model.conv_offset.bias.grad.detach().cpu().numpy() / batch_size,
            gt_offset_bias_grad, threshold)
        assert np.allclose(
            model.weight.grad.detach().cpu().numpy() / batch_size,
            gt_deform_weight_grad, threshold)

        from mmcv.ops import DeformConv2d

        # test bias
        model = DeformConv2d(1, 1, 2, stride=1, padding=0)
        assert not hasattr(model, 'bias')
        # test bias=True
        with pytest.raises(AssertionError):
            model = DeformConv2d(1, 1, 2, stride=1, padding=0, bias=True)
        # test in_channels % group != 0
        with pytest.raises(AssertionError):
            model = DeformConv2d(3, 2, 3, groups=2)
        # test out_channels % group != 0
        with pytest.raises(AssertionError):
            model = DeformConv2d(3, 4, 3, groups=3)

    @pytest.mark.parametrize(
        'device, threshold',
        [('cpu', 1e-1),
         pytest.param(
             'cuda',
             1e-3,
             marks=pytest.mark.skipif(
                 not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
         pytest.param(
             'mlu',
             1e-3,
             marks=pytest.mark.skipif(
                 not IS_MLU_AVAILABLE, reason='requires MLU support')),
         pytest.param(
             'musa',
             1e-3,
             marks=pytest.mark.skipif(
                 not IS_MUSA_AVAILABLE, reason='requires MUSA support'))])
    def test_deformconv_float(self, device, threshold):
        self._test_deformconv(torch.float, device=device, threshold=threshold)
        # test batch_size < im2col_step
        self._test_deformconv(
            torch.float, batch_size=1, im2col_step=2, device=device)
        # test bach_size % im2col_step != 0
        with pytest.raises(
                AssertionError,
                match='batch size must be divisible by im2col_step'):
            self._test_deformconv(
                torch.float, batch_size=10, im2col_step=3, device=device)

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
    ])
    def test_deformconv_double(self, device):
        self._test_deformconv(torch.double, device=device)

    @pytest.mark.parametrize('device, threshold', [
        pytest.param(
            'cuda',
            1e-1,
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            1e-1,
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            1e-1,
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_deformconv_half(self, device, threshold):
        self._test_deformconv(torch.half, device=device, threshold=threshold)
        # test amp when torch version >= '1.6.0', the type of
        # input data for deformconv might be torch.float or torch.half
        if (TORCH_VERSION != 'parrots'
                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
            with autocast(enabled=True):
                self._test_amp_deformconv(
                    torch.float, device=device, threshold=threshold)
                self._test_amp_deformconv(
                    torch.half, device=device, threshold=threshold)


================================================
FILE: tests/test_ops/test_deform_roi_pool.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os

import numpy as np
import pytest
import torch

from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck
    _USING_PARROTS = False

cur_dir = os.path.dirname(os.path.abspath(__file__))

inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
                                               1.]]]], [[0., 0., 0., 1., 1.]]),
          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
outputs = [([[[[1, 1.25], [1.5, 1.75]]]], [[[[3.0625, 0.4375],
                                             [0.4375, 0.0625]]]]),
           ([[[[1., 1.25], [1.5, 1.75]], [[4, 3.75],
                                          [3.5, 3.25]]]], [[[[3.0625, 0.4375],
                                                             [0.4375, 0.0625]],
                                                            [[3.0625, 0.4375],
                                                             [0.4375,
                                                              0.0625]]]]),
           ([[[[1.9375, 4.75],
               [7.5625,
                10.375]]]], [[[[0.47265625, 0.4296875, 0.4296875, 0.04296875],
                               [0.4296875, 0.390625, 0.390625, 0.0390625],
                               [0.4296875, 0.390625, 0.390625, 0.0390625],
                               [0.04296875, 0.0390625, 0.0390625,
                                0.00390625]]]])]


class TestDeformRoIPool:

    def test_deform_roi_pool_gradcheck(self):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import DeformRoIPoolPack
        pool_h = 2
        pool_w = 2
        spatial_scale = 1.0
        sampling_ratio = 2

        for case in inputs:
            np_input = np.array(case[0])
            np_rois = np.array(case[1])

            x = torch.tensor(
                np_input, device='cuda', dtype=torch.float, requires_grad=True)
            rois = torch.tensor(np_rois, device='cuda', dtype=torch.float)
            output_c = x.size(1)

            droipool = DeformRoIPoolPack((pool_h, pool_w),
                                         output_c,
                                         spatial_scale=spatial_scale,
                                         sampling_ratio=sampling_ratio).cuda()

            if _USING_PARROTS:
                gradcheck(droipool, (x, rois), no_grads=[rois])
            else:
                gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)

    def test_modulated_deform_roi_pool_gradcheck(self):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import ModulatedDeformRoIPoolPack
        pool_h = 2
        pool_w = 2
        spatial_scale = 1.0
        sampling_ratio = 2

        for case in inputs:
            np_input = np.array(case[0])
            np_rois = np.array(case[1])

            x = torch.tensor(
                np_input, device='cuda', dtype=torch.float, requires_grad=True)
            rois = torch.tensor(np_rois, device='cuda', dtype=torch.float)
            output_c = x.size(1)

            droipool = ModulatedDeformRoIPoolPack(
                (pool_h, pool_w),
                output_c,
                spatial_scale=spatial_scale,
                sampling_ratio=sampling_ratio).cuda()

            if _USING_PARROTS:
                gradcheck(droipool, (x, rois), no_grads=[rois])
            else:
                gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)

    def _test_deform_roi_pool_allclose(self, device, dtype=torch.float):
        from mmcv.ops import DeformRoIPoolPack
        pool_h = 2
        pool_w = 2
        spatial_scale = 1.0
        sampling_ratio = 2

        for case, output in zip(inputs, outputs):
            np_input = np.array(case[0])
            np_rois = np.array(case[1])
            np_output = np.array(output[0])
            np_grad = np.array(output[1])

            x = torch.tensor(
                np_input, device=device, dtype=torch.float, requires_grad=True)
            rois = torch.tensor(np_rois, device=device, dtype=torch.float)
            output_c = x.size(1)
            droipool = DeformRoIPoolPack(
                (pool_h, pool_w),
                output_c,
                spatial_scale=spatial_scale,
                sampling_ratio=sampling_ratio).to(device)

            output = droipool(x, rois)
            output.backward(torch.ones_like(output))
            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
    ])
    @pytest.mark.parametrize('dtype', [
        torch.float,
        pytest.param(
            torch.double,
            marks=pytest.mark.skipif(
                IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,
                reason='MLU, MUSA does not support for 64-bit floating point'),
        ), torch.half
    ])
    def test_deform_roi_pool_allclose(self, device, dtype):
        self._test_deform_roi_pool_allclose(device, dtype)


================================================
FILE: tests/test_ops/test_diff_iou_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import diff_iou_rotated_2d, diff_iou_rotated_3d
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE

if IS_MLU_AVAILABLE:
    torch.backends.mlu.matmul.allow_tf32 = False


# TODO @MTAI there are some bugs for musa!
@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_diff_iou_rotated_2d(device):
    np_boxes1 = np.asarray([[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
                             [0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
                             [0.5, 0.5, 1., 1., .0]]],
                           dtype=np.float32)
    np_boxes2 = np.asarray(
        [[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., np.pi / 2],
          [0.5, 0.5, 1., 1., np.pi / 4], [1., 1., 1., 1., .0],
          [1.5, 1.5, 1., 1., .0]]],
        dtype=np.float32)

    boxes1 = torch.from_numpy(np_boxes1).to(device)
    boxes2 = torch.from_numpy(np_boxes2).to(device)

    np_expect_ious = np.asarray([[1., 1., .7071, 1 / 7, .0]])
    ious = diff_iou_rotated_2d(boxes1, boxes2)
    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_diff_iou_rotated_3d(device):
    np_boxes1 = np.asarray(
        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
          [.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
          [.5, .5, .5, 1., 1., 1., .0]]],
        dtype=np.float32)
    np_boxes2 = np.asarray(
        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 2., np.pi / 2],
          [.5, .5, .5, 1., 1., 1., np.pi / 4], [1., 1., 1., 1., 1., 1., .0],
          [-1.5, -1.5, -1.5, 2.5, 2.5, 2.5, .0]]],
        dtype=np.float32)

    boxes1 = torch.from_numpy(np_boxes1).to(device)
    boxes2 = torch.from_numpy(np_boxes2).to(device)

    np_expect_ious = np.asarray([[1., .5, .7071, 1 / 15, .0]])
    ious = diff_iou_rotated_3d(boxes1, boxes2)
    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)


================================================
FILE: tests/test_ops/test_filtered_lrelu.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
from mmengine.utils import digit_version
from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch

from mmcv.ops import filtered_lrelu
from mmcv.utils import IS_MUSA_AVAILABLE


class TestFilteredLrelu:

    @classmethod
    def setup_class(cls):
        cls.input_tensor = torch.randn((1, 3, 16, 16), requires_grad=True)
        cls.bias = torch.randn(3, requires_grad=True)
        cls.filter_up = torch.randn((2, 2))
        cls.filter_down = torch.randn((2, 2))

    def test_filtered_lrelu_cpu(self):
        out = filtered_lrelu(self.input_tensor, bias=self.bias)
        assert out.shape == (1, 3, 16, 16)

        out = filtered_lrelu(
            self.input_tensor,
            bias=self.bias,
            filter_up=self.filter_up,
            filter_down=self.filter_down,
            up=2,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different filter_up
        filter_up = torch.randn((4, 4))
        out = filtered_lrelu(
            self.input_tensor,
            bias=self.bias,
            filter_up=filter_up,
            filter_down=self.filter_down,
            up=2,
            down=2,
            padding=2,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different filter_down
        filter_down = torch.randn((4, 4))
        out = filtered_lrelu(
            self.input_tensor,
            bias=self.bias,
            filter_up=self.filter_up,
            filter_down=filter_down,
            up=2,
            down=2,
            padding=2,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different b
        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)
        bias = torch.randn(4, requires_grad=True)
        out = filtered_lrelu(
            input_tensor,
            bias=bias,
            filter_up=self.filter_up,
            filter_down=self.filter_down,
            up=2,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 4, 16, 16)

        # test with different up
        out = filtered_lrelu(
            self.input_tensor,
            bias=self.bias,
            filter_up=self.filter_up,
            filter_down=self.filter_down,
            up=4,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 32, 32)

        # test with different down
        out = filtered_lrelu(
            self.input_tensor,
            bias=self.bias,
            filter_up=self.filter_up,
            filter_down=self.filter_down,
            up=2,
            down=4,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 8, 8)

        # test with different gain
        out1 = filtered_lrelu(self.input_tensor, bias=self.bias, gain=0.2)
        out2 = filtered_lrelu(self.input_tensor, bias=self.bias, gain=0.1)
        assert torch.allclose(out1, 2 * out2)

        # test with different slope
        out = filtered_lrelu(self.input_tensor, bias=self.bias, slope=0.2)
        assert out.shape == (1, 3, 16, 16)

        # test with different clamp
        out1 = filtered_lrelu(self.input_tensor, bias=self.bias, clamp=0.2)
        out2 = filtered_lrelu(self.input_tensor, bias=self.bias, clamp=0.1)
        assert out1.max() <= 0.2
        assert out2.max() <= 0.1

        # test with different flip_filter
        out1 = filtered_lrelu(
            self.input_tensor, bias=self.bias, flip_filter=True)
        assert out.shape == (1, 3, 16, 16)

    @pytest.mark.skipif(
        not torch.cuda.is_available() or is_rocm_pytorch()
        or digit_version(torch.version.cuda) < digit_version('10.2'),
        reason='requires cuda>=10.2')
    def test_filtered_lrelu_cuda(self):
        out = filtered_lrelu(self.input_tensor.cuda(), bias=self.bias.cuda())
        assert out.shape == (1, 3, 16, 16)

        out = filtered_lrelu(
            self.input_tensor.cuda(),
            bias=self.bias.cuda(),
            filter_up=self.filter_up.cuda(),
            filter_down=self.filter_down.cuda(),
            up=2,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different filter_up
        filter_up = torch.randn((4, 4))
        out = filtered_lrelu(
            self.input_tensor.cuda(),
            bias=self.bias.cuda(),
            filter_up=filter_up.cuda(),
            filter_down=self.filter_down.cuda(),
            up=2,
            down=2,
            padding=2,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different filter_down
        filter_down = torch.randn((4, 4))
        out = filtered_lrelu(
            self.input_tensor.cuda(),
            bias=self.bias.cuda(),
            filter_up=self.filter_up.cuda(),
            filter_down=filter_down.cuda(),
            up=2,
            down=2,
            padding=2,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different b
        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)
        bias = torch.randn(4, requires_grad=True)
        out = filtered_lrelu(
            input_tensor.cuda(),
            bias=bias.cuda(),
            filter_up=self.filter_up.cuda(),
            filter_down=self.filter_down.cuda(),
            up=2,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 4, 16, 16)

        # test with different up
        out = filtered_lrelu(
            self.input_tensor.cuda(),
            bias=self.bias.cuda(),
            filter_up=self.filter_up.cuda(),
            filter_down=self.filter_down.cuda(),
            up=4,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 32, 32)

        # test with different down
        out = filtered_lrelu(
            self.input_tensor.cuda(),
            bias=self.bias.cuda(),
            filter_up=self.filter_up.cuda(),
            filter_down=self.filter_down.cuda(),
            up=2,
            down=4,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 8, 8)

        # test with different gain
        out1 = filtered_lrelu(
            self.input_tensor.cuda(), bias=self.bias.cuda(), gain=0.2)
        out2 = filtered_lrelu(
            self.input_tensor.cuda(), bias=self.bias.cuda(), gain=0.1)
        assert torch.allclose(out1, 2 * out2)

        # test with different slope
        out = filtered_lrelu(
            self.input_tensor.cuda(), bias=self.bias.cuda(), slope=0.2)
        assert out.shape == (1, 3, 16, 16)

        # test with different clamp
        out1 = filtered_lrelu(
            self.input_tensor.cuda(), bias=self.bias.cuda(), clamp=0.2)
        out2 = filtered_lrelu(
            self.input_tensor.cuda(), bias=self.bias.cuda(), clamp=0.1)
        assert out1.max() <= 0.2
        assert out2.max() <= 0.1

        # test with different flip_filter
        out1 = filtered_lrelu(
            self.input_tensor.cuda(), bias=self.bias.cuda(), flip_filter=True)
        assert out.shape == (1, 3, 16, 16)

    @pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')
    def test_filtered_lrelu_musa(self):
        out = filtered_lrelu(self.input_tensor.musa(), bias=self.bias.musa())
        assert out.shape == (1, 3, 16, 16)

        out = filtered_lrelu(
            self.input_tensor.musa(),
            bias=self.bias.musa(),
            filter_up=self.filter_up.musa(),
            filter_down=self.filter_down.musa(),
            up=2,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different filter_up
        filter_up = torch.randn((4, 4))
        out = filtered_lrelu(
            self.input_tensor.musa(),
            bias=self.bias.musa(),
            filter_up=filter_up.musa(),
            filter_down=self.filter_down.musa(),
            up=2,
            down=2,
            padding=2,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different filter_down
        filter_down = torch.randn((4, 4))
        out = filtered_lrelu(
            self.input_tensor.musa(),
            bias=self.bias.musa(),
            filter_up=self.filter_up.musa(),
            filter_down=filter_down.musa(),
            up=2,
            down=2,
            padding=2,
            clamp=0.5)
        assert out.shape == (1, 3, 16, 16)

        # test with different b
        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)
        bias = torch.randn(4, requires_grad=True)
        out = filtered_lrelu(
            input_tensor.musa(),
            bias=bias.musa(),
            filter_up=self.filter_up.musa(),
            filter_down=self.filter_down.musa(),
            up=2,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 4, 16, 16)

        # test with different up
        out = filtered_lrelu(
            self.input_tensor.musa(),
            bias=self.bias.musa(),
            filter_up=self.filter_up.musa(),
            filter_down=self.filter_down.musa(),
            up=4,
            down=2,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 32, 32)

        # test with different down
        out = filtered_lrelu(
            self.input_tensor.musa(),
            bias=self.bias.musa(),
            filter_up=self.filter_up.musa(),
            filter_down=self.filter_down.musa(),
            up=2,
            down=4,
            padding=1,
            clamp=0.5)
        assert out.shape == (1, 3, 8, 8)

        # test with different gain
        out1 = filtered_lrelu(
            self.input_tensor.musa(), bias=self.bias.musa(), gain=0.2)
        out2 = filtered_lrelu(
            self.input_tensor.musa(), bias=self.bias.musa(), gain=0.1)
        assert torch.allclose(out1, 2 * out2)

        # test with different slope
        out = filtered_lrelu(
            self.input_tensor.musa(), bias=self.bias.musa(), slope=0.2)
        assert out.shape == (1, 3, 16, 16)

        # test with different clamp
        out1 = filtered_lrelu(
            self.input_tensor.musa(), bias=self.bias.musa(), clamp=0.2)
        out2 = filtered_lrelu(
            self.input_tensor.musa(), bias=self.bias.musa(), clamp=0.1)
        assert out1.max() <= 0.2
        assert out2.max() <= 0.1

        # test with different flip_filter
        out1 = filtered_lrelu(
            self.input_tensor.musa(), bias=self.bias.musa(), flip_filter=True)
        assert out.shape == (1, 3, 16, 16)


================================================
FILE: tests/test_ops/test_focal_loss.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck
    _USING_PARROTS = False

# torch.set_printoptions(precision=8, threshold=100)

inputs = [
    ([[1., 0], [0, 1.]], [0, 1]),
    ([[1., 0, -1.], [0, 1., 2.]], [2, 1]),
    ([[1e-6, 2e-6, 3e-6], [4e-6, 5e-5, 6e-4], [7e-3, 8e-2, 9e-1]], [1, 2, 0]),
]

softmax_outputs = [(0.00566451, [[-0.00657264, 0.00657264],
                                 [0.00657264, -0.00657264]]),
                   (0.34956908, [[0.10165970, 0.03739851, -0.13905823],
                                 [0.01227554, -0.10298023, 0.09070466]]),
                   (0.15754992, [[0.02590877, -0.05181759, 0.02590882],
                                 [0.02589641, 0.02589760, -0.05179400],
                                 [-0.07307514, 0.02234372, 0.05073142]])]

sigmoid_outputs = [(0.13562961, [[-0.00657264, 0.11185755],
                                 [0.11185755, -0.00657264]]),
                   (1.10251057, [[0.28808805, 0.11185755, -0.09602935],
                                 [0.11185755, -0.00657264, 0.40376765]]),
                   (0.42287254, [[0.07457182, -0.02485716, 0.07457201],
                                 [0.07457211, 0.07457669, -0.02483728],
                                 [-0.02462499, 0.08277918, 0.18050370]])]


class Testfocalloss:

    def _test_softmax(self, dtype=torch.float):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import softmax_focal_loss
        alpha = 0.25
        gamma = 2.0
        for case, output in zip(inputs, softmax_outputs):
            np_x = np.array(case[0])
            np_y = np.array(case[1])
            np_x_grad = np.array(output[1])

            x = torch.from_numpy(np_x).cuda().type(dtype)
            x.requires_grad_()
            y = torch.from_numpy(np_y).cuda().long()

            loss = softmax_focal_loss(x, y, gamma, alpha, None, 'mean')
            loss.backward()

            assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)
            assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)

    def _test_sigmoid(self, device, dtype=torch.float):
        from mmcv.ops import sigmoid_focal_loss
        alpha = 0.25
        gamma = 2.0
        for case, output in zip(inputs, sigmoid_outputs):
            np_x = np.array(case[0])
            np_y = np.array(case[1])
            np_x_grad = np.array(output[1])

            x = torch.from_numpy(np_x).to(device).type(dtype)
            x.requires_grad_()
            y = torch.from_numpy(np_y).to(device).long()

            loss = sigmoid_focal_loss(x, y, gamma, alpha, None, 'mean')
            loss.backward()

            assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)
            assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)

    def _test_grad_softmax(self, dtype=torch.float):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import SoftmaxFocalLoss
        alpha = 0.25
        gamma = 2.0
        for case in inputs:
            np_x = np.array(case[0])
            np_y = np.array(case[1])

            x = torch.from_numpy(np_x).cuda().type(dtype)
            x.requires_grad_()
            y = torch.from_numpy(np_y).cuda().long()

            floss = SoftmaxFocalLoss(gamma, alpha)
            if _USING_PARROTS:
                # gradcheck(floss, (x, y),
                #           no_grads=[y])
                pass
            else:
                gradcheck(floss, (x, y), eps=1e-2, atol=1e-2)

    def _test_grad_sigmoid(self, dtype=torch.float):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import SigmoidFocalLoss
        alpha = 0.25
        gamma = 2.0
        for case in inputs:
            np_x = np.array(case[0])
            np_y = np.array(case[1])

            x = torch.from_numpy(np_x).cuda().type(dtype)
            x.requires_grad_()
            y = torch.from_numpy(np_y).cuda().long()

            floss = SigmoidFocalLoss(gamma, alpha)
            if _USING_PARROTS:
                # gradcheck(floss, (x, y),
                #           no_grads=[y])
                pass
            else:
                gradcheck(floss, (x, y), eps=1e-2, atol=1e-2)

    def test_softmax_float(self):
        self._test_softmax(dtype=torch.float)

    def test_softmax_half(self):
        self._test_softmax(dtype=torch.half)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_sigmoid_float(self, device):
        self._test_sigmoid(device=device, dtype=torch.float)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_sigmoid_half(self, device):
        self._test_sigmoid(device, dtype=torch.half)

    def test_grad_softmax_float(self):
        self._test_grad_softmax(dtype=torch.float)

    def test_grad_sigmoid_float(self):
        self._test_grad_sigmoid(dtype=torch.float)


================================================
FILE: tests/test_ops/test_furthest_point_sample.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import furthest_point_sample, furthest_point_sample_with_dist
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_fps(device):
    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
                         [-0.8070, 2.4137,
                          -0.5845], [-1.0001, 2.1982, -0.5859],
                         [0.3841, 1.8983, -0.7431]],
                        [[-1.0696, 3.0758,
                          -0.1899], [-0.2559, 3.5521, -0.1402],
                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
                         [-0.0518, 3.7251, -0.3950]]]).to(device)

    idx = furthest_point_sample(xyz, 3)
    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).to(device)
    assert torch.all(idx == expected_idx)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_fps_with_dist(device):
    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
                         [-0.8070, 2.4137,
                          -0.5845], [-1.0001, 2.1982, -0.5859],
                         [0.3841, 1.8983, -0.7431]],
                        [[-1.0696, 3.0758,
                          -0.1899], [-0.2559, 3.5521, -0.1402],
                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
                         [-0.0518, 3.7251, -0.3950]]]).to(device)

    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).to(device)
    xyz_square_dist = ((xyz.unsqueeze(dim=1) -
                        xyz.unsqueeze(dim=2))**2).sum(-1)
    idx = furthest_point_sample_with_dist(xyz_square_dist, 3)
    assert torch.all(idx == expected_idx)

    import numpy as np
    fps_idx = np.load('tests/data/for_3d_ops/fps_idx.npy')
    features_for_fps_distance = np.load(
        'tests/data/for_3d_ops/features_for_fps_distance.npy')
    expected_idx = torch.from_numpy(fps_idx).to(device)
    features_for_fps_distance = torch.from_numpy(features_for_fps_distance).to(
        device)

    idx = furthest_point_sample_with_dist(features_for_fps_distance, 16)
    assert torch.all(idx == expected_idx)


================================================
FILE: tests/test_ops/test_fused_bias_leakyrelu.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck, gradgradcheck
    _USING_PARROTS = False


class TestFusedBiasLeakyReLU:

    @classmethod
    def setup_class(cls):
        if not IS_CUDA_AVAILABLE and not IS_NPU_AVAILABLE \
           and not IS_MUSA_AVAILABLE:
            return
        if IS_CUDA_AVAILABLE:
            cls.input_tensor = torch.randn((2, 2, 2, 2),
                                           requires_grad=True).cuda()
            cls.bias = torch.zeros(2, requires_grad=True).cuda()
        elif IS_NPU_AVAILABLE:
            cls.input_tensor = torch.randn((2, 2, 2, 2),
                                           requires_grad=True).npu()
            cls.bias = torch.zeros(2, requires_grad=True).npu()
        elif IS_MUSA_AVAILABLE:
            cls.input_tensor = torch.randn((2, 2, 2, 2),
                                           requires_grad=True).musa()
            cls.bias = torch.zeros(2, requires_grad=True).musa()

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_gradient(self, device):

        from mmcv.ops import FusedBiasLeakyReLU
        if _USING_PARROTS:
            if IS_CUDA_AVAILABLE:
                gradcheck(
                    FusedBiasLeakyReLU(2).cuda(),
                    self.input_tensor,
                    delta=1e-4,
                    pt_atol=1e-3)
        else:
            gradcheck(
                FusedBiasLeakyReLU(2).to(device),
                self.input_tensor,
                eps=1e-4,
                atol=1e-3)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_gradgradient(self, device):

        from mmcv.ops import FusedBiasLeakyReLU
        gradgradcheck(
            FusedBiasLeakyReLU(2).to(device),
            self.input_tensor,
            eps=1e-4,
            atol=1e-3)


================================================
FILE: tests/test_ops/test_gather_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import gather_points
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE


class TestGatherPoints:

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_gather_points_all_close(self, device):
        features = torch.tensor(
            [[[
                -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586,
                -1.4967, -0.4800, 0.2252
            ],
              [
                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776, 3.1154, 2.1705,
                  2.5221, 2.0411, 3.1446
              ],
              [
                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770, -0.2867, -1.4162,
                  -1.4044, -1.4245, -1.4074
              ]],
             [[
                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909, -0.6066, -0.8773,
                 -0.0745, -0.9496, 0.1434
             ],
              [
                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746, 1.9662, 0.9566,
                  1.8778, 1.1437, 1.3639
              ],
              [
                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540, 0.0462, -0.6227,
                  0.3223, -0.6944, -0.5294
              ]]],
            dtype=torch.float,
            device=device)
        idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]],
                           dtype=torch.int32,
                           device=device)
        output = gather_points(features, idx)
        expected_output = torch.tensor(
            [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
              [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
              [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
             [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
              [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
              [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]],
            dtype=torch.float,
            device=device)

        assert torch.allclose(output, expected_output)

        # test fp16
        output_half = gather_points(features.half(), idx)
        assert torch.allclose(output_half, expected_output.half())


================================================
FILE: tests/test_ops/test_group_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import grouping_operation
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
@pytest.mark.parametrize('dtype', [
    torch.half,
    torch.float,
    pytest.param(
        torch.double,
        marks=[
            pytest.mark.skipif(
                IS_MUSA_AVAILABLE,
                reason='MUSA does not support for 64-bit floating point')
        ]),
])
def test_grouping_points(dtype, device):
    idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],
                         [0, 0, 0]],
                        [[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],
                         [0, 0, 0]]]).int().to(device)
    features = torch.tensor([[[
        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
        0.9268, 0.8414
    ],
                              [
                                  5.4247, 1.5113, 2.3944, 1.4740, 5.0300,
                                  5.1030, 1.9360, 2.1939, 2.1581, 3.4666
                              ],
                              [
                                  -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,
                                  -0.5732, -1.0830, -1.7561, -1.6786, -1.6967
                              ]],
                             [[
                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,
                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527
                             ],
                              [
                                  1.1773, 1.5009, 2.6399, 5.9242, 1.0962,
                                  2.7346, 6.0865, 1.5555, 4.3303, 2.8229
                              ],
                              [
                                  -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
                                  -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
                              ]]],
                            dtype=dtype).to(device)
    features.requires_grad = True

    output = grouping_operation(features, idx)
    output.backward(output)
    grad_features = features.grad
    expected_output = torch.tensor(
        [[[[0.5798, 0.5798, 0.5798], [-1.3311, -1.3311, -1.3311],
           [0.9268, 0.9268, 0.9268], [0.5798, 0.5798, 0.5798],
           [0.5798, 0.5798, 0.5798], [0.5798, 0.5798, 0.5798]],
          [[5.4247, 5.4247, 5.4247], [1.4740, 1.4740, 1.4740],
           [2.1581, 2.1581, 2.1581], [5.4247, 5.4247, 5.4247],
           [5.4247, 5.4247, 5.4247], [5.4247, 5.4247, 5.4247]],
          [[-1.6266, -1.6266, -1.6266], [-1.6931, -1.6931, -1.6931],
           [-1.6786, -1.6786, -1.6786], [-1.6266, -1.6266, -1.6266],
           [-1.6266, -1.6266, -1.6266], [-1.6266, -1.6266, -1.6266]]],
         [[[-0.0380, -0.0380, -0.0380], [-0.3693, -0.3693, -0.3693],
           [-1.8527, -1.8527, -1.8527], [-0.0380, -0.0380, -0.0380],
           [-0.0380, -0.0380, -0.0380], [-0.0380, -0.0380, -0.0380]],
          [[1.1773, 1.1773, 1.1773], [6.0865, 6.0865, 6.0865],
           [2.8229, 2.8229, 2.8229], [1.1773, 1.1773, 1.1773],
           [1.1773, 1.1773, 1.1773], [1.1773, 1.1773, 1.1773]],
          [[-0.6646, -0.6646, -0.6646], [0.4990, 0.4990, 0.4990],
           [0.0386, 0.0386, 0.0386], [-0.6646, -0.6646, -0.6646],
           [-0.6646, -0.6646, -0.6646], [-0.6646, -0.6646, -0.6646]]]],
        dtype=dtype).to(device)
    expected_grad_features = torch.tensor(
        [[[
            6.9576, 0.0000, 0.0000, -3.9933, 0.0000, 0.0000, 0.0000, 0.0000,
            2.7804, 0.0000
        ],
          [
              65.0964, 0.0000, 0.0000, 4.4220, 0.0000, 0.0000, 0.0000, 0.0000,
              6.4743, 0.0000
          ],
          [
              -19.5192, 0.0000, 0.0000, -5.0793, 0.0000, 0.0000, 0.0000,
              0.0000, -5.0358, 0.0000
          ]],
         [[
             -0.4560, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -1.1079, 0.0000,
             0.0000, -5.5581
         ],
          [
              14.1276, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 18.2595, 0.0000,
              0.0000, 8.4687
          ],
          [
              -7.9752, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4970, 0.0000,
              0.0000, 0.1158
          ]]],
        dtype=dtype).to(device)
    assert torch.allclose(output, expected_output)
    assert torch.allclose(grad_features, expected_grad_features)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
@pytest.mark.parametrize('dtype', [
    torch.half,
    torch.float,
    pytest.param(
        torch.double,
        marks=pytest.mark.skipif(
            IS_MUSA_AVAILABLE,
            reason='MUSA does not support for 64-bit floating point')),
])
def test_stack_grouping_points(dtype, device):
    if device == 'npu' and dtype == torch.double:
        return
    idx = torch.tensor([[0, 0, 0], [3, 3, 3], [8, 8, 8], [1, 1, 1], [0, 0, 0],
                        [2, 2, 2], [0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0],
                        [1, 1, 1], [0, 0, 0]]).int().to(device)
    features = torch.tensor([[
        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
        0.9268, 0.8414
    ],
                             [
                                 5.4247, 1.5113, 2.3944, 1.4740, 5.0300,
                                 5.1030, 1.9360, 2.1939, 2.1581, 3.4666
                             ],
                             [
                                 -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,
                                 -0.5732, -1.0830, -1.7561, -1.6786, -1.6967
                             ],
                             [
                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,
                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527
                             ],
                             [
                                 1.1773, 1.5009, 2.6399, 5.9242, 1.0962,
                                 2.7346, 6.0865, 1.5555, 4.3303, 2.8229
                             ],
                             [
                                 -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
                                 -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
                             ]],
                            dtype=dtype).to(device)
    features_batch_cnt = torch.tensor([3, 3]).int().to(device)
    indices_batch_cnt = torch.tensor([6, 6]).int().to(device)
    output = grouping_operation(features, idx, features_batch_cnt,
                                indices_batch_cnt)
    expected_output = torch.tensor(
        [[[0.5798, 0.5798, 0.5798], [-0.7981, -0.7981, -0.7981],
          [-0.9280, -0.9280, -0.9280], [-1.3311, -1.3311, -1.3311],
          [1.3687, 1.3687, 1.3687], [0.9277, 0.9277, 0.9277],
          [-0.4164, -0.4164, -0.4164], [-1.8274, -1.8274, -1.8274],
          [0.9268, 0.9268, 0.9268], [0.8414, 0.8414, 0.8414]],
         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
         [[5.4247, 5.4247, 5.4247], [1.5113, 1.5113, 1.5113],
          [2.3944, 2.3944, 2.3944], [1.4740, 1.4740, 1.4740],
          [5.0300, 5.0300, 5.0300], [5.1030, 5.1030, 5.1030],
          [1.9360, 1.9360, 1.9360], [2.1939, 2.1939, 2.1939],
          [2.1581, 2.1581, 2.1581], [3.4666, 3.4666, 3.4666]],
         [[0.5798, 0.5798, 0.5798], [-0.7981, -0.7981, -0.7981],
          [-0.9280, -0.9280, -0.9280], [-1.3311, -1.3311, -1.3311],
          [1.3687, 1.3687, 1.3687], [0.9277, 0.9277, 0.9277],
          [-0.4164, -0.4164, -0.4164], [-1.8274, -1.8274, -1.8274],
          [0.9268, 0.9268, 0.9268], [0.8414, 0.8414, 0.8414]],
         [[-1.6266, -1.6266, -1.6266], [-1.0281, -1.0281, -1.0281],
          [-1.0393, -1.0393, -1.0393], [-1.6931, -1.6931, -1.6931],
          [-1.3982, -1.3982, -1.3982], [-0.5732, -0.5732, -0.5732],
          [-1.0830, -1.0830, -1.0830], [-1.7561, -1.7561, -1.7561],
          [-1.6786, -1.6786, -1.6786], [-1.6967, -1.6967, -1.6967]],
         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]],
         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]],
         [[1.1773, 1.1773, 1.1773], [1.5009, 1.5009, 1.5009],
          [2.6399, 2.6399, 2.6399], [5.9242, 5.9242, 5.9242],
          [1.0962, 1.0962, 1.0962], [2.7346, 2.7346, 2.7346],
          [6.0865, 6.0865, 6.0865], [1.5555, 1.5555, 1.5555],
          [4.3303, 4.3303, 4.3303], [2.8229, 2.8229, 2.8229]],
         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]]],
        dtype=dtype).to(device)
    assert torch.allclose(output, expected_output)


================================================
FILE: tests/test_ops/test_info.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch


class TestInfo:

    def test_info(self):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import get_compiler_version, get_compiling_cuda_version
        cv = get_compiler_version()
        ccv = get_compiling_cuda_version()
        assert cv is not None
        assert ccv is not None


================================================
FILE: tests/test_ops/test_iou3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import boxes_iou3d, boxes_overlap_bev, nms3d, nms3d_normal
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
])
def test_boxes_overlap_bev(device):
    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
                           dtype=np.float32)
    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
                           dtype=np.float32)
    np_expect_overlaps = np.asarray(
        [[4.0, 4.0, (8 + 8 * 2**0.5) /
          (3 + 2 * 2**0.5)], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
        dtype=np.float32)

    boxes1 = torch.from_numpy(np_boxes1).to(device)
    boxes2 = torch.from_numpy(np_boxes2).to(device)

    # test for 3 boxes
    overlaps = boxes_overlap_bev(boxes1, boxes2)
    assert np.allclose(overlaps.cpu().numpy(), np_expect_overlaps, atol=1e-4)

    # test for many boxes
    boxes2 = boxes2.repeat_interleave(555, 0)

    overlaps = boxes_overlap_bev(boxes1, boxes2)
    assert np.allclose(
        overlaps.cpu().numpy(), np_expect_overlaps.repeat(555, 1), atol=1e-4)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_boxes_iou3d(device):
    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
                           dtype=np.float32)
    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
                           dtype=np.float32)
    np_expect_ious = np.asarray(
        [[1.0, 1.0, 1.0 / 2**0.5], [1.0 / 15, 1.0 / 15, 1.0 / 15],
         [0.0, 0.0, 0.0]],
        dtype=np.float32)

    boxes1 = torch.from_numpy(np_boxes1).to(device)
    boxes2 = torch.from_numpy(np_boxes2).to(device)

    ious = boxes_iou3d(boxes1, boxes2)
    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_nms3d(device):
    # test for 5 boxes
    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
                          dtype=np.float32)
    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
    np_inds = np.array([1, 0, 3])
    boxes = torch.from_numpy(np_boxes)
    scores = torch.from_numpy(np_scores)
    inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)

    assert np.allclose(inds.cpu().numpy(), np_inds)

    # test for many boxes
    # In the float data type calculation process, float will be converted to
    # double in CUDA kernel (https://github.com/open-mmlab/mmcv/blob
    # /master/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp#L61),
    # always use float in MLU kernel. The difference between the mentioned
    # above leads to different results.
    if device != 'mlu':
        np.random.seed(42)
        np_boxes = np.random.rand(555, 7).astype(np.float32)
        np_scores = np.random.rand(555).astype(np.float32)
        boxes = torch.from_numpy(np_boxes)
        scores = torch.from_numpy(np_scores)
        inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)

        assert len(inds.cpu().numpy()) == 176


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_nms3d_normal(device):
    # test for 5 boxes
    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
                          dtype=np.float32)
    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
    np_inds = np.array([1, 0, 3])
    boxes = torch.from_numpy(np_boxes)
    scores = torch.from_numpy(np_scores)
    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)

    assert np.allclose(inds.cpu().numpy(), np_inds)

    # test for many boxes
    np.random.seed(42)
    np_boxes = np.random.rand(555, 7).astype(np.float32)
    np_scores = np.random.rand(555).astype(np.float32)
    boxes = torch.from_numpy(np_boxes)
    scores = torch.from_numpy(np_scores)
    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)

    assert len(inds.cpu().numpy()) == 148


================================================
FILE: tests/test_ops/test_knn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import knn
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_knn(device):
    new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
                             [-2.2769, 2.7817, -0.2334],
                             [-0.4003, 2.4666, -0.5116],
                             [-0.0740, 1.3147, -1.3625],
                             [-0.0740, 1.3147, -1.3625]],
                            [[-2.0289, 2.4952, -0.1708],
                             [-2.0668, 6.0278, -0.4875],
                             [0.4066, 1.4211, -0.2947],
                             [-2.0289, 2.4952, -0.1708],
                             [-2.0289, 2.4952, -0.1708]]]).to(device)

    xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
                         [-0.4003, 2.4666,
                          -0.5116], [-0.5251, 2.4379, -0.8466],
                         [-0.9691, 1.1418,
                          -1.3733], [-0.2232, 0.9561, -1.3626],
                         [-2.2769, 2.7817, -0.2334],
                         [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
                         [0.4917, 1.1529, -1.3496]],
                        [[-2.0289, 2.4952,
                          -0.1708], [-0.7188, 0.9956, -0.5096],
                         [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
                         [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
                         [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
                         [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
                                                    -1.2000]]]).to(device)

    idx = knn(5, xyz, new_xyz)
    new_xyz_ = new_xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
    xyz_ = xyz.unsqueeze(1).repeat(1, new_xyz.shape[1], 1, 1)
    dist = ((new_xyz_ - xyz_) * (new_xyz_ - xyz_)).sum(-1)
    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
    assert torch.all(idx == expected_idx)

    idx = knn(5,
              xyz.transpose(1, 2).contiguous(),
              new_xyz.transpose(1, 2).contiguous(), True)
    assert torch.all(idx == expected_idx)

    idx = knn(5, xyz, xyz)
    xyz_ = xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
    xyz__ = xyz.unsqueeze(1).repeat(1, xyz.shape[1], 1, 1)
    dist = ((xyz_ - xyz__) * (xyz_ - xyz__)).sum(-1)
    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
    assert torch.all(idx == expected_idx)


================================================
FILE: tests/test_ops/test_masked_conv2d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE

if IS_MLU_AVAILABLE:
    torch.backends.cnnl.allow_tf32 = False
    torch.backends.mlu.matmul.allow_tf32 = False


class TestMaskedConv2d:

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_masked_conv2d_all_close(self, device):
        from mmcv.ops import MaskedConv2d
        np_input = np.load(
            'tests/data/for_masked_conv2d/masked_conv2d_for_input.npy')
        np_mask = np.load(
            'tests/data/for_masked_conv2d/masked_conv2d_for_mask.npy')
        np_weight = np.load(
            'tests/data/for_masked_conv2d/masked_conv2d_for_weight.npy')
        np_bias = np.load(
            'tests/data/for_masked_conv2d/masked_conv2d_for_bias.npy')
        np_output = np.load(
            'tests/data/for_masked_conv2d/masked_conv2d_for_output.npy')
        input = torch.tensor(np_input, dtype=torch.float, device=device)
        mask = torch.tensor(np_mask, dtype=torch.float, device=device)
        weight = torch.tensor(np_weight, dtype=torch.float, device=device)
        bias = torch.tensor(np_bias, dtype=torch.float, device=device)
        conv = MaskedConv2d(3, 3, 3, 1, 1).to(device)
        conv.weight = torch.nn.Parameter(weight)
        conv.bias = torch.nn.Parameter(bias)
        output = conv(input, mask)
        assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)


================================================
FILE: tests/test_ops/test_merge_cells.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
"""
CommandLine:
    pytest tests/test_merge_cells.py
"""
import math

import pytest
import torch
import torch.nn.functional as F

from mmcv.ops.merge_cells import (BaseMergeCell, ConcatCell, GlobalPoolingCell,
                                  SumCell)


# All size (14, 7) below is to test the situation that
# the input size can't be divisible by the target size.
@pytest.mark.parametrize(
    'inputs_x, inputs_y',
    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
def test_sum_cell(inputs_x, inputs_y):
    sum_cell = SumCell(256, 256)
    output = sum_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
    assert output.size() == inputs_x.size()
    output = sum_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
    assert output.size() == inputs_y.size()
    output = sum_cell(inputs_x, inputs_y)
    assert output.size() == inputs_y.size()


@pytest.mark.parametrize(
    'inputs_x, inputs_y',
    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
def test_concat_cell(inputs_x, inputs_y):
    concat_cell = ConcatCell(256, 256)
    output = concat_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
    assert output.size() == inputs_x.size()
    output = concat_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
    assert output.size() == inputs_y.size()
    output = concat_cell(inputs_x, inputs_y)
    assert output.size() == inputs_y.size()


@pytest.mark.parametrize(
    'inputs_x, inputs_y',
    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
def test_global_pool_cell(inputs_x, inputs_y):
    gp_cell = GlobalPoolingCell(with_out_conv=False)
    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
    assert (gp_cell_out.size() == inputs_x.size())
    gp_cell = GlobalPoolingCell(256, 256)
    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
    assert (gp_cell_out.size() == inputs_x.size())


@pytest.mark.parametrize('target_size', [(256, 256), (128, 128), (64, 64),
                                         (14, 7)])
def test_resize_methods(target_size):
    inputs_x = torch.randn([2, 256, 128, 128])
    h, w = inputs_x.shape[-2:]
    target_h, target_w = target_size
    if (h <= target_h) or w <= target_w:
        rs_mode = 'upsample'
    else:
        rs_mode = 'downsample'

    if rs_mode == 'upsample':
        upsample_methods_list = ['nearest', 'bilinear']
        for method in upsample_methods_list:
            merge_cell = BaseMergeCell(upsample_mode=method)
            merge_cell_out = merge_cell._resize(inputs_x, target_size)
            gt_out = F.interpolate(inputs_x, size=target_size, mode=method)
            assert merge_cell_out.equal(gt_out)
    elif rs_mode == 'downsample':
        merge_cell = BaseMergeCell()
        merge_cell_out = merge_cell._resize(inputs_x, target_size)
        if h % target_h != 0 or w % target_w != 0:
            pad_h = math.ceil(h / target_h) * target_h - h
            pad_w = math.ceil(w / target_w) * target_w - w
            pad_l = pad_w // 2
            pad_r = pad_w - pad_l
            pad_t = pad_h // 2
            pad_b = pad_h - pad_t
            pad = (pad_l, pad_r, pad_t, pad_b)
            inputs_x = F.pad(inputs_x, pad, mode='constant', value=0.0)
        kernel_size = (inputs_x.shape[-2] // target_h,
                       inputs_x.shape[-1] // target_w)
        gt_out = F.max_pool2d(
            inputs_x, kernel_size=kernel_size, stride=kernel_size)
        print(merge_cell_out.shape, gt_out.shape)
        assert (merge_cell_out == gt_out).all()
        assert merge_cell_out.shape[-2:] == target_size


================================================
FILE: tests/test_ops/test_min_area_polygons.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import min_area_polygons
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE

np_pointsets = np.asarray([[
    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
    2.0, 1.5, 1.5
],
                           [
                               1.0, 1.0, 8.0, 8.0, 1.0, 2.0, 2.0, 1.0, 1.0,
                               3.0, 3.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.5, 1.5
                           ]])

expected_polygons = np.asarray(
    [[3.0000, 1.0000, 1.0000, 1.0000, 1.0000, 3.0000, 3.0000, 3.0000],
     [8.0, 8.0, 2.3243, 0.0541, 0.0541, 1.6757, 5.7297, 9.6216]])


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_min_area_polygons(device):
    pointsets = torch.from_numpy(np_pointsets).to(device).float()

    assert np.allclose(
        min_area_polygons(pointsets).cpu().numpy(),
        expected_polygons,
        atol=1e-4)


================================================
FILE: tests/test_ops/test_modulated_deform_conv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os

import numpy
import pytest
import torch
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE

if IS_MUSA_AVAILABLE:
    try:
        from torch_musa.core.amp import autocast
    except ImportError:
        pass
else:
    try:
        # If PyTorch version >= 1.6.0 and fp16 is enabled,
        # torch.cuda.amp.autocast would be imported and used;
        # we should test if our modules support it.
        from torch.cuda.amp import autocast
    except ImportError:
        pass

cur_dir = os.path.dirname(os.path.abspath(__file__))

input_t = [[[[1., 2., 3.], [1., 2., 3.], [1., 2., 3.]]]]
output_t = [[[[0.5, 1.5, 2.5, 1.5], [1.0, 3.0, 5.0, 3.0], [1.0, 3.0, 5.0, 3.0],
              [0.5, 1.5, 2.5, 1.5]]]]
input_grad = [[[[2., 2., 2.], [2., 2., 2.], [2., 2., 2.]]]]
dcn_w_grad = [[[[9., 9.], [9., 9.]]]]
dcn_offset_w_grad = [[[[-7.0, -4.0], [0.0, 0.0]]], [[[-9.0, 7.5], [-6.0,
                                                                   5.0]]],
                     [[[-4.0, -7.0], [0.0, 0.0]]],
                     [[[-7.5, -9.0], [-5.0, -6.0]]],
                     [[[-7.0, -4.0], [-7.0, -4.0]]],
                     [[[-6.0, 5.0], [-9.0, 7.5]]],
                     [[[-4.0, -7.0], [-4.0, -7.0]]],
                     [[[-5.0, -6.0], [-7.5, -9.0]]], [[[10.5, 6.0], [7.0,
                                                                     4.0]]],
                     [[[6.0, 10.5], [4.0, 7.0]]], [[[7.0, 4.0], [10.5, 6.0]]],
                     [[[4.0, 7.0], [6.0, 10.5]]]]
dcn_offset_b_grad = [
    -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, 4.5, 4.5, 4.5, 4.5
]


class TestMdconv:

    def _test_mdconv(self, dtype=torch.float, device='cuda'):
        if not torch.cuda.is_available() and device == 'cuda':
            pytest.skip('test requires GPU')
        if device == 'mlu':
            from mmcv.ops import \
                ModulatedDeformConv2dPack_MLU as ModulatedDeformConv2dPack
        else:
            from mmcv.ops import ModulatedDeformConv2dPack

        input = torch.tensor(input_t, dtype=dtype, device=device)
        input.requires_grad = True

        dcn = ModulatedDeformConv2dPack(
            1,
            1,
            kernel_size=(2, 2),
            stride=1,
            padding=1,
            deform_groups=1,
            bias=False).to(device)

        dcn.weight.data.fill_(1.)
        dcn.type(dtype)
        output = dcn(input)
        output.sum().backward()
        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)
        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,
                              1e-2)
        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),
                              dcn_w_grad, 1e-2)
        assert numpy.allclose(
            dcn.conv_offset.weight.grad.cpu().detach().numpy(),
            dcn_offset_w_grad, 1e-2)
        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
                              dcn_offset_b_grad, 1e-2)

    def _test_amp_mdconv(self, input_dtype=torch.float, device='cuda'):
        """The function to test amp released on pytorch 1.6.0.

        The type of input data might be torch.float or torch.half,
        so we should test mdconv in both cases. With amp, the data
        type of model will NOT be set manually.

        Args:
            input_dtype: torch.float or torch.half.
        """
        if not torch.cuda.is_available() and device == 'cuda':
            return
        if device == 'mlu':
            from mmcv.ops import \
                ModulatedDeformConv2dPack_MLU as ModulatedDeformConv2dPack
        else:
            from mmcv.ops import ModulatedDeformConv2dPack

        input = torch.tensor(input_t).to(device).type(input_dtype)
        input.requires_grad = True

        dcn = ModulatedDeformConv2dPack(
            1,
            1,
            kernel_size=(2, 2),
            stride=1,
            padding=1,
            deform_groups=1,
            bias=False).to(device)
        dcn.weight.data.fill_(1.)
        output = dcn(input)
        output.sum().backward()
        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)
        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,
                              1e-2)
        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),
                              dcn_w_grad, 1e-2)
        assert numpy.allclose(
            dcn.conv_offset.weight.grad.cpu().detach().numpy(),
            dcn_offset_w_grad, 1e-2)
        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
                              dcn_offset_b_grad, 1e-2)

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_mdconv_float(self, device):
        self._test_mdconv(dtype=torch.float, device=device)

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
    ])
    def test_mdconv_double(self, device):
        self._test_mdconv(dtype=torch.double, device=device)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_mdconv_half(self, device):
        self._test_mdconv(torch.half, device=device)
        # test amp when torch version >= '1.6.0', the type of
        # input data for mdconv might be torch.float or torch.half
        if (TORCH_VERSION != 'parrots'
                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
            with autocast(enabled=True):
                self._test_amp_mdconv(torch.float, device=device)
                self._test_amp_mdconv(torch.half, device=device)


================================================
FILE: tests/test_ops/test_ms_deformable_attn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops.multi_scale_deform_attn import (
    MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,
    multi_scale_deformable_attn_pytorch)
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)

_USING_PARROTS = True
_IS_AUTOCAST_AVAILABLE = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck
    _USING_PARROTS = False

if IS_MUSA_AVAILABLE:
    try:
        from torch.musa.amp import autocast
    except ImportError:
        _IS_AUTOCAST_AVAILABLE = False
        pass
else:
    try:
        # If PyTorch version >= 1.6.0 and fp16 is enabled,
        # torch.cuda.amp.autocast would be imported and used;
        # we should test if our modules support it.
        from torch.cuda.amp import autocast
    except ImportError:
        _IS_AUTOCAST_AVAILABLE = False
        pass


@pytest.mark.parametrize('device', [
    'cpu',
    pytest.param(
        'cuda:0',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_multiscale_deformable_attention(device):
    with pytest.raises(ValueError):
        # embed_dims must be divisible by num_heads,
        MultiScaleDeformableAttention(
            embed_dims=256,
            num_heads=7,
        )
    device = torch.device(device)
    msda = MultiScaleDeformableAttention(
        embed_dims=3, num_levels=2, num_heads=3)
    msda.init_weights()
    num_query = 5
    bs = 1
    embed_dims = 3
    query = torch.rand(num_query, bs, embed_dims).to(device)
    key = torch.rand(num_query, bs, embed_dims).to(device)
    spatial_shapes = torch.Tensor([[2, 2], [1, 1]]).long().to(device)
    level_start_index = torch.Tensor([0, 4]).long().to(device)
    reference_points = torch.rand(bs, num_query, 2, 2).to(device)
    msda.to(device)
    msda(
        query,
        key,
        key,
        reference_points=reference_points,
        spatial_shapes=spatial_shapes,
        level_start_index=level_start_index)

    # test with value_proj_ratio
    embed_dims = 6
    value_proj_ratio = 0.5
    query = torch.rand(num_query, bs, embed_dims).to(device)
    key = torch.rand(num_query, bs, embed_dims).to(device)
    msda = MultiScaleDeformableAttention(
        embed_dims=embed_dims,
        num_levels=2,
        num_heads=3,
        value_proj_ratio=value_proj_ratio)
    msda.init_weights()
    msda.to(device)
    msda(
        query,
        key,
        key,
        reference_points=reference_points,
        spatial_shapes=spatial_shapes,
        level_start_index=level_start_index)


def test_forward_multi_scale_deformable_attn_pytorch():
    N, M, D = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
    S = sum((H * W).item() for H, W in shapes)

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D) * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)

    multi_scale_deformable_attn_pytorch(value.double(), shapes,
                                        sampling_locations.double(),
                                        attention_weights.double()).detach()


@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
def test_forward_equal_with_pytorch_double():
    N, M, D = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum((H * W).item() for H, W in shapes)

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D) * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2
    output_pytorch = multi_scale_deformable_attn_pytorch(
        value.double(), shapes, sampling_locations.double(),
        attention_weights.double()).detach().cpu()

    output_cuda = MultiScaleDeformableAttnFunction.apply(
        value.cuda().double(), shapes.cuda(), level_start_index.cuda(),
        sampling_locations.cuda().double(),
        attention_weights.cuda().double(), im2col_step).detach().cpu()
    assert torch.allclose(output_cuda, output_pytorch)
    max_abs_err = (output_cuda - output_pytorch).abs().max()
    max_rel_err = ((output_cuda - output_pytorch).abs() /
                   output_pytorch.abs()).max()
    assert max_abs_err < 1e-18
    assert max_rel_err < 1e-15


@pytest.mark.skipif(not IS_NPU_AVAILABLE, reason='requires NPU support')
def test_forward_equal_with_pytorch_npu():
    N, M, D = 6, 4, 8
    Lq, L, P = 10000, 4, 8
    shapes = torch.as_tensor([(60, 40), (30, 20), (16, 24), (53, 32)],
                             dtype=torch.int32)
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum((H * W).item() for H, W in shapes)

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D) * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2
    output_pytorch = multi_scale_deformable_attn_pytorch(
        value.float(), shapes, sampling_locations.float(),
        attention_weights.float()).detach().cpu()

    output_npu = MultiScaleDeformableAttnFunction.apply(
        value.npu().float(), shapes.npu(), level_start_index.npu(),
        sampling_locations.npu().float(),
        attention_weights.npu().float(), im2col_step).detach().cpu()
    assert torch.allclose(output_npu, output_pytorch)
    max_abs_err = (output_npu - output_pytorch).abs().max()
    max_rel_err = ((output_npu - output_pytorch).abs() /
                   output_pytorch.abs()).max()
    assert max_abs_err < 1e-18
    assert max_rel_err < 1e-15


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_forward_equal_with_pytorch_float(device):
    N, M, D = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum((H * W).item() for H, W in shapes)

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D) * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2
    output_pytorch = multi_scale_deformable_attn_pytorch(
        value, shapes, sampling_locations, attention_weights).detach().cpu()

    output_device = MultiScaleDeformableAttnFunction.apply(
        value.to(device), shapes.to(device), level_start_index.to(device),
        sampling_locations.to(device), attention_weights.to(device),
        im2col_step).detach().cpu()
    assert torch.allclose(output_device, output_pytorch, rtol=1e-2, atol=1e-3)
    max_abs_err = (output_device - output_pytorch).abs().max()
    max_rel_err = ((output_device - output_pytorch).abs() /
                   output_pytorch.abs()).max()
    assert max_abs_err < 1e-9
    assert max_rel_err < 1e-6


@pytest.mark.skipif(
    not _IS_AUTOCAST_AVAILABLE, reason='requires autocast support')
@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_forward_equal_with_autocast(device):
    N, M, D = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum((H * W).item() for H, W in shapes)

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D) * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2
    output_pytorch = multi_scale_deformable_attn_pytorch(
        value, shapes, sampling_locations, attention_weights).detach().cpu()

    # float test
    dtype = torch.float
    with autocast(enabled=True):
        output_device = MultiScaleDeformableAttnFunction.apply(
            value.to(device).type(dtype), shapes.to(device),
            level_start_index.to(device), sampling_locations.to(device),
            attention_weights.to(device), im2col_step).detach().cpu()
    assert torch.allclose(output_device, output_pytorch, rtol=1e-2, atol=1e-3)
    max_abs_err = (output_device - output_pytorch).abs().max()
    max_rel_err = ((output_device - output_pytorch).abs() /
                   output_pytorch.abs()).max()
    assert max_abs_err < 1e-9
    assert max_rel_err < 1e-6

    # half test
    dtype = torch.half
    with autocast(enabled=True):
        output_device = MultiScaleDeformableAttnFunction.apply(
            value.to(device).type(dtype), shapes.to(device),
            level_start_index.to(device), sampling_locations.to(device),
            attention_weights.to(device), im2col_step).detach().cpu()
    assert torch.allclose(
        output_device, output_pytorch.half(), rtol=1e-2, atol=1e-3)
    max_abs_err = (output_device - output_pytorch).abs().max()
    max_rel_err = ((output_device - output_pytorch).abs() /
                   output_pytorch.abs()).max()
    assert max_abs_err < 1e-5
    assert max_rel_err < 1e-2


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
@pytest.mark.parametrize('dtype', [
    torch.float,
    pytest.param(
        torch.double,
        marks=pytest.mark.skipif(
            IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,
            reason='MLU, MUSA does not support for 64-bit floating point')),
    torch.half
])
@pytest.mark.parametrize('channels', [
    4,
    30,
    32,
    64,
    71,
    1025,
])
def test_gradient_numerical(channels,
                            device,
                            dtype,
                            grad_value=True,
                            grad_sampling_loc=True,
                            grad_attn_weight=True):

    N, M, _ = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(3, 2), (2, 1)], dtype=torch.long).to(device)
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum((H * W).item() for H, W in shapes)

    value = torch.rand(N, S, M, channels).to(device) * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2).to(device)
    attention_weights = torch.rand(N, Lq, M, L, P).to(device) + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2

    func = MultiScaleDeformableAttnFunction.apply

    value.requires_grad = grad_value
    sampling_locations.requires_grad = grad_sampling_loc
    attention_weights.requires_grad = grad_attn_weight
    if device == 'cuda':
        dtype = torch.double
        eps = 1e-6
    elif device == 'mlu':
        dtype = torch.float
        eps = 1e-4
    elif device == 'musa':
        dtype = torch.float
        eps = 1e-4
    if _USING_PARROTS:
        assert gradcheck(
            func, (value.to(dtype), shapes, level_start_index,
                   sampling_locations.to(dtype), attention_weights.to(dtype),
                   im2col_step),
            no_grads=[shapes, level_start_index],
            eps=eps)
    else:
        assert gradcheck(
            func, (value.to(dtype), shapes, level_start_index,
                   sampling_locations.to(dtype), attention_weights.to(dtype),
                   im2col_step),
            eps=eps,
            atol=1e-2)


@pytest.mark.skipif(not IS_NPU_AVAILABLE, reason='requires NPU support')
def test_backward_equal_with_pytorch_npu():
    N, M, D = 6, 4, 8
    Lq, L, P = 10000, 4, 8
    shapes = torch.as_tensor([(60, 40), (30, 20), (16, 24), (53, 32)],
                             dtype=torch.int32)
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum((H * W).item() for H, W in shapes)

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D) * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2
    value.requires_grad = True
    sampling_locations.requires_grad = True
    attention_weights.requires_grad = True
    output_pytorch = multi_scale_deformable_attn_pytorch(
        value.float(), shapes, sampling_locations.float(),
        attention_weights.float())
    grad_output_pytorch = torch.ones_like(output_pytorch)
    output_pytorch.backward(grad_output_pytorch)
    grad_value = value.grad.detach().cpu()
    grad_location = sampling_locations.grad.detach().cpu()
    grad_attn_weight = attention_weights.grad.detach().cpu()

    value_npu = value.npu()
    shapes_npu = shapes.npu()
    level_start_index_npu = level_start_index.npu()
    sampling_locations_npu = sampling_locations.npu()
    attention_weights_npu = attention_weights.npu()
    output_npu = MultiScaleDeformableAttnFunction.apply(
        value_npu.float(), shapes_npu, level_start_index_npu,
        sampling_locations_npu.float(), attention_weights_npu.float(),
        im2col_step)
    grad_output_npu = torch.ones_like(output_npu)
    output_npu.backward(grad_output_npu)
    grad_value_npu = value_npu.grad.detach().cpu()
    grad_location_npu = sampling_locations_npu.grad.detach().cpu()
    grad_attn_weight_npu = attention_weights_npu.grad.detach().cpu()
    assert torch.allclose(grad_value_npu, grad_value)
    max_abs_err_1 = (grad_value_npu - grad_value).abs().max()
    max_rel_err_1 = ((grad_value_npu - grad_value).abs() /
                     grad_value.abs()).max()
    assert max_abs_err_1 < 1e-5
    assert max_rel_err_1 < 1e-4
    assert torch.allclose(grad_location_npu, grad_location)
    max_abs_err_2 = (grad_location_npu - grad_location).abs().max()
    max_rel_err_2 = ((grad_location_npu - grad_location).abs() /
                     grad_location.abs()).max()
    assert max_abs_err_2 < 1e-5
    assert max_rel_err_2 < 1e-4
    assert torch.allclose(grad_attn_weight_npu, grad_attn_weight)
    max_abs_err_3 = (grad_attn_weight_npu - grad_attn_weight).abs().max()
    max_rel_err_3 = ((grad_attn_weight_npu - grad_attn_weight).abs() /
                     grad_attn_weight.abs()).max()
    assert max_abs_err_3 < 1e-5
    assert max_rel_err_3 < 1e-4


================================================
FILE: tests/test_ops/test_nms.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import mmengine
import numpy as np
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE


class Testnms:

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
    ])
    def test_nms_allclose(self, device):
        from mmcv.ops import nms
        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
                            dtype=np.float32)
        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
        np_inds = np.array([1, 0, 3])
        np_dets = np.array([[3.0, 6.0, 9.0, 11.0, 0.9],
                            [6.0, 3.0, 8.0, 7.0, 0.6],
                            [1.0, 4.0, 13.0, 7.0, 0.2]])
        boxes = torch.from_numpy(np_boxes)
        scores = torch.from_numpy(np_scores)
        dets, inds = nms(boxes, scores, iou_threshold=0.3, offset=0)
        assert np.allclose(dets, np_dets)  # test cpu
        assert np.allclose(inds, np_inds)  # test cpu
        dets, inds = nms(
            boxes.to(device), scores.to(device), iou_threshold=0.3, offset=0)
        assert np.allclose(dets.cpu().numpy(), np_dets)  # test gpu
        assert np.allclose(inds.cpu().numpy(), np_inds)  # test gpu

    def test_softnms_allclose(self):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import soft_nms
        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
                            dtype=np.float32)
        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)

        np_output = {
            'linear': {
                'dets':
                np.array(
                    [[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
                     [3., 7., 10., 12., 0.29024392], [1., 4., 13., 7., 0.2]],
                    dtype=np.float32),
                'inds':
                np.array([1, 0, 2, 3], dtype=np.int64)
            },
            'gaussian': {
                'dets':
                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.59630775],
                          [3., 7., 10., 12., 0.35275510],
                          [1., 4., 13., 7., 0.18650459]],
                         dtype=np.float32),
                'inds':
                np.array([1, 0, 2, 3], dtype=np.int64)
            },
            'naive': {
                'dets':
                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
                          [1., 4., 13., 7., 0.2]],
                         dtype=np.float32),
                'inds':
                np.array([1, 0, 3], dtype=np.int64)
            }
        }

        boxes = torch.from_numpy(np_boxes)
        scores = torch.from_numpy(np_scores)

        configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],
                   [0.3, 0.5, 0.01, 'naive']]

        for iou, sig, mscore, m in configs:
            dets, inds = soft_nms(
                boxes,
                scores,
                iou_threshold=iou,
                sigma=sig,
                min_score=mscore,
                method=m)
            assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
            assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])

        if torch.__version__ != 'parrots':
            boxes = boxes.cuda()
            scores = scores.cuda()
            for iou, sig, mscore, m in configs:
                dets, inds = soft_nms(
                    boxes,
                    scores,
                    iou_threshold=iou,
                    sigma=sig,
                    min_score=mscore,
                    method=m)
                assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
                assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])

    def test_nms_match(self):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import nms, nms_match
        iou_thr = 0.6
        # empty input
        empty_dets = np.array([])
        assert len(nms_match(empty_dets, iou_thr)) == 0

        # non empty ndarray input
        np_dets = np.array(
            [[49.1, 32.4, 51.0, 35.9, 0.9], [49.3, 32.9, 51.0, 35.3, 0.9],
             [35.3, 11.5, 39.9, 14.5, 0.4], [35.2, 11.7, 39.7, 15.7, 0.3]],
            dtype=np.float32)
        np_groups = nms_match(np_dets, iou_thr)
        assert isinstance(np_groups[0], np.ndarray)
        assert len(np_groups) == 2
        tensor_dets = torch.from_numpy(np_dets)
        boxes = tensor_dets[:, :4]
        scores = tensor_dets[:, 4]
        nms_keep_inds = nms(boxes.contiguous(), scores.contiguous(),
                            iou_thr)[1]
        assert {g[0].item() for g in np_groups} == set(nms_keep_inds.tolist())

        # non empty tensor input
        tensor_dets = torch.from_numpy(np_dets)
        tensor_groups = nms_match(tensor_dets, iou_thr)
        assert isinstance(tensor_groups[0], torch.Tensor)
        for i in range(len(tensor_groups)):
            assert np.equal(tensor_groups[i].numpy(), np_groups[i]).all()

        # input of wrong shape
        wrong_dets = np.zeros((2, 3))
        with pytest.raises(AssertionError):
            nms_match(wrong_dets, iou_thr)

    def test_batched_nms(self):
        from mmcv.ops import batched_nms
        results = mmengine.load('./tests/data/batched_nms_data.pkl')

        nms_max_num = 100
        nms_cfg = dict(
            type='nms',
            iou_threshold=0.7,
            score_threshold=0.5,
            max_num=nms_max_num)
        boxes, keep = batched_nms(
            torch.from_numpy(results['boxes']),
            torch.from_numpy(results['scores']),
            torch.from_numpy(results['idxs']),
            nms_cfg,
            class_agnostic=False)

        nms_cfg.update(split_thr=100)
        seq_boxes, seq_keep = batched_nms(
            torch.from_numpy(results['boxes']),
            torch.from_numpy(results['scores']),
            torch.from_numpy(results['idxs']),
            nms_cfg,
            class_agnostic=False)

        assert torch.equal(keep, seq_keep)
        assert torch.equal(boxes, seq_boxes)
        assert torch.equal(keep,
                           torch.from_numpy(results['keep'][:nms_max_num]))

        nms_cfg = dict(type='soft_nms', iou_threshold=0.7)
        boxes, keep = batched_nms(
            torch.from_numpy(results['boxes']),
            torch.from_numpy(results['scores']),
            torch.from_numpy(results['idxs']),
            nms_cfg,
            class_agnostic=False)

        nms_cfg.update(split_thr=100)
        seq_boxes, seq_keep = batched_nms(
            torch.from_numpy(results['boxes']),
            torch.from_numpy(results['scores']),
            torch.from_numpy(results['idxs']),
            nms_cfg,
            class_agnostic=False)

        assert torch.equal(keep, seq_keep)
        assert torch.equal(boxes, seq_boxes)

        # test skip nms when `nms_cfg` is None
        seq_boxes, seq_keep = batched_nms(
            torch.from_numpy(results['boxes']),
            torch.from_numpy(results['scores']),
            torch.from_numpy(results['idxs']),
            None,
            class_agnostic=False)
        assert len(seq_keep) == len(results['boxes'])
        # assert score is descending order
        assert ((seq_boxes[:, -1][1:] - seq_boxes[:, -1][:-1]) < 0).all()


================================================
FILE: tests/test_ops/test_nms_quadri.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE


class TestNMSQuadri:

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_ml_nms_quadri(self, device):
        from mmcv.ops import nms_quadri
        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
                            dtype=np.float32)
        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)

        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
                                   [2., 2., 3., 4., 4., 2., 3., 1.],
                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
                                  dtype=np.float32)
        np_expect_keep_inds = np.array([3, 1, 2], dtype=np.int64)

        boxes = torch.from_numpy(np_boxes).to(device)
        labels = torch.from_numpy(np_labels).to(device)

        dets, keep_inds = nms_quadri(boxes[:, :8], boxes[:, -1], 0.3, labels)

        assert np.allclose(dets.cpu().numpy()[:, :8], np_expect_dets)
        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_nms_quadri(self, device):
        from mmcv.ops import nms_quadri
        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
                            dtype=np.float32)

        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
                                   [2., 2., 3., 4., 4., 2., 3., 1.],
                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
                                  dtype=np.float32)
        np_expect_keep_inds = np.array([3, 1, 2], dtype=np.int64)

        boxes = torch.from_numpy(np_boxes).to(device)

        dets, keep_inds = nms_quadri(boxes[:, :8], boxes[:, -1], 0.3)
        assert np.allclose(dets.cpu().numpy()[:, :8], np_expect_dets)
        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)

    @pytest.mark.parametrize('device', [
        'cpu',
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_batched_nms(self, device):
        # test batched_nms with nms_quadri
        from mmcv.ops import batched_nms

        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
                            dtype=np.float32)
        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)

        np_expect_agnostic_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
                                            [2., 2., 3., 4., 4., 2., 3., 1.],
                                            [7., 7., 8., 8., 9., 7., 8., 6.]],
                                           dtype=np.float32)
        np_expect_agnostic_keep_inds = np.array([3, 1, 2], dtype=np.int64)

        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
                                   [2., 2., 3., 4., 4., 2., 3., 1.],
                                   [1., 1., 3., 4., 4., 4., 4., 1.],
                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
                                  dtype=np.float32)
        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)

        nms_cfg = dict(type='nms_quadri', iou_threshold=0.3)

        # test class_agnostic is True
        boxes, keep = batched_nms(
            torch.from_numpy(np_boxes[:, :8]).to(device),
            torch.from_numpy(np_boxes[:, -1]).to(device),
            torch.from_numpy(np_labels).to(device),
            nms_cfg,
            class_agnostic=True)
        assert np.allclose(boxes.cpu().numpy()[:, :8], np_expect_agnostic_dets)
        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)

        # test class_agnostic is False
        boxes, keep = batched_nms(
            torch.from_numpy(np_boxes[:, :8]).to(device),
            torch.from_numpy(np_boxes[:, -1]).to(device),
            torch.from_numpy(np_labels).to(device),
            nms_cfg,
            class_agnostic=False)
        assert np.allclose(boxes.cpu().numpy()[:, :8], np_expect_dets)
        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)


================================================
FILE: tests/test_ops/test_nms_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


class TestNmsRotated:

    @pytest.mark.parametrize('device', [
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_ml_nms_rotated(self, device):
        from mmcv.ops import nms_rotated
        np_boxes = np.array(
            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
             ],
            dtype=np.float32)
        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)

        np_expect_dets = np.array(
            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
             [6.0, 3.0, 8.0, 7.0, 0.5]],
            dtype=np.float32)
        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)

        boxes = torch.from_numpy(np_boxes).to(device)
        labels = torch.from_numpy(np_labels).to(device)

        # test cw angle definition
        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5, labels)

        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)

        # test ccw angle definition
        boxes[..., -2] *= -1
        dets, keep_inds = nms_rotated(
            boxes[:, :5], boxes[:, -1], 0.5, labels, clockwise=False)
        dets[..., -2] *= -1
        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_nms_rotated(self, device):
        from mmcv.ops import nms_rotated
        np_boxes = np.array(
            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
             ],
            dtype=np.float32)

        np_expect_dets = np.array(
            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
             [6.0, 3.0, 8.0, 7.0, 0.5]],
            dtype=np.float32)
        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)

        boxes = torch.from_numpy(np_boxes).to(device)

        # test cw angle definition
        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5)
        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)

        # test ccw angle definition
        boxes[..., -2] *= -1
        dets, keep_inds = nms_rotated(
            boxes[:, :5], boxes[:, -1], 0.5, clockwise=False)
        dets[..., -2] *= -1
        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)

    def test_batched_nms(self):
        # test batched_nms with nms_rotated
        from mmcv.ops import batched_nms

        np_boxes = np.array(
            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
             ],
            dtype=np.float32)
        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)

        np_expect_agnostic_dets = np.array(
            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
             [6.0, 3.0, 8.0, 7.0, 0.5]],
            dtype=np.float32)
        np_expect_agnostic_keep_inds = np.array([3, 1, 0], dtype=np.int64)

        np_expect_dets = np.array(
            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
             [6.0, 3.0, 8.0, 7.0, 0.5], [3.0, 7.0, 10.0, 12.0, 0.3]],
            dtype=np.float32)
        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)

        nms_cfg = dict(type='nms_rotated', iou_threshold=0.5)

        # test class_agnostic is True
        boxes, keep = batched_nms(
            torch.from_numpy(np_boxes[:, :5]),
            torch.from_numpy(np_boxes[:, -1]),
            torch.from_numpy(np_labels),
            nms_cfg,
            class_agnostic=True)
        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_agnostic_dets)
        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)

        # test class_agnostic is False
        boxes, keep = batched_nms(
            torch.from_numpy(np_boxes[:, :5]),
            torch.from_numpy(np_boxes[:, -1]),
            torch.from_numpy(np_labels),
            nms_cfg,
            class_agnostic=False)
        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_dets)
        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)


================================================
FILE: tests/test_ops/test_onnx.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os

import numpy as np
import onnx
import pytest
import torch
import torch.nn as nn

onnx_file = 'tmp.onnx'
if torch.__version__ == 'parrots':
    pytest.skip('not supported in parrots now', allow_module_level=True)


@pytest.fixture(autouse=True)
def run_before_and_after_test():
    # clear onnx_file before test
    if os.path.exists(onnx_file):
        os.remove(onnx_file)

    yield

    # clear onnx_file after test
    if os.path.exists(onnx_file):
        os.remove(onnx_file)


class WrapFunction(nn.Module):

    def __init__(self, wrapped_function):
        super().__init__()
        self.wrapped_function = wrapped_function

    def forward(self, *args, **kwargs):
        return self.wrapped_function(*args, **kwargs)


def test_roialign():
    rt = pytest.importorskip('onnxruntime')
    try:
        from mmcv.ops import roi_align
    except (ImportError, ModuleNotFoundError):
        pytest.skip('roi_align op is not successfully compiled')

    # roi align config
    pool_h = 2
    pool_w = 2
    spatial_scale = 1.0
    sampling_ratio = 2

    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
              ([[[[1., 2.], [3., 4.]], [[4., 3.],
                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]

    def warpped_function(torch_input, torch_rois):
        return roi_align(torch_input, torch_rois, (pool_w, pool_h),
                         spatial_scale, sampling_ratio, 'avg', True)

    for case in inputs:
        np_input = np.array(case[0], dtype=np.float32)
        np_rois = np.array(case[1], dtype=np.float32)
        input = torch.from_numpy(np_input)
        rois = torch.from_numpy(np_rois)

        # compute pytorch_output
        with torch.no_grad():
            pytorch_output = roi_align(input, rois, (pool_w, pool_h),
                                       spatial_scale, sampling_ratio, 'avg',
                                       True)

        # export and load onnx model
        wrapped_model = WrapFunction(warpped_function)
        with torch.no_grad():
            torch.onnx.export(
                wrapped_model, (input, rois),
                onnx_file,
                export_params=True,
                keep_initializers_as_inputs=True,
                input_names=['input', 'rois'],
                opset_version=11)

        onnx_model = onnx.load(onnx_file)
        session_options = rt.SessionOptions()

        # compute onnx_output
        input_all = [node.name for node in onnx_model.graph.input]
        input_initializer = [
            node.name for node in onnx_model.graph.initializer
        ]
        net_feed_input = list(set(input_all) - set(input_initializer))
        assert (len(net_feed_input) == 2)
        sess = rt.InferenceSession(
            onnx_file, session_options, providers=['CPUExecutionProvider'])
        onnx_output = sess.run(None, {
            'input': input.detach().numpy(),
            'rois': rois.detach().numpy()
        })
        onnx_output = onnx_output[0]

        # allclose

        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_roipool():
    rt = pytest.importorskip('onnxruntime')
    from mmcv.ops import roi_pool

    # roi pool config
    pool_h = 2
    pool_w = 2
    spatial_scale = 1.0

    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
              ([[[[1., 2.], [3., 4.]], [[4., 3.],
                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]

    def warpped_function(torch_input, torch_rois):
        return roi_pool(torch_input, torch_rois, (pool_w, pool_h),
                        spatial_scale)

    for case in inputs:
        np_input = np.array(case[0], dtype=np.float32)
        np_rois = np.array(case[1], dtype=np.float32)
        input = torch.from_numpy(np_input).cuda()
        rois = torch.from_numpy(np_rois).cuda()

        # compute pytorch_output
        with torch.no_grad():
            pytorch_output = roi_pool(input, rois, (pool_w, pool_h),
                                      spatial_scale)
            pytorch_output = pytorch_output.cpu()

        # export and load onnx model
        wrapped_model = WrapFunction(warpped_function)
        with torch.no_grad():
            torch.onnx.export(
                wrapped_model, (input, rois),
                onnx_file,
                export_params=True,
                keep_initializers_as_inputs=True,
                input_names=['input', 'rois'],
                opset_version=11)
        onnx_model = onnx.load(onnx_file)

        # compute onnx_output
        input_all = [node.name for node in onnx_model.graph.input]
        input_initializer = [
            node.name for node in onnx_model.graph.initializer
        ]
        net_feed_input = list(set(input_all) - set(input_initializer))
        assert (len(net_feed_input) == 2)
        sess = rt.InferenceSession(
            onnx_file, providers=['CPUExecutionProvider'])
        onnx_output = sess.run(
            None, {
                'input': input.detach().cpu().numpy(),
                'rois': rois.detach().cpu().numpy()
            })
        onnx_output = onnx_output[0]

        # allclose
        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)


def _test_symbolic(model, inputs, symbol_name):
    with torch.no_grad():
        torch.onnx.export(model, inputs, onnx_file, opset_version=11)

    import onnx
    model = onnx.load(onnx_file)
    nodes = model.graph.node

    symbol_exist = False
    for n in nodes:
        if n.op_type == symbol_name:
            symbol_exist = True
    assert symbol_exist


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_border_align():
    from mmcv.ops import BorderAlign
    model = BorderAlign(2)
    input = torch.rand(1, 8, 2, 2).cuda()
    boxes = torch.rand(1, 4, 4).cuda()
    _test_symbolic(model, (input, boxes), 'MMCVBorderAlign')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_carafe():
    from mmcv.ops import CARAFENaive
    feat = torch.randn(2, 64, 3, 3, device='cuda').double()
    mask = torch.randn(2, 100, 6, 6, device='cuda').sigmoid().double()
    _test_symbolic(CARAFENaive(5, 4, 2), (feat, mask), 'MMCVCARAFENaive')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_deform_conv():
    from mmcv.ops import DeformConv2dPack
    x = torch.randn(1, 2, 4, 4, device='cuda')
    _test_symbolic(
        DeformConv2dPack(2, 4, 3, 1, 1).cuda(), (x, ), 'MMCVDeformConv2d')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_modulated_deform_conv():
    from mmcv.ops import ModulatedDeformConv2dPack
    x = torch.randn(1, 2, 4, 4, device='cuda')
    _test_symbolic(
        ModulatedDeformConv2dPack(2, 4, 3, 1, 1).cuda(), x,
        'MMCVModulatedDeformConv2d')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_deform_roi_pool():
    from mmcv.ops import DeformRoIPoolPack
    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')
    output_c = x.size(1)
    pool_h = 2
    pool_w = 2
    spatial_scale = 1.0
    sampling_ratio = 2
    model = DeformRoIPoolPack((pool_h, pool_w),
                              output_c,
                              spatial_scale=spatial_scale,
                              sampling_ratio=sampling_ratio).cuda()

    _test_symbolic(model, (x, rois), 'MMCVDeformRoIPool')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_masked_conv():
    from mmcv.ops import MaskedConv2d
    x = torch.rand(1, 2, 4, 4, device='cuda')
    mask = torch.rand(1, 4, 4, device='cuda')
    _test_symbolic(
        MaskedConv2d(2, 4, 3, 1, 1).cuda(), (x, mask), 'MMCVMaskedConv2d')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_pr_roi_pool():
    from mmcv.ops import PrRoIPool
    pool_h = 2
    pool_w = 2
    spatial_scale = 1.0
    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')
    model = PrRoIPool((pool_h, pool_w), spatial_scale).cuda()
    _test_symbolic(model, (x, rois), 'PrRoIPool')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_psa_mask():
    from mmcv.ops import PSAMask
    input = torch.rand(4, 16, 8, 8).cuda()
    model = PSAMask('collect', (4, 4)).cuda()
    _test_symbolic(model, input, 'MMCVPSAMask')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_roi_align_rotated():
    from mmcv.ops import RoIAlignRotated
    pool_h = 2
    pool_w = 2
    spatial_scale = 1.0
    sampling_ratio = 2
    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
    rois = torch.tensor([[0., 0.5, 0.5, 1., 1., 0]], device='cuda')
    model = RoIAlignRotated((pool_h, pool_w), spatial_scale,
                            sampling_ratio).cuda()
    _test_symbolic(model, (x, rois), 'MMCVRoIAlignRotated')


@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_roi_feaeture_align():
    from mmcv.ops import rotated_feature_align
    wrapped_model = WrapFunction(rotated_feature_align)
    feature = torch.rand(1, 1, 2, 2, device='cuda')
    bbox = torch.rand(1, 2, 2, 5, device='cuda')
    _test_symbolic(wrapped_model, (feature, bbox), 'MMCVRotatedFeatureAlign')


================================================
FILE: tests/test_ops/test_pixel_group.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch


def test_pixel_group():
    from mmcv.ops import pixel_group
    np_score = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]).astype(np.float32)
    np_mask = (np_score > 0.5)
    np_embedding = np.zeros((10, 10, 8)).astype(np.float32)
    np_embedding[:, :7] = 0.9
    np_embedding[:, 7:] = 10.0
    np_kernel_label = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 0, 0, 0,
                                 0]]).astype(np.int32)
    np_kernel_contour = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                  [0, 0, 1, 1, 1, 0, 0, 0, 1, 0],
                                  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
                                  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
                                  [0, 0, 1, 1, 1, 0, 0, 0, 1, 0],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0]]).astype(np.uint8)
    kernel_region_num = 3
    distance_threshold = float(0.8)
    result = pixel_group(np_score, np_mask, np_embedding, np_kernel_label,
                         np_kernel_contour, kernel_region_num,
                         distance_threshold)
    gt_1 = [
        0.8999997973442078, 24.0, 1.0, 3.0, 2.0, 3.0, 3.0, 3.0, 4.0, 3.0, 5.0,
        3.0, 6.0, 3.0, 1.0, 4.0, 2.0, 4.0, 3.0, 4.0, 4.0, 4.0, 5.0, 4.0, 6.0,
        4.0, 1.0, 5.0, 2.0, 5.0, 3.0, 5.0, 4.0, 5.0, 5.0, 5.0, 6.0, 5.0, 1.0,
        6.0, 2.0, 6.0, 3.0, 6.0, 4.0, 6.0, 5.0, 6.0, 6.0, 6.0
    ]

    gt_2 = [
        0.9000000357627869, 8.0, 7.0, 3.0, 8.0, 3.0, 7.0, 4.0, 8.0, 4.0, 7.0,
        5.0, 8.0, 5.0, 7.0, 6.0, 8.0, 6.0
    ]

    assert np.allclose(result[0], [0, 0])
    assert np.allclose(result[1], gt_1)
    assert np.allclose(result[2], gt_2)

    # test torch Tensor
    np_score_t = torch.from_numpy(np_score)
    np_mask_t = torch.from_numpy(np_mask)
    np_embedding_t = torch.from_numpy(np_embedding)
    np_kernel_label_t = torch.from_numpy(np_kernel_label)
    np_kernel_contour_t = torch.from_numpy(np_kernel_contour)

    result = pixel_group(np_score_t, np_mask_t, np_embedding_t,
                         np_kernel_label_t, np_kernel_contour_t,
                         kernel_region_num, distance_threshold)

    assert np.allclose(result[0], [0, 0])
    assert np.allclose(result[1], gt_1)
    assert np.allclose(result[2], gt_2)


================================================
FILE: tests/test_ops/test_points_in_polygons.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import points_in_polygons
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_points_in_polygons(device):
    points = np.array([[300., 300.], [400., 400.], [100., 100], [300, 250],
                       [100, 0]])
    polygons = np.array([[200., 200., 400., 400., 500., 200., 400., 100.],
                         [400., 400., 500., 500., 600., 300., 500., 200.],
                         [300., 300., 600., 700., 700., 700., 700., 100.]])
    expected_output = np.array([[0., 0., 0.], [0., 0., 1.], [0., 0., 0.],
                                [1., 0., 0.], [0., 0., 0.]]).astype(np.float32)
    points = torch.tensor(points, dtype=torch.float32, device=device)
    polygons = torch.tensor(polygons, dtype=torch.float32, device=device)
    assert np.allclose(
        points_in_polygons(points, polygons).cpu().numpy(), expected_output,
        1e-3)


================================================
FILE: tests/test_ops/test_prroi_pool.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck

    _USING_PARROTS = False

inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
                                               1.]]]], [[0., 0., 0., 1., 1.]]),
          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
outputs = [
    ([[[[1.75, 2.25], [2.75, 3.25]]]], [[[[1., 1.],
                                          [1., 1.]]]], [[0., 2., 4., 2., 4.]]),
    ([[[[1.75, 2.25], [2.75, 3.25]],
       [[3.25, 2.75], [2.25, 1.75]]]], [[[[1., 1.], [1., 1.]],
                                         [[1., 1.],
                                          [1., 1.]]]], [[0., 0., 0., 0., 0.]]),
    ([[[[3.75, 6.91666651],
        [10.08333302,
         13.25]]]], [[[[0.11111111, 0.22222224, 0.22222222, 0.11111111],
                       [0.22222224, 0.444444448, 0.44444448, 0.22222224],
                       [0.22222224, 0.44444448, 0.44444448, 0.22222224],
                       [0.11111111, 0.22222224, 0.22222224, 0.11111111]]]],
     [[0.0, 3.33333302, 6.66666603, 3.33333349, 6.66666698]])
]


class TestPrRoiPool:

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_roipool_gradcheck(self, device):
        from mmcv.ops import PrRoIPool
        pool_h = 2
        pool_w = 2
        spatial_scale = 1.0

        for case in inputs:
            np_input = np.array(case[0], dtype=np.float32)
            np_rois = np.array(case[1], dtype=np.float32)

            x = torch.tensor(np_input, device=device, requires_grad=True)
            rois = torch.tensor(np_rois, device=device)

            froipool = PrRoIPool((pool_h, pool_w), spatial_scale)

            if _USING_PARROTS:
                gradcheck(froipool, (x, rois), no_grads=[rois])
            else:
                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)

    def _test_roipool_allclose(self, device, dtype=torch.float):
        from mmcv.ops import prroi_pool
        pool_h = 2
        pool_w = 2
        spatial_scale = 1.0

        for case, output in zip(inputs, outputs):
            np_input = np.array(case[0], dtype=np.float32)
            np_rois = np.array(case[1], dtype=np.float32)
            np_output = np.array(output[0], dtype=np.float32)
            np_input_grad = np.array(output[1], dtype=np.float32)
            np_rois_grad = np.array(output[2], dtype=np.float32)

            x = torch.tensor(
                np_input, dtype=dtype, device=device, requires_grad=True)
            rois = torch.tensor(
                np_rois, dtype=dtype, device=device, requires_grad=True)

            output = prroi_pool(x, rois, (pool_h, pool_w), spatial_scale)
            output.backward(torch.ones_like(output))
            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
            assert np.allclose(x.grad.data.cpu().numpy(), np_input_grad, 1e-3)
            assert np.allclose(rois.grad.data.cpu().numpy(), np_rois_grad,
                               1e-3)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_roipool_allclose_float(self, device):
        self._test_roipool_allclose(device, dtype=torch.float)


================================================
FILE: tests/test_ops/test_psa_mask.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch
import torch.nn as nn

from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


class Loss(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, input, target):
        input = input.view(-1)
        target = target.view(-1)
        return torch.mean(input - target)


class TestPSAMask:

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_psa_mask_collect(self, device):
        from mmcv.ops import PSAMask
        test_loss = Loss()

        input = np.fromfile(
            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
        output_collect = np.fromfile(
            'tests/data/for_psa_mask/psa_output_collect.bin', dtype=np.float32)

        input = input.reshape((4, 16, 8, 8))
        output_collect = output_collect.reshape((4, 64, 8, 8))
        label = torch.ones((4, 64, 8, 8))

        input = torch.FloatTensor(input)
        input.requires_grad = True

        psamask_collect = PSAMask('collect', (4, 4))

        # test collect cpu
        test_output = psamask_collect(input)
        loss = test_loss(test_output, label)
        loss.backward()
        test_output = test_output.detach().numpy()
        assert np.allclose(test_output, output_collect)
        assert test_output.shape == output_collect.shape

        psamask_collect.to(device)
        input = input.to(device)
        label = label.to(device)

        # test collect on device
        test_output = psamask_collect(input)
        loss = test_loss(test_output, label)
        loss.backward()
        test_output = test_output.detach().cpu().numpy()
        assert np.allclose(test_output, output_collect)
        assert test_output.shape == output_collect.shape

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def test_psa_mask_distribute(self, device):
        from mmcv.ops import PSAMask
        test_loss = Loss()

        input = np.fromfile(
            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
        output_distribute = np.fromfile(
            'tests/data/for_psa_mask/psa_output_distribute.bin',
            dtype=np.float32)

        input = input.reshape((4, 16, 8, 8))
        output_distribute = output_distribute.reshape((4, 64, 8, 8))
        label = torch.ones((4, 64, 8, 8))

        input = torch.FloatTensor(input)
        input.requires_grad = True

        psamask_distribute = PSAMask('distribute', (4, 4))

        # test distribute cpu
        test_output = psamask_distribute(input)
        loss = test_loss(test_output, label)
        loss.backward()
        test_output = test_output.detach().numpy()
        assert np.allclose(test_output, output_distribute)
        assert test_output.shape == output_distribute.shape

        psamask_distribute.to(device)
        input = input.to(device)
        label = label.to(device)

        # test distribute on device
        test_output = psamask_distribute(input)
        loss = test_loss(test_output, label)
        loss.backward()
        test_output = test_output.detach().cpu().numpy()
        assert np.allclose(test_output, output_distribute)
        assert test_output.shape == output_distribute.shape


================================================
FILE: tests/test_ops/test_riroi_align_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import RiRoIAlignRotated

if torch.__version__ == 'parrots':
    from parrots.autograd import gradcheck
    _USING_PARROTS = True
else:
    from torch.autograd import gradcheck
    _USING_PARROTS = False

np_feature = np.array([[[[1, 2], [3, 4]], [[1, 2], [4, 3]], [[4, 3], [2, 1]],
                        [[1, 2], [5, 6]], [[3, 4], [7, 8]], [[9, 10], [13,
                                                                       14]],
                        [[11, 12], [15, 16]], [[1, 1], [2, 2]]]])
np_rois = np.array([[0., 0.5, 0.5, 1., 1., np.pi / 3],
                    [0., 1., 1., 3., 3., np.pi / 2]])
expect_output = np.array([[[[1.8425, 1.3516], [2.3151, 1.8241]],
                           [[2.4779, 1.7416], [3.2173, 2.5632]],
                           [[2.7149, 2.2638], [2.6540, 2.3673]],
                           [[2.9461, 2.8638], [2.8028, 2.7205]],
                           [[4.1943, 2.7214], [5.6119, 4.1391]],
                           [[7.5276, 6.0547], [8.9453, 7.4724]],
                           [[12.1943, 10.7214], [13.6119, 12.1391]],
                           [[9.5489, 8.4237], [10.5763, 9.4511]]],
                          [[[7.6562, 12.5625], [4.0000, 6.6250]],
                           [[1.0000, 1.3125], [0.5000, 0.6562]],
                           [[1.6562, 1.9375], [1.0000, 1.3125]],
                           [[1.8438, 2.0547], [0.7500, 1.1562]],
                           [[0.8438, 3.0625], [0.2500, 1.1875]],
                           [[2.6562, 2.5625], [1.5000, 1.6250]],
                           [[3.6562, 4.5625], [2.0000, 2.6250]],
                           [[6.6562, 10.5625], [3.5000, 5.6250]]]])

expect_grad = np.array([[[[1.4727, 1.5586], [1.5586, 1.6602]],
                         [[1.4727, 1.5586], [1.5586, 1.6602]],
                         [[1.4727, 1.5586], [1.5586, 1.6602]],
                         [[1.4727, 1.5586], [1.5586, 1.6602]],
                         [[1.4727, 1.5586], [1.5586, 1.6602]],
                         [[1.4727, 1.5586], [1.5586, 1.6602]],
                         [[1.4727, 1.5586], [1.5586, 1.6602]],
                         [[1.4727, 1.5586], [1.5586, 1.6602]]]])

pool_h = 2
pool_w = 2
spatial_scale = 1.0
num_samples = 2
sampling_ratio = 2
num_orientations = 8
clockwise = False


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
def test_roialign_rotated_gradcheck():
    x = torch.tensor(
        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
                                 num_orientations, clockwise)
    if _USING_PARROTS:
        gradcheck(
            froipool, (x, rois), no_grads=[rois], delta=1e-3, pt_atol=1e-3)
    else:
        gradcheck(froipool, (x, rois), eps=1e-3, atol=1e-3)


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
def test_roialign_rotated_allclose():
    x = torch.tensor(
        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
                                 num_orientations, clockwise)
    output = froipool(x, rois)
    output.backward(torch.ones_like(output))
    assert np.allclose(
        output.data.type(torch.float).cpu().numpy(), expect_output, atol=1e-3)
    assert np.allclose(
        x.grad.data.type(torch.float).cpu().numpy(), expect_grad, atol=1e-3)


================================================
FILE: tests/test_ops/test_roi_align.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck
    _USING_PARROTS = False

# yapf:disable

inputs = [([[[[1., 2.], [3., 4.]]]],
           [[0., 0., 0., 1., 1.]]),
          ([[[[1., 2.], [3., 4.]],
             [[4., 3.], [2., 1.]]]],
           [[0., 0., 0., 1., 1.]]),
          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
           [[0., 0., 0., 3., 3.]])]
outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],
            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
           ([[[[1.0, 1.25], [1.5, 1.75]],
              [[4.0, 3.75], [3.5, 3.25]]]],
            [[[[3.0625, 0.4375], [0.4375, 0.0625]],
              [[3.0625, 0.4375], [0.4375, 0.0625]]]]),
           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],
            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
# yapf:enable

pool_h = 2
pool_w = 2
spatial_scale = 1.0
sampling_ratio = 2


def _test_roialign_gradcheck(device, dtype):
    try:
        from mmcv.ops import RoIAlign
    except ModuleNotFoundError:
        pytest.skip('RoIAlign op is not successfully compiled')
    if dtype is torch.half:
        pytest.skip('grad check does not support fp16')
    for case in inputs:
        np_input = np.array(case[0])
        np_rois = np.array(case[1])

        x = torch.tensor(
            np_input, dtype=dtype, device=device, requires_grad=True)
        rois = torch.tensor(np_rois, dtype=dtype, device=device)

        froipool = RoIAlign((pool_h, pool_w), spatial_scale, sampling_ratio)

        if torch.__version__ == 'parrots':
            gradcheck(
                froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
        else:
            gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)


def _test_roialign_allclose(device, dtype):
    try:
        from mmcv.ops import roi_align
    except ModuleNotFoundError:
        pytest.skip('test requires compilation')
    pool_h = 2
    pool_w = 2
    spatial_scale = 1.0
    sampling_ratio = 2
    for case, output in zip(inputs, outputs):
        np_input = np.array(case[0])
        np_rois = np.array(case[1])
        np_output = np.array(output[0])
        np_grad = np.array(output[1])

        x = torch.tensor(
            np_input, dtype=dtype, device=device, requires_grad=True)
        rois = torch.tensor(np_rois, dtype=dtype, device=device)

        output = roi_align(x, rois, (pool_h, pool_w), spatial_scale,
                           sampling_ratio, 'avg', True)
        output.backward(torch.ones_like(output))
        assert np.allclose(
            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
        assert np.allclose(
            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)


@pytest.mark.parametrize('dtype', [torch.float, torch.half])
@pytest.mark.parametrize('device', [
    'cpu',
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_roialign_float(device, dtype):
    _test_roialign_allclose(device=device, dtype=dtype)


@pytest.mark.parametrize('device', [
    'cpu',
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
])
def test_roialign_float64(device):
    _test_roialign_allclose(device=device, dtype=torch.double)
    _test_roialign_gradcheck(device=device, dtype=torch.double)


================================================
FILE: tests/test_ops/test_roi_align_rotated.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck
    _USING_PARROTS = False
# yapf:disable
inputs = [([[[[1., 2.], [3., 4.]]]],
           [[0., 0.5, 0.5, 1., 1., 0]]),
          ([[[[1., 2.], [3., 4.]]]],
           [[0., 0.5, 0.5, 1., 1., np.pi / 2]]),
          ([[[[1., 2.], [3., 4.]],
             [[4., 3.], [2., 1.]]]],
           [[0., 0.5, 0.5, 1., 1., 0]]),
          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
           [[0., 1.5, 1.5, 3., 3., 0]]),
          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
           [[0., 1.5, 1.5, 3., 3., np.pi / 2]])]
outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],
            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
           ([[[[1.5, 1], [1.75, 1.25]]]],
            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
           ([[[[1.0, 1.25], [1.5, 1.75]],
              [[4.0, 3.75], [3.5, 3.25]]]],
            [[[[3.0625, 0.4375], [0.4375, 0.0625]],
              [[3.0625, 0.4375], [0.4375, 0.0625]]]]),
           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],
            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]]),
           ([[[[7.5625, 1.9375], [10.375, 4.75]]]],
            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
# yapf:enable

pool_h = 2
pool_w = 2
spatial_scale = 1.0
sampling_ratio = 2


def _test_roialign_rotated_gradcheck(device, dtype):
    try:
        from mmcv.ops import RoIAlignRotated
    except ModuleNotFoundError:
        pytest.skip('RoIAlignRotated op is not successfully compiled')
    if dtype is torch.half:
        pytest.skip('grad check does not support fp16')
    for case in inputs:
        np_input = np.array(case[0])
        np_rois = np.array(case[1])

        x = torch.tensor(
            np_input, dtype=dtype, device=device, requires_grad=True)
        rois = torch.tensor(np_rois, dtype=dtype, device=device)

        froipool = RoIAlignRotated((pool_h, pool_w), spatial_scale,
                                   sampling_ratio)
        if torch.__version__ == 'parrots':
            gradcheck(
                froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
        else:
            gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)


def _test_roialign_rotated_allclose(device, dtype):
    try:
        from mmcv.ops import RoIAlignRotated, roi_align_rotated
    except ModuleNotFoundError:
        pytest.skip('test requires compilation')
    pool_h = 2
    pool_w = 2
    spatial_scale = 1.0
    sampling_ratio = 2

    for case, output in zip(inputs, outputs):
        np_input = np.array(case[0])
        np_rois = np.array(case[1])
        np_output = np.array(output[0])
        np_grad = np.array(output[1])

        x = torch.tensor(
            np_input, dtype=dtype, device=device, requires_grad=True)
        rois = torch.tensor(np_rois, dtype=dtype, device=device)

        output = roi_align_rotated(x, rois, (pool_h, pool_w), spatial_scale,
                                   sampling_ratio, True)
        output.backward(torch.ones_like(output))
        assert np.allclose(
            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
        assert np.allclose(
            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)

    # Test deprecated parameters
    roi_align_rotated_module_deprecated = RoIAlignRotated(
        out_size=(pool_h, pool_w),
        spatial_scale=spatial_scale,
        sample_num=sampling_ratio)

    output_1 = roi_align_rotated_module_deprecated(x, rois)

    roi_align_rotated_module_new = RoIAlignRotated(
        output_size=(pool_h, pool_w),
        spatial_scale=spatial_scale,
        sampling_ratio=sampling_ratio)

    output_2 = roi_align_rotated_module_new(x, rois)

    assert np.allclose(
        output_1.data.type(torch.float).cpu().numpy(),
        output_2.data.type(torch.float).cpu().numpy())


@pytest.mark.parametrize('device', [
    'cpu',
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
@pytest.mark.parametrize('dtype', [
    torch.float,
    pytest.param(
        torch.double,
        marks=pytest.mark.skipif(
            IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,
            reason='MLU, MUSA does not support for 64-bit floating point')),
    torch.half
])
def test_roialign_rotated(device, dtype):
    # check double only
    if dtype is torch.double:
        _test_roialign_rotated_gradcheck(device=device, dtype=dtype)
    _test_roialign_rotated_allclose(device=device, dtype=dtype)


================================================
FILE: tests/test_ops/test_roi_pool.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os

import numpy as np
import pytest
import torch

from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck

    _USING_PARROTS = False

cur_dir = os.path.dirname(os.path.abspath(__file__))

inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
                                               1.]]]], [[0., 0., 0., 1., 1.]]),
          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
outputs = [([[[[1., 2.], [3., 4.]]]], [[[[1., 1.], [1., 1.]]]]),
           ([[[[1., 2.], [3., 4.]], [[4., 3.], [2., 1.]]]], [[[[1., 1.],
                                                               [1., 1.]],
                                                              [[1., 1.],
                                                               [1., 1.]]]]),
           ([[[[4., 8.], [12., 16.]]]], [[[[0., 0., 0., 0.], [0., 1., 0., 1.],
                                           [0., 0., 0., 0.], [0., 1., 0.,
                                                              1.]]]])]


class TestRoiPool:

    def test_roipool_gradcheck(self):
        if not torch.cuda.is_available():
            return
        from mmcv.ops import RoIPool
        pool_h = 2
        pool_w = 2
        spatial_scale = 1.0

        for case in inputs:
            np_input = np.array(case[0])
            np_rois = np.array(case[1])

            x = torch.tensor(np_input, device='cuda', requires_grad=True)
            rois = torch.tensor(np_rois, device='cuda')

            froipool = RoIPool((pool_h, pool_w), spatial_scale)

            if _USING_PARROTS:
                pass
                # gradcheck(froipool, (x, rois), no_grads=[rois])
            else:
                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)

    def _test_roipool_allclose(self, device, dtype=torch.float):
        from mmcv.ops import roi_pool
        pool_h = 2
        pool_w = 2
        spatial_scale = 1.0

        for case, output in zip(inputs, outputs):
            np_input = np.array(case[0])
            np_rois = np.array(case[1])
            np_output = np.array(output[0])
            np_grad = np.array(output[1])

            x = torch.tensor(
                np_input, dtype=dtype, device=device, requires_grad=True)
            rois = torch.tensor(np_rois, dtype=dtype, device=device)
            output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
            output.backward(torch.ones_like(output))
            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(
                not IS_NPU_AVAILABLE, reason='requires NPU support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
    ])
    @pytest.mark.parametrize('dtype', [
        torch.float,
        pytest.param(
            torch.double,
            marks=pytest.mark.skipif(
                IS_MLU_AVAILABLE or IS_NPU_AVAILABLE or IS_MUSA_AVAILABLE,
                reason='MLU, NPU, MUSA '
                'does not support for 64-bit floating point')), torch.half
    ])
    def test_roipool_allclose(self, device, dtype):
        self._test_roipool_allclose(device, dtype)


================================================
FILE: tests/test_ops/test_roiaware_pool3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import (RoIAwarePool3d, points_in_boxes_all, points_in_boxes_cpu,
                      points_in_boxes_part)
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


@pytest.mark.parametrize('dtype', [
    torch.float, torch.half,
    pytest.param(
        torch.double,
        marks=pytest.mark.skipif(
            IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,
            reason='MLU, MUSA does not support for double'))
])
@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_RoIAwarePool3d(device, dtype):
    roiaware_pool3d_max = RoIAwarePool3d(
        out_size=4, max_pts_per_voxel=128, mode='max')
    roiaware_pool3d_avg = RoIAwarePool3d(
        out_size=4, max_pts_per_voxel=128, mode='avg')
    rois = torch.tensor(
        [[1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2],
         [-10.0, 23.0, 16.0, 20.0, 10.0, 20.0, -0.5 - np.pi / 2]],
        dtype=dtype).to(device)
    # boxes (m, 7) with bottom center in lidar coordinate
    pts = torch.tensor(
        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
        dtype=dtype).to(device)  # points (n, 3) in lidar coordinate
    pts_feature = pts.clone()

    pooled_features_max = roiaware_pool3d_max(
        rois=rois, pts=pts, pts_feature=pts_feature)
    assert pooled_features_max.shape == torch.Size([2, 4, 4, 4, 3])
    assert torch.allclose(pooled_features_max.sum(),
                          torch.tensor(51.100, dtype=dtype).to(device), 1e-3)

    pooled_features_avg = roiaware_pool3d_avg(
        rois=rois, pts=pts, pts_feature=pts_feature)
    assert pooled_features_avg.shape == torch.Size([2, 4, 4, 4, 3])
    assert torch.allclose(pooled_features_avg.sum(),
                          torch.tensor(49.750, dtype=dtype).to(device), 1e-3)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_points_in_boxes_part(device):
    boxes = torch.tensor(
        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3]],
         [[-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
        dtype=torch.float32).to(
            device)  # boxes (b, t, 7) with bottom center in lidar coordinate
    pts = torch.tensor(
        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
          [4.7, 3.5, -12.2]],
         [[3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], [-21.3, -52, -5],
          [0, 0, 0], [6, 7, 8], [-2, -3, -4], [6, 4, 9]]],
        dtype=torch.float32).to(device)  # points (b, m, 3) in lidar coordinate

    point_indices = points_in_boxes_part(points=pts, boxes=boxes)
    expected_point_indices = torch.tensor(
        [[0, 0, 0, 0, 0, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1]],
        dtype=torch.int32).to(device)
    assert point_indices.shape == torch.Size([2, 8])
    assert (point_indices == expected_point_indices).all()

    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
                         dtype=torch.float32).to(device)  # 30 degrees
    pts = torch.tensor(
        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
        dtype=torch.float32).to(device)
    point_indices = points_in_boxes_part(points=pts, boxes=boxes)
    expected_point_indices = torch.tensor([[-1, -1, 0, -1, 0, -1, -1, -1]],
                                          dtype=torch.int32).to(device)
    assert (point_indices == expected_point_indices).all()


def test_points_in_boxes_cpu():
    boxes = torch.tensor(
        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
        dtype=torch.float32
    )  # boxes (m, 7) with bottom center in lidar coordinate
    pts = torch.tensor(
        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
              -16, -18, 9
          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
        dtype=torch.float32)  # points (n, 3) in lidar coordinate

    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
    expected_point_indices = torch.tensor(
        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
        dtype=torch.int32)
    assert point_indices.shape == torch.Size([1, 15, 2])
    assert (point_indices == expected_point_indices).all()

    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
                         dtype=torch.float32)  # 30 degrees
    pts = torch.tensor(
        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
        dtype=torch.float32)
    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
    expected_point_indices = torch.tensor(
        [[[0], [0], [1], [0], [1], [0], [0], [0]]], dtype=torch.int32)
    assert (point_indices == expected_point_indices).all()


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
def test_points_in_boxes_all():

    boxes = torch.tensor(
        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
        dtype=torch.float32).cuda(
        )  # boxes (m, 7) with bottom center in lidar coordinate
    pts = torch.tensor(
        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
              -16, -18, 9
          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate

    point_indices = points_in_boxes_all(points=pts, boxes=boxes)
    expected_point_indices = torch.tensor(
        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
        dtype=torch.int32).cuda()
    assert point_indices.shape == torch.Size([1, 15, 2])
    assert (point_indices == expected_point_indices).all()

    if torch.cuda.device_count() > 1:
        pts = pts.to('cuda:1')
        boxes = boxes.to('cuda:1')
        expected_point_indices = expected_point_indices.to('cuda:1')
        point_indices = points_in_boxes_all(points=pts, boxes=boxes)
        assert point_indices.shape == torch.Size([1, 15, 2])
        assert (point_indices == expected_point_indices).all()


================================================
FILE: tests/test_ops/test_roipoint_pool3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import RoIPointPool3d
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
@pytest.mark.parametrize('dtype', [
    torch.float, torch.half,
    pytest.param(
        torch.double,
        marks=pytest.mark.skipif(
            IS_MLU_AVAILABLE or IS_NPU_AVAILABLE or IS_MUSA_AVAILABLE,
            reason='MLU, NPU, MUSA does not support for double'))
])
def test_roipoint(device, dtype):
    points = torch.tensor(
        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
        dtype=dtype).unsqueeze(0).to(device)
    feats = points.clone()
    rois = torch.tensor([[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
                          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
                        dtype=dtype).to(device)

    roipoint_pool3d = RoIPointPool3d(num_sampled_points=4)
    roi_feat, empty_flag = roipoint_pool3d(points, feats, rois)
    expected_roi_feat = torch.tensor(
        [[[[1, 2, 3.3, 1, 2, 3.3], [1.2, 2.5, 3, 1.2, 2.5, 3],
           [0.8, 2.1, 3.5, 0.8, 2.1, 3.5], [1.6, 2.6, 3.6, 1.6, 2.6, 3.6]],
          [[-9.2, 21, 18.2, -9.2, 21, 18.2], [-9.2, 21, 18.2, -9.2, 21, 18.2],
           [-9.2, 21, 18.2, -9.2, 21, 18.2], [-9.2, 21, 18.2, -9.2, 21, 18.2]]]
         ],
        dtype=dtype).to(device)
    expected_empty_flag = torch.tensor([[0, 0]]).int().to(device)

    assert torch.allclose(roi_feat, expected_roi_feat)
    assert torch.allclose(empty_flag, expected_empty_flag)


================================================
FILE: tests/test_ops/test_rotated_feature_align.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import rotated_feature_align
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
    pytest.param(
        'cpu',
        marks=pytest.mark.skipif(
            torch.__version__ == 'parrots', reason='requires PyTorch support'))
])
def test_rotated_feature_align(device):
    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],
                              [0.9144, 1.2248, 1.3115, -0.9690],
                              [-0.8949, -1.1797, -0.9093, -0.3961],
                              [-0.4586, 0.5062, -0.7947, -0.7397]],
                             [[-1.0943, -0.7495, 1.3461, -1.1652],
                              [0.2034, 0.6763, -1.2357, 0.5231],
                              [-1.0062, 1.2592, 1.4225, -0.3951],
                              [-0.1242, -1.6240, 0.1932, 2.7181]],
                             [[-1.6271, -1.0276, 0.0578, -0.2997],
                              [-0.9684, -1.6946, -1.3188, -1.1938],
                              [-1.6744, -0.8917, -0.6556,
                               1.0073], [-0.1205, 0.3671, -0.3731, -0.5347]]],
                            [[[0.7035, 0.2089, -0.1774, 3.4670],
                              [-0.8505, -0.9278, 1.4714, 0.1644],
                              [0.0898, 0.3531, -0.4007, 0.1927],
                              [1.2569, -0.2636, -0.5223, 0.0616]],
                             [[0.1760, -0.7639, -0.4600, -1.3260],
                              [-0.9921, -0.2970, -0.8955, 1.0508],
                              [1.3515, -0.1641, 1.9679, 1.1986],
                              [-0.3616, 0.6287, 0.4933, 0.3360]],
                             [[-0.5860, 0.2124, -0.8700, 2.4200],
                              [-0.0551, -1.5103, -1.6779, 0.8399],
                              [0.8431, 1.2414, -1.1243, -0.3887],
                              [-2.1254, 0.6047, -0.3515, 0.7254]]]],
                           device=device,
                           requires_grad=True)

    bbox = torch.tensor(
        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],
           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],
           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],
           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],
          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],
           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],
           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],
           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],
          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],
           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],
           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],
           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],
          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],
           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],
           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],
           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],
         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],
           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],
           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],
           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],
          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],
           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],
           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],
           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],
          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],
           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],
           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],
           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],
          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],
           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],
           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],
           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]],
        device=device,
        requires_grad=True)

    expected_output = torch.tensor([[[[1.1095, -0.2172, -0.5222, -0.6225],
                                      [0.9144, 0.7662, 1.0487, -0.9690],
                                      [-0.8949, -1.6384, -0.9093, -0.3961],
                                      [-0.8604, 0.5062, -0.7947, -0.7397]],
                                     [[-0.3961, -0.7495, 1.3461, 1.5528],
                                      [0.2034, 0.5522, -1.6722, 0.5231],
                                      [-1.0062, 1.1350, 1.4225, -0.3951],
                                      [-0.4826, -1.6240, 0.1932, 2.7181]],
                                     [[-2.6436, -1.0276, 0.0578, -0.8344],
                                      [-0.9684, -1.8151, -2.1843, -1.1938],
                                      [-1.6744, -1.0121, -0.6556, 1.0073],
                                      [-0.8474, 0.3671, -0.3731, -0.5347]]],
                                    [[[0.7035, 0.2089, -0.1774, 3.4670],
                                      [-0.8505, -0.9278, 1.4714, 0.1644],
                                      [0.0898, 0.3064, -0.4007, 0.5849],
                                      [1.2569, -0.2636, -0.5223, 0.0616]],
                                     [[0.1760, -0.7639, -0.4600, -1.3260],
                                      [-0.9921, -0.2970, -0.8955, 1.0508],
                                      [1.3515, -0.6125, 1.9679, 0.5550],
                                      [-0.3616, 0.6287, 0.4933, 0.3360]],
                                     [[-0.5860, 0.2124, -0.8700, 2.4200],
                                      [-0.0551, -1.5103, -1.6779, 0.8399],
                                      [0.8431, 0.8455, -1.1243, -1.5994],
                                      [-2.1254, 0.6047, -0.3515, 0.7254]]]],
                                   device=device)

    expected_grad = torch.tensor([
        [[[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]]],
        [[[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]]]
    ],
                                 device=device)

    output = rotated_feature_align(
        feature, bbox, spatial_scale=1 / 8, points=1)
    output.backward(torch.ones_like(output))
    assert torch.allclose(output, expected_output, 1e-2)
    assert torch.allclose(feature.grad, expected_grad, 1e-2)


================================================
FILE: tests/test_ops/test_saconv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn

from mmcv.ops import SAConv2d


def test_sacconv():

    # test with normal cast
    x = torch.rand(1, 3, 256, 256)
    saconv = SAConv2d(3, 5, kernel_size=3, padding=1)
    sac_out = saconv(x)
    refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1)
    refer_out = refer_conv(x)
    assert sac_out.shape == refer_out.shape

    # test with dilation >= 2
    dalited_saconv = SAConv2d(3, 5, kernel_size=3, padding=2, dilation=2)
    dalited_sac_out = dalited_saconv(x)
    refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=2, dilation=2)
    refer_out = refer_conv(x)
    assert dalited_sac_out.shape == refer_out.shape

    # test with deform
    deform_saconv = SAConv2d(3, 5, kernel_size=3, padding=1, use_deform=True)
    if torch.cuda.is_available():
        x = torch.rand(1, 3, 256, 256).cuda()
        deform_saconv = SAConv2d(
            3, 5, kernel_size=3, padding=1, use_deform=True).cuda()
        deform_sac_out = deform_saconv(x).cuda()
        refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1).cuda()
        refer_out = refer_conv(x)
        assert deform_sac_out.shape == refer_out.shape
    else:
        deform_sac_out = deform_saconv(x)
        refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1)
        refer_out = refer_conv(x)
        assert deform_sac_out.shape == refer_out.shape

    # test with groups >= 2
    x = torch.rand(1, 4, 256, 256)
    group_saconv = SAConv2d(4, 4, kernel_size=3, padding=1, groups=2)
    group_sac_out = group_saconv(x)
    refer_conv = nn.Conv2d(4, 4, kernel_size=3, padding=1, groups=2)
    refer_out = refer_conv(x)
    assert group_sac_out.shape == refer_out.shape


================================================
FILE: tests/test_ops/test_scatter_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
from torch.autograd import gradcheck

from mmcv.ops import DynamicScatter
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE

if torch.__version__ == 'parrots':
    pytest.skip('not supported in parrots now', allow_module_level=True)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_dynamic_scatter(device):
    dsmean = DynamicScatter([0.32, 0.32, 6],
                            [-74.88, -74.88, -2, 74.88, 74.88, 4], True)
    dsmax = DynamicScatter([0.32, 0.32, 6],
                           [-74.88, -74.88, -2, 74.88, 74.88, 4], False)

    # test empty input
    empty_feats = torch.empty(size=(0, 3), dtype=torch.float32, device=device)
    empty_coors = torch.empty(size=(0, 3), dtype=torch.int32, device=device)

    empty_feats.requires_grad_()
    empty_feats_out_mean, empty_coors_out_mean = dsmean(
        empty_feats, empty_coors)
    empty_feats_out_mean.sum().backward()
    empty_feats_out_max, empty_coors_out_max = dsmax(empty_feats, empty_coors)
    empty_feats_out_max.sum().backward()

    assert empty_feats_out_mean.shape == empty_feats.shape
    assert empty_feats_out_max.shape == empty_feats.shape
    assert empty_coors_out_mean.shape == empty_coors.shape
    assert empty_coors_out_max.shape == empty_coors.shape

    # test empty reduced output
    empty_o_feats = torch.rand(
        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
    empty_o_coors = torch.randint(
        low=-1, high=0, size=(200000, 3), dtype=torch.int32, device=device)

    empty_o_feats.requires_grad_()
    empty_o_feats_out_mean, empty_o_coors_out_mean = dsmean(
        empty_o_feats, empty_o_coors)
    empty_o_feats_out_mean.sum().backward()
    assert (empty_o_feats.grad == 0).all()

    empty_o_feats_out_max, empty_o_coors_out_max = dsmax(
        empty_o_feats, empty_o_coors)
    empty_o_feats_out_max.sum().backward()
    assert (empty_o_feats.grad == 0).all()

    # test non-empty input
    feats = torch.rand(
        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
    coors = torch.randint(
        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device=device)

    ref_voxel_coors = coors.unique(dim=0, sorted=True)
    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
    ref_voxel_feats_mean = []
    ref_voxel_feats_max = []
    for ref_voxel_coor in ref_voxel_coors:
        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)
        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))
        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)
    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)
    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)

    feats_out_mean, coors_out_mean = dsmean(feats, coors)
    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +
                coors_out_mean[:, 2]).argsort()
    feats_out_mean = feats_out_mean[seq_mean]
    coors_out_mean = coors_out_mean[seq_mean]

    feats_out_max, coors_out_max = dsmax(feats, coors)
    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +
               coors_out_max[:, 2]).argsort()
    feats_out_max = feats_out_max[seq_max]
    coors_cout_max = coors_out_max[seq_max]

    assert (coors_out_mean == ref_voxel_coors).all()
    assert torch.allclose(
        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)
    assert (coors_cout_max == ref_voxel_coors).all()
    assert torch.allclose(
        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)

    # test non-empty input without any point out of bound
    feats = torch.rand(
        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
    coors = torch.randint(
        low=0, high=20, size=(200000, 3), dtype=torch.int32, device=device)

    ref_voxel_coors = coors.unique(dim=0, sorted=True)
    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
    ref_voxel_feats_mean = []
    ref_voxel_feats_max = []
    for ref_voxel_coor in ref_voxel_coors:
        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)
        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))
        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)
    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)
    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)

    feats_out_mean, coors_out_mean = dsmean(feats, coors)
    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +
                coors_out_mean[:, 2]).argsort()
    feats_out_mean = feats_out_mean[seq_mean]
    coors_out_mean = coors_out_mean[seq_mean]

    feats_out_max, coors_out_max = dsmax(feats, coors)
    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +
               coors_out_max[:, 2]).argsort()
    feats_out_max = feats_out_max[seq_max]
    coors_cout_max = coors_out_max[seq_max]

    assert (coors_out_mean == ref_voxel_coors).all()
    assert torch.allclose(
        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)
    assert (coors_cout_max == ref_voxel_coors).all()
    assert torch.allclose(
        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)

    # test grad #
    feats = torch.rand(
        size=(100, 4), dtype=torch.float32, device=device) * 100 - 50
    coors = torch.randint(
        low=-1, high=3, size=(100, 3), dtype=torch.int32, device=device)
    feats.requires_grad_()
    # TODO(Cambricon): mlu only support max reduce in current version.
    if not IS_MLU_AVAILABLE:
        gradcheck(dsmean, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
    gradcheck(dsmax, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)


================================================
FILE: tests/test_ops/test_spconv.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
from torch import nn

from mmcv.cnn import build_conv_layer, build_norm_layer
from mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential,
                      SubMConv3d)

if torch.__version__ == 'parrots':
    pytest.skip('not supported in parrots now', allow_module_level=True)

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE


def make_sparse_convmodule(in_channels,
                           out_channels,
                           kernel_size,
                           indice_key,
                           stride=1,
                           padding=0,
                           conv_type='SubMConv3d',
                           norm_cfg=None,
                           order=('conv', 'norm', 'act')):
    """Make sparse convolution module.

    Args:
        in_channels (int): the number of input channels
        out_channels (int): the number of out channels
        kernel_size (int|tuple(int)): kernel size of convolution
        indice_key (str): the indice key used for sparse tensor
        stride (int|tuple(int)): the stride of convolution
        padding (int or list[int]): the padding number of input
        conv_type (str): sparse conv type in spconv
        norm_cfg (dict[str]): config of normalization layer
        order (tuple[str]): The order of conv/norm/activation layers. It is a
            sequence of "conv", "norm" and "act". Common examples are
            ("conv", "norm", "act") and ("act", "conv", "norm").

    Returns:
        spconv.SparseSequential: sparse convolution module.
    """
    assert isinstance(order, tuple) and len(order) <= 3
    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}

    conv_cfg = dict(type=conv_type, indice_key=indice_key)

    layers = list()
    for layer in order:
        if layer == 'conv':
            if conv_type not in [
                    'SparseInverseConv3d', 'SparseInverseConv2d',
                    'SparseInverseConv1d'
            ]:
                layers.append(
                    build_conv_layer(
                        conv_cfg,
                        in_channels,
                        out_channels,
                        kernel_size,
                        stride=stride,
                        padding=padding,
                        bias=False))
            else:
                layers.append(
                    build_conv_layer(
                        conv_cfg,
                        in_channels,
                        out_channels,
                        kernel_size,
                        bias=False))
        elif layer == 'norm':
            layers.append(build_norm_layer(norm_cfg, out_channels)[1])
        elif layer == 'act':
            layers.append(nn.ReLU(inplace=True))

    layers = SparseSequential(*layers)
    return layers


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_make_sparse_convmodule(device):
    if IS_CUDA_AVAILABLE:
        torch.cuda.empty_cache()
    elif IS_MUSA_AVAILABLE:
        torch.musa.empty_cache()
    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
                                   [6.8162713, -2.480431, -1.3616394, 0.36],
                                   [11.643568, -4.744306, -1.3580885, 0.16],
                                   [23.482342, 6.5036807, 0.5806964, 0.35]],
                                  dtype=torch.float32,
                                  device=device)  # n, point_features
    coordinates = torch.tensor(
        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
         [1, 35, 930, 469]],
        dtype=torch.int32,
        device=device)  # n, 4(batch, ind_x, ind_y, ind_z)

    # test
    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
                                       [41, 1600, 1408], 2)

    sparse_block0 = make_sparse_convmodule(
        4,
        16,
        3,
        'test0',
        stride=1,
        padding=0,
        conv_type='SubMConv3d',
        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
        order=('conv', 'norm', 'act')).to(device)
    assert isinstance(sparse_block0[0], SubMConv3d)
    assert sparse_block0[0].in_channels == 4
    assert sparse_block0[0].out_channels == 16
    assert isinstance(sparse_block0[1], torch.nn.BatchNorm1d)
    assert sparse_block0[1].eps == 0.001
    assert sparse_block0[1].momentum == 0.01
    assert isinstance(sparse_block0[2], torch.nn.ReLU)

    # test forward
    out_features = sparse_block0(input_sp_tensor)
    assert out_features.features.shape == torch.Size([4, 16])

    # device == mlu: not support inverse==1 yet
    if device != 'mlu':
        sparse_block1 = make_sparse_convmodule(
            4,
            16,
            3,
            'test1',
            stride=1,
            padding=0,
            conv_type='SparseInverseConv3d',
            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
            order=('norm', 'act', 'conv')).to(device)
        assert isinstance(sparse_block1[2], SparseInverseConv3d)
        assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
        assert isinstance(sparse_block1[1], torch.nn.ReLU)


================================================
FILE: tests/test_ops/test_syncbn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os
import platform

import numpy as np
import pytest
import torch
import torch.distributed as dist
import torch.nn as nn

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE

if platform.system() == 'Windows':
    import regex as re
else:
    import re


class TestSyncBN:

    def dist_init(self):
        rank = int(os.environ['SLURM_PROCID'])
        world_size = int(os.environ['SLURM_NTASKS'])
        local_rank = int(os.environ['SLURM_LOCALID'])
        node_list = str(os.environ['SLURM_NODELIST'])

        node_parts = re.findall('[0-9]+', node_list)
        os.environ['MASTER_ADDR'] = (f'{node_parts[1]}.{node_parts[2]}' +
                                     f'.{node_parts[3]}.{node_parts[4]}')
        os.environ['MASTER_PORT'] = '12341'
        os.environ['WORLD_SIZE'] = str(world_size)
        os.environ['RANK'] = str(rank)

        if IS_CUDA_AVAILABLE:
            dist.init_process_group('nccl')
            torch.cuda.set_device(local_rank)
        elif IS_MUSA_AVAILABLE:
            dist.init_process_group('mccl')
            torch.musa.set_device(local_rank)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def _test_syncbn_train(self, size=1, half=False, device='cuda'):

        if 'SLURM_NTASKS' not in os.environ or int(
                os.environ['SLURM_NTASKS']) != 4:
            print('must run with slurm has 4 processes!\n'
                  'srun -p test --gres=gpu:4 -n4')
            return
        else:
            print('Running syncbn test')
        from mmcv.ops import SyncBatchNorm

        assert size in (1, 2, 4)
        if not dist.is_initialized():
            self.dist_init()
        rank = dist.get_rank()

        torch.manual_seed(9)
        if IS_CUDA_AVAILABLE:
            torch.cuda.manual_seed(9)
        elif IS_MUSA_AVAILABLE:
            torch.musa.manual_seed(9)

        self.x = torch.rand(16, 3, 2, 3).to(device)
        self.y_bp = torch.rand(16, 3, 2, 3).to(device)

        if half:
            self.x = self.x.half()
            self.y_bp = self.y_bp.half()
        dist.broadcast(self.x, src=0)
        dist.broadcast(self.y_bp, src=0)

        if IS_CUDA_AVAILABLE:
            torch.cuda.synchronize()
        elif IS_MUSA_AVAILABLE:
            torch.musa.synchronize()
        if size == 1:
            groups = [None, None, None, None]
            groups[0] = dist.new_group([0])
            groups[1] = dist.new_group([1])
            groups[2] = dist.new_group([2])
            groups[3] = dist.new_group([3])
            group = groups[rank]
        elif size == 2:
            groups = [None, None, None, None]
            groups[0] = groups[1] = dist.new_group([0, 1])
            groups[2] = groups[3] = dist.new_group([2, 3])
            group = groups[rank]
        elif size == 4:
            group = dist.group.WORLD
        syncbn = SyncBatchNorm(3, group=group).to(device)
        syncbn.weight.data[0] = 0.2
        syncbn.weight.data[1] = 0.5
        syncbn.weight.data[2] = 0.7
        syncbn.train()

        bn = nn.BatchNorm2d(3).to(device)
        bn.weight.data[0] = 0.2
        bn.weight.data[1] = 0.5
        bn.weight.data[2] = 0.7
        bn.train()

        sx = self.x[rank * 4:rank * 4 + 4]
        sx.requires_grad_()
        sy = syncbn(sx)
        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])

        smean = syncbn.running_mean
        svar = syncbn.running_var
        sx_grad = sx.grad
        sw_grad = syncbn.weight.grad
        sb_grad = syncbn.bias.grad

        if size == 1:
            x = self.x[rank * 4:rank * 4 + 4]
            y_bp = self.y_bp[rank * 4:rank * 4 + 4]
        elif size == 2:
            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]
            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]
        elif size == 4:
            x = self.x
            y_bp = self.y_bp
        x.requires_grad_()
        y = bn(x)
        y.backward(y_bp)

        if size == 2:
            y = y[rank % 2 * 4:rank % 2 * 4 + 4]
        elif size == 4:
            y = y[rank * 4:rank * 4 + 4]

        mean = bn.running_mean
        var = bn.running_var
        if size == 1:
            x_grad = x.grad
            w_grad = bn.weight.grad
            b_grad = bn.bias.grad
        elif size == 2:
            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]
            w_grad = bn.weight.grad / 2
            b_grad = bn.bias.grad / 2
        elif size == 4:
            x_grad = x.grad[rank * 4:rank * 4 + 4]
            w_grad = bn.weight.grad / 4
            b_grad = bn.bias.grad / 4

        assert np.allclose(mean.data.cpu().numpy(),
                           smean.data.cpu().numpy(), 1e-3)
        assert np.allclose(var.data.cpu().numpy(),
                           svar.data.cpu().numpy(), 1e-3)
        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)
        assert np.allclose(w_grad.data.cpu().numpy(),
                           sw_grad.data.cpu().numpy(), 1e-3)
        assert np.allclose(b_grad.data.cpu().numpy(),
                           sb_grad.data.cpu().numpy(), 1e-3)
        assert np.allclose(x_grad.data.cpu().numpy(),
                           sx_grad.data.cpu().numpy(), 1e-2)

    @pytest.mark.parametrize('device', [
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'musa',
            marks=pytest.mark.skipif(
                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
    ])
    def _test_syncbn_empty_train(self, size=1, half=False, device='cuda'):

        if 'SLURM_NTASKS' not in os.environ or int(
                os.environ['SLURM_NTASKS']) != 4:
            print('must run with slurm has 4 processes!\n'
                  'srun -p test --gres=gpu:4 -n4')
            return
        else:
            print('Running syncbn test')
        from mmcv.ops import SyncBatchNorm

        assert size in (1, 2, 4)
        if not dist.is_initialized():
            self.dist_init()
        rank = dist.get_rank()

        torch.manual_seed(9)
        if IS_CUDA_AVAILABLE:
            torch.cuda.manual_seed(9)
        elif IS_MUSA_AVAILABLE:
            torch.musa.manual_seed(9)

        self.x = torch.rand(0, 3, 2, 3).to(device)
        self.y_bp = torch.rand(0, 3, 2, 3).to(device)

        if half:
            self.x = self.x.half()
            self.y_bp = self.y_bp.half()
        dist.broadcast(self.x, src=0)
        dist.broadcast(self.y_bp, src=0)

        if IS_CUDA_AVAILABLE:
            torch.cuda.synchronize()
        elif IS_MUSA_AVAILABLE:
            torch.musa.synchronize()
        if size == 1:
            groups = [None, None, None, None]
            groups[0] = dist.new_group([0])
            groups[1] = dist.new_group([1])
            groups[2] = dist.new_group([2])
            groups[3] = dist.new_group([3])
            group = groups[rank]
        elif size == 2:
            groups = [None, None, None, None]
            groups[0] = groups[1] = dist.new_group([0, 1])
            groups[2] = groups[3] = dist.new_group([2, 3])
            group = groups[rank]
        elif size == 4:
            group = dist.group.WORLD

        syncbn = SyncBatchNorm(3, group=group, stats_mode='N').to(device)
        syncbn.weight.data[0] = 0.2
        syncbn.weight.data[1] = 0.5
        syncbn.weight.data[2] = 0.7
        syncbn.train()

        bn = nn.BatchNorm2d(3).to(device)
        bn.weight.data[0] = 0.2
        bn.weight.data[1] = 0.5
        bn.weight.data[2] = 0.7
        bn.train()

        sx = self.x[rank * 4:rank * 4 + 4]
        sx.requires_grad_()
        sy = syncbn(sx)
        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])
        smean = syncbn.running_mean
        svar = syncbn.running_var
        sx_grad = sx.grad
        sw_grad = syncbn.weight.grad
        sb_grad = syncbn.bias.grad

        if size == 1:
            x = self.x[rank * 4:rank * 4 + 4]
            y_bp = self.y_bp[rank * 4:rank * 4 + 4]
        elif size == 2:
            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]
            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]
        elif size == 4:
            x = self.x
            y_bp = self.y_bp
        x.requires_grad_()
        y = bn(x)
        y.backward(y_bp)

        if size == 2:
            y = y[rank % 2 * 4:rank % 2 * 4 + 4]
        elif size == 4:
            y = y[rank * 4:rank * 4 + 4]

        mean = bn.running_mean
        var = bn.running_var
        if size == 1:
            x_grad = x.grad
            w_grad = bn.weight.grad
            b_grad = bn.bias.grad
        elif size == 2:
            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]
            w_grad = bn.weight.grad / 2
            b_grad = bn.bias.grad / 2
        elif size == 4:
            x_grad = x.grad[rank * 4:rank * 4 + 4]
            w_grad = bn.weight.grad / 4
            b_grad = bn.bias.grad / 4

        assert np.allclose(mean.data.cpu().numpy(),
                           smean.data.cpu().numpy(), 1e-3)
        assert np.allclose(var.data.cpu().numpy(),
                           svar.data.cpu().numpy(), 1e-3)
        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)
        assert np.allclose(w_grad.data.cpu().numpy(),
                           sw_grad.data.cpu().numpy(), 1e-3)
        assert np.allclose(b_grad.data.cpu().numpy(),
                           sb_grad.data.cpu().numpy(), 1e-3)
        assert np.allclose(x_grad.data.cpu().numpy(),
                           sx_grad.data.cpu().numpy(), 1e-2)

        # 'stats_mode' only allows 'default' and 'N'
        with pytest.raises(AssertionError):
            SyncBatchNorm(3, group=group, stats_mode='X')

    def test_syncbn_1(self):
        self._test_syncbn_train(size=1)

    def test_syncbn_2(self):
        self._test_syncbn_train(size=2)

    def test_syncbn_4(self):
        self._test_syncbn_train(size=4)

    def test_syncbn_1_half(self):
        self._test_syncbn_train(size=1, half=True)

    def test_syncbn_2_half(self):
        self._test_syncbn_train(size=2, half=True)

    def test_syncbn_4_half(self):
        self._test_syncbn_train(size=4, half=True)

    def test_syncbn_empty_1(self):
        self._test_syncbn_empty_train(size=1)

    def test_syncbn_empty_2(self):
        self._test_syncbn_empty_train(size=2)

    def test_syncbn_empty_4(self):
        self._test_syncbn_empty_train(size=4)

    def test_syncbn_empty_1_half(self):
        self._test_syncbn_empty_train(size=1, half=True)

    def test_syncbn_empty_2_half(self):
        self._test_syncbn_empty_train(size=2, half=True)

    def test_syncbn_empty_4_half(self):
        self._test_syncbn_empty_train(size=4, half=True)


================================================
FILE: tests/test_ops/test_three_interpolate.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import three_interpolate
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE


@pytest.mark.parametrize('dtype', [
    torch.half, torch.float,
    pytest.param(
        torch.double,
        marks=pytest.mark.skipif(
            IS_NPU_AVAILABLE or IS_MUSA_AVAILABLE,
            reason='NPU, MUSA does not support for 64-bit floating point'))
])
@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
def test_three_interpolate(dtype, device):
    if IS_MUSA_AVAILABLE:
        torch.musa.empty_cache()
    features = torch.tensor(
        [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
          [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
          [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
          [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
          [0.3207, 0.0000, 0.3411, 0.3207, 0.3207, 0.3207]],
         [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
          [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
          [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
          [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
          [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]],
        dtype=dtype,
        device=device)

    idx = torch.tensor(
        [[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2], [0, 1, 3]],
         [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4], [0, 1, 2]]],
        device=device).int()

    weight = torch.tensor([[[3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [1.0000e+00, 5.8155e-08, 2.2373e-08],
                            [1.0000e+00, 1.7737e-08, 1.7356e-08],
                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [3.3333e-01, 3.3333e-01, 3.3333e-01]],
                           [[3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [1.0000e+00, 1.3651e-08, 7.7312e-09],
                            [1.0000e+00, 1.7148e-08, 1.4070e-08],
                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [3.3333e-01, 3.3333e-01, 3.3333e-01]]],
                          dtype=dtype,
                          device=device)

    output = three_interpolate(features, idx, weight)
    expected_output = torch.tensor([[[
        3.8953e+00, 4.4995e+00, 4.4995e+00, 3.8953e+00, 3.8953e+00, 3.2072e+00
    ], [
        2.9320e+00, 3.0447e+00, 3.0447e+00, 2.9320e+00, 2.9320e+00, 2.9583e+00
    ], [
        2.7281e+00, 2.6436e+00, 2.6436e+00, 2.7281e+00, 2.7281e+00, 2.7380e+00
    ], [
        4.6824e+00, 7.0199e+00, 7.0199e+00, 4.6824e+00, 4.6824e+00, 2.3466e+00
    ], [
        2.2060e-01, 3.4110e-01, 3.4110e-01, 2.2060e-01, 2.2060e-01, 2.1380e-01
    ]],
                                    [[
                                        8.1773e-01, 9.5440e-01, 2.4532e+00,
                                        8.1773e-01, 8.1773e-01, 1.1359e+00
                                    ],
                                     [
                                         8.4689e-01, 1.9176e+00, 1.4715e+00,
                                         8.4689e-01, 8.4689e-01, 1.3079e+00
                                     ],
                                     [
                                         6.9473e-01, 2.7440e-01, 2.0842e+00,
                                         6.9473e-01, 6.9473e-01, 7.8619e-01
                                     ],
                                     [
                                         7.6789e-01, 1.5063e+00, 1.6209e+00,
                                         7.6789e-01, 7.6789e-01, 1.1562e+00
                                     ],
                                     [
                                         3.8760e-01, 1.0300e-02, 8.3569e-09,
                                         3.8760e-01, 3.8760e-01, 1.9723e-01
                                     ]]],
                                   dtype=dtype,
                                   device=device)

    assert torch.allclose(output, expected_output, 1e-3, 1e-4)


================================================
FILE: tests/test_ops/test_three_nn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

from mmcv.ops import three_nn
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE

known = [[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],
          [-0.6503, 3.6637, -1.0622], [-1.8373, 3.5605, -0.7867],
          [-1.8373, 3.5605, -0.7867]],
         [[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],
          [0.0858, 2.4721, -0.1928], [-1.3399, 1.9991, -0.3698],
          [-1.3399, 1.9991, -0.3698]]]

unknown = [[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],
            [-0.6503, 3.6637, -1.0622], [-1.5237, 2.3976, -0.8097],
            [-0.0722, 3.4017, -0.2880], [0.5198, 3.0661, -0.4605],
            [-2.0185, 3.5019, -0.3236], [0.5098, 3.1020, 0.5799],
            [-1.6137, 3.8443, -0.5269], [0.7341, 2.9626, -0.3189]],
           [[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],
            [0.0858, 2.4721, -0.1928], [-0.9022, 1.6560, -1.3090],
            [0.1156, 1.6901, -0.4366], [-0.6477, 2.3576, -0.1563],
            [-0.8482, 1.1466, -1.2704], [-0.8753, 2.0845, -0.3460],
            [-0.5621, 1.4233, -1.2858], [-0.5883, 1.3114, -1.2899]]]

expected_dist = [[[0.0000, 0.0000, 0.0000], [0.0000, 2.0463, 2.8588],
                  [0.0000, 1.2229, 1.2229], [1.2047, 1.2047, 1.2047],
                  [1.0011, 1.0845, 1.8411], [0.7433, 1.4451, 2.4304],
                  [0.5007, 0.5007, 0.5007], [0.4587, 2.0875, 2.7544],
                  [0.4450, 0.4450, 0.4450], [0.5514, 1.7206, 2.6811]],
                 [[0.0000, 0.0000, 0.0000], [0.0000, 1.6464, 1.6952],
                  [0.0000, 1.5125, 1.5125], [1.0915, 1.0915, 1.0915],
                  [0.8197, 0.8511, 1.4894], [0.7433, 0.8082, 0.8082],
                  [0.8955, 1.3340, 1.3340], [0.4730, 0.4730, 0.4730],
                  [0.7949, 1.3325, 1.3325], [0.7566, 1.3727, 1.3727]]]

expected_idx = [[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0],
                 [1, 2, 0], [0, 3, 4], [1, 2, 0], [0, 3, 4], [1, 2, 0]],
                [[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0],
                 [2, 0, 3], [1, 0, 3], [0, 3, 4], [1, 0, 3], [1, 0, 3]]]


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))
])
@pytest.mark.parametrize('dtype,rtol', [(torch.float, 1e-8),
                                        (torch.half, 1e-3)])
def test_three_nn(device, dtype, rtol):
    dtype = torch.float
    known_t = torch.tensor(known, dtype=dtype, device=device)
    unknown_t = torch.tensor(unknown, dtype=dtype, device=device)

    dist_t, idx_t = three_nn(unknown_t, known_t)
    expected_dist_t = torch.tensor(expected_dist, dtype=dtype, device=device)
    expected_idx_t = torch.tensor(expected_idx, device=device)

    assert torch.allclose(dist_t, expected_dist_t, atol=1e-4, rtol=rtol)
    assert torch.all(idx_t == expected_idx_t)


================================================
FILE: tests/test_ops/test_tin_shift.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os

import numpy as np
import pytest
import torch

from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck

    _USING_PARROTS = False

cur_dir = os.path.dirname(os.path.abspath(__file__))

inputs = ([[[[0.88572276, 0.46422583], [0.97408265, 0.59547687],
             [0.030812204, 0.96236038], [0.75418317, 0.44058233],
             [0.33279222, 0.00084149837], [0.7069388, 0.23255438],
             [0.13547045, 0.81549376], [0.40174931, 0.36317211]],
            [[0.57444429, 0.15905505], [0.39897251, 0.25790238],
             [0.93282568, 0.18451685], [0.92526674, 0.18283755],
             [0.31664443, 0.59323865], [0.1957739, 0.42505842],
             [0.081158757, 0.81340349], [0.43456328, 0.30195212]],
            [[0.8198145, 0.05990988], [0.98062474, 0.34803438],
             [0.10412294, 0.37183142], [0.15021622, 0.038857818],
             [0.40985721, 0.42253625], [0.71150124, 0.59778064],
             [0.83851069, 0.15194464], [0.097513378, 0.74820143]],
            [[0.80680406, 0.49327564], [0.17821097, 0.12980539],
             [0.50657678, 0.14446253], [0.04178369, 0.53071898],
             [0.84983683, 0.3826949], [0.32193625, 0.91275406],
             [0.75628334, 0.52934098], [0.27994192, 0.3053292]]],
           [[[0.082397044, 0.4210068], [0.23563534, 0.7938987],
             [0.63669145, 0.69397897], [0.8844561, 0.97854084],
             [0.79027033, 0.60640401], [0.63528901, 0.72172403],
             [0.0097346902, 0.70800996], [0.87891227, 0.13674974]],
            [[0.74329448, 0.0243572], [0.82178867, 0.85750699],
             [0.7568835, 0.73146772], [0.5031184, 0.30479157],
             [0.28713053, 0.47414285], [0.4682079, 0.067471564],
             [0.48368263, 0.14590704], [0.25397325, 0.19946373]],
            [[0.4291026, 0.068739474], [0.7159555, 0.79903615],
             [0.76412082, 0.85348046], [0.081224024, 0.82264912],
             [0.97173303, 0.24291694], [0.48957139, 0.43488795],
             [0.67382395, 0.21889746], [0.36712623, 0.67127824]],
            [[0.12054044, 0.18096751], [0.86675781, 0.54755616],
             [0.68208277, 0.15164375], [0.79991871, 0.80811197],
             [0.85256428, 0.68253738], [0.185983, 0.95642138],
             [0.48102546, 0.28009653], [0.35726011, 0.58168036]]]])

shifts = [([[1, 0, 1, -2], [-2, 1, -1, 1]]), ([[2, 1, 2, -1], [-1, 2, 0, 2]])]

outputs = [([[[[0.0, 0.0], [0.0, 0.0], [0.030812, 0.96236], [0.75418, 0.44058],
               [0.0, 0.0], [0.0, 0.0], [0.83851, 0.15194], [0.097513, 0.7482]],
              [[0.88572, 0.46423], [0.97408, 0.59548], [0.93283, 0.18452],
               [0.92527, 0.18284], [0.33279, 0.0008415], [0.70694, 0.23255],
               [0.75628, 0.52934], [0.27994, 0.30533]],
              [[0.57444, 0.15906], [0.39897, 0.2579], [0.10412, 0.37183],
               [0.15022, 0.038858], [0.31664, 0.59324], [0.19577, 0.42506],
               [0.0, 0.0], [0.0, 0.0]],
              [[0.81981, 0.05991], [0.98062, 0.34803], [0.50658, 0.14446],
               [0.041784, 0.53072], [0.40986, 0.42254], [0.7115, 0.59778],
               [0.0, 0.0], [0.0, 0.0]]],
             [[[0.4291, 0.068739], [0.71596, 0.79904], [0.0, 0.0], [0.0, 0.0],
               [0.28713, 0.47414], [0.46821, 0.067472], [0.0, 0.0], [0.0,
                                                                     0.0]],
              [[0.12054, 0.18097], [0.86676, 0.54756], [0.63669, 0.69398],
               [0.88446, 0.97854], [0.97173, 0.24292], [0.48957, 0.43489],
               [0.0097347, 0.70801], [0.87891, 0.13675]],
              [[0.0, 0.0], [0.0, 0.0], [0.75688, 0.73147], [0.50312, 0.30479],
               [0.85256, 0.68254], [0.18598, 0.95642], [0.48368, 0.14591],
               [0.25397, 0.19946]],
              [[0.0, 0.0], [0.0, 0.0], [0.76412, 0.85348], [0.081224, 0.82265],
               [0.0, 0.0], [0.0, 0.0], [0.67382, 0.2189], [0.36713,
                                                           0.67128]]]]),
           ([[[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0],
               [0.0, 0.0], [0.081159, 0.8134], [0.43456, 0.30195]],
              [[0.0, 0.0], [0.0, 0.0], [0.030812, 0.96236], [0.75418, 0.44058],
               [0.0, 0.0], [0.0, 0.0], [0.83851, 0.15194], [0.097513, 0.7482]],
              [[0.88572, 0.46423], [0.97408, 0.59548], [0.93283, 0.18452],
               [0.92527, 0.18284], [0.33279, 0.0008415], [0.70694, 0.23255],
               [0.75628, 0.52934], [0.27994, 0.30533]],
              [[0.57444, 0.15906], [0.39897, 0.2579], [0.10412, 0.37183],
               [0.15022, 0.038858], [0.31664, 0.59324], [0.19577, 0.42506],
               [0.0, 0.0], [0.0, 0.0]]],
             [[[0.74329, 0.024357], [0.82179, 0.85751], [0.0, 0.0], [0.0, 0.0],
               [0.79027, 0.6064], [0.63529, 0.72172], [0.0, 0.0], [0.0, 0.0]],
              [[0.4291, 0.068739], [0.71596, 0.79904], [0.0, 0.0], [0.0, 0.0],
               [0.28713, 0.47414], [0.46821, 0.067472], [0.0, 0.0], [0.0,
                                                                     0.0]],
              [[0.12054, 0.18097], [0.86676, 0.54756], [0.63669, 0.69398],
               [0.88446, 0.97854], [0.97173, 0.24292], [0.48957, 0.43489],
               [0.0097347, 0.70801], [0.87891, 0.13675]],
              [[0.0, 0.0], [0.0, 0.0], [0.75688, 0.73147], [0.50312, 0.30479],
               [0.85256, 0.68254], [0.18598, 0.95642], [0.48368, 0.14591],
               [0.25397, 0.19946]]]])]

grads = [
    [[[[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
       [1., 1.]],
      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
       [1., 1.]],
      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
       [0., 0.]],
      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
       [0., 0.]]],
     [[[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
       [0., 0.]],
      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
       [1., 1.]],
      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
       [1., 1.]],
      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
       [1., 1.]]]],
    [[[[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [1., 1.],
       [1., 1.]],
      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
       [1., 1.]],
      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
       [1., 1.]],
      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
       [0., 0.]]],
     [[[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
       [0., 0.]],
      [[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
       [0., 0.]],
      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
       [1., 1.]],
      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
       [1., 1.]]]]
]


def _test_tinshift_gradcheck(device, dtype):
    try:
        from mmcv.ops import tin_shift
    except ModuleNotFoundError:
        pytest.skip('TINShift op is not successfully compiled')

    if dtype == torch.half:
        pytest.skip('"add_cpu/sub_cpu" not implemented for Half')

    for shift in shifts:
        np_input = np.array(inputs)
        np_shift = np.array(shift)

        x = torch.tensor(
            np_input, dtype=dtype, device=device, requires_grad=True)
        shift = torch.tensor(np_shift, device=device).int()
        if torch.__version__ == 'parrots':
            gradcheck(tin_shift, (x, shift))
        else:
            gradcheck(tin_shift, (x, shift), atol=1, rtol=0.1)


def _test_tinshift_allclose(device, dtype):
    try:
        from mmcv.ops import tin_shift
    except ModuleNotFoundError:
        pytest.skip('TINShift op is not successfully compiled')

    for shift, output, grad in zip(shifts, outputs, grads):
        np_input = np.array(inputs)
        np_shift = np.array(shift)
        np_output = np.array(output)
        np_grad = np.array(grad)

        x = torch.tensor(
            np_input, dtype=dtype, device=device, requires_grad=True)
        shift = torch.tensor(np_shift, device=device).int()

        output = tin_shift(x, shift)
        output.backward(torch.ones_like(output))
        assert np.allclose(
            output.data.type(torch.float).cpu().numpy(), np_output, 1e-3)
        assert np.allclose(
            x.grad.data.type(torch.float).cpu().numpy(), np_grad, 1e-3)


def _test_tinshift_assert(device, dtype):
    try:
        from mmcv.ops import tin_shift
    except ModuleNotFoundError:
        pytest.skip('TINShift op is not successfully compiled')

    inputs = [
        torch.rand(2, 3, 4, 2),
        torch.rand(2, 3, 4, 2),
        torch.rand(1, 3, 4, 2)
    ]
    shifts = [torch.rand(2, 3), torch.rand(2, 5)]

    for x, shift in zip(inputs, shifts):
        x = x.to(device).type(dtype)
        shift = shift.to(device).type(dtype)

        # A ValueError should be raised if ops get inputs with wrong shapes.
        with pytest.raises(ValueError):
            tin_shift(x, shift)


@pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
@pytest.mark.parametrize('dtype', [
    torch.float,
    pytest.param(
        torch.double,
        marks=pytest.mark.skipif(
            IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,
            reason='MLU, MUSA does not support for 64-bit floating point')),
    torch.half
])
def test_tinshift(device, dtype):
    _test_tinshift_allclose(device=device, dtype=dtype)
    _test_tinshift_gradcheck(device=device, dtype=dtype)
    _test_tinshift_assert(device=device, dtype=dtype)


================================================
FILE: tests/test_ops/test_upfirdn2d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck, gradgradcheck
    _USING_PARROTS = False


class TestUpFirDn2d:
    """Unit test for UpFirDn2d.

    Here, we just test the basic case of upsample version. More gerneal tests
    will be included in other unit test for UpFirDnUpsample and
    UpFirDnDownSample modules.
    """

    @classmethod
    def setup_class(cls):
        kernel_1d = torch.tensor([1., 3., 3., 1.])
        cls.kernel = kernel_1d[:, None] * kernel_1d[None, :]
        cls.kernel = cls.kernel / cls.kernel.sum()
        cls.factor = 2
        pad = cls.kernel.shape[0] - cls.factor
        cls.pad = ((pad + 1) // 2 + cls.factor - 1, pad // 2)

        cls.input_tensor = torch.randn((2, 3, 4, 4), requires_grad=True)

    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
    def test_upfirdn2d(self):
        from mmcv.ops import upfirdn2d
        if _USING_PARROTS:
            gradcheck(
                upfirdn2d,
                (self.input_tensor.cuda(),
                 self.kernel.type_as(
                     self.input_tensor).cuda(), self.factor, 1, self.pad),
                delta=1e-4,
                pt_atol=1e-3)
        else:
            gradcheck(
                upfirdn2d,
                (self.input_tensor.cuda(),
                 self.kernel.type_as(
                     self.input_tensor).cuda(), self.factor, 1, self.pad),
                eps=1e-4,
                atol=1e-3)

            gradgradcheck(
                upfirdn2d,
                (self.input_tensor.cuda(),
                 self.kernel.type_as(
                     self.input_tensor).cuda(), self.factor, 1, self.pad),
                eps=1e-4,
                atol=1e-3)

        # test with different up
        kernel = torch.randn(3, 3)
        out = upfirdn2d(
            self.input_tensor.cuda(), filter=kernel.cuda(), up=2, padding=1)
        assert out.shape == (2, 3, 8, 8)

        # test with different down
        input_tensor = torch.randn(2, 3, 8, 8)
        out = upfirdn2d(
            input_tensor.cuda(), filter=self.kernel.cuda(), down=2, padding=1)
        assert out.shape == (2, 3, 4, 4)

        # test with different flip_filter
        out = upfirdn2d(
            self.input_tensor.cuda(),
            filter=self.kernel.cuda(),
            flip_filter=True)
        assert out.shape == (2, 3, 1, 1)

        # test with different gain
        out1 = upfirdn2d(
            self.input_tensor.cuda(), filter=self.kernel.cuda(), gain=0.2)
        out2 = upfirdn2d(
            self.input_tensor.cuda(), filter=self.kernel.cuda(), gain=0.1)
        assert torch.allclose(out1, out2 * 2)


================================================
FILE: tests/test_ops/test_voxelization.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch

from mmcv.ops import Voxelization
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,
                        IS_NPU_AVAILABLE)


def _get_voxel_points_indices(points, coors, voxel):
    result_form = np.equal(coors, voxel)
    return result_form[:, 0] & result_form[:, 1] & result_form[:, 2]


@pytest.mark.parametrize('device_type', [
    'cpu',
    pytest.param(
        'cuda:0',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
])
def test_voxelization(device_type):
    voxel_size = [0.5, 0.5, 0.5]
    point_cloud_range = [0, -40, -3, 70.4, 40, 1]

    voxel_dict = np.load(
        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
    expected_coors = voxel_dict['coors']
    expected_voxels = voxel_dict['voxels']
    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
    points = voxel_dict['points']

    points = torch.tensor(points)
    max_num_points = -1
    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
                                        max_num_points)
    max_num_points = 1000
    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
                                     max_num_points)

    device = torch.device(device_type)

    # test hard_voxelization on cpu/gpu
    points = points.contiguous().to(device)
    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
    coors = coors.cpu().detach().numpy()
    voxels = voxels.cpu().detach().numpy()
    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
    assert np.all(coors == expected_coors)
    assert np.all(voxels == expected_voxels)
    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)

    # test dynamic_voxelization on cpu/gpu
    coors = dynamic_voxelization.forward(points)
    coors = coors.cpu().detach().numpy()
    points = points.cpu().detach().numpy()
    for i in range(expected_voxels.shape[0]):
        indices = _get_voxel_points_indices(points, coors, expected_voxels[i])
        num_points_current_voxel = points[indices].shape[0]
        assert num_points_current_voxel > 0
        assert np.all(
            points[indices] == expected_coors[i][:num_points_current_voxel])
        assert num_points_current_voxel == expected_num_points_per_voxel[i]


@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
def test_voxelization_nondeterministic():
    voxel_size = [0.5, 0.5, 0.5]
    point_cloud_range = [0, -40, -3, 70.4, 40, 1]

    voxel_dict = np.load(
        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
    points = voxel_dict['points']

    points = torch.tensor(points)
    max_num_points = -1
    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
                                        max_num_points)

    max_num_points = 10
    max_voxels = 50
    hard_voxelization = Voxelization(
        voxel_size,
        point_cloud_range,
        max_num_points,
        max_voxels,
        deterministic=False)

    # test hard_voxelization (non-deterministic version) on gpu
    points = torch.tensor(points).contiguous().to(device='cuda:0')
    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
    coors = coors.cpu().detach().numpy().tolist()
    voxels = voxels.cpu().detach().numpy().tolist()
    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy().tolist()

    coors_all = dynamic_voxelization.forward(points)
    coors_all = coors_all.cpu().detach().numpy().tolist()

    coors_set = {tuple(c) for c in coors}
    coors_all_set = {tuple(c) for c in coors_all}

    assert len(coors_set) == len(coors)
    assert len(coors_set - coors_all_set) == 0

    points = points.cpu().detach().numpy().tolist()

    coors_points_dict = {}
    for c, ps in zip(coors_all, points):
        if tuple(c) not in coors_points_dict:
            coors_points_dict[tuple(c)] = set()
        coors_points_dict[tuple(c)].add(tuple(ps))

    for c, ps, n in zip(coors, voxels, num_points_per_voxel):
        ideal_voxel_points_set = coors_points_dict[tuple(c)]
        voxel_points_set = {tuple(p) for p in ps[:n]}
        assert len(voxel_points_set) == n
        if n < max_num_points:
            assert voxel_points_set == ideal_voxel_points_set
            for p in ps[n:]:
                assert max(p) == min(p) == 0
        else:
            assert len(voxel_points_set - ideal_voxel_points_set) == 0

    # test hard_voxelization (non-deterministic version) on gpu
    # with all input point in range
    points = torch.tensor(points).contiguous().to(device='cuda:0')[:max_voxels]
    coors_all = dynamic_voxelization.forward(points)
    valid_mask = coors_all.ge(0).all(-1)
    points = points[valid_mask]
    coors_all = coors_all[valid_mask]
    coors_all = coors_all.cpu().detach().numpy().tolist()

    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
    coors = coors.cpu().detach().numpy().tolist()

    coors_set = {tuple(c) for c in coors}
    coors_all_set = {tuple(c) for c in coors_all}

    assert len(coors_set) == len(coors) == len(coors_all_set)


@pytest.mark.parametrize(
    'device_type',
    [
        pytest.param(
            # this is only used for dipu device testing case.
            # dipu will mock to cuda automatically on mlu physical device.
            'cuda:0',
            marks=pytest.mark.skipif(
                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
                not IS_MLU_AVAILABLE, reason='requires MLU support'))
    ])
def test_voxelization_mlu(device_type):
    voxel_size = [0.5, 0.5, 0.5]
    point_cloud_range = [0, -40, -3, 70.4, 40, 1]

    voxel_dict = np.load(
        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
    expected_coors = voxel_dict['coors']
    expected_voxels = voxel_dict['voxels']
    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
    points = voxel_dict['points']

    points = torch.tensor(points)
    max_num_points = 1000
    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
                                     max_num_points)

    device = torch.device(device_type)

    # test hard_voxelization on mlu
    points = points.contiguous().to(device)
    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
    coors = coors.cpu().detach().numpy()
    voxels = voxels.cpu().detach().numpy()
    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
    assert np.all(coors == expected_coors)
    assert np.all(voxels == expected_voxels)
    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)


@pytest.mark.parametrize('device_type', [
    pytest.param(
        'npu',
        marks=pytest.mark.skipif(
            not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_voxelization_npu(device_type):
    voxel_size = [0.5, 0.5, 0.5]
    point_cloud_range = [0, -40, -3, 70.4, 40, 1]

    voxel_dict = np.load(
        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
    expected_coors = voxel_dict['coors']
    expected_voxels = voxel_dict['voxels']
    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
    points = voxel_dict['points']

    points = torch.tensor(points)
    max_num_points = 1000
    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
                                     max_num_points)

    device = torch.device(device_type)

    # test hard_voxelization on npu
    points = points.contiguous().to(device)
    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
    coors = coors.cpu().detach().numpy()
    voxels = voxels.cpu().detach().numpy()
    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
    assert np.all(coors == expected_coors)
    assert np.all(voxels == expected_voxels)
    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)


@pytest.mark.parametrize('device_type', [
    pytest.param(
        'musa',
        marks=pytest.mark.skipif(
            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),
])
def test_voxelization_musa(device_type):
    voxel_size = [0.5, 0.5, 0.5]
    point_cloud_range = [0, -40, -3, 70.4, 40, 1]

    voxel_dict = np.load(
        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
    expected_coors = voxel_dict['coors']
    expected_voxels = voxel_dict['voxels']
    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
    points = voxel_dict['points']

    points = torch.tensor(points)
    max_num_points = 1000
    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
                                     max_num_points)

    device = torch.device(device_type)

    # test hard_voxelization on mlu
    points = points.contiguous().to(device)
    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
    coors = coors.cpu().detach().numpy()
    voxels = voxels.cpu().detach().numpy()
    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
    assert np.all(coors == expected_coors)
    assert np.all(voxels == expected_voxels)
    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)


================================================
FILE: tests/test_transforms/test_transforms_formatting.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
try:
    import torch
except ModuleNotFoundError:
    torch = None
else:
    from mmcv.transforms import ToTensor, to_tensor, ImageToTensor

import copy

import numpy as np
import pytest


@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
def test_to_tensor():

    # The type of the input object is torch.Tensor
    data_tensor = torch.tensor([1, 2, 3])
    tensor_from_tensor = to_tensor(data_tensor)
    assert isinstance(tensor_from_tensor, torch.Tensor)

    # The type of the input object is numpy.ndarray
    data_numpy = np.array([1, 2, 3])
    tensor_from_numpy = to_tensor(data_numpy)
    assert isinstance(tensor_from_numpy, torch.Tensor)

    # The type of the input object is list
    data_list = [1, 2, 3]
    tensor_from_list = to_tensor(data_list)
    assert isinstance(tensor_from_list, torch.Tensor)

    # The type of the input object is int
    data_int = 1
    tensor_from_int = to_tensor(data_int)
    assert isinstance(tensor_from_int, torch.Tensor)

    # The type of the input object is float
    data_float = 1.0
    tensor_from_float = to_tensor(data_float)
    assert isinstance(tensor_from_float, torch.Tensor)

    # The type of the input object is invalid
    with pytest.raises(TypeError):
        data_str = '123'
        _ = to_tensor(data_str)


@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
class TestToTensor:

    def test_init(self):
        TRANSFORM = ToTensor(keys=['img_label'])
        assert TRANSFORM.keys == ['img_label']

    def test_transform(self):
        TRANSFORMS = ToTensor(['instances.bbox', 'img_label'])

        # Test multi-level key and single-level key (multi-level key is
        # not in results)
        with pytest.raises(KeyError):
            results = {'instances': {'label': [1]}, 'img_label': [1]}
            results_tensor = TRANSFORMS.transform(copy.deepcopy(results))
            assert isinstance(results_tensor['instances']['label'], list)
            assert isinstance(results_tensor['img_label'], torch.Tensor)

        # Test multi-level key (multi-level key is in results)
        results = {'instances': {'bbox': [[0, 0, 10, 10]]}, 'img_label': [1]}
        results_tensor = TRANSFORMS.transform(copy.deepcopy(results))
        assert isinstance(results_tensor['instances']['bbox'], torch.Tensor)

    def test_repr(self):
        TRANSFORMS = ToTensor(['instances.bbox', 'img_label'])
        TRANSFORMS_str = str(TRANSFORMS)
        isinstance(TRANSFORMS_str, str)


@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
class TestImageToTensor:

    def test_init(self):
        TRANSFORMS = ImageToTensor(['img'])
        assert TRANSFORMS.keys == ['img']

    def test_transform(self):
        TRANSFORMS = ImageToTensor(['img'])

        # image only has one channel
        results = {'img': np.zeros((224, 224))}
        results = TRANSFORMS.transform(results)
        assert results['img'].shape == (1, 224, 224)

        # image has three channels
        results = {'img': np.zeros((224, 224, 3))}
        results = TRANSFORMS.transform(results)
        assert results['img'].shape == (3, 224, 224)

    def test_repr(self):
        TRANSFORMS = ImageToTensor(['img'])
        TRANSFORMS_str = str(TRANSFORMS)
        assert isinstance(TRANSFORMS_str, str)


================================================
FILE: tests/test_transforms/test_transforms_loading.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import os.path as osp

import numpy as np
import pytest

from mmcv.transforms import LoadAnnotations, LoadImageFromFile


class TestLoadImageFromFile:

    def test_load_img(self):
        # file_client_args and backend_args can not be both set
        with pytest.raises(
                ValueError,
                match='"file_client_args" and "backend_args" cannot be set'):
            LoadImageFromFile(
                file_client_args={'backend': 'disk'},
                backend_args={'backend': 'disk'})
        data_prefix = osp.join(osp.dirname(__file__), '../data')

        results = dict(img_path=osp.join(data_prefix, 'color.jpg'))
        transform = LoadImageFromFile()
        results = transform(copy.deepcopy(results))
        assert results['img_path'] == osp.join(data_prefix, 'color.jpg')
        assert results['img'].shape == (300, 400, 3)
        assert results['img'].dtype == np.uint8
        assert results['img_shape'] == (300, 400)
        assert results['ori_shape'] == (300, 400)
        assert repr(transform) == transform.__class__.__name__ + \
            "(ignore_empty=False, to_float32=False, color_type='color', " + \
            "imdecode_backend='cv2', backend_args=None)"

        # to_float32
        transform = LoadImageFromFile(to_float32=True)
        results = transform(copy.deepcopy(results))
        assert results['img'].dtype == np.float32

        # gray image
        results = dict(img_path=osp.join(data_prefix, 'grayscale.jpg'))
        transform = LoadImageFromFile()
        results = transform(copy.deepcopy(results))
        assert results['img'].shape == (300, 400, 3)
        assert results['img'].dtype == np.uint8

        transform = LoadImageFromFile(color_type='unchanged')
        results = transform(copy.deepcopy(results))
        assert results['img'].shape == (300, 400)
        assert results['img'].dtype == np.uint8

        # test load empty
        fake_img_path = osp.join(data_prefix, 'fake.jpg')
        results['img_path'] = fake_img_path
        transform = LoadImageFromFile(ignore_empty=False)
        with pytest.raises(FileNotFoundError):
            transform(copy.deepcopy(results))
        transform = LoadImageFromFile(ignore_empty=True)
        assert transform(copy.deepcopy(results)) is None


class TestLoadAnnotations:

    def setup_class(cls):
        data_prefix = osp.join(osp.dirname(__file__), '../data')
        seg_map = osp.join(data_prefix, 'grayscale.jpg')
        cls.results = {
            'seg_map_path':
            seg_map,
            'instances': [{
                'bbox': [0, 0, 10, 20],
                'bbox_label': 1,
                'keypoints': [1, 2, 3]
            }, {
                'bbox': [10, 10, 110, 120],
                'bbox_label': 2,
                'keypoints': [4, 5, 6]
            }]
        }

    def test_init(self):
        # file_client_args and backend_args can not be both set
        with pytest.raises(
                ValueError,
                match='"file_client_args" and "backend_args" cannot be set'):
            LoadAnnotations(
                file_client_args={'backend': 'disk'},
                backend_args={'backend': 'disk'})

    def test_load_bboxes(self):
        transform = LoadAnnotations(
            with_bbox=True,
            with_label=False,
            with_seg=False,
            with_keypoints=False,
        )
        results = transform(copy.deepcopy(self.results))
        assert 'gt_bboxes' in results
        assert (results['gt_bboxes'] == np.array([[0, 0, 10, 20],
                                                  [10, 10, 110, 120]])).all()
        assert results['gt_bboxes'].dtype == np.float32

    def test_load_labels(self):
        transform = LoadAnnotations(
            with_bbox=False,
            with_label=True,
            with_seg=False,
            with_keypoints=False,
        )
        results = transform(copy.deepcopy(self.results))
        assert 'gt_bboxes_labels' in results
        assert (results['gt_bboxes_labels'] == np.array([1, 2])).all()
        assert results['gt_bboxes_labels'].dtype == np.int64

    def test_load_kps(self):
        transform = LoadAnnotations(
            with_bbox=False,
            with_label=False,
            with_seg=False,
            with_keypoints=True,
        )
        results = transform(copy.deepcopy(self.results))
        assert 'gt_keypoints' in results
        assert (results['gt_keypoints'] == np.array([[[1, 2, 3]],
                                                     [[4, 5, 6]]])).all()
        assert results['gt_keypoints'].dtype == np.float32

    def test_load_seg_map(self):
        transform = LoadAnnotations(
            with_bbox=False,
            with_label=False,
            with_seg=True,
            with_keypoints=False,
        )
        results = transform(copy.deepcopy(self.results))
        assert 'gt_seg_map' in results
        assert results['gt_seg_map'].shape[:2] == (300, 400)
        assert results['gt_seg_map'].dtype == np.uint8

    def test_repr(self):
        transform = LoadAnnotations(
            with_bbox=True,
            with_label=False,
            with_seg=False,
            with_keypoints=False,
        )
        assert repr(transform) == (
            'LoadAnnotations(with_bbox=True, '
            'with_label=False, with_seg=False, '
            "with_keypoints=False, imdecode_backend='cv2', "
            'backend_args=None)')


================================================
FILE: tests/test_transforms/test_transforms_processing.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import os.path as osp
from unittest.mock import Mock

import numpy as np
import pytest

import mmcv
from mmcv.transforms import (TRANSFORMS, Normalize, Pad, RandomFlip,
                             RandomResize, Resize, TestTimeAug)
from mmcv.transforms.base import BaseTransform

try:
    import torch
except ModuleNotFoundError:
    torch = None
else:
    import torchvision

from numpy.testing import assert_array_almost_equal, assert_array_equal
from PIL import Image


class TestNormalize:

    def test_normalize(self):
        img_norm_cfg = dict(
            mean=[123.675, 116.28, 103.53],
            std=[58.395, 57.12, 57.375],
            to_rgb=True)
        transform = Normalize(**img_norm_cfg)
        results = dict()
        img = mmcv.imread(
            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
        original_img = copy.deepcopy(img)
        results['img'] = img
        results = transform(results)
        mean = np.array(img_norm_cfg['mean'])
        std = np.array(img_norm_cfg['std'])
        converted_img = (original_img[..., ::-1] - mean) / std
        assert np.allclose(results['img'], converted_img)

    def test_repr(self):
        img_norm_cfg = dict(
            mean=[123.675, 116.28, 103.53],
            std=[58.395, 57.12, 57.375],
            to_rgb=True)
        transform = Normalize(**img_norm_cfg)
        assert repr(transform) == ('Normalize(mean=[123.675 116.28  103.53 ], '
                                   'std=[58.395 57.12  57.375], to_rgb=True)')


class TestResize:

    def test_resize(self):
        data_info = dict(
            img=np.random.random((1333, 800, 3)),
            gt_seg_map=np.random.random((1333, 800, 3)),
            gt_bboxes=np.array([[0, 0, 112, 112]]),
            gt_keypoints=np.array([[[20, 50, 1]]]))

        with pytest.raises(AssertionError):
            transform = Resize(scale=None, scale_factor=None)
        with pytest.raises(TypeError):
            transform = Resize(scale_factor=[])
        # test scale is int
        transform = Resize(scale=2000)
        results = transform(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (2000, 2000)
        assert results['scale_factor'] == (2000 / 800, 2000 / 1333)

        # test scale is tuple
        transform = Resize(scale=(2000, 2000))
        results = transform(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (2000, 2000)
        assert results['scale_factor'] == (2000 / 800, 2000 / 1333)

        # test scale_factor is float
        transform = Resize(scale_factor=2.0)
        results = transform(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (2666, 1600)
        assert results['scale_factor'] == (2.0, 2.0)

        # test scale_factor is tuple
        transform = Resize(scale_factor=(1.5, 2))
        results = transform(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (2666, 1200)
        assert results['scale_factor'] == (1.5, 2)

        # test keep_ratio is True
        transform = Resize(scale=(2000, 2000), keep_ratio=True)
        results = transform(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (2000, 1200)
        assert results['scale_factor'] == (1200 / 800, 2000 / 1333)

        # test resize_bboxes/seg/kps
        transform = Resize(scale_factor=(1.5, 2))
        results = transform(copy.deepcopy(data_info))
        assert (results['gt_bboxes'] == np.array([[0, 0, 168, 224]])).all()
        assert (results['gt_keypoints'] == np.array([[[30, 100, 1]]])).all()
        assert results['gt_seg_map'].shape[:2] == (2666, 1200)

        # test clip_object_border = False
        data_info = dict(
            img=np.random.random((300, 400, 3)),
            gt_bboxes=np.array([[200, 150, 600, 450]]))
        transform = Resize(scale=(200, 150), clip_object_border=False)
        results = transform(data_info)
        assert (results['gt_bboxes'] == np.array([100, 75, 300, 225])).all()

    def test_repr(self):
        transform = Resize(scale=(2000, 2000), keep_ratio=True)
        assert repr(transform) == ('Resize(scale=(2000, 2000), '
                                   'scale_factor=None, keep_ratio=True, '
                                   'clip_object_border=True), backend=cv2), '
                                   'interpolation=bilinear)')


class TestPad:

    def test_pad(self):
        # test size and size_divisor are both set
        with pytest.raises(AssertionError):
            Pad(size=(10, 10), size_divisor=2)

        # test size and size_divisor are both None
        with pytest.raises(AssertionError):
            Pad(size=None, size_divisor=None)

        # test size and pad_to_square are both None
        with pytest.raises(AssertionError):
            Pad(size=(10, 10), pad_to_square=True)

        # test pad_val is not int or tuple
        with pytest.raises(AssertionError):
            Pad(size=(10, 10), pad_val=[])

        # test padding_mode is not 'constant', 'edge', 'reflect' or 'symmetric'
        with pytest.raises(AssertionError):
            Pad(size=(10, 10), padding_mode='edg')

        data_info = dict(
            img=np.random.random((1333, 800, 3)),
            gt_seg_map=np.random.random((1333, 800, 3)),
            gt_bboxes=np.array([[0, 0, 112, 112]]),
            gt_keypoints=np.array([[[20, 50, 1]]]))

        # test pad img / gt_seg_map with size
        trans = Pad(size=(1200, 2000))
        results = trans(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (2000, 1200)
        assert results['gt_seg_map'].shape[:2] == (2000, 1200)

        # test pad img/gt_seg_map with size_divisor
        trans = Pad(size_divisor=11)
        results = trans(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (1342, 803)
        assert results['gt_seg_map'].shape[:2] == (1342, 803)

        # test pad img/gt_seg_map with pad_to_square
        trans = Pad(pad_to_square=True)
        results = trans(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (1333, 1333)
        assert results['gt_seg_map'].shape[:2] == (1333, 1333)

        # test pad img/gt_seg_map with pad_to_square and size_divisor
        trans = Pad(pad_to_square=True, size_divisor=11)
        results = trans(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (1342, 1342)
        assert results['gt_seg_map'].shape[:2] == (1342, 1342)

        # test pad img/gt_seg_map with pad_to_square and size_divisor
        trans = Pad(pad_to_square=True, size_divisor=11)
        results = trans(copy.deepcopy(data_info))
        assert results['img'].shape[:2] == (1342, 1342)
        assert results['gt_seg_map'].shape[:2] == (1342, 1342)

        # test padding_mode
        new_img = np.ones((1333, 800, 3))
        data_info['img'] = new_img
        trans = Pad(pad_to_square=True, padding_mode='edge')
        results = trans(copy.deepcopy(data_info))
        assert (results['img'] == np.ones((1333, 1333, 3))).all()

        # test pad_val is dict
        # test rgb image, size=(2000, 2000)
        trans = Pad(
            size=(2000, 2000),
            pad_val=dict(img=(12, 12, 12), seg=(10, 10, 10)))
        results = trans(copy.deepcopy(data_info))
        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 10).all()

        trans = Pad(size=(2000, 2000), pad_val=dict(img=(12, 12, 12)))
        results = trans(copy.deepcopy(data_info))
        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 255).all()

        # test rgb image, pad_to_square=True
        trans = Pad(
            pad_to_square=True,
            pad_val=dict(img=(12, 12, 12), seg=(10, 10, 10)))
        results = trans(copy.deepcopy(data_info))
        assert (results['img'][:, 800:1333, :] == 12).all()
        assert (results['gt_seg_map'][:, 800:1333, :] == 10).all()

        trans = Pad(pad_to_square=True, pad_val=dict(img=(12, 12, 12)))
        results = trans(copy.deepcopy(data_info))
        assert (results['img'][:, 800:1333, :] == 12).all()
        assert (results['gt_seg_map'][:, 800:1333, :] == 255).all()

        # test pad_val is int
        # test rgb image
        trans = Pad(size=(2000, 2000), pad_val=12)
        results = trans(copy.deepcopy(data_info))
        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 255).all()
        # test gray image
        new_img = np.random.random((1333, 800))
        data_info['img'] = new_img
        new_semantic_seg = np.random.random((1333, 800))
        data_info['gt_seg_map'] = new_semantic_seg
        trans = Pad(size=(2000, 2000), pad_val=12)
        results = trans(copy.deepcopy(data_info))
        assert (results['img'][1333:2000, 800:2000] == 12).all()
        assert (results['gt_seg_map'][1333:2000, 800:2000] == 255).all()

    def test_repr(self):
        trans = Pad(pad_to_square=True, size_divisor=11, padding_mode='edge')
        assert repr(trans) == (
            'Pad(size=None, size_divisor=11, pad_to_square=True, '
            "pad_val={'img': 0, 'seg': 255}), padding_mode=edge)")


class TestCenterCrop:

    @classmethod
    def setup_class(cls):
        img = mmcv.imread(
            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
        cls.original_img = copy.deepcopy(img)
        seg = np.random.randint(0, 19, (300, 400)).astype(np.uint8)
        cls.gt_semantic_map = copy.deepcopy(seg)

    @staticmethod
    def reset_results(results, original_img, gt_semantic_map):
        results['img'] = copy.deepcopy(original_img)
        results['gt_seg_map'] = copy.deepcopy(gt_semantic_map)
        results['gt_bboxes'] = np.array([[0, 0, 210, 160],
                                         [200, 150, 400, 300]])
        results['gt_keypoints'] = np.array([[[20, 50, 1]], [[200, 150, 1]],
                                            [[300, 225, 1]]])
        return results

    @pytest.mark.skipif(
        condition=torch is None, reason='No torch in current env')
    def test_error(self):
        # test assertion if size is smaller than 0
        with pytest.raises(AssertionError):
            transform = dict(type='CenterCrop', crop_size=-1)
            TRANSFORMS.build(transform)

        # test assertion if size is tuple but one value is smaller than 0
        with pytest.raises(AssertionError):
            transform = dict(type='CenterCrop', crop_size=(224, -1))
            TRANSFORMS.build(transform)

        # test assertion if size is tuple and len(size) < 2
        with pytest.raises(AssertionError):
            transform = dict(type='CenterCrop', crop_size=(224, ))
            TRANSFORMS.build(transform)

        # test assertion if size is tuple len(size) > 2
        with pytest.raises(AssertionError):
            transform = dict(type='CenterCrop', crop_size=(224, 224, 3))
            TRANSFORMS.build(transform)

    def test_repr(self):
        # test repr
        transform = dict(type='CenterCrop', crop_size=224)
        center_crop_module = TRANSFORMS.build(transform)
        assert isinstance(repr(center_crop_module), str)

    def test_transform(self):
        results = {}
        self.reset_results(results, self.original_img, self.gt_semantic_map)

        # test CenterCrop when size is int
        transform = dict(type='CenterCrop', crop_size=224)
        center_crop_module = TRANSFORMS.build(transform)
        results = center_crop_module(results)
        assert results['img_shape'] == (224, 224)
        assert (results['img'] == self.original_img[38:262, 88:312, ...]).all()
        assert (results['gt_seg_map'] == self.gt_semantic_map[38:262,
                                                              88:312]).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 122, 122], [112, 112, 224,
                                                     224]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[0, 12, 0]], [[112, 112, 1]], [[212, 187, 1]]])).all()

        # test CenterCrop when size is tuple
        transform = dict(type='CenterCrop', crop_size=(224, 224))
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        assert results['img_shape'] == (224, 224)
        assert (results['img'] == self.original_img[38:262, 88:312, ...]).all()
        assert (results['gt_seg_map'] == self.gt_semantic_map[38:262,
                                                              88:312]).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 122, 122], [112, 112, 224,
                                                     224]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[0, 12, 0]], [[112, 112, 1]], [[212, 187, 1]]])).all()

        # test CenterCrop when crop_height != crop_width
        transform = dict(type='CenterCrop', crop_size=(224, 256))
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        assert results['img_shape'] == (256, 224)
        assert (results['img'] == self.original_img[22:278, 88:312, ...]).all()
        assert (results['gt_seg_map'] == self.gt_semantic_map[22:278,
                                                              88:312]).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 122, 138], [112, 128, 224,
                                                     256]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[0, 28, 0]], [[112, 128, 1]], [[212, 203, 1]]])).all()

        # test CenterCrop when crop_size is equal to img.shape
        img_height, img_width, _ = self.original_img.shape
        transform = dict(type='CenterCrop', crop_size=(img_width, img_height))
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        assert results['img_shape'] == (300, 400)
        assert (results['img'] == self.original_img).all()
        assert (results['gt_seg_map'] == self.gt_semantic_map).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 210, 160], [200, 150, 400,
                                                     300]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[20, 50, 1]], [[200, 150, 1]], [[300, 225, 1]]])).all()

        # test CenterCrop when crop_size is larger than img.shape
        transform = dict(
            type='CenterCrop', crop_size=(img_width * 2, img_height * 2))
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        assert results['img_shape'] == (300, 400)
        assert (results['img'] == self.original_img).all()
        assert (results['gt_seg_map'] == self.gt_semantic_map).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 210, 160], [200, 150, 400,
                                                     300]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[20, 50, 1]], [[200, 150, 1]], [[300, 225, 1]]])).all()

        # test with padding
        transform = dict(
            type='CenterCrop',
            crop_size=(img_width // 2, img_height * 2),
            auto_pad=True,
            pad_cfg=dict(type='Pad', padding_mode='constant', pad_val=12))
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        assert results['img_shape'] == (600, 200)
        assert results['img'].shape[:2] == results['gt_seg_map'].shape
        assert (results['img'][300:600, 100:300, ...] == 12).all()
        assert (results['gt_seg_map'][300:600, 100:300] == 255).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 110, 160], [100, 150, 200,
                                                     300]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()

        transform = dict(
            type='CenterCrop',
            crop_size=(img_width // 2, img_height * 2),
            auto_pad=True,
            pad_cfg=dict(
                type='Pad',
                padding_mode='constant',
                pad_val=dict(img=13, seg=33)))
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        assert results['img_shape'] == (600, 200)
        assert (results['img'][300:600, 100:300, ...] == 13).all()
        assert (results['gt_seg_map'][300:600, 100:300] == 33).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 110, 160], [100, 150, 200,
                                                     300]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()

        # test CenterCrop when crop_width is smaller than img_width
        transform = dict(
            type='CenterCrop', crop_size=(img_width // 2, img_height))
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        assert results['img_shape'] == (img_height, img_width // 2)
        assert (results['img'] == self.original_img[:, 100:300, ...]).all()
        assert (results['gt_seg_map'] == self.gt_semantic_map[:,
                                                              100:300]).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 110, 160], [100, 150, 200,
                                                     300]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()

        # test CenterCrop when crop_height is smaller than img_height
        transform = dict(
            type='CenterCrop', crop_size=(img_width, img_height // 2))
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        assert results['img_shape'] == (img_height // 2, img_width)
        assert (results['img'] == self.original_img[75:225, ...]).all()
        assert (results['gt_seg_map'] == self.gt_semantic_map[75:225,
                                                              ...]).all()
        assert np.equal(results['gt_bboxes'],
                        np.array([[0, 0, 210, 85], [200, 75, 400,
                                                    150]])).all()
        assert np.equal(
            results['gt_keypoints'],
            np.array([[[20, 0, 0]], [[200, 75, 1]], [[300, 150, 0]]])).all()

    @pytest.mark.skipif(
        condition=torch is None, reason='No torch in current env')
    def test_torchvision_compare(self):
        # compare results with torchvision
        results = {}
        transform = dict(type='CenterCrop', crop_size=224)
        center_crop_module = TRANSFORMS.build(transform)
        results = self.reset_results(results, self.original_img,
                                     self.gt_semantic_map)
        results = center_crop_module(results)
        center_crop_module = torchvision.transforms.CenterCrop(size=224)
        pil_img = Image.fromarray(self.original_img)
        pil_seg = Image.fromarray(self.gt_semantic_map)
        cropped_img = center_crop_module(pil_img)
        cropped_img = np.array(cropped_img)
        cropped_seg = center_crop_module(pil_seg)
        cropped_seg = np.array(cropped_seg)
        assert np.equal(results['img'], cropped_img).all()
        assert np.equal(results['gt_seg_map'], cropped_seg).all()


class TestRandomGrayscale:

    @classmethod
    def setup_class(cls):
        cls.img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)

    def test_repr(self):
        # test repr
        transform = dict(
            type='RandomGrayscale',
            prob=1.,
            channel_weights=(0.299, 0.587, 0.114),
            keep_channels=True)
        random_gray_scale_module = TRANSFORMS.build(transform)
        assert isinstance(repr(random_gray_scale_module), str)

    def test_error(self):
        # test invalid argument
        transform = dict(type='RandomGrayscale', prob=2)
        with pytest.raises(AssertionError):
            TRANSFORMS.build(transform)

    def test_transform(self):
        results = dict()
        # test rgb2gray, return the grayscale image with prob = 1.
        transform = dict(
            type='RandomGrayscale',
            prob=1.,
            channel_weights=(0.299, 0.587, 0.114),
            keep_channels=True)

        random_gray_scale_module = TRANSFORMS.build(transform)
        results['img'] = copy.deepcopy(self.img)
        img = random_gray_scale_module(results)['img']
        computed_gray = (self.img[:, :, 0] * 0.299 +
                         self.img[:, :, 1] * 0.587 +
                         self.img[:, :, 2] * 0.114).astype(np.uint8)
        for i in range(img.shape[2]):
            assert_array_almost_equal(img[:, :, i], computed_gray, decimal=4)
        assert img.shape == (10, 10, 3)

        # test rgb2gray, return the original image with p=0.
        transform = dict(type='RandomGrayscale', prob=0.)
        random_gray_scale_module = TRANSFORMS.build(transform)
        results['img'] = copy.deepcopy(self.img)
        img = random_gray_scale_module(results)['img']
        assert_array_equal(img, self.img)
        assert img.shape == (10, 10, 3)

        # test image with one channel
        transform = dict(type='RandomGrayscale', prob=1.)
        results['img'] = self.img[:, :, 0:1]
        random_gray_scale_module = TRANSFORMS.build(transform)
        img = random_gray_scale_module(results)['img']
        assert_array_equal(img, self.img[:, :, 0:1])
        assert img.shape == (10, 10, 1)


@TRANSFORMS.register_module()
class MockPackTaskInputs(BaseTransform):

    def __init__(self) -> None:
        super().__init__()

    def transform(self, results):
        packed_results = dict(inputs=results['img'], data_sample=Mock())
        return packed_results


class TestMultiScaleFlipAug:

    @classmethod
    def setup_class(cls):
        cls.img = mmcv.imread(
            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
        cls.original_img = copy.deepcopy(cls.img)

    def test_error(self):
        # test assertion if scales is not tuple or list of tuple
        with pytest.raises(AssertionError):
            transform = dict(
                type='MultiScaleFlipAug', scales=[1333, 800], transforms=[])
            TRANSFORMS.build(transform)

        # test assertion if flip_direction is not str or list of str
        with pytest.raises(AssertionError):
            transform = dict(
                type='MultiScaleFlipAug',
                scales=[(1333, 800)],
                flip_direction=1,
                transforms=[])
            TRANSFORMS.build(transform)

    @pytest.mark.skipif(
        condition=torch is None, reason='No torch in current env')
    def test_multi_scale_flip_aug(self):
        # test with empty transforms
        transform = dict(
            type='MultiScaleFlipAug',
            transforms=[dict(type='MockPackTaskInputs')],
            scales=[(1333, 800), (800, 600), (640, 480)],
            allow_flip=True,
            flip_direction=['horizontal', 'vertical', 'diagonal'])
        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
        results = dict()
        results['img'] = copy.deepcopy(self.original_img)
        packed_results = multi_scale_flip_aug_module(results)
        assert len(packed_results['inputs']) == 12

        # test with allow_flip=False
        transform = dict(
            type='MultiScaleFlipAug',
            transforms=[dict(type='MockPackTaskInputs')],
            scales=[(1333, 800), (800, 600), (640, 480)],
            allow_flip=False,
            flip_direction=['horizontal', 'vertical', 'diagonal'])
        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
        results = dict()
        results['img'] = copy.deepcopy(self.original_img)
        packed_results = multi_scale_flip_aug_module(results)
        assert len(packed_results['inputs']) == 3

        # test with transforms
        img_norm_cfg = dict(
            mean=[123.675, 116.28, 103.53],
            std=[58.395, 57.12, 57.375],
            to_rgb=True)
        transforms_cfg = [
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='MockPackTaskInputs')
        ]
        transform = dict(
            type='MultiScaleFlipAug',
            transforms=transforms_cfg,
            scales=[(1333, 800), (800, 600), (640, 480)],
            allow_flip=True,
            flip_direction=['horizontal', 'vertical', 'diagonal'])
        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
        results = dict()
        results['img'] = copy.deepcopy(self.original_img)
        packed_results = multi_scale_flip_aug_module(results)
        assert len(packed_results['inputs']) == 12

        # test with scale_factor
        img_norm_cfg = dict(
            mean=[123.675, 116.28, 103.53],
            std=[58.395, 57.12, 57.375],
            to_rgb=True)
        transforms_cfg = [
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='MockPackTaskInputs')
        ]
        transform = dict(
            type='MultiScaleFlipAug',
            transforms=transforms_cfg,
            scale_factor=[0.5, 1., 2.],
            allow_flip=True,
            flip_direction=['horizontal', 'vertical', 'diagonal'])
        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
        results = dict()
        results['img'] = copy.deepcopy(self.original_img)
        packed_results = multi_scale_flip_aug_module(results)
        assert len(packed_results['inputs']) == 12

        # test no resize
        img_norm_cfg = dict(
            mean=[123.675, 116.28, 103.53],
            std=[58.395, 57.12, 57.375],
            to_rgb=True)
        transforms_cfg = [
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='MockPackTaskInputs')
        ]
        transform = dict(
            type='MultiScaleFlipAug',
            transforms=transforms_cfg,
            allow_flip=True,
            flip_direction=['horizontal', 'vertical', 'diagonal'])
        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
        results = dict()
        results['img'] = copy.deepcopy(self.original_img)
        packed_results = multi_scale_flip_aug_module(results)
        assert len(packed_results['inputs']) == 4


class TestRandomChoiceResize:

    @classmethod
    def setup_class(cls):
        cls.img = mmcv.imread(
            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
        cls.original_img = copy.deepcopy(cls.img)

    def reset_results(self, results):
        results['img'] = copy.deepcopy(self.original_img)
        results['gt_seg_map'] = copy.deepcopy(self.original_img)

    def test_repr(self):
        # test repr
        transform = dict(
            type='RandomChoiceResize', scales=[(1333, 800), (1333, 600)])
        random_multiscale_resize = TRANSFORMS.build(transform)
        assert isinstance(repr(random_multiscale_resize), str)

    def test_error(self):
        # test assertion if size is smaller than 0
        with pytest.raises(AssertionError):
            transform = dict(type='RandomChoiceResize', scales=[0.5, 1, 2])
            TRANSFORMS.build(transform)

    def test_random_multiscale_resize(self):
        results = dict()
        # test with one scale
        transform = dict(type='RandomChoiceResize', scales=[(1333, 800)])
        random_multiscale_resize = TRANSFORMS.build(transform)
        self.reset_results(results)
        results = random_multiscale_resize(results)
        assert results['img'].shape == (800, 1333, 3)

        # test with multi scales
        _scale_choice = [(1333, 800), (1333, 600)]
        transform = dict(type='RandomChoiceResize', scales=_scale_choice)
        random_multiscale_resize = TRANSFORMS.build(transform)
        self.reset_results(results)
        results = random_multiscale_resize(results)
        assert (results['img'].shape[1],
                results['img'].shape[0]) in _scale_choice

        # test keep_ratio
        transform = dict(
            type='RandomChoiceResize',
            scales=[(900, 600)],
            resize_type='Resize',
            keep_ratio=True)
        random_multiscale_resize = TRANSFORMS.build(transform)
        self.reset_results(results)
        _input_ratio = results['img'].shape[0] / results['img'].shape[1]
        results = random_multiscale_resize(results)
        _output_ratio = results['img'].shape[0] / results['img'].shape[1]
        assert_array_almost_equal(_input_ratio, _output_ratio)

        # test clip_object_border
        gt_bboxes = [[200, 150, 600, 450]]
        transform = dict(
            type='RandomChoiceResize',
            scales=[(200, 150)],
            resize_type='Resize',
            clip_object_border=True)
        random_multiscale_resize = TRANSFORMS.build(transform)
        self.reset_results(results)
        results['gt_bboxes'] = np.array(gt_bboxes)
        results = random_multiscale_resize(results)
        assert results['img'].shape == (150, 200, 3)
        assert np.equal(results['gt_bboxes'], np.array([[100, 75, 200,
                                                         150]])).all()

        transform = dict(
            type='RandomChoiceResize',
            scales=[(200, 150)],
            resize_type='Resize',
            clip_object_border=False)
        random_multiscale_resize = TRANSFORMS.build(transform)
        self.reset_results(results)
        results['gt_bboxes'] = np.array(gt_bboxes)
        results = random_multiscale_resize(results)
        assert results['img'].shape == (150, 200, 3)
        assert np.equal(results['gt_bboxes'], np.array([[100, 75, 300,
                                                         225]])).all()


class TestRandomFlip:

    def test_init(self):

        # prob is float
        TRANSFORMS = RandomFlip(0.1)
        assert TRANSFORMS.prob == 0.1

        # prob is None
        with pytest.raises(ValueError):
            TRANSFORMS = RandomFlip(None)
            assert TRANSFORMS.prob is None

        # prob is a list
        TRANSFORMS = RandomFlip([0.1, 0.2], ['horizontal', 'vertical'])
        assert len(TRANSFORMS.prob) == 2
        assert len(TRANSFORMS.direction) == 2

        # direction is an invalid type
        with pytest.raises(ValueError):
            TRANSFORMS = RandomFlip(0.1, 1)

        # prob is an invalid type
        with pytest.raises(ValueError):
            TRANSFORMS = RandomFlip('0.1')

    def test_transform(self):

        results = {
            'img': np.random.random((224, 224, 3)),
            'gt_bboxes': np.array([[0, 1, 100, 101]]),
            'gt_keypoints': np.array([[[100, 100, 1.0]]]),
            # seg map flip is irrelative with image, so there is no requirement
            # that gt_set_map of test data matches image.
            'gt_seg_map': np.array([[0, 1], [2, 3]])
        }

        # horizontal flip
        TRANSFORMS = RandomFlip([1.0], ['horizontal'])
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert (results_update['gt_bboxes'] == np.array([[124, 1, 224,
                                                          101]])).all()
        assert (results_update['gt_seg_map'] == np.array([[1, 0], [3,
                                                                   2]])).all()

        # diagonal flip
        TRANSFORMS = RandomFlip([1.0], ['diagonal'])
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert (results_update['gt_bboxes'] == np.array([[124, 123, 224,
                                                          223]])).all()
        assert (results_update['gt_seg_map'] == np.array([[3, 2], [1,
                                                                   0]])).all()

        # vertical flip
        TRANSFORMS = RandomFlip([1.0], ['vertical'])
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert (results_update['gt_bboxes'] == np.array([[0, 123, 100,
                                                          223]])).all()
        assert (results_update['gt_seg_map'] == np.array([[2, 3], [0,
                                                                   1]])).all()

        # horizontal flip when direction is None
        TRANSFORMS = RandomFlip(1.0)
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert (results_update['gt_bboxes'] == np.array([[124, 1, 224,
                                                          101]])).all()
        assert (results_update['gt_seg_map'] == np.array([[1, 0], [3,
                                                                   2]])).all()

        # horizontal flip and swap label pair
        TRANSFORMS = RandomFlip([1.0], ['horizontal'],
                                swap_seg_labels=[[0, 1]])
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert (results_update['gt_seg_map'] == np.array([[0, 1], [3,
                                                                   2]])).all()
        assert results_update['swap_seg_labels'] == [[0, 1]]

        TRANSFORMS = RandomFlip(0.0)
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert (results_update['gt_bboxes'] == np.array([[0, 1, 100,
                                                          101]])).all()
        assert (results_update['gt_seg_map'] == np.array([[0, 1], [2,
                                                                   3]])).all()

        # flip direction is invalid in bbox flip
        with pytest.raises(ValueError):
            TRANSFORMS = RandomFlip(1.0)
            results_update = TRANSFORMS._flip_bbox(results['gt_bboxes'],
                                                   (224, 224), 'invalid')

        # flip direction is invalid in keypoints flip
        with pytest.raises(ValueError):
            TRANSFORMS = RandomFlip(1.0)
            results_update = TRANSFORMS._flip_keypoints(
                results['gt_keypoints'], (224, 224), 'invalid')

        # swap pair is invalid
        with pytest.raises(AssertionError):
            TRANSFORMS = RandomFlip(1.0, swap_seg_labels='invalid')
            results_update = TRANSFORMS._flip_seg_map(results['gt_seg_map'],
                                                      'horizontal')

    def test_repr(self):
        TRANSFORMS = RandomFlip(0.1)
        TRANSFORMS_str = str(TRANSFORMS)
        assert isinstance(TRANSFORMS_str, str)


class TestRandomResize:

    def test_init(self):
        TRANSFORMS = RandomResize(
            (224, 224),
            (1.0, 2.0),
        )
        assert TRANSFORMS.scale == (224, 224)

    def test_repr(self):
        TRANSFORMS = RandomResize(
            (224, 224),
            (1.0, 2.0),
        )
        TRANSFORMS_str = str(TRANSFORMS)
        assert isinstance(TRANSFORMS_str, str)

    def test_transform(self):

        # choose target scale from init when override is True
        results = {}
        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0))
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert results_update['scale'][0] >= 224 and results_update['scale'][
            0] <= 448
        assert results_update['scale'][1] >= 224 and results_update['scale'][
            1] <= 448

        # keep ratio is True
        results = {
            'img': np.random.random((224, 224, 3)),
            'gt_seg_map': np.random.random((224, 224, 3)),
            'gt_bboxes': np.array([[0, 0, 112, 112]]),
            'gt_keypoints': np.array([[[112, 112]]])
        }

        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0),
                                  resize_type='Resize',
                                  keep_ratio=True)
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert 224 <= results_update['img_shape'][0]
        assert 448 >= results_update['img_shape'][0]
        assert 224 <= results_update['img_shape'][1]
        assert 448 >= results_update['img_shape'][1]
        assert results_update['keep_ratio']
        assert results['gt_bboxes'][0][2] >= 112
        assert results['gt_bboxes'][0][2] <= 112

        # keep ratio is False
        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0),
                                  resize_type='Resize',
                                  keep_ratio=False)
        results_update = TRANSFORMS.transform(copy.deepcopy(results))

        # choose target scale from init when override is False and scale is a
        # list of tuples
        results = {}
        TRANSFORMS = RandomResize([(224, 448), (112, 224)],
                                  resize_type='Resize',
                                  keep_ratio=True)
        results_update = TRANSFORMS.transform(copy.deepcopy(results))
        assert results_update['scale'][1] >= 224 and results_update['scale'][
            1] <= 448
        assert results_update['scale'][0] >= 112 and results_update['scale'][
            0] <= 224

        # the type of scale is invalid in init
        with pytest.raises(NotImplementedError):
            results = {}
            TRANSFORMS = RandomResize([(224, 448), [112, 224]],
                                      resize_type='Resize',
                                      keep_ratio=True)
            results_update = TRANSFORMS.transform(copy.deepcopy(results))


class TestTestTimeAug:

    def test_init(self):
        subroutines = [[
            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
        ], [
            dict(type='RandomFlip', prob=1.),
            dict(type='RandomFlip', prob=0.)
        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]

        tta_transform = TestTimeAug(subroutines)
        subroutines = tta_transform.subroutines
        assert len(subroutines) == 4

        assert isinstance(subroutines[0].transforms[0], Resize)
        assert isinstance(subroutines[0].transforms[1], RandomFlip)
        assert isinstance(subroutines[0].transforms[2], Normalize)
        assert isinstance(subroutines[1].transforms[0], Resize)
        assert isinstance(subroutines[1].transforms[1], RandomFlip)
        assert isinstance(subroutines[1].transforms[2], Normalize)

    def test_transform(self):
        results = {
            'img': np.random.random((224, 224, 3)),
            'gt_bboxes': np.array([[0, 1, 100, 101]]),
            'gt_keypoints': np.array([[[100, 100, 1.0]]]),
            'gt_seg_map': np.random.random((224, 224, 3))
        }
        input_results = copy.deepcopy(results)
        transforms = [[
            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
        ], [
            dict(type='RandomFlip', prob=0.),
            dict(type='RandomFlip', prob=1.)
        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]

        tta_transform = TestTimeAug(transforms)
        results = tta_transform.transform(results)
        assert len(results['img']) == 4

        resize1 = tta_transform.subroutines[0].transforms[0]
        resize2 = tta_transform.subroutines[2].transforms[0]
        flip1 = tta_transform.subroutines[0].transforms[1]
        flip2 = tta_transform.subroutines[1].transforms[1]
        normalize = tta_transform.subroutines[0].transforms[2]
        target_results = [
            normalize.transform(
                flip1.transform(
                    resize1.transform(copy.deepcopy(input_results)))),
            normalize.transform(
                flip2.transform(
                    resize1.transform(copy.deepcopy(input_results)))),
            normalize.transform(
                flip1.transform(
                    resize2.transform(copy.deepcopy(input_results)))),
            normalize.transform(
                flip2.transform(
                    resize2.transform(copy.deepcopy(input_results)))),
        ]

        assert np.allclose(target_results[0]['img'], results['img'][0])
        assert np.allclose(target_results[1]['img'], results['img'][1])
        assert np.allclose(target_results[2]['img'], results['img'][2])
        assert np.allclose(target_results[3]['img'], results['img'][3])

    def test_repr(self):
        transforms = [[
            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
        ], [
            dict(type='RandomFlip', prob=0.),
            dict(type='RandomFlip', prob=1.)
        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]

        tta_transform = TestTimeAug(transforms)
        repr_str = repr(tta_transform)
        repr_str_list = repr_str.split('\n')
        assert repr_str_list[0] == 'TestTimeAugtransforms='
        assert repr_str_list[1] == 'Compose('
        assert repr_str_list[2].startswith('    Resize(scale=(1333, 800)')
        assert repr_str_list[3].startswith('    RandomFlip(prob=0.0')
        assert repr_str_list[4].startswith('    Normalize(mean=[0. 0. 0.]')


================================================
FILE: tests/test_transforms/test_transforms_wrapper.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings

import numpy as np
import pytest

from mmcv.transforms.base import BaseTransform
from mmcv.transforms.builder import TRANSFORMS
from mmcv.transforms.utils import (avoid_cache_randomness, cache_random_params,
                                   cache_randomness)
from mmcv.transforms.wrappers import (Compose, KeyMapper, RandomApply,
                                      RandomChoice, TransformBroadcaster)


@TRANSFORMS.register_module()
class AddToValue(BaseTransform):
    """Dummy transform to add a given addend to results['value']"""

    def __init__(self, addend=0) -> None:
        super().__init__()
        self.addend = addend

    def add(self, results, addend):
        augend = results['value']

        if isinstance(augend, list):
            warnings.warn('value is a list', UserWarning)
        if isinstance(augend, dict):
            warnings.warn('value is a dict', UserWarning)

        def _add_to_value(augend, addend):
            if isinstance(augend, list):
                return [_add_to_value(v, addend) for v in augend]
            if isinstance(augend, dict):
                return {k: _add_to_value(v, addend) for k, v in augend.items()}
            return augend + addend

        results['value'] = _add_to_value(results['value'], addend)
        return results

    def transform(self, results):
        return self.add(results, self.addend)

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'addend = {self.addend}'
        return repr_str


@TRANSFORMS.register_module()
class RandomAddToValue(AddToValue):
    """Dummy transform to add a random addend to results['value']"""

    def __init__(self, repeat=1) -> None:
        super().__init__(addend=None)
        self.repeat = repeat

    @cache_randomness
    def get_random_addend(self):
        return np.random.rand()

    def transform(self, results):
        for _ in range(self.repeat):
            results = self.add(results, addend=self.get_random_addend())
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f'repeat = {self.repeat}'
        return repr_str


@TRANSFORMS.register_module()
class SumTwoValues(BaseTransform):
    """Dummy transform to test transform wrappers."""

    def transform(self, results):
        if 'num_1' in results and 'num_2' in results:
            results['sum'] = results['num_1'] + results['num_2']
        elif 'num_1' in results:
            results['sum'] = results['num_1']
        elif 'num_2' in results:
            results['sum'] = results['num_2']
        else:
            results['sum'] = np.nan
        return results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        return repr_str


def test_compose():

    # Case 1: build from cfg
    pipeline = [dict(type='AddToValue')]
    pipeline = Compose(pipeline)
    _ = str(pipeline)

    # Case 2: build from transform list
    pipeline = [AddToValue()]
    pipeline = Compose(pipeline)

    # Case 3: invalid build arguments
    pipeline = [[dict(type='AddToValue')]]
    with pytest.raises(TypeError):
        pipeline = Compose(pipeline)

    # Case 4: contain transform with None output
    class DummyTransform(BaseTransform):

        def transform(self, results):
            return None

    pipeline = Compose([DummyTransform()])
    results = pipeline({})
    assert results is None


def test_cache_random_parameters():

    transform = RandomAddToValue()

    # Case 1: cache random parameters
    assert hasattr(RandomAddToValue, '_methods_with_randomness')
    assert 'get_random_addend' in RandomAddToValue._methods_with_randomness

    with cache_random_params(transform):
        results_1 = transform(dict(value=0))
        results_2 = transform(dict(value=0))
        np.testing.assert_equal(results_1['value'], results_2['value'])

    # Case 2: do not cache random parameters
    results_1 = transform(dict(value=0))
    results_2 = transform(dict(value=0))
    with pytest.raises(AssertionError):
        np.testing.assert_equal(results_1['value'], results_2['value'])

    # Case 3: allow to invoke random method 0 times
    transform = RandomAddToValue(repeat=0)
    with cache_random_params(transform):
        _ = transform(dict(value=0))

    # Case 4: NOT allow to invoke random method >1 times
    transform = RandomAddToValue(repeat=2)
    with pytest.raises(RuntimeError):
        with cache_random_params(transform):
            _ = transform(dict(value=0))

    # Case 5: apply on nested transforms
    transform = Compose([RandomAddToValue()])
    with cache_random_params(transform):
        results_1 = transform(dict(value=0))
        results_2 = transform(dict(value=0))
        np.testing.assert_equal(results_1['value'], results_2['value'])


def test_key_mapper():
    # Case 0: only remap
    pipeline = KeyMapper(
        transforms=[AddToValue(addend=1)], remapping={'value': 'v_out'})

    results = dict(value=0)
    results = pipeline(results)

    np.testing.assert_equal(results['value'], 0)  # should be unchanged
    np.testing.assert_equal(results['v_out'], 1)

    # Case 1: simple remap
    pipeline = KeyMapper(
        transforms=[AddToValue(addend=1)],
        mapping={'value': 'v_in'},
        remapping={'value': 'v_out'})

    results = dict(value=0, v_in=1)
    results = pipeline(results)

    np.testing.assert_equal(results['value'], 0)  # should be unchanged
    np.testing.assert_equal(results['v_in'], 1)
    np.testing.assert_equal(results['v_out'], 2)

    # Case 2: collecting list
    pipeline = KeyMapper(
        transforms=[AddToValue(addend=2)],
        mapping={'value': ['v_in_1', 'v_in_2']},
        remapping={'value': ['v_out_1', 'v_out_2']})
    results = dict(value=0, v_in_1=1, v_in_2=2)

    with pytest.warns(UserWarning, match='value is a list'):
        results = pipeline(results)

    np.testing.assert_equal(results['value'], 0)  # should be unchanged
    np.testing.assert_equal(results['v_in_1'], 1)
    np.testing.assert_equal(results['v_in_2'], 2)
    np.testing.assert_equal(results['v_out_1'], 3)
    np.testing.assert_equal(results['v_out_2'], 4)

    # Case 3: collecting dict
    pipeline = KeyMapper(
        transforms=[AddToValue(addend=2)],
        mapping={'value': {
            'v1': 'v_in_1',
            'v2': 'v_in_2'
        }},
        remapping={'value': {
            'v1': 'v_out_1',
            'v2': 'v_out_2'
        }})
    results = dict(value=0, v_in_1=1, v_in_2=2)

    with pytest.warns(UserWarning, match='value is a dict'):
        results = pipeline(results)

    np.testing.assert_equal(results['value'], 0)  # should be unchanged
    np.testing.assert_equal(results['v_in_1'], 1)
    np.testing.assert_equal(results['v_in_2'], 2)
    np.testing.assert_equal(results['v_out_1'], 3)
    np.testing.assert_equal(results['v_out_2'], 4)

    # Case 4: collecting list with auto_remap mode
    pipeline = KeyMapper(
        transforms=[AddToValue(addend=2)],
        mapping=dict(value=['v_in_1', 'v_in_2']),
        auto_remap=True)
    results = dict(value=0, v_in_1=1, v_in_2=2)

    with pytest.warns(UserWarning, match='value is a list'):
        results = pipeline(results)

    np.testing.assert_equal(results['value'], 0)
    np.testing.assert_equal(results['v_in_1'], 3)
    np.testing.assert_equal(results['v_in_2'], 4)

    # Case 5: collecting dict with auto_remap mode
    pipeline = KeyMapper(
        transforms=[AddToValue(addend=2)],
        mapping=dict(value=dict(v1='v_in_1', v2='v_in_2')),
        auto_remap=True)
    results = dict(value=0, v_in_1=1, v_in_2=2)

    with pytest.warns(UserWarning, match='value is a dict'):
        results = pipeline(results)

    np.testing.assert_equal(results['value'], 0)
    np.testing.assert_equal(results['v_in_1'], 3)
    np.testing.assert_equal(results['v_in_2'], 4)

    # Case 6: nested collection with auto_remap mode
    pipeline = KeyMapper(
        transforms=[AddToValue(addend=2)],
        mapping=dict(value=['v1', dict(v2=['v21', 'v22'], v3='v3')]),
        auto_remap=True)
    results = dict(value=0, v1=1, v21=2, v22=3, v3=4)

    with pytest.warns(UserWarning, match='value is a list'):
        results = pipeline(results)

    np.testing.assert_equal(results['value'], 0)
    np.testing.assert_equal(results['v1'], 3)
    np.testing.assert_equal(results['v21'], 4)
    np.testing.assert_equal(results['v22'], 5)
    np.testing.assert_equal(results['v3'], 6)

    # Case 7: output_map must be None if `auto_remap` is set True
    with pytest.raises(ValueError):
        pipeline = KeyMapper(
            transforms=[AddToValue(addend=1)],
            mapping=dict(value='v_in'),
            remapping=dict(value='v_out'),
            auto_remap=True)

    # Case 8: allow_nonexist_keys8
    pipeline = KeyMapper(
        transforms=[SumTwoValues()],
        mapping=dict(num_1='a', num_2='b'),
        auto_remap=False,
        allow_nonexist_keys=True)

    results = pipeline(dict(a=1, b=2))
    np.testing.assert_equal(results['sum'], 3)

    results = pipeline(dict(a=1))
    np.testing.assert_equal(results['sum'], 1)

    # Case 9: use wrapper as a transform
    transform = KeyMapper(mapping=dict(b='a'), auto_remap=False)
    results = transform(dict(a=1))
    # note that the original key 'a' will not be removed
    assert results == dict(a=1, b=1)

    # Case 10: manually set keys ignored
    pipeline = KeyMapper(
        transforms=[SumTwoValues()],
        mapping=dict(num_1='a', num_2=...),  # num_2 (b) will be ignored
        auto_remap=False,
        # allow_nonexist_keys will not affect manually ignored keys
        allow_nonexist_keys=False)

    results = pipeline(dict(a=1, b=2))
    np.testing.assert_equal(results['sum'], 1)

    # Test basic functions
    pipeline = KeyMapper(
        transforms=[AddToValue(addend=1)],
        mapping=dict(value='v_in'),
        remapping=dict(value='v_out'))

    # __iter__
    for _ in pipeline:
        pass

    # __repr__
    assert repr(pipeline) == (
        'KeyMapper(transforms = Compose(\n    ' + 'AddToValueaddend = 1' +
        '\n), mapping = {\'value\': \'v_in\'}, ' +
        'remapping = {\'value\': \'v_out\'}, auto_remap = False, ' +
        'allow_nonexist_keys = False)')


def test_transform_broadcaster():

    # Case 1: apply to list in results
    pipeline = TransformBroadcaster(
        transforms=[AddToValue(addend=1)],
        mapping=dict(value='values'),
        auto_remap=True)
    results = dict(values=[1, 2])

    results = pipeline(results)

    np.testing.assert_equal(results['values'], [2, 3])

    # Case 2: apply to multiple keys
    pipeline = TransformBroadcaster(
        transforms=[AddToValue(addend=1)],
        mapping=dict(value=['v_1', 'v_2']),
        auto_remap=True)
    results = dict(v_1=1, v_2=2)

    results = pipeline(results)

    np.testing.assert_equal(results['v_1'], 2)
    np.testing.assert_equal(results['v_2'], 3)

    # Case 3: apply to multiple groups of keys
    pipeline = TransformBroadcaster(
        transforms=[SumTwoValues()],
        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', 'b_2']),
        remapping=dict(sum=['a', 'b']),
        auto_remap=False)

    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
    results = pipeline(results)

    np.testing.assert_equal(results['a'], 3)
    np.testing.assert_equal(results['b'], 7)

    # Case 3: apply to all keys
    pipeline = TransformBroadcaster(
        transforms=[SumTwoValues()], mapping=None, remapping=None)
    results = dict(num_1=[1, 2, 3], num_2=[4, 5, 6])

    results = pipeline(results)

    np.testing.assert_equal(results['sum'], [5, 7, 9])

    # Case 4: inconsistent sequence length
    with pytest.raises(ValueError):
        pipeline = TransformBroadcaster(
            transforms=[SumTwoValues()],
            mapping=dict(num_1='list_1', num_2='list_2'),
            auto_remap=False)

        results = dict(list_1=[1, 2], list_2=[1, 2, 3])
        _ = pipeline(results)

    # Case 5: share random parameter
    pipeline = TransformBroadcaster(
        transforms=[RandomAddToValue()],
        mapping=dict(value='values'),
        auto_remap=True,
        share_random_params=True)

    results = dict(values=[0, 0])
    results = pipeline(results)

    np.testing.assert_equal(results['values'][0], results['values'][1])

    # Case 6: partial broadcasting
    pipeline = TransformBroadcaster(
        transforms=[SumTwoValues()],
        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', ...]),
        remapping=dict(sum=['a', 'b']),
        auto_remap=False)

    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
    results = pipeline(results)

    np.testing.assert_equal(results['a'], 3)
    np.testing.assert_equal(results['b'], 3)

    pipeline = TransformBroadcaster(
        transforms=[SumTwoValues()],
        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', 'b_2']),
        remapping=dict(sum=['a', ...]),
        auto_remap=False)

    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
    results = pipeline(results)

    np.testing.assert_equal(results['a'], 3)
    assert 'b' not in results

    # Test repr
    assert repr(pipeline) == (
        'TransformBroadcaster(transforms = Compose(\n' + '    SumTwoValues' +
        '\n), mapping = {\'num_1\': [\'a_1\', \'b_1\'], ' +
        '\'num_2\': [\'a_2\', \'b_2\']}, ' +
        'remapping = {\'sum\': [\'a\', Ellipsis]}, auto_remap = False, ' +
        'allow_nonexist_keys = False, share_random_params = False)')


def test_random_choice():

    # Case 1: given probability
    pipeline = RandomChoice(
        transforms=[[AddToValue(addend=1.0)], [AddToValue(addend=2.0)]],
        prob=[1.0, 0.0])

    results = pipeline(dict(value=1))
    np.testing.assert_equal(results['value'], 2.0)

    # Case 2: default probability
    pipeline = RandomChoice(transforms=[[AddToValue(
        addend=1.0)], [AddToValue(addend=2.0)]])

    _ = pipeline(dict(value=1))

    # Case 3: nested RandomChoice in TransformBroadcaster
    pipeline = TransformBroadcaster(
        transforms=[
            RandomChoice(
                transforms=[[AddToValue(addend=1.0)],
                            [AddToValue(addend=2.0)]], ),
        ],
        mapping={'value': 'values'},
        auto_remap=True,
        share_random_params=True)

    results = dict(values=[0 for _ in range(10)])
    results = pipeline(results)
    # check share_random_params=True works so that all values are same
    values = results['values']
    assert all(map(lambda x: x == values[0], values))

    # repr
    assert repr(pipeline) == (
        'TransformBroadcaster(transforms = Compose(\n' +
        '    RandomChoice(transforms = [Compose(\n' +
        '    AddToValueaddend = 1.0' + '\n), Compose(\n' +
        '    AddToValueaddend = 2.0' + '\n)]prob = None)' +
        '\n), mapping = {\'value\': \'values\'}, ' +
        'remapping = {\'value\': \'values\'}, auto_remap = True, ' +
        'allow_nonexist_keys = False, share_random_params = True)')


def test_random_apply():

    # Case 1: simple use
    pipeline = RandomApply(transforms=[AddToValue(addend=1.0)], prob=1.0)
    results = pipeline(dict(value=1))
    np.testing.assert_equal(results['value'], 2.0)

    pipeline = RandomApply(transforms=[AddToValue(addend=1.0)], prob=0.0)
    results = pipeline(dict(value=1))
    np.testing.assert_equal(results['value'], 1.0)

    # Case 2: nested RandomApply in TransformBroadcaster
    pipeline = TransformBroadcaster(
        transforms=[RandomApply(transforms=[AddToValue(addend=1)], prob=0.5)],
        mapping={'value': 'values'},
        auto_remap=True,
        share_random_params=True)

    results = dict(values=[0 for _ in range(10)])
    results = pipeline(results)
    # check share_random_params=True works so that all values are same
    values = results['values']
    assert all(map(lambda x: x == values[0], values))

    # __iter__
    for _ in pipeline:
        pass

    # repr
    assert repr(pipeline) == (
        'TransformBroadcaster(transforms = Compose(\n' +
        '    RandomApply(transforms = Compose(\n' +
        '    AddToValueaddend = 1' + '\n), prob = 0.5)' +
        '\n), mapping = {\'value\': \'values\'}, ' +
        'remapping = {\'value\': \'values\'}, auto_remap = True, ' +
        'allow_nonexist_keys = False, share_random_params = True)')


def test_utils():
    # Test cache_randomness: normal case
    class DummyTransform(BaseTransform):

        @cache_randomness
        def func(self):
            return np.random.rand()

        def transform(self, results):
            _ = self.func()
            return results

    transform = DummyTransform()
    _ = transform({})
    with cache_random_params(transform):
        _ = transform({})

    # Test cache_randomness: invalid function type
    with pytest.raises(TypeError):

        class DummyTransform(BaseTransform):

            @cache_randomness
            @staticmethod
            def func():
                return np.random.rand()

            def transform(self, results):
                return results

    # Test cache_randomness: invalid function argument list
    with pytest.raises(TypeError):

        class DummyTransform(BaseTransform):

            @cache_randomness
            def func(cls):
                return np.random.rand()

            def transform(self, results):
                return results

    # Test avoid_cache_randomness: invalid mixture with cache_randomness
    with pytest.raises(RuntimeError):

        @avoid_cache_randomness
        class DummyTransform(BaseTransform):

            @cache_randomness
            def func(self):
                pass

            def transform(self, results):
                return results

    # Test avoid_cache_randomness: raise error in cache_random_params
    with pytest.raises(RuntimeError):

        @avoid_cache_randomness
        class DummyTransform(BaseTransform):

            def transform(self, results):
                return results

        transform = DummyTransform()
        with cache_random_params(transform):
            pass

    # Test avoid_cache_randomness: non-inheritable
    @avoid_cache_randomness
    class DummyBaseTransform(BaseTransform):

        def transform(self, results):
            return results

    class DummyTransform(DummyBaseTransform):
        pass

    transform = DummyTransform()
    with cache_random_params(transform):
        pass


================================================
FILE: tests/test_utils/test_env.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import sys

import pytest

import mmcv


def test_collect_env():
    try:
        import torch  # noqa: F401
    except ModuleNotFoundError:
        pytest.skip('skipping tests that require PyTorch')

    from mmcv.utils import collect_env
    env_info = collect_env()
    expected_keys = [
        'sys.platform', 'Python', 'CUDA available', 'PyTorch',
        'PyTorch compiling details', 'OpenCV', 'MMCV', 'MMCV Compiler', 'GCC',
        'MMCV CUDA Compiler'
    ]
    for key in expected_keys:
        assert key in env_info

    if env_info['CUDA available']:
        for key in ['CUDA_HOME', 'NVCC']:
            assert key in env_info

    if sys.platform == 'win32':
        assert 'MSVC' in env_info

    assert env_info['sys.platform'] == sys.platform
    assert env_info['Python'] == sys.version.replace('\n', '')
    assert env_info['MMCV'] == mmcv.__version__


================================================
FILE: tests/test_utils/test_parrots_jit.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
from mmengine.utils.dl_utils import TORCH_VERSION

import mmcv

pytest.skip('this test not ready now', allow_module_level=True)
skip_no_parrots = pytest.mark.skipif(
    TORCH_VERSION != 'parrots', reason='test case under parrots environment')


class TestJit:

    def test_add_dict(self):

        @mmcv.jit
        def add_dict(oper):
            rets = oper['x'] + oper['y']
            return {'result': rets}

        def add_dict_pyfunc(oper):
            rets = oper['x'] + oper['y']
            return {'result': rets}

        a = torch.rand((3, 4))
        b = torch.rand((3, 4))
        oper = {'x': a, 'y': b}

        rets_t = add_dict(oper)
        rets = add_dict_pyfunc(oper)
        assert 'result' in rets
        assert (rets_t['result'] == rets['result']).all()

    def test_add_list(self):

        @mmcv.jit
        def add_list(oper, x, y):
            rets = {}
            for idx, pair in enumerate(oper):
                rets[f'k{idx}'] = pair['x'] + pair['y']
            rets[f'k{len(oper)}'] = x + y
            return rets

        def add_list_pyfunc(oper, x, y):
            rets = {}
            for idx, pair in enumerate(oper):
                rets[f'k{idx}'] = pair['x'] + pair['y']
            rets[f'k{len(oper)}'] = x + y
            return rets

        pair_num = 3
        oper = []
        for _ in range(pair_num):
            oper.append({'x': torch.rand((3, 4)), 'y': torch.rand((3, 4))})
        a = torch.rand((3, 4))
        b = torch.rand((3, 4))
        rets = add_list_pyfunc(oper, x=a, y=b)
        rets_t = add_list(oper, x=a, y=b)
        for idx in range(pair_num + 1):
            assert f'k{idx}' in rets_t
            assert (rets[f'k{idx}'] == rets_t[f'k{idx}']).all()

    @skip_no_parrots
    def test_jit_cache(self):

        @mmcv.jit
        def func(oper):
            if oper['const'] > 1:
                return oper['x'] * 2 + oper['y']
            else:
                return oper['x'] * 2 - oper['y']

        def pyfunc(oper):
            if oper['const'] > 1:
                return oper['x'] * 2 + oper['y']
            else:
                return oper['x'] * 2 - oper['y']

        assert len(func._cache._cache) == 0

        oper = {'const': 2, 'x': torch.rand((3, 4)), 'y': torch.rand((3, 4))}
        rets_plus = pyfunc(oper)
        rets_plus_t = func(oper)
        assert (rets_plus == rets_plus_t).all()
        assert len(func._cache._cache) == 1

        oper['const'] = 0.5
        rets_minus = pyfunc(oper)
        rets_minus_t = func(oper)
        assert (rets_minus == rets_minus_t).all()
        assert len(func._cache._cache) == 2

        rets_a = (rets_minus_t + rets_plus_t) / 4
        assert torch.allclose(oper['x'], rets_a)

    @skip_no_parrots
    def test_jit_shape(self):

        @mmcv.jit
        def func(a):
            return a + 1

        assert len(func._cache._cache) == 0

        a = torch.ones((3, 4))
        r = func(a)
        assert r.shape == (3, 4)
        assert (r == 2).all()
        assert len(func._cache._cache) == 1

        a = torch.ones((2, 3, 4))
        r = func(a)
        assert r.shape == (2, 3, 4)
        assert (r == 2).all()
        assert len(func._cache._cache) == 2

    @skip_no_parrots
    def test_jit_kwargs(self):

        @mmcv.jit
        def func(a, b):
            return torch.mean((a - b) * (a - b))

        assert len(func._cache._cache) == 0
        x = torch.rand((16, 32))
        y = torch.rand((16, 32))
        func(x, y)
        assert len(func._cache._cache) == 1
        func(x, b=y)
        assert len(func._cache._cache) == 1
        func(b=y, a=x)
        assert len(func._cache._cache) == 1

    def test_jit_derivate(self):

        @mmcv.jit(derivate=True)
        def func(x, y):
            return (x + 2) * (y - 2)

        a = torch.rand((3, 4))
        b = torch.rand((3, 4))
        a.requires_grad = True

        c = func(a, b)
        assert c.requires_grad
        d = torch.empty_like(c)
        d.fill_(1.0)
        c.backward(d)
        assert torch.allclose(a.grad, (b - 2))
        assert b.grad is None

        a.grad = None
        c = func(a, b)
        assert c.requires_grad
        d = torch.empty_like(c)
        d.fill_(2.7)
        c.backward(d)
        assert torch.allclose(a.grad, 2.7 * (b - 2))
        assert b.grad is None

    def test_jit_optimize(self):

        @mmcv.jit(optimize=True)
        def func(a, b):
            return torch.mean((a - b) * (a - b))

        def pyfunc(a, b):
            return torch.mean((a - b) * (a - b))

        a = torch.rand((16, 32))
        b = torch.rand((16, 32))

        c = func(a, b)
        d = pyfunc(a, b)
        assert torch.allclose(c, d)

    @mmcv.skip_no_elena
    def test_jit_coderize(self):
        if not torch.cuda.is_available():
            return

        @mmcv.jit(coderize=True)
        def func(a, b):
            return (a + b) * (a - b)

        def pyfunc(a, b):
            return (a + b) * (a - b)

        a = torch.rand((16, 32), device='cuda')
        b = torch.rand((16, 32), device='cuda')

        c = func(a, b)
        d = pyfunc(a, b)
        assert torch.allclose(c, d)

    def test_jit_value_dependent(self):

        @mmcv.jit
        def func(a, b):
            torch.nonzero(a)
            return torch.mean((a - b) * (a - b))

        def pyfunc(a, b):
            torch.nonzero(a)
            return torch.mean((a - b) * (a - b))

        a = torch.rand((16, 32))
        b = torch.rand((16, 32))

        c = func(a, b)
        d = pyfunc(a, b)
        assert torch.allclose(c, d)

    @skip_no_parrots
    def test_jit_check_input(self):

        def func(x):
            y = torch.rand_like(x)
            return x + y

        a = torch.ones((3, 4))
        with pytest.raises(AssertionError):
            func = mmcv.jit(func, check_input=(a, ))

    @skip_no_parrots
    def test_jit_partial_shape(self):

        @mmcv.jit(full_shape=False)
        def func(a, b):
            return torch.mean((a - b) * (a - b))

        def pyfunc(a, b):
            return torch.mean((a - b) * (a - b))

        a = torch.rand((3, 4))
        b = torch.rand((3, 4))
        assert torch.allclose(func(a, b), pyfunc(a, b))
        assert len(func._cache._cache) == 1

        a = torch.rand((6, 5))
        b = torch.rand((6, 5))
        assert torch.allclose(func(a, b), pyfunc(a, b))
        assert len(func._cache._cache) == 1

        a = torch.rand((3, 4, 5))
        b = torch.rand((3, 4, 5))
        assert torch.allclose(func(a, b), pyfunc(a, b))
        assert len(func._cache._cache) == 2

        a = torch.rand((1, 9, 8))
        b = torch.rand((1, 9, 8))
        assert torch.allclose(func(a, b), pyfunc(a, b))
        assert len(func._cache._cache) == 2

    def test_instance_method(self):

        class T:

            def __init__(self, shape):
                self._c = torch.rand(shape)

            @mmcv.jit
            def test_method(self, x, y):
                return (x * self._c) + y

        shape = (16, 32)
        t = T(shape)
        a = torch.rand(shape)
        b = torch.rand(shape)
        res = (a * t._c) + b
        jit_res = t.test_method(a, b)
        assert torch.allclose(res, jit_res)

        t = T(shape)
        res = (a * t._c) + b
        jit_res = t.test_method(a, b)
        assert torch.allclose(res, jit_res)


================================================
FILE: tests/test_video/test_optflow.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import tempfile

import cv2
import numpy as np
import pytest
from numpy.testing import assert_array_almost_equal, assert_array_equal

import mmcv


def test_flowread():
    data_dir = osp.join(osp.dirname(__file__), '../data')
    flow_shape = (60, 80, 2)

    # read .flo file
    flow = mmcv.flowread(osp.join(data_dir, 'optflow.flo'))
    assert flow.shape == flow_shape

    # pseudo read
    flow_same = mmcv.flowread(flow)
    assert_array_equal(flow, flow_same)

    # read quantized flow concatenated vertically
    flow = mmcv.flowread(
        osp.join(data_dir, 'optflow_concat0.jpg'), quantize=True, denorm=True)
    assert flow.shape == flow_shape

    # read quantized flow concatenated horizontally
    flow = mmcv.flowread(
        osp.join(data_dir, 'optflow_concat1.jpg'),
        quantize=True,
        concat_axis=1,
        denorm=True)
    assert flow.shape == flow_shape

    # test exceptions
    notflow_file = osp.join(data_dir, 'color.jpg')
    with pytest.raises(TypeError):
        mmcv.flowread(1)
    with pytest.raises(IOError):
        mmcv.flowread(notflow_file)
    with pytest.raises(IOError):
        mmcv.flowread(notflow_file, quantize=True)
    with pytest.raises(ValueError):
        mmcv.flowread(np.zeros((100, 100, 1)))


def test_flowwrite():
    flow = np.random.rand(100, 100, 2).astype(np.float32)

    # write to a .flo file
    tmp_filehandler, filename = tempfile.mkstemp()
    mmcv.flowwrite(flow, filename)
    flow_from_file = mmcv.flowread(filename)
    assert_array_equal(flow, flow_from_file)
    os.close(tmp_filehandler)
    os.remove(filename)

    # write to two .jpg files
    tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test_flow.jpg')
    for concat_axis in range(2):
        mmcv.flowwrite(
            flow, tmp_filename, quantize=True, concat_axis=concat_axis)
        shape = (200, 100) if concat_axis == 0 else (100, 200)
        assert osp.isfile(tmp_filename)
        assert mmcv.imread(tmp_filename, flag='unchanged').shape == shape
        os.remove(tmp_filename)

    # test exceptions
    with pytest.raises(AssertionError):
        mmcv.flowwrite(flow, tmp_filename, quantize=True, concat_axis=2)


def test_quantize_flow():
    flow = (np.random.rand(10, 8, 2).astype(np.float32) - 0.5) * 15
    max_val = 5.0
    dx, dy = mmcv.quantize_flow(flow, max_val=max_val, norm=False)
    ref = np.zeros_like(flow, dtype=np.uint8)
    for i in range(ref.shape[0]):
        for j in range(ref.shape[1]):
            for k in range(ref.shape[2]):
                val = flow[i, j, k] + max_val
                val = min(max(val, 0), 2 * max_val)
                ref[i, j, k] = min(np.floor(255 * val / (2 * max_val)), 254)
    assert_array_equal(dx, ref[..., 0])
    assert_array_equal(dy, ref[..., 1])
    max_val = 0.5
    dx, dy = mmcv.quantize_flow(flow, max_val=max_val, norm=True)
    ref = np.zeros_like(flow, dtype=np.uint8)
    for i in range(ref.shape[0]):
        for j in range(ref.shape[1]):
            for k in range(ref.shape[2]):
                scale = flow.shape[1] if k == 0 else flow.shape[0]
                val = flow[i, j, k] / scale + max_val
                val = min(max(val, 0), 2 * max_val)
                ref[i, j, k] = min(np.floor(255 * val / (2 * max_val)), 254)
    assert_array_equal(dx, ref[..., 0])
    assert_array_equal(dy, ref[..., 1])


def test_dequantize_flow():
    dx = np.random.randint(256, size=(10, 8), dtype=np.uint8)
    dy = np.random.randint(256, size=(10, 8), dtype=np.uint8)
    max_val = 5.0
    flow = mmcv.dequantize_flow(dx, dy, max_val=max_val, denorm=False)
    ref = np.zeros_like(flow, dtype=np.float32)
    for i in range(ref.shape[0]):
        for j in range(ref.shape[1]):
            ref[i, j, 0] = float(dx[i, j] + 0.5) * 2 * max_val / 255 - max_val
            ref[i, j, 1] = float(dy[i, j] + 0.5) * 2 * max_val / 255 - max_val
    assert_array_almost_equal(flow, ref)
    max_val = 0.5
    flow = mmcv.dequantize_flow(dx, dy, max_val=max_val, denorm=True)
    h, w = dx.shape
    ref = np.zeros_like(flow, dtype=np.float32)
    for i in range(ref.shape[0]):
        for j in range(ref.shape[1]):
            ref[i, j,
                0] = (float(dx[i, j] + 0.5) * 2 * max_val / 255 - max_val) * w
            ref[i, j,
                1] = (float(dy[i, j] + 0.5) * 2 * max_val / 255 - max_val) * h
    assert_array_almost_equal(flow, ref)


def test_flow2rgb():
    flow = np.array([[[0, 0], [0.5, 0.5], [1, 1], [2, 1], [3, np.inf]]],
                    dtype=np.float32)
    flow_img = mmcv.flow2rgb(flow)
    # yapf: disable
    assert_array_almost_equal(
        flow_img,
        np.array([[[1., 1., 1.],
                   [1., 0.826074731, 0.683772236],
                   [1., 0.652149462, 0.367544472],
                   [1., 0.265650552, 5.96046448e-08],
                   [0., 0., 0.]]],
                 dtype=np.float32))
    # yapf: enable


def test_flow_warp():

    img = np.zeros((5, 5, 3))
    img[2, 2, 0] = 1
    flow = np.ones((5, 5, 2))

    res_nn = mmcv.flow_warp(img, flow, interpolate_mode='nearest')
    res_bi = mmcv.flow_warp(img, flow, interpolate_mode='bilinear')

    assert_array_almost_equal(res_nn, res_bi, decimal=5)

    img = np.zeros((5, 5, 1))
    img[2, 2, 0] = 1
    img[2, 3, 0] = 0.75
    flow = np.zeros((5, 5, 2))
    flow[2, 2, :] = [0.5, 0.7]

    res_ = np.copy(img)
    res_[2, 2] = 0.5 * 0.3 + 0.75 * 0.5 * 0.3
    res_bi = mmcv.flow_warp(img, flow, interpolate_mode='bilinear')
    assert_array_almost_equal(res_, res_bi, decimal=5)

    with pytest.raises(NotImplementedError):
        _ = mmcv.flow_warp(img, flow, interpolate_mode='xxx')

    with pytest.raises(AssertionError):
        _ = mmcv.flow_warp(img, flow[:, :, 0], interpolate_mode='xxx')


def test_make_color_wheel():
    default_color_wheel = mmcv.make_color_wheel()
    color_wheel = mmcv.make_color_wheel([2, 2, 2, 2, 2, 2])
    # yapf: disable
    assert_array_equal(default_color_wheel, np.array(
        [[1.       , 0.        , 0.        ],  # noqa
        [1.        , 0.06666667, 0.        ],  # noqa
        [1.        , 0.13333334, 0.        ],  # noqa
        [1.        , 0.2       , 0.        ],  # noqa
        [1.        , 0.26666668, 0.        ],  # noqa
        [1.        , 0.33333334, 0.        ],  # noqa
        [1.        , 0.4       , 0.        ],  # noqa
        [1.        , 0.46666667, 0.        ],  # noqa
        [1.        , 0.53333336, 0.        ],  # noqa
        [1.        , 0.6       , 0.        ],  # noqa
        [1.        , 0.6666667 , 0.        ],  # noqa
        [1.        , 0.73333335, 0.        ],  # noqa
        [1.        , 0.8       , 0.        ],  # noqa
        [1.        , 0.8666667 , 0.        ],  # noqa
        [1.        , 0.93333334, 0.        ],  # noqa
        [1.        , 1.        , 0.        ],  # noqa
        [0.8333333 , 1.        , 0.        ],  # noqa
        [0.6666667 , 1.        , 0.        ],  # noqa
        [0.5       , 1.        , 0.        ],  # noqa
        [0.33333334, 1.        , 0.        ],  # noqa
        [0.16666667, 1.        , 0.        ],  # noqa
        [0.        , 1.        , 0.        ],  # noqa
        [0.        , 1.        , 0.25      ],  # noqa
        [0.        , 1.        , 0.5       ],  # noqa
        [0.        , 1.        , 0.75      ],  # noqa
        [0.        , 1.        , 1.        ],  # noqa
        [0.        , 0.90909094, 1.        ],  # noqa
        [0.        , 0.8181818 , 1.        ],  # noqa
        [0.        , 0.72727275, 1.        ],  # noqa
        [0.        , 0.6363636 , 1.        ],  # noqa
        [0.        , 0.54545456, 1.        ],  # noqa
        [0.        , 0.45454547, 1.        ],  # noqa
        [0.        , 0.36363637, 1.        ],  # noqa
        [0.        , 0.27272728, 1.        ],  # noqa
        [0.        , 0.18181819, 1.        ],  # noqa
        [0.        , 0.09090909, 1.        ],  # noqa
        [0.        , 0.        , 1.        ],  # noqa
        [0.07692308, 0.        , 1.        ],  # noqa
        [0.15384616, 0.        , 1.        ],  # noqa
        [0.23076923, 0.        , 1.        ],  # noqa
        [0.30769232, 0.        , 1.        ],  # noqa
        [0.3846154 , 0.        , 1.        ],  # noqa
        [0.46153846, 0.        , 1.        ],  # noqa
        [0.53846157, 0.        , 1.        ],  # noqa
        [0.61538464, 0.        , 1.        ],  # noqa
        [0.6923077 , 0.        , 1.        ],  # noqa
        [0.7692308 , 0.        , 1.        ],  # noqa
        [0.84615386, 0.        , 1.        ],  # noqa
        [0.9230769 , 0.        , 1.        ],  # noqa
        [1.        , 0.        , 1.        ],  # noqa
        [1.        , 0.        , 0.8333333 ],  # noqa
        [1.        , 0.        , 0.6666667 ],  # noqa
        [1.        , 0.        , 0.5       ],  # noqa
        [1.        , 0.        , 0.33333334],  # noqa
        [1.        , 0.        , 0.16666667]], dtype=np.float32))  # noqa

    assert_array_equal(
        color_wheel,
        np.array([[1., 0. , 0. ],  # noqa
                 [1. , 0.5, 0. ],  # noqa
                 [1. , 1. , 0. ],  # noqa
                 [0.5, 1. , 0. ],  # noqa
                 [0. , 1. , 0. ],  # noqa
                 [0. , 1. , 0.5],  # noqa
                 [0. , 1. , 1. ],  # noqa
                 [0. , 0.5, 1. ],  # noqa
                 [0. , 0. , 1. ],  # noqa
                 [0.5, 0. , 1. ],  # noqa
                 [1. , 0. , 1. ],  # noqa
                 [1. , 0. , 0.5]], dtype=np.float32))  # noqa
    # yapf: enable


def test_flow_from_bytes():
    data_dir = osp.join(osp.dirname(__file__), '../data')
    flow_shape = (60, 80, 2)
    flow_file = osp.join(data_dir, 'optflow.flo')

    # read .flo file
    flow_fromfile = mmcv.flowread(flow_file)

    with open(flow_file, 'rb') as f:
        flow_bytes = f.read()
    flow_frombytes = mmcv.flow_from_bytes(flow_bytes)

    assert flow_frombytes.shape == flow_shape
    assert np.all(flow_frombytes == flow_fromfile)


def test_sparse_flow_from_bytes():
    data_dir = osp.join(osp.dirname(__file__), '../data')
    flow_file = osp.join(data_dir, 'sparse_flow.png')

    with open(flow_file, 'rb') as f:
        flow_bytes = f.read()
    # read flow from bytes
    flow_frombytes, valid_frombytes = mmcv.sparse_flow_from_bytes(flow_bytes)

    # test flow shape is [H, W, 2] and valid shape is [H, W]
    assert flow_frombytes.shape[:2] == valid_frombytes.shape
    assert flow_frombytes.shape[2] == 2

    def read_sparse_flow_from_file():
        flow = cv2.imread(flow_file, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
        flow = flow[:, :, ::-1].astype(np.float32)
        flow, valid = flow[:, :, :2], flow[:, :, 2]
        flow = (flow - 2**15) / 64.0
        return flow, valid

    # read flow from file
    flow_flowfile, valid_fromfile = read_sparse_flow_from_file()

    assert np.all(flow_frombytes == flow_flowfile)
    assert np.all(valid_frombytes == valid_fromfile)


================================================
FILE: tests/test_video/test_processing.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import platform
import tempfile

import pytest

import mmcv


class TestVideoEditor:

    @classmethod
    def setup_class(cls):
        cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')
        cls.num_frames = 168

    @pytest.mark.skipif(platform.system() == 'Windows', reason='skip windows')
    def test_cut_concat_video(self):
        part1_file = osp.join(tempfile.gettempdir(), '.mmcv_test1.mp4')
        part2_file = osp.join(tempfile.gettempdir(), '.mmcv_test2.mp4')
        mmcv.cut_video(self.video_path, part1_file, end=3, vcodec='h264')
        mmcv.cut_video(self.video_path, part2_file, start=3, vcodec='h264')
        v1 = mmcv.VideoReader(part1_file)
        v2 = mmcv.VideoReader(part2_file)
        assert len(v1) == 75
        assert len(v2) == self.num_frames - 75

        out_file = osp.join(tempfile.gettempdir(), '.mmcv_test.mp4')
        mmcv.concat_video([part1_file, part2_file], out_file)
        v = mmcv.VideoReader(out_file)
        assert len(v) == self.num_frames
        os.remove(part1_file)
        os.remove(part2_file)
        os.remove(out_file)

    @pytest.mark.skipif(platform.system() == 'Windows', reason='skip windows')
    def test_resize_video(self):
        out_file = osp.join(tempfile.gettempdir(), '.mmcv_test.mp4')
        mmcv.resize_video(
            self.video_path, out_file, (200, 100), log_level='panic')
        v = mmcv.VideoReader(out_file)
        assert v.resolution == (200, 100)
        os.remove(out_file)
        mmcv.resize_video(self.video_path, out_file, ratio=2)
        v = mmcv.VideoReader(out_file)
        assert v.resolution == (294 * 2, 240 * 2)
        os.remove(out_file)
        mmcv.resize_video(self.video_path, out_file, (1000, 480), keep_ar=True)
        v = mmcv.VideoReader(out_file)
        assert v.resolution == (294 * 2, 240 * 2)
        os.remove(out_file)
        mmcv.resize_video(
            self.video_path, out_file, ratio=(2, 1.5), keep_ar=True)
        v = mmcv.VideoReader(out_file)
        assert v.resolution == (294 * 2, 360)
        os.remove(out_file)


================================================
FILE: tests/test_video/test_reader.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import shutil
import tempfile
from collections import OrderedDict

import pytest

import mmcv


class TestCache:

    def test_init(self):
        with pytest.raises(ValueError):
            mmcv.Cache(0)
        cache = mmcv.Cache(100)
        assert cache.capacity == 100
        assert cache.size == 0

    def test_put(self):
        cache = mmcv.Cache(3)
        for i in range(1, 4):
            cache.put(f'k{i}', i)
            assert cache.size == i
        assert cache._cache == OrderedDict([('k1', 1), ('k2', 2), ('k3', 3)])
        cache.put('k4', 4)
        assert cache.size == 3
        assert cache._cache == OrderedDict([('k2', 2), ('k3', 3), ('k4', 4)])
        cache.put('k2', 2)
        assert cache._cache == OrderedDict([('k2', 2), ('k3', 3), ('k4', 4)])

    def test_get(self):
        cache = mmcv.Cache(3)
        assert cache.get('key_none') is None
        assert cache.get('key_none', 0) == 0
        cache.put('k1', 1)
        assert cache.get('k1') == 1


class TestVideoReader:

    @classmethod
    def setup_class(cls):
        cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')
        cls.num_frames = 168
        cls.video_url = 'https://download.openmmlab.com/mmcv/test_data/sample-mp4-file.mp4'  # noqa: E501

    def test_load(self):
        # read from video file
        v = mmcv.VideoReader(self.video_path)
        assert v.width == 294
        assert v.height == 240
        assert v.fps == 25
        assert v.frame_cnt == self.num_frames
        assert len(v) == self.num_frames
        assert v.opened
        import cv2
        assert isinstance(v.vcap, type(cv2.VideoCapture()))

        # read from video url
        v = mmcv.VideoReader(self.video_url)
        assert v.width == 320
        assert v.height == 240
        assert v.fps == 15
        assert v.frame_cnt == 1889
        assert len(v) == 1889
        assert v.opened
        assert isinstance(v.vcap, type(cv2.VideoCapture()))

    def test_read(self):
        v = mmcv.VideoReader(self.video_path)
        img = v.read()
        assert int(round(img.mean())) == 94
        img = v.get_frame(63)
        assert int(round(img.mean())) == 94
        img = v[64]
        assert int(round(img.mean())) == 205
        img = v[-104]
        assert int(round(img.mean())) == 205
        img = v[63]
        assert int(round(img.mean())) == 94
        img = v[-105]
        assert int(round(img.mean())) == 94
        img = v.read()
        assert int(round(img.mean())) == 205
        with pytest.raises(IndexError):
            v.get_frame(self.num_frames + 1)
        with pytest.raises(IndexError):
            v[-self.num_frames - 1]

    def test_slice(self):
        v = mmcv.VideoReader(self.video_path)
        imgs = v[-105:-103]
        assert int(round(imgs[0].mean())) == 94
        assert int(round(imgs[1].mean())) == 205
        assert len(imgs) == 2
        imgs = v[63:65]
        assert int(round(imgs[0].mean())) == 94
        assert int(round(imgs[1].mean())) == 205
        assert len(imgs) == 2
        imgs = v[64:62:-1]
        assert int(round(imgs[0].mean())) == 205
        assert int(round(imgs[1].mean())) == 94
        assert len(imgs) == 2
        imgs = v[:5]
        assert len(imgs) == 5
        for img in imgs:
            assert int(round(img.mean())) == 94
        imgs = v[165:]
        assert len(imgs) == 3
        for img in imgs:
            assert int(round(img.mean())) == 0
        imgs = v[-3:]
        assert len(imgs) == 3
        for img in imgs:
            assert int(round(img.mean())) == 0

    def test_current_frame(self):
        v = mmcv.VideoReader(self.video_path)
        assert v.current_frame() is None
        v.read()
        img = v.current_frame()
        assert int(round(img.mean())) == 94

    def test_position(self):
        v = mmcv.VideoReader(self.video_path)
        assert v.position == 0
        for _ in range(10):
            v.read()
        assert v.position == 10
        v.get_frame(99)
        assert v.position == 100

    def test_iterator(self):
        cnt = 0
        for img in mmcv.VideoReader(self.video_path):
            cnt += 1
            assert img.shape == (240, 294, 3)
        assert cnt == self.num_frames

    def test_with(self):
        with mmcv.VideoReader(self.video_path) as v:
            assert v.opened
        assert not v.opened

    def test_cvt2frames(self):
        v = mmcv.VideoReader(self.video_path)
        frame_dir = tempfile.mkdtemp()
        v.cvt2frames(frame_dir)
        assert osp.isdir(frame_dir)
        for i in range(self.num_frames):
            filename = f'{frame_dir}/{i:06d}.jpg'
            assert osp.isfile(filename)
            os.remove(filename)

        v = mmcv.VideoReader(self.video_path)
        v.cvt2frames(frame_dir, show_progress=False)
        assert osp.isdir(frame_dir)
        for i in range(self.num_frames):
            filename = f'{frame_dir}/{i:06d}.jpg'
            assert osp.isfile(filename)
            os.remove(filename)

        v = mmcv.VideoReader(self.video_path)
        v.cvt2frames(
            frame_dir,
            file_start=100,
            filename_tmpl='{:03d}.JPEG',
            start=100,
            max_num=20)
        assert osp.isdir(frame_dir)
        for i in range(100, 120):
            filename = f'{frame_dir}/{i:03d}.JPEG'
            assert osp.isfile(filename)
            os.remove(filename)
        shutil.rmtree(frame_dir)

    def test_frames2video(self):
        v = mmcv.VideoReader(self.video_path)
        frame_dir = tempfile.mkdtemp()
        v.cvt2frames(frame_dir)
        assert osp.isdir(frame_dir)
        for i in range(self.num_frames):
            filename = f'{frame_dir}/{i:06d}.jpg'
            assert osp.isfile(filename)

        out_filename = osp.join(tempfile.gettempdir(), 'mmcv_test.avi')
        mmcv.frames2video(frame_dir, out_filename)
        v = mmcv.VideoReader(out_filename)
        assert v.fps == 30
        assert len(v) == self.num_frames

        mmcv.frames2video(
            frame_dir,
            out_filename,
            fps=25,
            start=10,
            end=50,
            show_progress=False)

        with mmcv.VideoReader(out_filename) as v:
            assert v.fps == 25
            assert len(v) == 40

            for i in range(self.num_frames):
                filename = f'{frame_dir}/{i:06d}.jpg'
                os.remove(filename)
            shutil.rmtree(frame_dir)


================================================
FILE: tests/test_visualization.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest

import mmcv


def test_color():
    assert mmcv.color_val(mmcv.Color.blue) == (255, 0, 0)
    assert mmcv.color_val('green') == (0, 255, 0)
    assert mmcv.color_val((1, 2, 3)) == (1, 2, 3)
    assert mmcv.color_val(100) == (100, 100, 100)
    assert mmcv.color_val(np.zeros(3, dtype=int)) == (0, 0, 0)
    with pytest.raises(TypeError):
        mmcv.color_val([255, 255, 255])
    with pytest.raises(TypeError):
        mmcv.color_val(1.0)
    with pytest.raises(AssertionError):
        mmcv.color_val((0, 0, 500))